I have a task to download 200M images from provider's image server as quick as possible, generally we use requests lib to do http communication, to speed up, I tried to use python's concurrency package to do the download in multiple thread, but Python's multiple thread is not real multiple thread, because if The Python Global Interpreter Lock(GIL). so it does not help that much when it's IO expensive operation.

asyncio

asyncio package really fits this requirement, it use an event loop to loop through different coroutine and speed things up, since python 3.4 it introduce 2 new key words async and await to work with asyncio package.

here is a simple code sample to show the difference, to download 20 images, the async version takes 5seconds, but the sync version of code takes 31 seconds which is 6 times slower than async version.

import asyncio

import aiohttp
import requests


async def async_download_image(image):
    async with aiohttp.ClientSession() as session:
        async with session.get(f"http://images.ipsensus.com:8080/{image}") as res:
            file = image.split("/")[-1]
            with open(f"/tmp/async/{file}", "wb") as f:
                print(f"downloaded {file}")
                f.write(await res.read())


async def async_download_images(images):
    tasks = [asyncio.create_task(async_download_image(image)) for image in images]
    await asyncio.gather(*tasks)


def sync_download_images(images):
    for image in images:
        r = requests.get(f"http://images.ipsensus.com:8080/{image}")
        file = image.split("/")[-1]
        with open(f"/tmp/sync/{file}", "wb") as f:
            print(f"downloaded {file}")
            f.write(r.content)


if __name__ == "__main__":
    import time

    images = [
        '003-007-001-12752/7f9695b3-6eaf-4d68-b2e3-8e00aafb7d73.png',
        '003-007-001-12753/899b781b-7584-4367-baf6-5a0224d70960.png',
        '003-007-001-12711/66553f65-f321-4a23-b747-9d7345717d35.png',
        '003-007-001-12711/b6399c73-f6b4-4911-a9d3-3fd17a5a8185.png',
        '003-007-001-12711/965db348-524a-4ac3-af7b-7048dde599d1.png',
        '003-007-001-12750/073e4d64-15c5-4f20-95f8-e7d29035277d.png',
        '003-007-001-12749/070e467b-c013-4e44-8e84-2b5645a803a3.png',
        '003-007-001-12749/9fc928d9-bd31-4fcf-b092-357392033eed.png',
        '003-007-001-12749/83570dbe-1bfa-4597-8500-0419cacfcd6b.png',
        '003-007-001-12751/20917d0e-d177-4b8e-ac57-407336cef4bf.png',
        '003-007-001-12747/bb7ed0ed-4783-40a2-87df-96338c4f9962.png',
        '003-007-001-12714/78b5126a-d374-4818-91db-acdc8e984c76.png',
        '003-007-001-12710/7307f5af-dab9-4cd8-bf91-94d39d394206.png',
        '003-007-001-12714/2e067bf9-ac31-4fff-9705-dd630bdd73a8.png',
        '003-007-001-12711/377d2fe3-cd2d-4c05-a06b-3c919f72682b.png',
        '003-007-001-12715/60de6045-2d0b-44a4-8a52-2281a6cdce4b.png',
        '003-007-001-12716/a798f5e0-4e46-4193-9c19-641c1e7d1858.png',
        '003-007-001-12759/ca3b678c-358c-4a68-a8dd-0b41d8cbc1ba.png',
        '003-007-001-12760/c8c4a7fd-01e6-4bb7-8d40-3e909a904777.png',
        '003-007-001-12760/d44dc080-6238-4c8e-991f-97126112c0d3.png'
    ]
    s = time.perf_counter()
    # asyncio.run(async_download_images(images))
    sync_download_images(images)
    elapsed = time.perf_counter() - s
    print(f"{__file__} executed in {elapsed:0.2f} seconds.")