I have a task to download 200M images from provider's image server as quick as possible, generally we use requests lib to do http communication, to speed up, I tried to use python's concurrency package to do the download in multiple thread, but Python's multiple thread is not real multiple thread, because if The Python Global Interpreter Lock(GIL). so it does not help that much when it's IO expensive operation.
asyncio package really fits this requirement, it use an event loop to loop through different coroutine and speed things up, since python 3.4 it introduce 2 new key words async
and await
to work with asyncio package.
here is a simple code sample to show the difference, to download 20 images, the async version takes 5seconds, but the sync version of code takes 31 seconds which is 6 times slower than async version.
import asyncio
import aiohttp
import requests
async def async_download_image(image):
async with aiohttp.ClientSession() as session:
async with session.get(f"http://images.ipsensus.com:8080/{image}") as res:
file = image.split("/")[-1]
with open(f"/tmp/async/{file}", "wb") as f:
print(f"downloaded {file}")
f.write(await res.read())
async def async_download_images(images):
tasks = [asyncio.create_task(async_download_image(image)) for image in images]
await asyncio.gather(*tasks)
def sync_download_images(images):
for image in images:
r = requests.get(f"http://images.ipsensus.com:8080/{image}")
file = image.split("/")[-1]
with open(f"/tmp/sync/{file}", "wb") as f:
print(f"downloaded {file}")
f.write(r.content)
if __name__ == "__main__":
import time
images = [
'003-007-001-12752/7f9695b3-6eaf-4d68-b2e3-8e00aafb7d73.png',
'003-007-001-12753/899b781b-7584-4367-baf6-5a0224d70960.png',
'003-007-001-12711/66553f65-f321-4a23-b747-9d7345717d35.png',
'003-007-001-12711/b6399c73-f6b4-4911-a9d3-3fd17a5a8185.png',
'003-007-001-12711/965db348-524a-4ac3-af7b-7048dde599d1.png',
'003-007-001-12750/073e4d64-15c5-4f20-95f8-e7d29035277d.png',
'003-007-001-12749/070e467b-c013-4e44-8e84-2b5645a803a3.png',
'003-007-001-12749/9fc928d9-bd31-4fcf-b092-357392033eed.png',
'003-007-001-12749/83570dbe-1bfa-4597-8500-0419cacfcd6b.png',
'003-007-001-12751/20917d0e-d177-4b8e-ac57-407336cef4bf.png',
'003-007-001-12747/bb7ed0ed-4783-40a2-87df-96338c4f9962.png',
'003-007-001-12714/78b5126a-d374-4818-91db-acdc8e984c76.png',
'003-007-001-12710/7307f5af-dab9-4cd8-bf91-94d39d394206.png',
'003-007-001-12714/2e067bf9-ac31-4fff-9705-dd630bdd73a8.png',
'003-007-001-12711/377d2fe3-cd2d-4c05-a06b-3c919f72682b.png',
'003-007-001-12715/60de6045-2d0b-44a4-8a52-2281a6cdce4b.png',
'003-007-001-12716/a798f5e0-4e46-4193-9c19-641c1e7d1858.png',
'003-007-001-12759/ca3b678c-358c-4a68-a8dd-0b41d8cbc1ba.png',
'003-007-001-12760/c8c4a7fd-01e6-4bb7-8d40-3e909a904777.png',
'003-007-001-12760/d44dc080-6238-4c8e-991f-97126112c0d3.png'
]
s = time.perf_counter()
# asyncio.run(async_download_images(images))
sync_download_images(images)
elapsed = time.perf_counter() - s
print(f"{__file__} executed in {elapsed:0.2f} seconds.")