-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper_async.py
86 lines (63 loc) · 1.73 KB
/
scraper_async.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import asyncio
import aiohttp
import re
from datetime import datetime
import requests
import sys
'''
Synchronous requests
'''
def fetch_sync(count, url, print_titles):
responses = []
start_time = datetime.now()
for i in range(count):
responses.append(requests.get(url).text)
end_time = datetime.now()
total_time = end_time - start_time
print('Sync time taken in (s): ', total_time.total_seconds())
if print_titles:
print_responses(responses)
'''
Asynchronous requests
'''
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.text()
async def run(loop, r, url, print_titles):
tasks = []
start_time = datetime.now()
for i in range(r):
task = asyncio.ensure_future(fetch(url))
tasks.append(task)
responses = await asyncio.gather(*tasks)
end_time = datetime.now()
total_time = end_time - start_time
print('Async time taken in (s): ', total_time.total_seconds())
if print_titles:
print_responses(responses)
'''
Find <title> and print
'''
def print_responses(responses):
for r in responses:
title = re.search('<title>(.*)</title>', r)
if title:
print(title.group(1))
else:
print('No title found.')
'''
Run time comparison
'''
url = "https://en.wikipedia.org/wiki/Special:Random"
num_loops = 5
_print_titles = False
if len(sys.argv) > 1:
url = sys.argv[1]
if len(sys.argv) > 2:
num_loops = int(sys.argv[2])
fetch_sync(num_loops, url, _print_titles)
print('')
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(loop, num_loops, url, _print_titles))
loop.run_until_complete(future)