-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvk_scraper.py
144 lines (123 loc) · 6.04 KB
/
vk_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import asyncio
import aiohttp
import time
from db_api.commands import add_group_user_from_json, count_group_users, get_random_token, count_tokens
from loguru import logger
from db_api.config import bot
async def generate_data_idsgroup(start_num):
"""Принимает номер с которого начинать диапозон и вовзращает код для выполнения запроса"""
num_elements = 500
request_str = 'return ['
for i in range(20):
# Создаем новую строку и добавляем в нее числа от i*num_elements+1 до (i+1)*num_elements
new_string = ','.join(str(j) for j in range(start_num + i * num_elements, start_num + (i + 1) * num_elements))
request_str += 'API.groups.getById({{group_ids: "{ids}", fields: "id,name,contacts", v: "5.131"}}),'.format(
ids=new_string)
request_str += '];'
return request_str
async def send_telegram_notification(count,i, error_list, token_count):
text = f"Статус парсинга групп VK:\n\n" \
f"Всего собранно:\n" \
f"<code>{i}</code> / 220000000\n\n" \
f"Групп в БД:\n" \
f"<code>{count}</code>" \
f"Токенов осталось <code>{token_count}</code>\n\n" \
f"Не удалось собрать: {error_list}"
try:
await bot.send_message(chat_id=-935547037, text=text)
except:
pass
async def fetch(session, url, data):
await asyncio.sleep(30)
async with session.post(url, data=data, timeout=100) as response:
return await response.json()
async def get_groups_async(start_num, request_count):
iterator = start_num
ended_with_err = []
semaphore = asyncio.Semaphore(10)
async with aiohttp.ClientSession() as session:
tasks = []
for i in range(request_count): # сколько запросов отправим
token = await get_random_token()
code = await generate_data_idsgroup(iterator)
logger.info(f'Processing a request with groups: {iterator}')
params = {
'code': f'{code}',
'fields': "id, name,contacts",
'access_token': token,
'v': '5.131'
}
url = "https://api.vk.com/method/execute"
async with semaphore:
task = asyncio.create_task(fetch(session, url, data=params))
tasks.append(task)
iterator += 10000
if (i + 1) % 50 == 0:
count = await count_group_users()
token_count = await count_tokens()
await send_telegram_notification(count=count,i=iterator, error_list=ended_with_err, token_count =token_count)
for response_data in await asyncio.gather(*tasks):
ended_with_err += await add_group_user_from_json(response_data, iterator, token=token)
count = await count_group_users()
logger.info(f'Total number of records in group_user table: {count}')
logger.info(f'Next start_num is: {iterator}')
with open('ended_with_err.txt', 'w') as f:
unique_list = list(set(ended_with_err))
err_list = sorted(unique_list)
for item in err_list:
f.write("%s\n" % item)
async def get_groups_async(start_num, request_count):
iterator = start_num
ended_with_err = []
semaphore = asyncio.Semaphore(1)
async with aiohttp.ClientSession() as session:
tasks = []
for i in range(request_count): # сколько запросов отправим
token = await get_random_token()
code = await generate_data_idsgroup(iterator)
logger.info(f'Processing a request with groups: {iterator}')
params = {
'code': f'{code}',
'fields': "id, name,contacts",
'access_token': token,
'v': '5.131'
}
url = "https://api.vk.com/method/execute"
async with semaphore:
task = asyncio.create_task(fetch(session, url, data=params))
tasks.append(task)
if len(tasks) >= 1:
for response_data in await asyncio.gather(*tasks):
iterator += 10000
ended_with_err += await add_group_user_from_json(response_data, iterator, token=token)
count = await count_group_users()
if (i + 1) % 50 == 0:
token_count = await count_tokens()
await send_telegram_notification(count=count,i=iterator, error_list=ended_with_err, token_count=token_count)
logger.info(f'Total number of records in group_user table: {count}')
tasks = []
if len(tasks) > 0:
for response_data in await asyncio.gather(*tasks):
iterator += 10000
ended_with_err += await add_group_user_from_json(response_data, iterator, token=token)
count = await count_group_users()
if (i + 1) % 50 == 0:
token_count = await count_tokens()
await send_telegram_notification(count=count,i=iterator, error_list=ended_with_err, token_count=token_count)
logger.info(f'Total number of records in group_user table: {count}')
logger.info(f'Next start_num is: {iterator}')
with open('ended_with_err.txt', 'w') as f:
unique_list = list(set(ended_with_err))
err_list = sorted(unique_list)
for item in err_list:
f.write("%s\n" % item)
if __name__ == '__main__':
# 5112500
start_time = time.time()
# asyncio.run(get_groups_async(start_num=5112500, request_count=20))
loop = asyncio.get_event_loop()
loop.run_until_complete(get_groups_async(start_num=5112500, request_count=2))
# asyncio.run(send_telegram_notification())
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Время выполнения функции: {elapsed_time} секунд")