This repository has been archived by the owner on Aug 24, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
166 lines (135 loc) · 5.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import re
import tweepy
import json
import os
import csv
import requests
import progressbar
import time
from bs4 import BeautifulSoup
config = json.loads(open("settings.json").read())
ACCESS_TOKEN = config["keys"]["access_token"]
ACCESS_SECRET = config["keys"]["access_secret"]
CONSUMER_KEY = config["keys"]["consumer_key"]
CONSUMER_SECRET = config["keys"]["consumer_secret"]
extension = config["format"]
only_count = config["count_only"]
metadata_toggle = config["no_metadata"]
combine_files = config["combine_files"]
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)
widgets = [progressbar.Percentage(), progressbar.Bar()]
all_urls = config["all_urls"]
todo = 0
done = 0
all_files = []
if not ((extension == ".txt") or (extension == ".csv")):
print("Invalid format, Check config file.")
exit()
def handle_counter(todo):
todobar = progressbar.ProgressBar(widgets=widgets, max_value=len(all_urls)).start()
for list_url in all_urls:
try:
response = requests.get(list_url, headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status()
soup = BeautifulSoup(response.text, features="html.parser")
for el in soup.find_all("ul", attrs={"class": "stats"}):
for i in el.find_all("li"):
txt = str(i)
if "Members" in txt:
comma = re.search("<strong>(.*)</strong>", txt)
comma = comma.group(1)
number = int(comma.replace(",", ""))
todo += number
except Exception as e:
print("Something is wrong with list URL")
print(list_url + " -- " + str(e))
continue
todobar.update(todobar.value+1)
todobar.finish()
print()
print("Total handles to fetch: " + str(todo))
print()
return todo
def api_and_scrape(done):
errfile = open("errors.log", "w")
donebar = progressbar.ProgressBar(widgets=widgets, max_value=todo).start()
for list_url in all_urls:
components = re.split(r"; |, |\*|\n|\/|\?", list_url)
list_name = components[5]
list_owner = components[3]
errfile.write(
"Log for " + list_name + " :: " + list_owner + " @ " + list_url + "\n"
)
filename = "results/" + list_name + extension
file = open(filename, "w")
all_files.append(filename)
if extension == ".csv":
writer = csv.writer(file)
for member in tweepy.Cursor(api.list_members, list_owner, list_name).items():
user = str("@" + member.screen_name)
name = str(member.name)
loc = str(member.location)
bio = str(member.description)
url = str(member.url)
metadata = ""
if not metadata_toggle:
try:
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status()
soup = BeautifulSoup(response.text, features="html.parser")
metas = soup.find_all("meta")
metadata = [
meta.attrs["content"]
for meta in metas
if "name" in meta.attrs and meta.attrs["name"] == "description"
][0]
except Exception as e:
errfile.write(url + " -- " + str(e))
if extension == ".txt":
file.write(
"{:<15s}{:^50s}{:^50s}{:^150s}{:^50s}{:>150s}".format(
user, name, loc, bio, url, metadata
)
)
file.write("\n")
elif extension == ".csv":
row = [user, name, loc, bio, url, metadata]
writer.writerow(row)
done += 1
donebar.update(donebar.value+1)
donebar.finish()
print()
print("Total extracted handles = " + str(done))
print()
def file_combiner():
combinebar = progressbar.ProgressBar(widgets=widgets, max_value=len(all_urls)).start()
with open("results/combined_file." + extension, "w") as outfile:
for fname in all_files:
with open(fname) as infile:
outfile.write(infile.read())
combinebar.update(combinebar.value+1)
combinebar.finish()
##### Execution of program #####
print()
print("Calculating total handles to extract...")
print()
todo = handle_counter(todo)
if only_count:
print("Count mode only")
print("Fin.")
exit()
print("Extracting now")
os.makedirs("results", exist_ok=True)
print()
print("Progress:")
api_and_scrape(done)
if not only_count:
if combine_files:
print("Combining into 1 file")
print()
file_combiner()
print()
print("Finised.")
print("Check 'errors.log' for details")