-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
159 lines (134 loc) · 5.82 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import argparse
import datetime
import json
import os
import os.path
import praw
import requests as r
FIELDS_TO_UPDATE = [
'locked',
'num_comments',
'num_crossposts',
'over_18',
'pinned',
'score',
'selftext',
'spoiler',
'stickied',
'subreddit_subscribers',
]
class DateAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string):
try:
setattr(namespace, self.dest, datetime.datetime.strptime(values, '%Y-%m-%d'))
except ValueError:
raise ValueError(f'{values} is not a valid YYYY-MM-DD date')
def main():
parser = argparse.ArgumentParser(description='Chronologically scrape reddit')
parser.add_argument('--sub', help='Subreddit to scrape (e.g. "me_irl")')
parser.add_argument('--sub_list', help='Path to a .txt file with one subreddit name per line to scrape')
parser.add_argument('--field_list', help='Path to a .txt file with one field per line to keep from posts. If not supplied, keep all fields.')
parser.add_argument('--start_date', help='Start date in YYYY-MM-DD format (starts at 00:00 UTC)', action=DateAction)
parser.add_argument('--end_date', help='End date in YYYY-MM-DD format ends at (23:59:59 UTC)', action=DateAction)
parser.add_argument('--count', type=int, default=1000, help='A maximum amount of posts to download for each provided subreddit')
parser.add_argument('--update', help='Update the data from the Reddit API after downloading it. Recommended for anything that requires up-to-date karma counts.', action='store_true')
args = parser.parse_args()
if not args.sub_list and not args.sub:
print('You must provide either --sub or --sub_list.')
exit(1)
"""
Things that are okay:
- Supplying a start_date, end_date, and count
- Supplying a start_date and an end_date
- Supplying a start_date and a count
- Supplying only a count
"""
if not ((args.start_date is not None and args.end_date is not None) or (args.start_date is not None and args.count is not None) or (args.count is not None)):
print('Invalid combination of --start_date, --end_date, and --count. Make sure these arguments make sense.')
exit(1)
if args.sub_list:
subreddits = read_lines_of_file(args.sub_list)
else:
subreddits = [args.sub]
if args.field_list:
fields = read_lines_of_file(args.field_list)
for subreddit in subreddits:
scrape_subreddit(
subreddit,
args.update,
args.count,
fields,
args.start_date if args.start_date else datetime.datetime(2000, 1, 1),
args.end_date if args.end_date else datetime.datetime.now(),
)
def read_lines_of_file(path):
with open(path, 'r') as f:
return [line.strip() for line in f.readlines()]
def scrape_subreddit(subreddit, update, count, fields, start_date, end_date):
print(f'Scraping {subreddit}')
if update:
reddit = login_to_reddit()
end_epoch = int(start_date.timestamp())
# add 23 hours, 59 minutes, 59 seconds to include the posts made on end_date
start_epoch = int((end_date + datetime.timedelta(days=1) - datetime.timedelta(seconds=1)).timestamp())
current_epoch = start_epoch
output = []
while len(output) < count or count is None:
payload = {
'subreddit': subreddit,
'sort': 'desc',
'size': 500,
'before': current_epoch,
'after': end_epoch,
'sort_type': 'created_utc',
}
posts = query_pushshift(payload)
ids = [f't3_{post["id"]}' for post in posts]
if len(posts) == 0:
break
# Zip with some reddit info if we plan on updating stuff
if update:
zipped_posts = zip(posts, reddit.info(ids))
else:
zipped_posts = zip(posts, [None] * len(posts))
for post, submission in zipped_posts:
# Update the time of the last post we downloaded
current_epoch = int(post['created_utc'])
# Lose the fields we don't care about
if fields:
post = keep_whitelisted_fields(post, fields)
# Query reddit for up-to-date info on fields we need to update
if update:
for field in FIELDS_TO_UPDATE:
if field in post:
post[field] = getattr(submission, field)
# Stick it on the end
output.append(post)
if len(output) % 500 == 0:
print(f'Downloaded {len(output)} posts so far')
output_path = os.path.abspath(f'{subreddit}-{datetime.datetime.fromtimestamp(start_epoch).strftime("%Y-%m-%d")}-{datetime.datetime.fromtimestamp(end_epoch).strftime("%Y-%m-%d")}.json')
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False)
print(f'Downloaded {len(output)} posts from {subreddit}.')
def keep_whitelisted_fields(post, whitelist):
output = {}
for field in whitelist:
output[field] = post[field]
return output
def login_to_reddit():
if not os.path.isfile('praw.ini'):
print("Couldn't find praw.ini in this directory! Try making one if this breaks: https://praw.readthedocs.io/en/latest/getting_started/configuration/prawini.html")
exit(1)
reddit = praw.Reddit('DEFAULT', user_agent='python:reddit-scraper:v0.2.0 (by /u/notverycreative1)')
print('Logged in!')
return reddit
def query_pushshift(payload):
"""
Query the pushshift API.
:param payload: Payload of params to query with
:returns: list of post objects
"""
response = r.get('https://api.pushshift.io/reddit/search/submission', params=payload)
return [post for post in response.json()['data']]
if __name__ == '__main__':
main()