-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathreddit_grab
209 lines (194 loc) · 6.56 KB
/
reddit_grab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/bin/bash
# Download the comment history for a given user
# Provenance: https://github.com/rawiriblundell/scripts/blob/master/reddit_grab
# Pop this at the top so that it's available in the output of 'head'
usage() {
cat >&2 << EOF
Usage: reddit_grab [username]
Optional args: --start {date}, --finish {date}
Start must be older than finish e.g. '--start 25/12/2010 --finish 25/12/2016'
If no args are given, then yesterday's posts will be downloaded
EOF
}
# Ensure that we have our dependencies
for cmd in curl jq; do
if ! command -v "${cmd}" >/dev/null 2>&1; then
missing_cmd+=",${cmd}"
fi
done
command -v gdate >/dev/null 2>&1 && date() { gdate "${@}"; } # MacOS tweak
{ date --version 2>&1 | grep -q GNU; } || missing_cmd+=",GNU date"
if (( "${#missing_cmd}" > 0 )); then
printf -- '%s\n' "The following requirements were missing: ${missing_cmd/,/}" >&2
exit 1
fi
# Ensure that at least one arg is given
(( "${#}" == 0 )) && { usage; exit 1; }
# Get our user string
user="${1}"
# Validate that the user exists
usertest="$(curl -s "https://www.reddit.com/api/username_available.json?user=${user}")"
case "${usertest}" in
(false) :;;
(*'Too Many Requests'*)
printf -- '%s\n' "Too many requests made to Reddit's API. Wait a bit and try again." >&2
exit 1
;;
(*)
printf -- '%s\n' "'${user}' does not appear to exist on Reddit" >&2
exit 1
;;
esac
shift 1
# Things are coming along nicely... Let's set our base variables...
base_url="https://api.pushshift.io/reddit/comment/search"
base_url+="/?author=${user}&size=500&sort=desc&sort_type=created_utc"
base_dir="${HOME}/reddit/${user}/comments"
req_count=0
req_limit=50
# Set the defaults for these two vars
end_epoch=$(date --date="00:00:00" +%s)
start_epoch=$(date --date="00:00:00 yesterday" +%s)
cur_epoch="${EPOCHSECONDS:-$(date +%s)}"
# Reddit enabled comments on 12 December 2005.
reddit_epoch=1134298800
# Parse through any extra args
# A lot of code here to basically cover:
# --start [anything here] --finish [anything here]
# OR
# --finish [anything here] --start [anything here]
# This is to cater for different date formats which may have spaces e.g.
# --start 01 Feb 2015 --finish 2016/15/06 08:00
while (( "${#}" > 0 )); do
case "${1}" in
(--start)
shift 1
# We make sure that we exclude "--finish *"
case "${*}" in
(*--finish*)
while [[ "${1}" != "--finish" ]]; do
start_date+="${1} "
shift 1
(( "${#}" == 0 )) && exit 1 # prevent infinite loop
done
;;
(*)
start_date="${*}"
shift "${#}"
;;
esac
if ! start_epoch=$(date --date="${start_date}" +%s) >/dev/null 2>&1; then
printf -- '%s\n' "'${start_date}' does not appear to be a valid date format" >&2
exit 1
fi
;;
(--finish)
shift 1
# We make sure that we exclude "--start *"
case "${*}" in
(*--start*)
while [[ "${1}" != "--start" ]]; do
finish_date+="${1} "
shift 1
(( "${#}" == 0 )) && exit 1 # prevent infinite loop
done
;;
(*)
finish_date="${*}"
shift "${#}"
;;
esac
if ! end_epoch=$(date --date="${finish_date}" +%s) >/dev/null 2>&1; then
printf -- '%s\n' "'${finish_date}' does not appear to be a valid date format" >&2
exit 1
fi
;;
(*)
printf -- '%s\n' "DEBUG: ${1}"
exit 1
;;
esac
done
# Make sure no goofs are being goofs. Those goofs.
if (( start_epoch < reddit_epoch )); then
printf -- '%s\n' "Reddit has supported comment functionality since 12 December 2005." \
"${start_epoch} is a timestamp from before that date. Please try again." >&2
exit 1
fi
if (( start_epoch > cur_epoch ))||(( end_epoch > cur_epoch )); then
printf -- '%s\n' "START time and/or END time cannot be in the future." >&2
exit 1
fi
if (( start_epoch > end_epoch )); then
printf -- '%s\n' "START time cannot be newer than END time." >&2
exit 1
fi
# If we make it to this point, things are going great!
# Let's throw this function into the mix
validate_file() {
local test_epoch
test_epoch=$(jq -r 'last(.[] | .[].created_utc)' "${1:-No file specified}")
case "${test_epoch}" in
([1-9]*)
# Epoch seems legit, so no-op
:
;;
(null)
printf -- '%s\n' "Seems no comments were posted by ${user} that day, removing ${1}..."
rm "${1}"
;;
(*)
# If we have a request limit warning in our data
# Remove the file and loop up to the sleep 60 before continuing
# TO-DO: Auto-figure out and re-download the failed data
if grep -q "429 Too Many Requests" ./*; then
rm "$(grep -l "429 Too Many Requests" ./*)"
req_count="${req_limit}"
fi
;;
esac
}
mkdir -p "${base_dir}" || exit 1
# The logic in this subshell isn't 100% - it downloads either yesterday's
# comments or it downloads a whole bunch between two timestamps
# It doesn't necessarily download both... and my carefactor is honestly minimal...
(
cd "${base_dir}" || { printf -- '%s\n' "Could not enter ${base_dir}"; exit 1; }
# If used with default start and end e.g. cron'd task, we cater for that
# This download's yesterday's comments
if (( (end_epoch - 86400) == start_epoch )); then
date_stamp="$(date --date="@${start_epoch}" +%Y%m%d)"
out_file="${base_dir}/${date_stamp}.json"
if [[ -f "${out_file}" ]]; then
printf -- '%s\n' "Exists: ${out_file}"
else
printf -- '%s\n' "Downloading into ${out_file}. ($(date -d "@${start_epoch}" '+%a %d %b %Y'))"
curl -s "${base_url}&before=${end_epoch}&after=${start_epoch}" > "${out_file}"
fi
validate_file "${out_file}"
else
until (( (end_epoch - 86400) == start_epoch )); do
(( start_epoch > end_epoch )) && break
(( start_epoch > cur_epoch )) && break
if (( req_count >= req_limit )); then
printf -- '%s\n' "Snoozing for 60 seconds to keep the remote server happy..."
sleep 60
req_count=0
fi
temp_end_epoch=$(( start_epoch + 86400 ))
date_stamp="$(date --date="@${start_epoch}" +%Y%m%d)"
out_file="${base_dir}/${date_stamp}.json"
if [[ -f "${out_file}" ]]; then
printf -- '%s\n' "Exists: ${out_file}"
else
printf -- '%s\n' "Downloading into ${out_file}. ($(date -d "@${start_epoch}" '+%a %d %b %Y'))"
curl -s "${base_url}&before=${temp_end_epoch}&after=${start_epoch}" > "${out_file}"
sleep 1
validate_file "${out_file}"
(( req_count++ ))
fi
start_epoch=$(( start_epoch + 86400 ))
done
fi
)
exit 0