forked from ankushagrawal94/TheHackerNewsBump
-
Notifications
You must be signed in to change notification settings - Fork 0
/
getRelevantGHEventsAvgCase.py
131 lines (114 loc) · 3.94 KB
/
getRelevantGHEventsAvgCase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#This file gets all the GitHub events from event_table that are relevant where relevance is defined as within 1 week of expected_hn_mention_date
import MySQLdb
import time
import re
import datetime
start_time = time.time()
db = MySQLdb.connect(host="localhost", # your host, usually localhost
user="root", # your username
passwd="password", # your password
db="githubDB") # name of the data base
cur = db.cursor()
cur.execute ("USE githubDB")
#cur.execute("DROP TABLE IF EXISTS event_table_general_condensed")
#cur.execute ("""
# CREATE TABLE event_table_general_condensed
# (
# repo_name VARCHAR(255),
# stars INT(6),
# event_time DATE,
# repo_created DATE
# )
#""")
db.commit()
print "succesfully created the DB"
cur.execute("SELECT * FROM days_after")
days_after = cur.fetchall()
#calculate avg_days_after
days_used = 0
avg_days_after = 0
for day in days_after:
d_stars = day[0]
d_hn_points = day[1]
d_avg_days_after = day[2]
d_mode_days_after = day[3]
d_num_data_points = day[4]
if d_num_data_points < 10:
continue
avg_days_after += d_avg_days_after
days_used += 1
avg_days_after /= days_used
print "The average number of days after is: %s" % avg_days_after
#Get a list of all repositories with their respective creation dates and current number of stars
cur.execute("SELECT * FROM latest_repo_events WHERE stars > 5")
global_event_list = cur.fetchall()
prev_row = ''
event_count = 0
print "pausing now permitted by typing CTRL-C"
#Iterate through this list and look for events in the 7 days before and after the expected_hn_mention_date.
#Put this in a table called event_table_general_condensed.
for event in global_event_list:
try:
event_count += 1
repo_name = event[0]
stars = event[1]
event_time = event[2]
repo_created = event[3]
#random sample
if event_count < 176602:
continue
if event_count % 50 == 0:
continue
if event_count % 100 == 0:
continue
#skip duplicates
if event[0] == prev_row:
prev_row = event[0]
continue
prev_row = event[0]
#352
expected_hn_mention_date = repo_created + datetime.timedelta(days = avg_days_after)
#Get all 15 days
start_date = expected_hn_mention_date + datetime.timedelta(days = -7)
end_date = expected_hn_mention_date + datetime.timedelta(days = 7)
cur.execute(("SELECT * FROM event_table_two WHERE repo_name = \"%s\" AND event_time BETWEEN \"%s\" AND \"%s\" ") % (repo_name, start_date, end_date))
prev_entry = ''
for row in cur.fetchall():
#ET refers to event_table
ET_repo_name = row[0]
ET_stars = row[1]
ET_event_time = row[2]
ET_repo_created = row[3]
if prev_entry == ET_event_time:
print "duplicate event_time"
continue
prev_entry = ET_event_time
cur.execute(("INSERT INTO event_table_general_condensed (repo_name, stars, event_time, repo_created) VALUES (%s, %s, %s, %s)"), (ET_repo_name, ET_stars, ET_event_time, ET_repo_created))
db.commit()
print "completed event #%s of %s" % (event_count, len(global_event_list))
print "time elapsed is: %s seconds" % int(time.time() - start_time)
try:
print "Expected time until completion: %s seconds" % (int((time.time() - start_time) / ( float(event_count)/len(global_event_list) )) - int(time.time() - start_time))
except Exception, e:
print e
except KeyboardInterrupt:
print '\nPausing... (Hit ENTER to continue, type quit to exit.)'
try:
response = raw_input()
if response == 'quit':
break
print 'Resuming...'
#cur.close()
db.close()
db = MySQLdb.connect(host="localhost", # your host, usually localhost
user="root", # your username
passwd="password", # your password
db="githubDB") # name of the data base
cur = db.cursor()
except KeyboardInterrupt:
print 'Quitting...'
break
finally:
cur.close()
db.close()
print "\n\nEvent collection complete."