-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapi_request.py
128 lines (112 loc) · 4.33 KB
/
api_request.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import requests
import json
import pandas as pd
# def FloodTagsAPI_estimated_time_for_request(rq_until="2016-09-06T17:00:00.000Z", rq_since="2016-09-04T17:00:00.000Z"):
# data = FloodTagsAPI_refined_tweets(rq_limit=1, rq_until=rq_until, rq_since=rq_since)
# total_tweets_in_time_period = data['meta']['total']
# return(total_tweets_in_time_period)
def FloodTagsAPI_refined_tweets(
rq_skip=0,
rq_limit=100,
rq_database="indonesia",
rq_until="2016-09-06T17:00:00.000Z",
rq_since="2016-09-04T17:00:00.000Z",
rq_hasWaterdepth="false",
rq_hasLocations="false"
):
def get_request(
rq_skip=rq_skip,
rq_limit=rq_limit,
rq_database=rq_database,
rq_until=rq_until,
rq_since=rq_since,
rq_hasWaterdepth=rq_hasWaterdepth,
rq_hasLocations=rq_hasLocations
):
# the amount of tags to skip from the beginning
# the amount of tags to return
rq_base_url_start = "https://api.floodtags.com/v1/tags/"
rq_base_url_end = "/index"
# rq_database = location
# rq_until = "2016-09-04T17:00:00.000Z"
# rq_since = "2016-09-03T17:00:00.000Z"
# rq_hasWaterdepth = "false"
# rq_hasLocations = "false"
# rq_limit
# location
rq_url = rq_base_url_start \
+ rq_database \
+ rq_base_url_end \
+ "?until=" + rq_until \
+ "&since=" + rq_since \
+ "&hasWaterdepth=" + rq_hasWaterdepth \
+ "&hasLocations=" + rq_hasLocations \
+ "&skip=" + str(rq_skip) \
+ "&limit=" + str(rq_limit)
r = requests.get(rq_url)
r_json = json.loads(r.text)
return r_json
# TODO reformulate the parameter name > what does it actually take? in this case "data" are two different things for the functions below
def request_total(data):
total_tweets_in_time_period = data['meta']['total']
return total_tweets_in_time_period
def user_list(data):
list_of_users = []
for i in range(len(data)):
list_of_users.append(data[i]['source']['username'])
return list_of_users
def tweet_list(data):
list_of_tweets = []
for i in range(len(data)):
list_of_tweets.append(data[i]['text'])
return list_of_tweets
def date_list(data):
list_of_dates = []
for i in range(len(data)):
list_of_dates.append(data[i]['date'])
return list_of_dates
# get the total number of tweets in the specified period
initial_query = get_request(rq_limit=1)
total_number_tweets = request_total(initial_query)
total_cycles = (total_number_tweets // 100) + 1
# request the total user list in a specified period of time
# TODO: I should get all the information in this request to minimize the amount of requests made. Therefore i have to write all data into memory, not just the userlist
total_user_list = []
total_tweet_list = []
total_dates_list = []
skip = 0
for i in range(total_cycles):
rq_limit = 100
r_json = get_request(rq_skip=skip)
tweets = r_json['tags']
total_user_list = total_user_list + user_list(tweets)
total_tweet_list = total_tweet_list + tweet_list(tweets)
total_dates_list = total_dates_list + date_list(tweets)
skip += rq_limit
# time.sleep(1)
# convert and merge the returned lists into a single pandas dataframe
pd_tweets = pd.DataFrame()
pd_tweets['username'] = total_user_list
pd_tweets['tweet'] = total_tweet_list
pd_tweets['date'] = total_dates_list
pd_tweets.to_csv('temp/tweets_dataframe.csv')
return pd_tweets
#print(FloodTagsAPI_refined_tweets())
# structure of tweets is:
# {'retweet': False,
# 'date': '2016-09-04T16:10:49.000Z',
# 'keywords': ['banjir'],
# 'id': 't-772466960437170178',
# 'source':
# {
# 'id': '772466960437170178',
# 'username': 'bewidha',
# 'userId': '69587534'},
# 'text': "setelah 'Hujan di Bulan Juni', perlu ada novel berjudul 'Banjir di Bulan Agustus'",
# 'waterDepth': -1,
# 'classes': [],
# 'photos': [],
# 'locations': [],
# 'urls': [],
# 'labels': []
# }