From f678f2d36e979b98c202b32cf9c85c64419254dc Mon Sep 17 00:00:00 2001 From: eddvrs Date: Sun, 18 Dec 2022 23:01:38 +0000 Subject: [PATCH 1/2] Pushshift updates (#1) * Update parameter names to match new API params * Update reading of total_available after changes to response json structure * Update checks on meta_data returned for batching requests * Update gitignore --- .gitignore | 3 ++- pmaw/PushshiftAPIBase.py | 32 ++++++++++++++++++++++---------- pmaw/Request.py | 28 ++++++++++++++++------------ pmaw/utils/slices.py | 12 ++++++------ 4 files changed, 46 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index 22825df..5c402db 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,5 @@ pmaw.code-workspace pmaw.egg-info /**/__pycache__ -/**/cache \ No newline at end of file +/**/cache +.idea/ diff --git a/pmaw/PushshiftAPIBase.py b/pmaw/PushshiftAPIBase.py index 11ecb10..e9c7e45 100644 --- a/pmaw/PushshiftAPIBase.py +++ b/pmaw/PushshiftAPIBase.py @@ -77,9 +77,14 @@ def _get(self, url, payload={}): @property def shards_are_down(self): - shards = self.metadata_.get('shards') + try: + shards = self.metadata_['es'].get('_shards') + except KeyError: + return True + if shards is None: - return + return True + return shards['successful'] != shards['total'] def _multithread(self, check_total=False): @@ -146,13 +151,13 @@ def _futures_handler(self, futures, check_total): break # handle time slicing logic - if 'before' in payload and 'after' in payload: - before = payload['before'] - after = payload['after'] + if 'until' in payload and 'since' in payload: + until = payload['until'] + since = payload['since'] log.debug( - f"Time slice from {after} - {before} returned {len(data)} results") + f"Time slice from {since} - {until} returned {len(data)} results") total_results = self.resp_dict.get( - (after, before), 0) + (since, until), 0) log.debug( f'{total_results} total results for this time slice') # calculate remaining results @@ -171,10 +176,10 @@ def _futures_handler(self, futures, check_total): # Fix issue where Pushshift occasionally reports remaining results that it is # unable to return - len(data) == 0 when this happens if len(data) > 0: - before = data[-1]['created_utc'] + until = data[-1]['created_utc'] # generate payloads self.req.gen_slices( - url, payload, after, before, num) + url, payload, since, until, num) except HTTPError as exc: log.debug(f"Request Failed -- {exc}") @@ -249,7 +254,14 @@ def _search(self, # check to see how many results are remaining self.req.req_list.appendleft((url, self.req.payload)) self._multithread(check_total=True) - total_avail = self.metadata_.get('total_results', 0) + if len(self.metadata_) != 0: + try: + total_avail = self.metadata_['es']['hits']['total']['value'] + except KeyError: + log.info(f'Result(s) in Pushshift undetermined') + total_avail = 0 # ¯\_(ツ)_/¯ + else: + total_avail = 0 # ¯\_(ツ)_/¯ if self.req.limit is None: log.info(f'{total_avail} result(s) available in Pushshift') diff --git a/pmaw/Request.py b/pmaw/Request.py index 67f376b..76d7030 100644 --- a/pmaw/Request.py +++ b/pmaw/Request.py @@ -37,12 +37,12 @@ def __init__(self, payload, filter_fn, kind, max_results_per_request, max_ids_pe if filter_fn is not None and not callable(filter_fn): raise ValueError('filter_fn must be a callable function') - if safe_exit and self.payload.get('before', None) is None: + if safe_exit and self.payload.get('until', None) is None: # warn the user not to use safe_exit without setting before, # doing otherwise will make it impossible to resume without modifying # future query to use before value from first run before = int(dt.datetime.now().timestamp()) - payload['before'] = before + payload['until'] = before warnings.warn(f'Using safe_exit without setting before value is not recommended. Setting before to {before}') if self.praw is not None: @@ -186,21 +186,25 @@ def _add_nec_args(self, payload): payload['size'] = self.max_results_per_request if 'sort' not in payload: - payload['sort'] = 'desc' - elif payload.get('sort') != 'desc': + payload['sort'] = 'created_utc' + elif payload.get('sort') != 'created_utc': err_msg = "Support for non-default sort has not been implemented as it may cause unexpected results" raise NotImplementedError(err_msg) + if 'order' not in payload: + payload['order'] = 'asc' + if 'metadata' not in payload: payload['metadata'] = 'true' - if 'before' not in payload: - payload['before'] = int(dt.datetime.now().timestamp()) + if 'until' not in payload: + payload['until'] = int(dt.datetime.now().timestamp()) if 'filter' in payload: if not isinstance(payload['filter'], list): if isinstance(payload['filter'], str): payload['filter'] = [payload['filter']] else: payload['filter'] = list(payload['filter']) + # make sure that the created_utc field is returned if 'created_utc' not in payload['filter']: payload['filter'].append('created_utc') @@ -255,22 +259,22 @@ def gen_url_payloads(self, url, batch_size, search_window): self.req_list.extend(url_payloads) else: - if 'after' not in self.payload: + if 'since' not in self.payload: search_window = dt.timedelta(days=search_window) num = batch_size - before = self.payload['before'] + before = self.payload['until'] after = int((dt.datetime.fromtimestamp( before) - search_window).timestamp()) # set before to after for future time slices - self.payload['before'] = after + self.payload['until'] = after else: - before = self.payload['before'] - after = self.payload['after'] + before = self.payload['until'] + after = self.payload['since'] # set before to avoid repeated time slices when there are missed responses - self.payload['before'] = after + self.payload['until'] = after num = batch_size # generate payloads diff --git a/pmaw/utils/slices.py b/pmaw/utils/slices.py index a99de4d..f389107 100644 --- a/pmaw/utils/slices.py +++ b/pmaw/utils/slices.py @@ -3,13 +3,13 @@ log = logging.getLogger(__name__) -def timeslice(after, before, num): +def timeslice(since, until, num): log.debug( - f'Generating {num} slices between {after} and {before}') - return [int((before-after)*i/num + after) for i in range(num+1)] + f'Generating {num} slices between {since} and {until}') + return [int((until-since)*i/num + since) for i in range(num+1)] -def mapslice(payload, after, before): - payload['before'] = before - payload['after'] = after +def mapslice(payload, since, until): + payload['until'] = until + payload['since'] = since return payload From 3d9f0da2271570906d6dc3df67ab91ba4f5f1fb1 Mon Sep 17 00:00:00 2001 From: eddvrs Date: Sun, 18 Dec 2022 23:31:56 +0000 Subject: [PATCH 2/2] Pushshift updates (#2) * Update parameter names to match new API params * Update reading of total_available after changes to response json structure * Update checks on meta_data returned for batching requests * Update gitignore * Update documentation --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b5430ad..989de33 100644 --- a/README.md +++ b/README.md @@ -135,8 +135,8 @@ A user-defined function can be provided using the `filter_fn` parameter for eith ## Unsupported Parameters -- `sort='asc'` is unsupported as it can have unexpected results -- `before` and `after` only support epoch time (float or int) +- `order='asc'` is unsupported as it can have unexpected results +- `until` and `since` only support epoch time (float or int) - `aggs` are unsupported, as **PMAW** is intended to be used for collecting large numbers of submissions or comments. Use [PSAW](https://github.com/dmarx/psaw) for aggregation requests. ### Feature Requests @@ -180,7 +180,7 @@ A user-defined function can be provided using the `filter_fn` parameter for eith ### Keyword Arguments -- Unlike the Pushshift API, the `before` and `after` keyword arguments must be in epoch time +- Unlike the Pushshift API, the `until` and `since` keyword arguments must be in epoch time - `limit` is the number of submissions/comments to return. If set to `None` or if the set `limit` is higher than the number of available submissions/comments for the provided parameters then `limit` will be set to the amount available. - Other accepted parameters are covered in the Pushshift documentation for [submissions](https://github.com/pushshift/api#searching-submissions) and [comments](https://github.com/pushshift/api#searching-comments). @@ -287,7 +287,7 @@ reddit = praw.Reddit( ) api_praw = PushshiftAPI(praw=reddit) -comments = api_praw.search_comments(q="quantum", subreddit="science", limit=100, before=1629990795) +comments = api_praw.search_comments(q="quantum", subreddit="science", limit=100, until=1629990795) ``` ## Custom Filter @@ -339,11 +339,11 @@ If you expect that your query may be interrupted while its running, setting `saf from pmaw import PushshiftAPI api = PushshiftAPI() -posts = api.search_submissions(subreddit="science", limit=700000, before=1613234822, safe_exit=True) +posts = api.search_submissions(subreddit="science", limit=700000, until=1613234822, safe_exit=True) print(f'{len(posts)} posts retrieved from Pushshift') ``` -A `before` value is required to load previous responses / requests when using non-id search, as `before` is set to the current time when the `search` method is called, which would result in a different set of parameters then when you last ran the search despite all other parameters being the same. +A `until` value is required to load previous responses / requests when using non-id search, as `until` is set to the current time when the `search` method is called, which would result in a different set of parameters then when you last ran the search despite all other parameters being the same. ### Loading Cache with Key