From 477676c7df1eb9525c1a73c8044c1110caef4350 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Thu, 6 May 2021 22:02:06 +0100 Subject: [PATCH 1/8] move time related functions to helpers --- twarc/client2.py | 28 ---------------------------- twarc/helpers.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 28 deletions(-) create mode 100644 twarc/helpers.py diff --git a/twarc/client2.py b/twarc/client2.py index 2ee2f29f..d2e43e37 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -740,34 +740,6 @@ def _ensure_user_id(self, user): else: raise ValueError(f"No such user {user}") -def _ts(dt): - """ - Return ISO 8601 / RFC 3339 datetime in UTC. If no timezone is specified it - is assumed to be in UTC. The Twitter API does not accept microseconds. - - Args: - dt (datetime): a `datetime` object to format. - - Returns: - str: an ISO 8601 / RFC 3339 datetime in UTC. - """ - if dt.tzinfo: - dt = dt.astimezone(datetime.timezone.utc) - else: - dt = dt.replace(tzinfo=datetime.timezone.utc) - return dt.isoformat(timespec='seconds') - -def _utcnow(): - """ - Return _now_ in ISO 8601 / RFC 3339 datetime in UTC. - - Returns: - datetime: Current timestamp in UTC. - """ - return datetime.datetime.now(datetime.timezone.utc).isoformat( - timespec='seconds' - ) - def _append_metadata(result, url): """ Appends `__twarc` metadata to the result. diff --git a/twarc/helpers.py b/twarc/helpers.py new file mode 100644 index 00000000..cfb259d0 --- /dev/null +++ b/twarc/helpers.py @@ -0,0 +1,31 @@ +""" +Useful functions for converting things into different types +""" + +def _ts(dt): + """ + Return ISO 8601 / RFC 3339 datetime in UTC. If no timezone is specified it + is assumed to be in UTC. The Twitter API does not accept microseconds. + + Args: + dt (datetime): a `datetime` object to format. + + Returns: + str: an ISO 8601 / RFC 3339 datetime in UTC. + """ + if dt.tzinfo: + dt = dt.astimezone(datetime.timezone.utc) + else: + dt = dt.replace(tzinfo=datetime.timezone.utc) + return dt.isoformat(timespec='seconds') + +def _utcnow(): + """ + Return _now_ in ISO 8601 / RFC 3339 datetime in UTC. + + Returns: + datetime: Current timestamp in UTC. + """ + return datetime.datetime.now(datetime.timezone.utc).isoformat( + timespec='seconds' + ) From 05c6f27f5e28a629fc08cdf2c5d2a457f8f81e5d Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Thu, 6 May 2021 22:03:08 +0100 Subject: [PATCH 2/8] move time related functions to helpers --- twarc/client2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/twarc/client2.py b/twarc/client2.py index d2e43e37..ce36501c 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -19,6 +19,7 @@ from twarc import expansions from twarc.decorators import * +from twarc.helpers import * from twarc.version import version From 14c733681c90d02664a5cbd5adae97261db3f863 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Tue, 11 May 2021 00:06:22 +0100 Subject: [PATCH 3/8] add and move ts functions to helpers --- twarc/client2.py | 11 +++++------ twarc/helpers.py | 39 +++++++++++++++++++++++++++++++++------ twarc/version.py | 2 +- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index ce36501c..b7457a6b 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -9,7 +9,6 @@ import json import logging import requests -import datetime import time from oauthlib.oauth2 import BackendApplicationClient @@ -121,9 +120,9 @@ def _search( if until_id: params["until_id"] = until_id if start_time: - params["start_time"] = _ts(start_time) + params["start_time"] = ts(start_time) if end_time: - params["end_time"] = _ts(end_time) + params["end_time"] = ts(end_time) count = 0 made_call = time.monotonic() @@ -487,9 +486,9 @@ def _timeline( if until_id: params["until_id"] = until_id if start_time: - params["start_time"] = _ts(start_time) + params["start_time"] = ts(start_time) if end_time: - params["end_time"] = _ts(end_time) + params["end_time"] = ts(end_time) count = 0 for response in self.get_paginated(url, params=params): @@ -757,6 +756,6 @@ def _append_metadata(result, url): result["__twarc"] = { "url": url, "version": version, - "retrieved_at": _utcnow() + "retrieved_at": utcnow() } return result diff --git a/twarc/helpers.py b/twarc/helpers.py index cfb259d0..11f32098 100644 --- a/twarc/helpers.py +++ b/twarc/helpers.py @@ -1,8 +1,10 @@ """ Useful functions for converting things into different types """ +import datetime -def _ts(dt): + +def ts(dt): """ Return ISO 8601 / RFC 3339 datetime in UTC. If no timezone is specified it is assumed to be in UTC. The Twitter API does not accept microseconds. @@ -17,15 +19,40 @@ def _ts(dt): dt = dt.astimezone(datetime.timezone.utc) else: dt = dt.replace(tzinfo=datetime.timezone.utc) - return dt.isoformat(timespec='seconds') + return dt.isoformat(timespec="seconds") + -def _utcnow(): +def utcnow(): """ Return _now_ in ISO 8601 / RFC 3339 datetime in UTC. Returns: datetime: Current timestamp in UTC. """ - return datetime.datetime.now(datetime.timezone.utc).isoformat( - timespec='seconds' - ) + return datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") + + +def _snowflake2millis(snowflake_id): + return (snowflake_id >> 22) + 1288834974657 + + +def _millis2snowflake(milliseconds): + return (int(milliseconds) - 1288834974657) << 22 + + +def _get_millis(ms): + return ms % 1000 + + +def _sample_windows(start_ts, end_ts, sample_type): + """ + todo: Generate tuples of start and end snowflake ids between two timestamps + + sample_type - type of random sample and millisecond range: + _1% "Spritzer" Sample [657-666] + 10% "Gardenhose" Sample [657-756] + 10% "Enterprise" Sample [*0*] + _1% v2 Sample [?] + _N% v2 Sample [?] + """ + pass diff --git a/twarc/version.py b/twarc/version.py index 9febec5a..e0c5a628 100644 --- a/twarc/version.py +++ b/twarc/version.py @@ -1 +1 @@ -version = '2.0.12' +version = '2.0.13' From f15692f051c242c856c830b03b94eba0ffb33adf Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Wed, 27 Oct 2021 11:47:07 +0100 Subject: [PATCH 4/8] Revert "merge main" This reverts commit c5b777aa6f74a80d6e7b0f6923ecfc9d1820f07f, reversing changes made to 730e98dab72172a4f148358393905b62d5e9d6cd. --- docs/README.md | 2 +- docs/api/client.md | 4 + docs/api/client2.md | 1 + docs/api/expansions.md | 4 - docs/plugins.md | 2 +- docs/twitter-developer-access.md | 2 - mkdocs.yml | 1 - setup.py | 2 +- test_twarc2.py | 59 ++----- twarc/__init__.py | 1 - twarc/__main__.py | 4 +- twarc/client2.py | 69 ++------ twarc/command2.py | 289 ++++++++----------------------- twarc/config.py | 16 -- twarc/expansions.py | 69 ++------ twarc/handshake.py | 7 +- twarc/version.py | 2 +- twarc2.py | 4 + utils/source.py | 7 +- utils/wall.py | 30 ++-- 20 files changed, 158 insertions(+), 417 deletions(-) delete mode 100644 docs/api/expansions.md delete mode 100644 twarc/config.py create mode 100644 twarc2.py diff --git a/docs/README.md b/docs/README.md index 4961bc65..6cec7483 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,7 +1,7 @@ twarc ===== -***For information about working with the Twitter V2 API please see the [twarc2](https://twarc-project.readthedocs.io/en/latest/twarc2/) page.*** +***For information about working with the Twitter V2 API please see the [twarc2](twarc2) page.*** --- diff --git a/docs/api/client.md b/docs/api/client.md index dacadbfb..4ccaac08 100644 --- a/docs/api/client.md +++ b/docs/api/client.md @@ -2,3 +2,7 @@ ::: twarc.client handler: python + + + + diff --git a/docs/api/client2.md b/docs/api/client2.md index f9dcbb2d..21506c7a 100644 --- a/docs/api/client2.md +++ b/docs/api/client2.md @@ -2,3 +2,4 @@ ::: twarc.client2 handler: python + diff --git a/docs/api/expansions.md b/docs/api/expansions.md deleted file mode 100644 index 1e6c763d..00000000 --- a/docs/api/expansions.md +++ /dev/null @@ -1,4 +0,0 @@ -# twarc.expansions - -::: twarc.expansions - handler: python diff --git a/docs/plugins.md b/docs/plugins.md index e7905d6f..1d037504 100644 --- a/docs/plugins.md +++ b/docs/plugins.md @@ -16,7 +16,7 @@ add it to this list): * [twarc-ids](https://pypi.org/project/twarc-ids/): extract tweet ids from tweets * [twarc-videos](https://pypi.org/project/twarc-videos): extract videos from tweets * [twarc-csv](https://pypi.org/project/twarc-csv/): export tweets to CSV -* [twarc-timeline-archive](https://pypi.org/project/twarc-timeline-archive): routinely download tweet timelines for a list of users +* [twarc-timelines](https://pypi.org/project/twarc-timelines): download tweet timelines for a list of users ## Writing a Plugin diff --git a/docs/twitter-developer-access.md b/docs/twitter-developer-access.md index 3f0e53fd..64a440dd 100644 --- a/docs/twitter-developer-access.md +++ b/docs/twitter-developer-access.md @@ -61,8 +61,6 @@ Now that you have your keys and tokens, you can start using the API. You may be Be careful not to commit your keys into a public repository or make them visible to the public - do not include them in a client side js script for example. Most apps will ask for API Key and Secret, but "Consumer Key" is "API Key" and "Consumer Secret" is "API Secret". -For Academic Access, there is only one endpoint that takes Bearer (App Only) authentication, so in most cases, the Bearer Token is all you need to share. - ## Step 5: Next Steps Install `twarc`, and run `twarc2 configure` to set it up. diff --git a/mkdocs.yml b/mkdocs.yml index 08662270..b5d0de77 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -28,7 +28,6 @@ nav: - Library API: - api/client.md - api/client2.md - - api/expansions.md plugins: - search diff --git a/setup.py b/setup.py index 64e0fb32..bc95d3f0 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ url='https://github.com/docnow/twarc', author='Ed Summers', author_email='ehs@pobox.com', - packages=['twarc'], + packages=['twarc', ], description='Archive tweets from the command line', long_description=long_description, long_description_content_type="text/markdown", diff --git a/test_twarc2.py b/test_twarc2.py index ca639cc4..9015b2a3 100644 --- a/test_twarc2.py +++ b/test_twarc2.py @@ -5,7 +5,6 @@ import dotenv import pytest import logging -import pathlib import datetime import threading @@ -16,7 +15,6 @@ access_token = os.environ.get('ACCESS_TOKEN') access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET') -test_data = pathlib.Path('test-data') logging.basicConfig(filename="test.log", level=logging.INFO) # Implicitly test the constructor in application auth mode. This ensures that @@ -294,7 +292,6 @@ def test_follows(): break assert found >= 1000 - def test_follows_username(): """ Test followers and and following by username. @@ -333,20 +330,16 @@ def test_flattened(): found_referenced_tweets = False event = threading.Event() - for count, response in enumerate(T.sample(event=event)): - - # streaming api always returns a tweet at a time but flatten - # will put these in a list so they can be treated uniformly - tweets = twarc.expansions.flatten(response) - assert len(tweets) == 1 - tweet = tweets[0] + for count, result in enumerate(T.sample(event=event)): + result = twarc.expansions.flatten(result) + tweet = result["data"] assert "id" in tweet logging.info("got sample tweet #%s %s", count, tweet["id"]) author_id = tweet["author_id"] assert "author" in tweet - assert tweet["author"]["id"] == author_id + assert result["data"]["author"]["id"] == author_id if "in_reply_to_user_id" in tweet: assert "in_reply_to_user" in tweet @@ -369,11 +362,8 @@ def test_flattened(): assert tweet["entities"]["mentions"][0]["username"] found_entities_mentions = True - # need to ensure there are no errors because a referenced tweet - # might be protected or deleted in which case it would not have been - # included in the response and would not have been flattened - if "errors" not in response and "referenced_tweets" in tweet: - assert tweet["referenced_tweets"][0]["text"] + if "referenced_tweets" in tweet: + assert tweet["referenced_tweets"][0]["id"] found_referenced_tweets = True if found_geo and found_in_reply_to_user and found_attachments_media \ @@ -393,33 +383,18 @@ def test_flattened(): assert found_referenced_tweets, "found referenced tweets" -def test_ensure_flattened(): - resp = next(T.search_recent('twitter')) - - # flatten a response - flat1 = twarc.expansions.ensure_flattened(resp) - assert isinstance(flat1, list) - assert len(flat1) > 1 - assert 'author' in flat1[0] - - # flatten the flattened list - flat2 = twarc.expansions.ensure_flattened(flat1) - assert isinstance(flat2, list) - assert len(flat2) == len(flat1) - assert 'author' in flat2[0] +def test_flatten_noop(): + """ + Flattening twice should be a no-op. + """ + resp = next(T.tweet_lookup(range(1000, 2000))) - # flatten a tweet object which will force it into a list - flat3 = twarc.expansions.ensure_flattened(flat2[0]) - assert isinstance(flat3, list) - assert len(flat3) == 1 + flat1 = twarc.expansions.flatten(resp) + assert len(flat1) > 0 - with pytest.raises(ValueError): - twarc.expansions.ensure_flattened({'fake': 'tweet'}) - with pytest.raises(ValueError): - twarc.expansions.ensure_flattened([{'fake': 'tweet'}]) - with pytest.raises(ValueError): - flat1[0].pop('author') - twarc.expansions.ensure_flattened(flat1) + flat2 = twarc.expansions.flatten(flat1) + assert len(flat2) > 0 + assert len(flat1) == len(flat2) def test_twarc_metadata(): @@ -433,7 +408,7 @@ def test_twarc_metadata(): for response in T.tweet_lookup(range(1000, 2000)): assert "__twarc" in response - assert "__twarc" in twarc.expansions.flatten(response)[0] + assert "__twarc" in twarc.expansions.flatten(response) # Witout metadata T.metadata = False diff --git a/twarc/__init__.py b/twarc/__init__.py index e3773bc6..38e546e8 100644 --- a/twarc/__init__.py +++ b/twarc/__init__.py @@ -1,4 +1,3 @@ from .client import Twarc from .client2 import Twarc2 from .version import version -from .expansions import ensure_flattened diff --git a/twarc/__main__.py b/twarc/__main__.py index 85497b8c..53f093ef 100644 --- a/twarc/__main__.py +++ b/twarc/__main__.py @@ -1,5 +1,5 @@ -from twarc.command2 import twarc2 +import twarc.command if __name__ == "__main__": - twarc2(prog_name="python -m twarc2") + twarc.command.main() diff --git a/twarc/client2.py b/twarc/client2.py index 4b017ecb..b7457a6b 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -7,10 +7,9 @@ import re import ssl import json -import time import logging -import datetime import requests +import time from oauthlib.oauth2 import BackendApplicationClient from requests.exceptions import ConnectionError @@ -25,8 +24,6 @@ log = logging.getLogger("twarc") -TWITTER_EPOCH = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc) - class Twarc2: """ @@ -136,19 +133,17 @@ def _search( count += len(response['data']) yield response - else: - log.info(f'Retrieved an empty page of results.') - - # Calculate the amount of time to sleep, accounting for any - # processing time used by the rest of the application. - # This is to satisfy the 1 request / 1 second rate limit - # on the search/all endpoint. - time.sleep( - max(0, sleep_between - (time.monotonic() - made_call)) - ) - made_call = time.monotonic() + # Calculate the amount of time to sleep, accounting for any + # processing time used by the rest of the application. + # This is to satisfy the 1 request / 1 second rate limit + # on the search/all endpoint. - log.info(f'No more results for search {query}.') + time.sleep( + max(0, sleep_between - (time.monotonic() - made_call)) + ) + made_call = time.monotonic() + else: + log.info(f'no more results for search') def search_recent( self, query, since_id=None, until_id=None, start_time=None, @@ -211,13 +206,6 @@ def search_all( generator[dict]: a generator, dict for each paginated response. """ url = "https://api.twitter.com/2/tweets/search/all" - - # start time defaults to the beginning of Twitter to override the - # default of the last month. Only do this if start_time is not already - # specified and since_id isn't being used - if start_time is None and since_id is None: - start_time = TWITTER_EPOCH - return self._search( url, query, since_id, until_id, start_time, end_time, max_results, sleep_between=1.05 @@ -365,21 +353,6 @@ def sample(self, event=None, record_keepalive=False): data = _append_metadata(data, resp.url) yield data - # Check for an operational disconnect error in the response - if data.get("errors", []): - for error in data["errors"]: - if error.get("disconnect_type") == "OperationalDisconnect": - log.info( - "Received operational disconnect message: " - "This stream has fallen too far behind in " - "processing tweets. Some data may have been " - "lost." - ) - # Sleep briefly, then break this get call and - # attempt to reconnect. - time.sleep(5) - break - except requests.exceptions.HTTPError as e: errors += 1 log.error("caught http error %s on %s try", e, errors) @@ -524,9 +497,7 @@ def _timeline( count += len(response['data']) yield response else: - log.info(f'Retrieved an empty page of results for timeline {user_id}') - - log.info(f'No more results for timeline {user_id}.') + log.info(f'no more results for timeline') def timeline( self, user, since_id=None, until_id=None, start_time=None, @@ -735,15 +706,13 @@ def connect(self): self.client.close() if self.auth_type == "application" and self.bearer_token: - log.info('creating HTTP session headers for app auth.') - auth = f"Bearer {self.bearer_token}" - log.debug('authorization: %s', auth) + log.info('Creating HTTP session headers for app auth.') self.client = requests.Session() - self.client.headers.update({"Authorization": auth}) + self.client.headers.update( + {"Authorization": f"Bearer {self.bearer_token}"} + ) elif self.auth_type == "application": - log.info('creating app auth client via OAuth2') - log.debug('client_id: %s', self.consumer_key) - log.debug('client_secret: %s', self.consumer_secret) + log.info('Creating app auth client via OAuth2') client = BackendApplicationClient(client_id=self.consumer_key) self.client = OAuth2Session(client=client) self.client.fetch_token( @@ -753,10 +722,6 @@ def connect(self): ) else: log.info('creating user auth client') - log.debug('client_id: %s', self.consumer_key) - log.debug('client_secret: %s', self.consumer_secret) - log.debug('resource_owner_key: %s', self.access_token) - log.debug('resource_owner_secret: %s', self.access_token_secret) self.client = OAuth1Session( client_key=self.consumer_key, client_secret=self.consumer_secret, diff --git a/twarc/command2.py b/twarc/command2.py index fdb58c50..b1de6f92 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -19,12 +19,10 @@ from twarc.version import version from twarc.handshake import handshake -from twarc.config import ConfigProvider from twarc.decorators import cli_api_error -from twarc.expansions import ensure_flattened +from twarc.expansions import flatten as flat from click_config_file import configuration_option -config_provider = ConfigProvider() @with_plugins(iter_entry_points('twarc.plugins')) @click.group() @@ -44,26 +42,23 @@ show_default=True, ) @click.option('--log', default='twarc.log') -@click.option('--verbose', is_flag=True, default=False) @click.option('--metadata/--no-metadata', default=True, show_default=True, help="Include/don't include metadata about when and how data was collected.") -@configuration_option(cmd_name='twarc', config_file_name='config', provider=config_provider) +@configuration_option(cmd_name='twarc') @click.pass_context def twarc2( ctx, consumer_key, consumer_secret, access_token, access_token_secret, bearer_token, - log, metadata, app_auth, verbose + log, metadata, app_auth ): """ Collect data from the Twitter V2 API. """ logging.basicConfig( filename=log, - level=logging.DEBUG if verbose else logging.INFO, + level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s" ) - logging.info("using config %s", config_provider.file_path) - if bearer_token or (consumer_key and consumer_secret): if app_auth and (bearer_token or (consumer_key and consumer_secret)): ctx.obj = twarc.Twarc2( @@ -108,19 +103,15 @@ def configure(ctx): """ Set up your Twitter app keys. """ - - config_file = config_provider.file_path - logging.info('creating config file: %s', config_file) - - config_dir = pathlib.Path(config_file).parent - if not config_dir.is_dir(): - logging.info('creating config directory: %s', config_dir) - config_dir.mkdir(parents=True) - keys = handshake() if keys is None: raise click.ClickException("Unable to authenticate") + config_dir = pathlib.Path(click.get_app_dir('twarc')) + if not config_dir.is_dir(): + config_dir.mkdir(parents=True) + config_file = config_dir / 'config' + config = configobj.ConfigObj(unrepr=True) config.filename = config_file @@ -167,12 +158,14 @@ def get_version(): help='Search the full archive (requires Academic Research track)') @click.option('--limit', default=0, help='Maximum number of tweets to save') @click.option('--max-results', default=0, help='Maximum number of tweets per API response') +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with tweets, and one line per tweet') @click.argument('query', type=str) @click.argument('outfile', type=click.File('w'), default='-') @click.pass_obj @cli_api_error def search(T, query, outfile, since_id, until_id, start_time, end_time, limit, - max_results, archive): + max_results, archive, flatten): """ Search for tweets. """ @@ -184,6 +177,12 @@ def search(T, query, outfile, since_id, until_id, start_time, end_time, limit, # default number of tweets per response 500 when not set otherwise if max_results == 0: max_results = 500 + + # if the user is searching the historical archive the assumption is that + # they want to search everything, and not just the previous month which + # is the default: https://github.com/DocNow/twarc/issues/434 + if start_time == None and since_id == None: + start_time = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc) else: if max_results == 0: max_results = 100 @@ -191,19 +190,21 @@ def search(T, query, outfile, since_id, until_id, start_time, end_time, limit, for result in search_method(query, since_id, until_id, start_time, end_time, max_results): - _write(result, outfile) + _write(result, outfile, flatten) count += len(result['data']) if limit != 0 and count >= limit: break @twarc2.command('tweet') +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with tweets, and one line per tweet') @click.option('--pretty', is_flag=True, default=False, help='Pretty print the JSON') @click.argument('tweet_id', type=str) @click.argument('outfile', type=click.File('w'), default='-') @click.pass_obj @cli_api_error -def tweet(T, tweet_id, outfile, pretty): +def tweet(T, tweet_id, outfile, flatten, pretty): """ Look up a tweet using its tweet id or URL. """ @@ -212,23 +213,25 @@ def tweet(T, tweet_id, outfile, pretty): if not re.match('^\d+$', tweet_id): click.echo(click.style("Please enter a tweet URL or ID", fg="red"), err=True) result = next(T.tweet_lookup([tweet_id])) - _write(result, outfile, pretty=pretty) + _write(result, outfile, flatten, pretty=pretty) @twarc2.command('followers') @click.option('--limit', default=0, help='Maximum number of followers to save') +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with users, and one line per user') @click.argument('user', type=str) @click.argument('outfile', type=click.File('w'), default='-') @click.pass_obj @cli_api_error -def followers(T, user, outfile, limit): +def followers(T, user, outfile, limit, flatten): """ Get the followers for a given user. """ count = 0 for result in T.followers(user): - _write(result, outfile) + _write(result, outfile, flatten) count += len(result['data']) if limit != 0 and count >= limit: break @@ -236,18 +239,20 @@ def followers(T, user, outfile, limit): @twarc2.command('following') @click.option('--limit', default=0, help='Maximum number of friends to save') +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with users, and one line per user') @click.argument('userd', type=str) @click.argument('outfile', type=click.File('w'), default='-') @click.pass_obj @cli_api_error -def following(T, user, outfile, limit): +def following(T, user, outfile, limit, flatten): """ Get the users who are following a given user. """ count = 0 for result in T.following(user): - _write(result, outfile) + _write(result, outfile, flatten) count += len(result['data']) if limit != 0 and count >= limit: break @@ -255,10 +260,12 @@ def following(T, user, outfile, limit): @twarc2.command('sample') @click.option('--limit', default=0, help='Maximum number of tweets to save') +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with tweets, and one line per tweet.') @click.argument('outfile', type=click.File('a+'), default='-') @click.pass_obj @cli_api_error -def sample(T, outfile, limit): +def sample(T, flatten, outfile, limit): """ Fetch tweets from the sample stream. """ @@ -269,35 +276,38 @@ def sample(T, outfile, limit): count += 1 if limit != 0 and count >= limit: event.set() - _write(result, outfile) + _write(result, outfile, flatten) @twarc2.command('hydrate') @click.argument('infile', type=click.File('r'), default='-') @click.argument('outfile', type=click.File('w'), default='-') +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with tweets, and one line per tweet.') @click.pass_obj @cli_api_error -def hydrate(T, infile, outfile): +def hydrate(T, infile, outfile, flatten): """ Hydrate tweet ids. """ for result in T.tweet_lookup(infile): - _write(result, outfile) + _write(result, outfile, flatten) @twarc2.command('users') @click.option('--usernames', is_flag=True, default=False) +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with tweets, and one line per tweet.') @click.argument('infile', type=click.File('r'), default='-') @click.argument('outfile', type=click.File('w'), default='-') @click.pass_obj @cli_api_error -def users(T, infile, outfile, usernames): +def users(T, infile, outfile, usernames, flatten): """ Get data for user ids or usernames. """ for result in T.user_lookup(infile, usernames): - _write(result, outfile) - + _write(result, outfile, flatten) @twarc2.command('mentions') @click.option('--since-id', type=int, @@ -310,20 +320,20 @@ def users(T, infile, outfile, usernames): @click.option('--end-time', type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), help='Match tweets sent before time (ISO 8601/RFC 3339)') +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with tweets, and one line per tweet') @click.argument('user_id', type=str) @click.argument('outfile', type=click.File('w'), default='-') @click.pass_obj @cli_api_error -def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time): +def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time, flatten): """ Retrieve the most recent tweets mentioning the given user. """ for result in T.mentions(user_id, since_id, until_id, start_time, end_time): - _write(result, outfile) - + _write(result, outfile, flatten) @twarc2.command('timeline') -@click.option('--limit', default=0, help='Maximum number of tweets to return') @click.option('--since-id', type=int, help='Match tweets sent after tweet id') @click.option('--until-id', type=int, @@ -334,183 +344,18 @@ def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time): @click.option('--end-time', type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), help='Match tweets sent before time (ISO 8601/RFC 3339)') -@click.option('--use-search', is_flag=True, default=False, - help='Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.') +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with tweets, and one line per tweet') @click.argument('user_id', type=str) @click.argument('outfile', type=click.File('w'), default='-') @click.pass_obj @cli_api_error -def timeline(T, user_id, outfile, since_id, until_id, start_time, end_time, - use_search, limit): - """ - Retrieve recent tweets for the given user. - """ - - if use_search: - q = f'from:{user_id}' - tweets = T.search_all(q, since_id, until_id, start_time, end_time) - else: - tweets = T.timeline(user_id, since_id, until_id, start_time, end_time) - - count = 0 - for result in tweets: - _write(result, outfile) - - count += len(result['data']) - if limit != 0 and count >= limit: - break - - -@twarc2.command('timelines') -@click.option('--limit', default=0, help='Maximum number of tweets to return') -@click.option('--timeline-limit', default=0, - help='Maximum number of tweets to return per-timeline') -@click.option('--use-search', is_flag=True, default=False, - help='Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.') -@click.argument('infile', type=click.File('r'), default='-') -@click.argument('outfile', type=click.File('w'), default='-') -@click.pass_obj -def timelines(T, infile, outfile, limit, timeline_limit, use_search): +def timeline(T, user_id, outfile, since_id, until_id, start_time, end_time, flatten): """ - Fetch the timelines of every user in an input source of tweets. If - the input is a line oriented text file of user ids or usernames that will - be used instead. + Retrieve the 3200 most recent tweets for the given user. """ - total_count = 0 - seen = set() - for line in infile: - line = line.strip() - if line == "": - continue - - users = [] - try: - data = ensure_flattened(json.loads(line)) - users = set([t['author']['id'] for t in ensure_flattened(data)]) - except json.JSONDecodeError: - users = set([line]) - except ValueError: - users = set([line]) - - for user in users: - - # only process a given user once - if user in seen: - continue - seen.add(user) - - # which api endpoint to use - if use_search and since_id: - tweets = T.search_all(f'from:{user}', since_id=since_id) - elif use_search: - tweets = T.search_all(f'from:{user}') - else: - tweets = T.timeline(user) - - timeline_count = 0 - for response in tweets: - _write(response, outfile) - - timeline_count += len(response['data']) - if timeline_limit != 0 and timeline_count >= timeline_limit: - break - - total_count += len(response['data']) - if limit != 0 and total_count >= limit: - return - - -@twarc2.command('conversation') -@click.option('--archive', is_flag=True, default=False, - help='Search the full archive (requires Academic Research track)') -@click.argument('tweet_id', type=str) -@click.argument('outfile', type=click.File('w'), default='-') -@click.pass_obj -@cli_api_error -def conversation(T, tweet_id, archive, outfile): - """ - Retrieve a conversation thread using the tweet id. - """ - q = f'conversation_id:{tweet_id}' - if archive: - search = T.search_all(q) - else: - search = T.search_recent(q) - for resp in search: - _write(resp, outfile) - - -@twarc2.command('conversations') -@click.option('--limit', default=0, help='Maximum number of tweets to return') -@click.option('--conversation-limit', default=0, - help='Maximum number of tweets to return per-conversation') -@click.option('--archive', is_flag=True, default=False, - help='Use the Academic Research project track access to the full archive') -@click.argument('infile', type=click.File('r'), default='-') -@click.argument('outfile', type=click.File('w'), default='-') -@click.pass_obj -@cli_api_error -def conversations(T, infile, outfile, archive, limit, conversation_limit): - """ - Fetch the full conversation threads that the input tweets are a part of. - Alternatively the input can be a line oriented file of conversation ids. - """ - - # keep track of converstation ids that have been fetched so that they - # aren't fetched twice - seen = set() - - # use the archive or recent search? - search = T.search_all if archive else T.search_recent - - count = 0 - stop = False - for line in infile: - conv_ids = [] - - # stop will get set when the total tweet limit has been met - if stop: - break - - # get a specific conversation id - line = line.strip() - if re.match(r'^\d+$', line): - if line in seen: - continue - conv_ids = [line] - - # generate all conversation_ids that are referenced in tweets input - else: - def f(): - for tweet in ensure_flattened(json.loads(line)): - yield tweet.get('conversation_id') - conv_ids = f() - - # output results while paying attention to the set limits - conv_count = 0 - - for conv_id in conv_ids: - - if conv_id in seen: - logging.info(f'already fetched conversation_id {conv_id}') - seen.add(conv_id) - - conv_count = 0 - - logging.info(f'fetching conversation {conv_id}') - for result in search(f'conversation_id:{conv_id}'): - _write(result, outfile, False) - - count += len(result['data']) - if limit != 0 and count >= limit: - logging.info(f'reached tweet limit of {limit}') - stop = True - break - - conv_count += len(result['data']) - if conversation_limit !=0 and conv_count >= conversation_limit: - logging.info(f'reached conversation limit {conversation_limit}') - break + for result in T.timeline(user_id, since_id, until_id, start_time, end_time): + _write(result, outfile, flatten) @twarc2.command('flatten') @@ -519,24 +364,25 @@ def f(): @cli_api_error def flatten(infile, outfile): """ - "Flatten" tweets, or move expansions inline with tweet objects and ensure - that each line of output is a single tweet. + "Flatten" tweets, or move expansions inline with tweet objects. """ if (infile.name == outfile.name): click.echo(click.style(f"💔 Cannot flatten files in-place, specify a different output file!", fg='red'), err=True) return for line in infile: - for tweet in ensure_flattened(json.loads(line)): - _write(tweet, outfile, False) + result = json.loads(line) + _write(result, outfile, True) @twarc2.command('stream') @click.option('--limit', default=0, help='Maximum number of tweets to return') +@click.option('--flatten', is_flag=True, default=False, + help='Include expansions inline with tweets, and one line per tweet') @click.argument('outfile', type=click.File('a+'), default='-') @click.pass_obj @cli_api_error -def stream(T, outfile, limit): +def stream(T, flatten, outfile, limit): """ Fetch tweets from the live stream. """ @@ -552,7 +398,7 @@ def stream(T, outfile, limit): if limit != 0 and count == limit: logging.info(f'reached limit {limit}') event.set() - _write(result, outfile) + _write(result, outfile, flatten) @twarc2.group() @@ -690,6 +536,17 @@ def _error_str(errors): return click.style("\n".join(parts), fg="red") -def _write(results, outfile, pretty=False): +def _write(results, outfile, flatten, pretty=False): indent = 2 if pretty else None - click.echo(json.dumps(results, indent=indent), file=outfile) + if 'data' in results: + if flatten: + if isinstance(results['data'], list): + for r in flat(results)['data']: + click.echo(json.dumps(r, indent=indent), file=outfile) + else: + r = flat(results)['data'] + click.echo(json.dumps(r, indent=indent), file=outfile) + else: + click.echo(json.dumps(results, indent=indent), file=outfile) + else: + click.echo(json.dumps(results, indent=indent), file=outfile) diff --git a/twarc/config.py b/twarc/config.py deleted file mode 100644 index 3fe2096d..00000000 --- a/twarc/config.py +++ /dev/null @@ -1,16 +0,0 @@ -import logging -import configobj - -# Adapted from click_config_file.configobj_provider so that we can store the -# file path that the config was loaded from in order to log it later. - -log = logging - -class ConfigProvider(): - - def __init__(self): - self.file_path = None - - def __call__(self, file_path, cmd_name): - self.file_path = file_path - return configobj.ConfigObj(file_path, unrepr=True) diff --git a/twarc/expansions.py b/twarc/expansions.py index 22a57c24..64ec6fcc 100644 --- a/twarc/expansions.py +++ b/twarc/expansions.py @@ -1,10 +1,8 @@ """ This module contains a list of the known Twitter V2+ API expansions and fields -for each expansion, and a function flatten() for "flattening" a result set, -including all expansions inline. +for each expansion, and a function for "flattening" a result set, including all +expansions inline -ensure_flattened() can be used in tweet processing programs that need to make -sure that data is flattened. """ from collections import defaultdict @@ -117,13 +115,9 @@ def extract_includes(response, expansion, _id="id"): def flatten(response): """ - Flatten an API response by moving all "included" entities inline with the - tweets they are referenced from. flatten expects an entire page response - from the API (data, includes, meta) and will raise a ValueError if what is - passed in does not appear to be an API response. It will return a list of - dictionaries where each dictionary represents a tweet. Empty objects will - be returned for things that are missing in includes, which can happen when - protected or delete users or tweets are referenced. + Flatten the response. Expects an entire page response from the API (data, + includes, meta) Defaults: Return empty objects for things missing in + includes. Doesn't modify tweets, only adds extra data. """ # Users extracted both by id and by username for expanding mentions @@ -197,60 +191,17 @@ def expand_payload(payload): return payload - # First expand the included tweets, before processing actual result tweets: + # First, expand the included tweets, before processing actual result tweets: for included_id, included_tweet in extract_includes(response, "tweets").items(): includes_tweets[included_id] = expand_payload(included_tweet) # Now flatten the list of tweets or an individual tweet - tweets = [] if "data" in response: - data = response['data'] - - if isinstance(data, list): - tweets = expand_payload(response["data"]) - elif isinstance(data, dict): - tweets = [expand_payload(response["data"])] + response["data"] = expand_payload(response["data"]) # Add the __twarc metadata to each tweet if it's a result set - if "__twarc" in response: - for tweet in tweets: + if "__twarc" in response and isinstance(response["data"], list): + for tweet in response["data"]: tweet["__twarc"] = response["__twarc"] - else: - raise ValueError(f'missing data stanza in response: {response}') - - return tweets - - -def ensure_flattened(data): - """ - Will ensure that the supplied data is "flattened". The input data can be a - response from the Twitter API, a list of tweet dictionaries, or a single tweet - dictionary. It will always return a list of tweet dictionaries. A ValueError - will be thrown if the supplied data is not recognizable or it cannot be - flattened. - - ensure_flattened is designed for use in twarc plugins and other tweet - processing applications that want to operate on a stream of tweets, and - examine included entities like users and tweets without hunting and - pecking in the response data. - """ - if isinstance(data, dict) and 'data' in data: - return flatten(data) - - elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict): - # if author is present it is already flattened - if 'author' in data[0]: - return data - else: - raise ValueError('unable to flatten list of tweets without original response data: {data}') - - elif isinstance(data, dict) and 'author' in data: - # if author is present it is already flattened - if 'author' in data: - return [data] - else: - raise ValueError(f'unable to flatten tweet dictionary without original response data: {data}') - - else: - raise ValueError(f'cannot flatten unrecognized data: {data}') + return response diff --git a/twarc/handshake.py b/twarc/handshake.py index cafacd0d..dce86c0a 100644 --- a/twarc/handshake.py +++ b/twarc/handshake.py @@ -6,6 +6,7 @@ from requests_oauthlib import OAuth1 from urllib.parse import parse_qs +from getpass import getpass def handshake(): @@ -15,7 +16,7 @@ def handshake(): access_token = "" access_token_secret = "" - bearer_token = input( + bearer_token = getpass( "Please enter your Bearer Token (leave blank to skip to API key configuration): " ) @@ -31,7 +32,7 @@ def handshake(): "Configure API keys and secrets." consumer_key = input("Please enter your API key: ") - consumer_secret = input("Please enter your API secret: ") + consumer_secret = getpass("Please enter your API secret: ") # verify that the keys work to get the bearer token url = "https://api.twitter.com/oauth2/token" @@ -95,7 +96,7 @@ def handshake(): screen_name = credentials.get('screen_name')[0] else: access_token = input("Enter your Access Token: ") - access_token_secret = input("Enter your Access Token Secret: ") + access_token_secret = getpass("Enter your Access Token Secret: ") screen_name = "default" return { diff --git a/twarc/version.py b/twarc/version.py index 4f58e967..e0c5a628 100644 --- a/twarc/version.py +++ b/twarc/version.py @@ -1 +1 @@ -version = '2.1.8' +version = '2.0.13' diff --git a/twarc2.py b/twarc2.py new file mode 100644 index 00000000..aa5f2278 --- /dev/null +++ b/twarc2.py @@ -0,0 +1,4 @@ +from twarc.command2 import twarc2 + +if __name__ == "__main__": + twarc2(prog_name="python -m twarc2") diff --git a/utils/source.py b/utils/source.py index 18d87c05..4f9d4284 100755 --- a/utils/source.py +++ b/utils/source.py @@ -5,6 +5,7 @@ Example usage: utils/source.py tweets.jsonl > sources.html """ +from __future__ import print_function import json import fileinput from collections import defaultdict @@ -54,14 +55,14 @@

Twitter client sources

- created on the command line with twarc + created on the command line with twarc
""") for source in sumsort: - print(''.format(source, summary[source])) + print(''.format(source.encode('utf-8'), summary[source])) print(""" @@ -70,7 +71,7 @@


-created on the command line with twarc. +created on the command line with twarc.

diff --git a/utils/wall.py b/utils/wall.py index 836628df..9b865602 100755 --- a/utils/wall.py +++ b/utils/wall.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- """ Feed wall.py your JSON and get a wall of tweets as HTML. If you want to get the @@ -7,6 +8,7 @@ % tail -r tweets.jsonl | ./wall.py > wall.html """ +from __future__ import print_function import os import re @@ -109,7 +111,7 @@ def text(t):

Title Here

- created on the command line with twarc + created on the command line with twarc
@@ -147,10 +149,10 @@ def text(t): "created_at": tweet["created_at"], "name": tweet["user"]["name"], "username": tweet["user"]["screen_name"], - "user_url": "https://twitter.com/" + tweet["user"]["screen_name"], + "user_url": "http://twitter.com/" + tweet["user"]["screen_name"], "text": text(tweet), "avatar": AVATAR_DIR + "/" + filename, - "url": "https://twitter.com/" + tweet["user"]["screen_name"] + "/status/" + tweet["id_str"], + "url": "http://twitter.com/" + tweet["user"]["screen_name"] + "/status/" + tweet["id_str"], } if 'retweet_status' in tweet: @@ -158,19 +160,20 @@ def text(t): else: t['retweet_count'] = tweet.get('retweet_count', 0) - t['favorite_count'] = tweet.get('favorite_count', 0) - t['retweet_string'] = 'retweet' if t['retweet_count'] == 1 else 'retweets' - t['favorite_string'] = 'like' if t['favorite_count'] == 1 else 'likes' + if t['retweet_count'] == 1: + t['retweet_string'] = 'retweet' + else: + t['retweet_string'] = 'retweets' for url in tweet['entities']['urls']: a = '%(url)s' % url start, end = url['indices'] t['text'] = t['text'][0:start] + a + t['text'][end:] - t['text'] = re.sub('@([A-Za-z0-9_]+)', r'@\g<1>', t['text']) - t['text'] = re.sub(' #([^ ]+)', r' #\g<1>', t['text']) + t['text'] = re.sub(' @([^ ]+)', ' @\g<1>', t['text']) + t['text'] = re.sub(' #([^ ]+)', ' #\g<1>', t['text']) - html = """ + html = u"""
%(name)s
@@ -178,13 +181,16 @@ def text(t):
%(text)s

- %(retweet_count)s %(retweet_string)s, %(favorite_count)s %(favorite_string)s
+ %(retweet_count)s %(retweet_string)s
""" % t - print(html) + if sys.version_info.major == 2: + print(html.encode('utf8')) + else: + print(html) print(""" @@ -193,7 +199,7 @@ def text(t):


-created on the command line with twarc. +created on the command line with twarc.

From 1303abcd338e258be338f01534e4e01d2db98e74 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Wed, 27 Oct 2021 11:47:27 +0100 Subject: [PATCH 5/8] Revert "Merge branch 'main' into random-sample" This reverts commit 730e98dab72172a4f148358393905b62d5e9d6cd, reversing changes made to 14c733681c90d02664a5cbd5adae97261db3f863. --- README.md | 2 +- twarc/__main__.py | 5 ----- twarc/command2.py | 14 ++++++-------- twarc2.py | 4 ---- 4 files changed, 7 insertions(+), 18 deletions(-) delete mode 100644 twarc/__main__.py delete mode 100644 twarc2.py diff --git a/README.md b/README.md index c0551a68..51fc74b8 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ If you prefer you can create a page on the [wiki](https://github.com/docnow/twar If you are interested in adding functionality to twarc or fixing something that's broken here are the steps to setting up your development environment: - git clone https://github.com/docnow/twarc + git clone https://github.io/docnow/twarc cd twarc pip install -r requirements.txt diff --git a/twarc/__main__.py b/twarc/__main__.py deleted file mode 100644 index 53f093ef..00000000 --- a/twarc/__main__.py +++ /dev/null @@ -1,5 +0,0 @@ -import twarc.command - -if __name__ == "__main__": - twarc.command.main() - diff --git a/twarc/command2.py b/twarc/command2.py index b1de6f92..c0df34bf 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -150,10 +150,10 @@ def get_version(): help='Match tweets sent prior to tweet id') @click.option('--start-time', type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), - help='Match tweets created after UTC time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04') + help='Match tweets created after time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04') @click.option('--end-time', type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), - help='Match tweets sent before UTC time (ISO 8601/RFC 3339)') + help='Match tweets sent before time (ISO 8601/RFC 3339)') @click.option('--archive', is_flag=True, default=False, help='Search the full archive (requires Academic Research track)') @click.option('--limit', default=0, help='Maximum number of tweets to save') @@ -388,11 +388,9 @@ def stream(T, flatten, outfile, limit): """ event = threading.Event() count = 0 - click.echo(click.style(f'Started a stream with rules:', fg='green'), - err=True) + click.echo(click.style(f'Started a stream with rules:', fg='green')) _print_stream_rules(T) - click.echo(click.style(f'Writing to {outfile.name}\nCTRL+C to stop...', - fg='green'), err=True) + click.echo(click.style(f'Writing to {outfile.name}\nCTRL+C to stop...', fg='green')) for result in T.stream(event=event): count += 1 if limit != 0 and count == limit: @@ -425,7 +423,7 @@ def _print_stream_rules(T): """ result = T.get_stream_rules() if 'data' not in result or len(result['data']) == 0: - click.echo('No rules yet. Add them with ' + click.style('twarc2 stream-rules add', bold=True), err=True) + click.echo('No rules yet. Add them with ' + click.style('twarc2 stream-rules add', bold=True)) else: count = 0 for rule in result['data']: @@ -434,7 +432,7 @@ def _print_stream_rules(T): s = rule['value'] if 'tag' in rule: s += f" (tag: {rule['tag']})" - click.echo(click.style(f'☑ {s}'), err=True) + click.echo(click.style(f'☑ {s}')) count += 1 diff --git a/twarc2.py b/twarc2.py deleted file mode 100644 index aa5f2278..00000000 --- a/twarc2.py +++ /dev/null @@ -1,4 +0,0 @@ -from twarc.command2 import twarc2 - -if __name__ == "__main__": - twarc2(prog_name="python -m twarc2") From 1a8358a0282b4ed464dc8be08d2279e2e5c9e32a Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Wed, 27 Oct 2021 11:47:51 +0100 Subject: [PATCH 6/8] Revert "add and move ts functions to helpers" This reverts commit 14c733681c90d02664a5cbd5adae97261db3f863. --- twarc/client2.py | 11 ++++++----- twarc/helpers.py | 39 ++++++--------------------------------- twarc/version.py | 2 +- 3 files changed, 13 insertions(+), 39 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index b7457a6b..ce36501c 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -9,6 +9,7 @@ import json import logging import requests +import datetime import time from oauthlib.oauth2 import BackendApplicationClient @@ -120,9 +121,9 @@ def _search( if until_id: params["until_id"] = until_id if start_time: - params["start_time"] = ts(start_time) + params["start_time"] = _ts(start_time) if end_time: - params["end_time"] = ts(end_time) + params["end_time"] = _ts(end_time) count = 0 made_call = time.monotonic() @@ -486,9 +487,9 @@ def _timeline( if until_id: params["until_id"] = until_id if start_time: - params["start_time"] = ts(start_time) + params["start_time"] = _ts(start_time) if end_time: - params["end_time"] = ts(end_time) + params["end_time"] = _ts(end_time) count = 0 for response in self.get_paginated(url, params=params): @@ -756,6 +757,6 @@ def _append_metadata(result, url): result["__twarc"] = { "url": url, "version": version, - "retrieved_at": utcnow() + "retrieved_at": _utcnow() } return result diff --git a/twarc/helpers.py b/twarc/helpers.py index 11f32098..cfb259d0 100644 --- a/twarc/helpers.py +++ b/twarc/helpers.py @@ -1,10 +1,8 @@ """ Useful functions for converting things into different types """ -import datetime - -def ts(dt): +def _ts(dt): """ Return ISO 8601 / RFC 3339 datetime in UTC. If no timezone is specified it is assumed to be in UTC. The Twitter API does not accept microseconds. @@ -19,40 +17,15 @@ def ts(dt): dt = dt.astimezone(datetime.timezone.utc) else: dt = dt.replace(tzinfo=datetime.timezone.utc) - return dt.isoformat(timespec="seconds") - + return dt.isoformat(timespec='seconds') -def utcnow(): +def _utcnow(): """ Return _now_ in ISO 8601 / RFC 3339 datetime in UTC. Returns: datetime: Current timestamp in UTC. """ - return datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") - - -def _snowflake2millis(snowflake_id): - return (snowflake_id >> 22) + 1288834974657 - - -def _millis2snowflake(milliseconds): - return (int(milliseconds) - 1288834974657) << 22 - - -def _get_millis(ms): - return ms % 1000 - - -def _sample_windows(start_ts, end_ts, sample_type): - """ - todo: Generate tuples of start and end snowflake ids between two timestamps - - sample_type - type of random sample and millisecond range: - _1% "Spritzer" Sample [657-666] - 10% "Gardenhose" Sample [657-756] - 10% "Enterprise" Sample [*0*] - _1% v2 Sample [?] - _N% v2 Sample [?] - """ - pass + return datetime.datetime.now(datetime.timezone.utc).isoformat( + timespec='seconds' + ) diff --git a/twarc/version.py b/twarc/version.py index e0c5a628..9febec5a 100644 --- a/twarc/version.py +++ b/twarc/version.py @@ -1 +1 @@ -version = '2.0.13' +version = '2.0.12' From c458799fa15a2795c091bbc26a086f8426a4c105 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Wed, 27 Oct 2021 11:48:08 +0100 Subject: [PATCH 7/8] Revert "move time related functions to helpers" This reverts commit 05c6f27f5e28a629fc08cdf2c5d2a457f8f81e5d. --- twarc/client2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/twarc/client2.py b/twarc/client2.py index ce36501c..d2e43e37 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -19,7 +19,6 @@ from twarc import expansions from twarc.decorators import * -from twarc.helpers import * from twarc.version import version From 42b9306deb0f8c327921b801fdb4997ae04fbf66 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Wed, 27 Oct 2021 11:48:22 +0100 Subject: [PATCH 8/8] Revert "move time related functions to helpers" This reverts commit 477676c7df1eb9525c1a73c8044c1110caef4350. --- twarc/client2.py | 28 ++++++++++++++++++++++++++++ twarc/helpers.py | 31 ------------------------------- 2 files changed, 28 insertions(+), 31 deletions(-) delete mode 100644 twarc/helpers.py diff --git a/twarc/client2.py b/twarc/client2.py index d2e43e37..2ee2f29f 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -740,6 +740,34 @@ def _ensure_user_id(self, user): else: raise ValueError(f"No such user {user}") +def _ts(dt): + """ + Return ISO 8601 / RFC 3339 datetime in UTC. If no timezone is specified it + is assumed to be in UTC. The Twitter API does not accept microseconds. + + Args: + dt (datetime): a `datetime` object to format. + + Returns: + str: an ISO 8601 / RFC 3339 datetime in UTC. + """ + if dt.tzinfo: + dt = dt.astimezone(datetime.timezone.utc) + else: + dt = dt.replace(tzinfo=datetime.timezone.utc) + return dt.isoformat(timespec='seconds') + +def _utcnow(): + """ + Return _now_ in ISO 8601 / RFC 3339 datetime in UTC. + + Returns: + datetime: Current timestamp in UTC. + """ + return datetime.datetime.now(datetime.timezone.utc).isoformat( + timespec='seconds' + ) + def _append_metadata(result, url): """ Appends `__twarc` metadata to the result. diff --git a/twarc/helpers.py b/twarc/helpers.py deleted file mode 100644 index cfb259d0..00000000 --- a/twarc/helpers.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Useful functions for converting things into different types -""" - -def _ts(dt): - """ - Return ISO 8601 / RFC 3339 datetime in UTC. If no timezone is specified it - is assumed to be in UTC. The Twitter API does not accept microseconds. - - Args: - dt (datetime): a `datetime` object to format. - - Returns: - str: an ISO 8601 / RFC 3339 datetime in UTC. - """ - if dt.tzinfo: - dt = dt.astimezone(datetime.timezone.utc) - else: - dt = dt.replace(tzinfo=datetime.timezone.utc) - return dt.isoformat(timespec='seconds') - -def _utcnow(): - """ - Return _now_ in ISO 8601 / RFC 3339 datetime in UTC. - - Returns: - datetime: Current timestamp in UTC. - """ - return datetime.datetime.now(datetime.timezone.utc).isoformat( - timespec='seconds' - )
{}{}
{}{}