Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Random sample option #459

Closed
wants to merge 10 commits into from
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ If you prefer you can create a page on the [wiki](https://github.com/docnow/twar

If you are interested in adding functionality to twarc or fixing something that's broken here are the steps to setting up your development environment:

git clone https://github.com/docnow/twarc
git clone https://github.io/docnow/twarc
cd twarc
pip install -r requirements.txt

Expand Down
2 changes: 1 addition & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
twarc
=====

***For information about working with the Twitter V2 API please see the [twarc2](https://twarc-project.readthedocs.io/en/latest/twarc2/) page.***
***For information about working with the Twitter V2 API please see the [twarc2](twarc2) page.***

---

Expand Down
4 changes: 4 additions & 0 deletions docs/api/client.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@

::: twarc.client
handler: python




1 change: 1 addition & 0 deletions docs/api/client2.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@

::: twarc.client2
handler: python

4 changes: 0 additions & 4 deletions docs/api/expansions.md

This file was deleted.

2 changes: 1 addition & 1 deletion docs/plugins.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ add it to this list):
* [twarc-ids](https://pypi.org/project/twarc-ids/): extract tweet ids from tweets
* [twarc-videos](https://pypi.org/project/twarc-videos): extract videos from tweets
* [twarc-csv](https://pypi.org/project/twarc-csv/): export tweets to CSV
* [twarc-timeline-archive](https://pypi.org/project/twarc-timeline-archive): routinely download tweet timelines for a list of users
* [twarc-timelines](https://pypi.org/project/twarc-timelines): download tweet timelines for a list of users

## Writing a Plugin

Expand Down
2 changes: 0 additions & 2 deletions docs/twitter-developer-access.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ Now that you have your keys and tokens, you can start using the API. You may be

Be careful not to commit your keys into a public repository or make them visible to the public - do not include them in a client side js script for example. Most apps will ask for API Key and Secret, but "Consumer Key" is "API Key" and "Consumer Secret" is "API Secret".

For Academic Access, there is only one endpoint that takes Bearer (App Only) authentication, so in most cases, the Bearer Token is all you need to share.

## Step 5: Next Steps

Install `twarc`, and run `twarc2 configure` to set it up.
Expand Down
1 change: 0 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ nav:
- Library API:
- api/client.md
- api/client2.md
- api/expansions.md

plugins:
- search
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
url='https://github.com/docnow/twarc',
author='Ed Summers',
author_email='[email protected]',
packages=['twarc'],
packages=['twarc', ],
description='Archive tweets from the command line',
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
59 changes: 17 additions & 42 deletions test_twarc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import dotenv
import pytest
import logging
import pathlib
import datetime
import threading

Expand All @@ -16,7 +15,6 @@
access_token = os.environ.get('ACCESS_TOKEN')
access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')

test_data = pathlib.Path('test-data')
logging.basicConfig(filename="test.log", level=logging.INFO)

# Implicitly test the constructor in application auth mode. This ensures that
Expand Down Expand Up @@ -294,7 +292,6 @@ def test_follows():
break
assert found >= 1000


def test_follows_username():
"""
Test followers and and following by username.
Expand Down Expand Up @@ -333,20 +330,16 @@ def test_flattened():
found_referenced_tweets = False

event = threading.Event()
for count, response in enumerate(T.sample(event=event)):

# streaming api always returns a tweet at a time but flatten
# will put these in a list so they can be treated uniformly
tweets = twarc.expansions.flatten(response)
assert len(tweets) == 1
tweet = tweets[0]
for count, result in enumerate(T.sample(event=event)):
result = twarc.expansions.flatten(result)

tweet = result["data"]
assert "id" in tweet
logging.info("got sample tweet #%s %s", count, tweet["id"])

author_id = tweet["author_id"]
assert "author" in tweet
assert tweet["author"]["id"] == author_id
assert result["data"]["author"]["id"] == author_id

if "in_reply_to_user_id" in tweet:
assert "in_reply_to_user" in tweet
Expand All @@ -369,11 +362,8 @@ def test_flattened():
assert tweet["entities"]["mentions"][0]["username"]
found_entities_mentions = True

# need to ensure there are no errors because a referenced tweet
# might be protected or deleted in which case it would not have been
# included in the response and would not have been flattened
if "errors" not in response and "referenced_tweets" in tweet:
assert tweet["referenced_tweets"][0]["text"]
if "referenced_tweets" in tweet:
assert tweet["referenced_tweets"][0]["id"]
found_referenced_tweets = True

if found_geo and found_in_reply_to_user and found_attachments_media \
Expand All @@ -393,33 +383,18 @@ def test_flattened():
assert found_referenced_tweets, "found referenced tweets"


def test_ensure_flattened():
resp = next(T.search_recent('twitter'))

# flatten a response
flat1 = twarc.expansions.ensure_flattened(resp)
assert isinstance(flat1, list)
assert len(flat1) > 1
assert 'author' in flat1[0]

# flatten the flattened list
flat2 = twarc.expansions.ensure_flattened(flat1)
assert isinstance(flat2, list)
assert len(flat2) == len(flat1)
assert 'author' in flat2[0]
def test_flatten_noop():
"""
Flattening twice should be a no-op.
"""
resp = next(T.tweet_lookup(range(1000, 2000)))

# flatten a tweet object which will force it into a list
flat3 = twarc.expansions.ensure_flattened(flat2[0])
assert isinstance(flat3, list)
assert len(flat3) == 1
flat1 = twarc.expansions.flatten(resp)
assert len(flat1) > 0

with pytest.raises(ValueError):
twarc.expansions.ensure_flattened({'fake': 'tweet'})
with pytest.raises(ValueError):
twarc.expansions.ensure_flattened([{'fake': 'tweet'}])
with pytest.raises(ValueError):
flat1[0].pop('author')
twarc.expansions.ensure_flattened(flat1)
flat2 = twarc.expansions.flatten(flat1)
assert len(flat2) > 0
assert len(flat1) == len(flat2)


def test_twarc_metadata():
Expand All @@ -433,7 +408,7 @@ def test_twarc_metadata():

for response in T.tweet_lookup(range(1000, 2000)):
assert "__twarc" in response
assert "__twarc" in twarc.expansions.flatten(response)[0]
assert "__twarc" in twarc.expansions.flatten(response)

# Witout metadata
T.metadata = False
Expand Down
1 change: 0 additions & 1 deletion twarc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .client import Twarc
from .client2 import Twarc2
from .version import version
from .expansions import ensure_flattened
5 changes: 0 additions & 5 deletions twarc/__main__.py

This file was deleted.

69 changes: 17 additions & 52 deletions twarc/client2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
import re
import ssl
import json
import time
import logging
import datetime
import requests
import datetime
import time

from oauthlib.oauth2 import BackendApplicationClient
from requests.exceptions import ConnectionError
Expand All @@ -25,8 +24,6 @@

log = logging.getLogger("twarc")

TWITTER_EPOCH = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc)


class Twarc2:
"""
Expand Down Expand Up @@ -136,19 +133,17 @@ def _search(
count += len(response['data'])
yield response

else:
log.info(f'Retrieved an empty page of results.')

# Calculate the amount of time to sleep, accounting for any
# processing time used by the rest of the application.
# This is to satisfy the 1 request / 1 second rate limit
# on the search/all endpoint.
time.sleep(
max(0, sleep_between - (time.monotonic() - made_call))
)
made_call = time.monotonic()
# Calculate the amount of time to sleep, accounting for any
# processing time used by the rest of the application.
# This is to satisfy the 1 request / 1 second rate limit
# on the search/all endpoint.

log.info(f'No more results for search {query}.')
time.sleep(
max(0, sleep_between - (time.monotonic() - made_call))
)
made_call = time.monotonic()
else:
log.info(f'no more results for search')

def search_recent(
self, query, since_id=None, until_id=None, start_time=None,
Expand Down Expand Up @@ -211,13 +206,6 @@ def search_all(
generator[dict]: a generator, dict for each paginated response.
"""
url = "https://api.twitter.com/2/tweets/search/all"

# start time defaults to the beginning of Twitter to override the
# default of the last month. Only do this if start_time is not already
# specified and since_id isn't being used
if start_time is None and since_id is None:
start_time = TWITTER_EPOCH

return self._search(
url, query, since_id, until_id, start_time, end_time, max_results,
sleep_between=1.05
Expand Down Expand Up @@ -365,21 +353,6 @@ def sample(self, event=None, record_keepalive=False):
data = _append_metadata(data, resp.url)
yield data

# Check for an operational disconnect error in the response
if data.get("errors", []):
for error in data["errors"]:
if error.get("disconnect_type") == "OperationalDisconnect":
log.info(
"Received operational disconnect message: "
"This stream has fallen too far behind in "
"processing tweets. Some data may have been "
"lost."
)
# Sleep briefly, then break this get call and
# attempt to reconnect.
time.sleep(5)
break

except requests.exceptions.HTTPError as e:
errors += 1
log.error("caught http error %s on %s try", e, errors)
Expand Down Expand Up @@ -524,9 +497,7 @@ def _timeline(
count += len(response['data'])
yield response
else:
log.info(f'Retrieved an empty page of results for timeline {user_id}')

log.info(f'No more results for timeline {user_id}.')
log.info(f'no more results for timeline')

def timeline(
self, user, since_id=None, until_id=None, start_time=None,
Expand Down Expand Up @@ -735,15 +706,13 @@ def connect(self):
self.client.close()

if self.auth_type == "application" and self.bearer_token:
log.info('creating HTTP session headers for app auth.')
auth = f"Bearer {self.bearer_token}"
log.debug('authorization: %s', auth)
log.info('Creating HTTP session headers for app auth.')
self.client = requests.Session()
self.client.headers.update({"Authorization": auth})
self.client.headers.update(
{"Authorization": f"Bearer {self.bearer_token}"}
)
elif self.auth_type == "application":
log.info('creating app auth client via OAuth2')
log.debug('client_id: %s', self.consumer_key)
log.debug('client_secret: %s', self.consumer_secret)
log.info('Creating app auth client via OAuth2')
client = BackendApplicationClient(client_id=self.consumer_key)
self.client = OAuth2Session(client=client)
self.client.fetch_token(
Expand All @@ -753,10 +722,6 @@ def connect(self):
)
else:
log.info('creating user auth client')
log.debug('client_id: %s', self.consumer_key)
log.debug('client_secret: %s', self.consumer_secret)
log.debug('resource_owner_key: %s', self.access_token)
log.debug('resource_owner_secret: %s', self.access_token_secret)
self.client = OAuth1Session(
client_key=self.consumer_key,
client_secret=self.consumer_secret,
Expand Down
Loading