Skip to content

Commit

Permalink
fix typing; update gql endpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
vladkens committed Dec 27, 2023
1 parent 7b62efb commit 61d159c
Show file tree
Hide file tree
Showing 17 changed files with 55,761 additions and 54,303 deletions.
2 changes: 1 addition & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1 @@
tests/mocked-data/* binary merge
tests/mocked-data/* binary merge
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pylint:
test:
@pytest -s --cov=twscrape tests/

show-cov:
test-cov:
@pytest -s --cov=twscrape tests/
@coverage html
@open htmlcov/index.html
Expand Down
83 changes: 71 additions & 12 deletions _get_gql_ops.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import json
import os
import re

import httpx
from fake_useragent import UserAgent

# note: update this url on next run
# url = "https://abs.twimg.com/responsive-web/client-web/api.f4ff3bfa.js"
# url = "https://abs.twimg.com/responsive-web/client-web/api.bb81931a.js"
url = "https://abs.twimg.com/responsive-web/client-web/main.45d48c6a.js"
client = httpx.Client(headers={"user-agent": UserAgent().chrome})

ops = """
SearchTimeline
Expand All @@ -23,13 +23,72 @@

ops = [op.strip() for op in ops.split("\n") if op.strip()]

script: str = httpx.get(url).text
pairs = re.findall(r'queryId:"(.+?)".+?operationName:"(.+?)"', script)
pairs = {op_name: op_id for op_id, op_name in pairs}

for x in ops:
print(f'OP_{x} = "{pairs.get(x, "???")}/{x}"')
def script_url(k: str, v: str):
return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"


def get_scripts():
cache_dir = "/tmp/twscrape-ops"
os.makedirs(cache_dir, exist_ok=True)

rep = client.get("https://twitter.com/elonmusk")
rep.raise_for_status()
urls = []

scripts = rep.text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0]
try:
for k, v in json.loads(scripts).items():
urls.append(script_url(k, f"{v}a"))
except json.decoder.JSONDecodeError as e:
print(scripts)
print(e)
exit(1)

v = rep.text.split("/client-web/main.")[1].split(".")[0]
urls.append(script_url("main", v))

urls = [
x
for x in urls
if "/i18n/" not in x and "/icons/" not in x and "react-syntax-highlighter" not in x
]

scripts = []
for i, x in enumerate(urls, 1):
cache_path = os.path.join(cache_dir, x.split("/")[-1].split("?")[0])
if os.path.exists(cache_path):
with open(cache_path) as fp:
scripts.append(fp.read())
continue

print(f"({i:3d} / {len(urls):3d}) {x}")
rep = client.get(x)
rep.raise_for_status()

with open(cache_path, "w") as fp:
fp.write(rep.text)
scripts.append(rep.text)

# for ??? check urls:
# https://twitter.com/SpaceX/status/1719132541632864696/likes
# https://twitter.com/i/lists/1494877848087187461
return scripts


all_pairs = {}
for txt in get_scripts():
pairs = re.findall(r'queryId:"(.+?)".+?operationName:"(.+?)"', txt)
pairs = {op_name: op_id for op_id, op_name in pairs}

for k, v in pairs.items():
if k in all_pairs and v != all_pairs[k]:
print(f"DIFF: {k} = {v} != {all_pairs[k]}")

all_pairs[k] = v


for k, v in all_pairs.items():
print(f'OP_{k} = "{v}/{k}"')

print("-" * 40)

for x in ops:
print(f'OP_{x} = "{all_pairs.get(x, "???")}/{x}"')
Loading

0 comments on commit 61d159c

Please sign in to comment.