fix typing; update gql endpoints

vladkens · Dec 27, 2023 · 61d159c · 61d159c
1 parent 7b62efb
commit 61d159c
Show file tree

Hide file tree

Showing 17 changed files with 55,761 additions and 54,303 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1 @@
-tests/mocked-data/*   binary merge
+tests/mocked-data/*   binary merge
diff --git a/Makefile b/Makefile
@@ -29,7 +29,7 @@ pylint:
 test:
 	@pytest -s --cov=twscrape tests/
 
-show-cov:
+test-cov:
 	@pytest -s --cov=twscrape tests/
 	@coverage html
 	@open htmlcov/index.html

diff --git a/_get_gql_ops.py b/_get_gql_ops.py
@@ -1,11 +1,11 @@
+import json
+import os
 import re
 
 import httpx
+from fake_useragent import UserAgent
 
-# note: update this url on next run
-# url = "https://abs.twimg.com/responsive-web/client-web/api.f4ff3bfa.js"
-# url = "https://abs.twimg.com/responsive-web/client-web/api.bb81931a.js"
-url = "https://abs.twimg.com/responsive-web/client-web/main.45d48c6a.js"
+client = httpx.Client(headers={"user-agent": UserAgent().chrome})
 
 ops = """
 SearchTimeline
@@ -23,13 +23,72 @@
 
 ops = [op.strip() for op in ops.split("\n") if op.strip()]
 
-script: str = httpx.get(url).text
-pairs = re.findall(r'queryId:"(.+?)".+?operationName:"(.+?)"', script)
-pairs = {op_name: op_id for op_id, op_name in pairs}
 
-for x in ops:
-    print(f'OP_{x} = "{pairs.get(x, "???")}/{x}"')
+def script_url(k: str, v: str):
+    return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"
+
+
+def get_scripts():
+    cache_dir = "/tmp/twscrape-ops"
+    os.makedirs(cache_dir, exist_ok=True)
+
+    rep = client.get("https://twitter.com/elonmusk")
+    rep.raise_for_status()
+    urls = []
+
+    scripts = rep.text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0]
+    try:
+        for k, v in json.loads(scripts).items():
+            urls.append(script_url(k, f"{v}a"))
+    except json.decoder.JSONDecodeError as e:
+        print(scripts)
+        print(e)
+        exit(1)
+
+    v = rep.text.split("/client-web/main.")[1].split(".")[0]
+    urls.append(script_url("main", v))
+
+    urls = [
+        x
+        for x in urls
+        if "/i18n/" not in x and "/icons/" not in x and "react-syntax-highlighter" not in x
+    ]
+
+    scripts = []
+    for i, x in enumerate(urls, 1):
+        cache_path = os.path.join(cache_dir, x.split("/")[-1].split("?")[0])
+        if os.path.exists(cache_path):
+            with open(cache_path) as fp:
+                scripts.append(fp.read())
+            continue
+
+        print(f"({i:3d} / {len(urls):3d}) {x}")
+        rep = client.get(x)
+        rep.raise_for_status()
+
+        with open(cache_path, "w") as fp:
+            fp.write(rep.text)
+        scripts.append(rep.text)
 
-# for ??? check urls:
-# https://twitter.com/SpaceX/status/1719132541632864696/likes
-# https://twitter.com/i/lists/1494877848087187461
+    return scripts
+
+
+all_pairs = {}
+for txt in get_scripts():
+    pairs = re.findall(r'queryId:"(.+?)".+?operationName:"(.+?)"', txt)
+    pairs = {op_name: op_id for op_id, op_name in pairs}
+
+    for k, v in pairs.items():
+        if k in all_pairs and v != all_pairs[k]:
+            print(f"DIFF: {k} = {v} != {all_pairs[k]}")
+
+        all_pairs[k] = v
+
+
+for k, v in all_pairs.items():
+    print(f'OP_{k} = "{v}/{k}"')
+
+print("-" * 40)
+
+for x in ops:
+    print(f'OP_{x} = "{all_pairs.get(x, "???")}/{x}"')
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		tests/mocked-data/* binary merge
		tests/mocked-data/* binary merge