concurrent requests (breaking)

vishalmhjn · Oct 17, 2023 · b3948b3 · b3948b3
1 parent 7f87877
commit b3948b3
Showing 1 changed file with 51 additions and 37 deletions.
diff --git a/src/call_semanticscholar.py b/src/call_semanticscholar.py
@@ -1,53 +1,67 @@
+import aiohttp
+import asyncio
 import requests
 import sys
 import time
 import pandas as pd
 from random import choice
+from tqdm import tqdm
+
+desktop_agents = [""]
+BASE_API_URL = "http://api.semanticscholar.org/v1/paper/"
 
-# add you desktop agent here. for this go to: https://www.whatismybrowser.com/detect/what-is-my-user-agent/
-# and copy your agent here
-# Example ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36']
-desktop_agents = ['']
 
-
 def random_headers():
-    return {'User-Agent': choice(desktop_agents),'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
+    return {
+        "User-Agent": choice(desktop_agents),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+    }
 
-def call_api(doi):
 
-    search_url = "http://api.semanticscholar.org/v1/paper/" + \
-        doi+"?include_unknown_references=true"
+async def call_api_async(session, doi):
+    search_url = BASE_API_URL + doi + "?include_unknown_references=true"
 
-    resp = requests.get(search_url, headers=random_headers())
-    content = resp.json()
-    return content
+    headers = random_headers()
 
+    async with session.get(search_url, headers=headers) as response:
+        content = await response.json()
+        return content
 
-if __name__ == '__main__':
-
-    df = pd.read_csv('../data/'+sys.argv[1])
-    filename = sys.argv[2]
-
-    print(len(df))
-    df = df[df.doi != "No Doi"]
-    print(len(df))
-    list_doi = list(df['doi'])
+
+async def fetch_articles_async(df):
+    timeout = aiohttp.ClientTimeout(total=10 * 60)
+    connector = aiohttp.TCPConnector(limit=5)
+
+    list_doi = list(df["doi"])
     list_abstracts = []
     list_topics = []
-    i = 0
-    for doi in list_doi:
-        i = i+1
-        print(i)
-        try:
-            content = call_api(doi)
-            list_abstracts.append(content['abstract'])
-            list_topics.append(content['topics'])
-        except Exception as e:
-            print(e)
-            list_abstracts.append("None")
-            list_topics.append("None")
-        time.sleep(2)
-    df['abstract'] = list_abstracts
-    df['topics'] = list_topics
-    df.to_csv('../data/abstracts_'+filename+'.csv', index=None)
 
+    async with aiohttp.ClientSession(
+        connector=connector, headers=random_headers(), timeout=timeout
+    ) as session:  #
+        tasks = [call_api_async(session, doi) for doi in list_doi]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    for content in results:
+        list_abstracts.append(content["abstract"])
+        list_topics.append(content["topics"])
+
+    return list_abstracts, list_topics
+
+
+if __name__ == "__main__":
+    df = pd.read_csv(sys.argv[1])
+
+    print(f"Total articles: {len(df)}")
+
+    df = df[df.doi != "No Doi"]
+    print(f"Articles with abstracts: {len(df)}")
+
+    loop = asyncio.get_event_loop()
+    list_abstracts, list_topics = loop.run_until_complete(fetch_articles_async(df))
+
+    df["abstract"] = list_abstracts
+    df["topics"] = list_topics
+
+    output_file = "../data/abstracts_" + sys.argv[1].split("/")[-1][:-4] + ".csv"
+    df.to_csv(output_file, index=None)