Skip to content

Commit

Permalink
concurrent requests (breaking)
Browse files Browse the repository at this point in the history
  • Loading branch information
vishalmhjn committed Oct 17, 2023
1 parent 7f87877 commit b3948b3
Showing 1 changed file with 51 additions and 37 deletions.
88 changes: 51 additions & 37 deletions src/call_semanticscholar.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,67 @@
import aiohttp
import asyncio
import requests
import sys
import time
import pandas as pd
from random import choice
from tqdm import tqdm

desktop_agents = [""]
BASE_API_URL = "http://api.semanticscholar.org/v1/paper/"

# add you desktop agent here. for this go to: https://www.whatismybrowser.com/detect/what-is-my-user-agent/
# and copy your agent here
# Example ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36']
desktop_agents = ['']


def random_headers():
return {'User-Agent': choice(desktop_agents),'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
return {
"User-Agent": choice(desktop_agents),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
}

def call_api(doi):

search_url = "http://api.semanticscholar.org/v1/paper/" + \
doi+"?include_unknown_references=true"
async def call_api_async(session, doi):
search_url = BASE_API_URL + doi + "?include_unknown_references=true"

resp = requests.get(search_url, headers=random_headers())
content = resp.json()
return content
headers = random_headers()

async with session.get(search_url, headers=headers) as response:
content = await response.json()
return content

if __name__ == '__main__':

df = pd.read_csv('../data/'+sys.argv[1])
filename = sys.argv[2]

print(len(df))
df = df[df.doi != "No Doi"]
print(len(df))
list_doi = list(df['doi'])

async def fetch_articles_async(df):
timeout = aiohttp.ClientTimeout(total=10 * 60)
connector = aiohttp.TCPConnector(limit=5)

list_doi = list(df["doi"])
list_abstracts = []
list_topics = []
i = 0
for doi in list_doi:
i = i+1
print(i)
try:
content = call_api(doi)
list_abstracts.append(content['abstract'])
list_topics.append(content['topics'])
except Exception as e:
print(e)
list_abstracts.append("None")
list_topics.append("None")
time.sleep(2)
df['abstract'] = list_abstracts
df['topics'] = list_topics
df.to_csv('../data/abstracts_'+filename+'.csv', index=None)

async with aiohttp.ClientSession(
connector=connector, headers=random_headers(), timeout=timeout
) as session: #
tasks = [call_api_async(session, doi) for doi in list_doi]
results = await asyncio.gather(*tasks, return_exceptions=True)

for content in results:
list_abstracts.append(content["abstract"])
list_topics.append(content["topics"])

return list_abstracts, list_topics


if __name__ == "__main__":
df = pd.read_csv(sys.argv[1])

print(f"Total articles: {len(df)}")

df = df[df.doi != "No Doi"]
print(f"Articles with abstracts: {len(df)}")

loop = asyncio.get_event_loop()
list_abstracts, list_topics = loop.run_until_complete(fetch_articles_async(df))

df["abstract"] = list_abstracts
df["topics"] = list_topics

output_file = "../data/abstracts_" + sys.argv[1].split("/")[-1][:-4] + ".csv"
df.to_csv(output_file, index=None)

0 comments on commit b3948b3

Please sign in to comment.