forked from cjlee112/spnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
arxiv_rss.py
96 lines (85 loc) · 3.76 KB
/
arxiv_rss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import datetime
import os
import pickle
import re
import time
import feedparser
a_re = re.compile(r"""<a.*?>(.*?)</a>""", re.DOTALL)
p_re = re.compile(r"""<p>(.*?)</p>""", re.DOTALL)
title_re = re.compile(r"""(.*?)\.\s\(arXiv:(.*?)\[(.*?)\].*?\)""", re.DOTALL)
arvix_categories = map(lambda x: x.strip(), "astro-ph, cond-mat, cs, gr-qc, hep-ex, hep-lat, hep-ph, hep-th, math, math-ph, nlin, nucl-ex, nucl-th, physics, q-bio, q-fin, quant-ph, stat".split(','))
cache_filename = "arxiv.pickle"
#def load_categories(filename="arxiv_categories.txt"):
#categories = []
#with open(filename) as handle:
#for line in handle:
#categories.append(line.strip())
#return categories
def rss_gen(categories=None, base_url="http://export.arxiv.org/rss/", verbose=False):
"""Generator for Arxiv RSS feeds."""
#attribute_map = {'title': 'title', 'url': 'link', 'author': 'authors', }
if not categories:
categories = arvix_categories
for category in categories:
url = base_url + category
feed = feedparser.parse(url)
if verbose:
print "Downloading category %s: %s" %(category, url)
num_entries = len(feed['entries'])
for feed_entry in feed['entries']:
paper = dict()
paper['url'] = feed_entry['links'][0]['href'].strip()
paper['authors'] = tuple(a_re.findall(feed_entry['author']))
m = p_re.findall(feed_entry['summary'])
paper['abstract'] = m[0].strip()
m = title_re.findall(feed_entry['title'])[0]
paper['title'] = m[0].strip()
paper['arxiv-topic-area'] = m[2].strip()
paper['id'] = m[1].strip()
paper['year'] = datetime.datetime.now().year
yield paper
if verbose:
print " Received %s entries. Sleeping for 20 seconds as requested in robots.txt" % str(num_entries)
time.sleep(20) #robots.txt
def load_cache(filename=cache_filename):
if not os.path.isfile(filename):
print "Cache file %s does not exist. You must invoke the feed downloader at least once with get_papers(download=True)" % filename
exit()
with open(filename) as cache_file:
papers = pickle.load(cache_file)
return papers
def append_to_cache(papers, filename=cache_filename):
"""Add newly downloaded files to the cache, making sure not to add duplicates. May not scale well for a large number of papers."""
hashable_papers = set()
if os.path.isfile(filename):
with open(filename) as cache_file:
cached_papers = pickle.load(cache_file)
for paper in cached_papers:
hashable_papers.add(frozenset(tuple(paper.items())))
for paper in papers:
hashable_papers.add(frozenset(tuple(paper.items())))
new_cache = []
for paper in hashable_papers:
new_cache.append(dict(paper))
with open(filename, 'w') as cache_file:
pickle.dump(new_cache, cache_file)
def get_papers(download=False, verbose=False, filename=cache_filename):
"""Returns the papers in the cache. Downloads new papers if the cache is empty or if cache=True."""
if not download:
return load_cache(filename)
all_papers = []
for paper in rss_gen(verbose=verbose):
all_papers.append(paper)
append_to_cache(all_papers, filename)
if verbose:
"Papers cached in %s ." % filename
return load_cache(filename)
if __name__ == '__main__':
if os.path.isfile(cache_filename):
papers = load_cache()
print "Cache contains %s papers." % str(len(papers))
else:
print "No cache file. Downloading papers from the Arxiv."
papers = get_papers(download=True, verbose=True)
papers = load_cache()
print "Cache contains %s papers." % str(len(papers))