-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGoogleScholarCrawler.py
189 lines (166 loc) · 7.69 KB
/
GoogleScholarCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from selenium import webdriver
import pandas as pd
import numpy as np
import time
import datetime
import subprocess
import re
import os
import requests
import sys
import json
import pdb
author = "Nico Zheng"
email = "nico921113[at]gmail.com"
# keywords = ["online communities"] # search keywords
# journals = ['information systems research', 'mis quarterly', 'journal of management information systems',
# 'journal of the association for information systems', 'management science', 'operational research',
# 'information & management', "decision support systems", "european journal of information systems"]
# fpath = "/Users/Nico/test/test_googlecrawer2" # output file folder
# chromedriver_path = "" # modify this if you need to use local chromedriver
alias = {'information systems research':"ISR", 'mis quarterly':'MISQ', 'journal of management information systems':"JMIS",'journal of the association for information systems':"JAIS", 'management science':'MS', 'operational research':"OR",
"information & management": "I&M", "decision support systems": "DSS", "european journal of information systems": "EJIS"}
'''
crawl google scholar search results and save pdf if the file was avaliable.
requirements:
- selenium `pip install selenium`
- webdriver `brew install webdriver`
- requests `pip install requests`
- openpyxl `pip install openpyxl`
will create local file folders based on keywords and journal as following:
$ tree test_googlecrawer -d
test_googlecrawer
└── wikipedia
├── information systems research
├── journal of management information systems
├── journal of the association for information systems
├── management science
├── mis quarterly
└── operational research
in each folder, there is a log file recorded detailed search results (author, title, journal, year and whether pdf is avaliable)
also avaliable pdf files.
the pdf file renamed as:
author-year-title-journal.pdf
'''
## global functions
def downloadPdf(output, link):
'''
write pdf file based on link address.
'''
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get(link, headers=headers)
with open(output, 'wb') as f:
f.write(response.content)
def parse(infobox):
infobox = infobox.lower().split("-")
infobox = [c.strip() for c in infobox]
author = infobox[0].split(",")[0].split(" ")[1] # extract last name
journal = infobox[1].split(",")[0] # extract journal, "..." may be init, otherwise, we need to get bibtex diredctly, much more time-consuming
year = infobox[1].split(",")[1].strip() # extract year
return author, journal, year
class Article:
def __init__(self, keywords, target_journal, folder):
self.keywords = keywords # search keywords
self.target_journal = target_journal # searched journal
self.output_folder = folder # output folder
self.createFolder()
self.total_articles = {}
def createFolder(self):
self.output_fpath = "/".join([self.output_folder, self.keywords, self.target_journal])
if not os.path.exists(self.output_fpath):
os.makedirs(self.output_fpath) # creaet output folder if there is not one.
print('creating folder {0}'.format(self.output_fpath))
def getInfo(self, article, driver):
default = {"title": "NA", "author": "NA", "journal": "NA", "year":"NA", "log": "NA", "citation":"NA"}
a = article.find_element_by_css_selector('a')
title = a.text
default['title'] = title
return default
def getPdf(self, article, driver):
pdf_link = "NA"
try:
tmp = article.find_element_by_css_selector("div[class=gs_or_ggsm")
pdf_link = tmp.find_element_by_tag_name("a").get_attribute("href")
if not pdf_link.endswith('.pdf'):
pdf_link = "NA"
except:
self.info['log'] = "pdf missing"
return pdf_link
def getFileName(self, alias=alias):
by = ["author", "year", "title", "journal"]
if alias:
if self.info['journal'] in alias.keys():
self.info['journal-short'] = alias[self.info['journal']]
by = ["author", "year", "title", "journal-short"]
if len(self.info['title'].split(" ")) > 10:
self.info['title-short'] = " ".join(self.info['title'].split(" ")[:10])
if "journal-short" in self.info.keys():
by = ["author", "year", "title-short", "journal-short"]
else:
by = ["author", "year", "title-short", "journal"]
filename = "-".join([self.info[c] for c in by]) + ".pdf"
return filename
def fit(self, article, driver, num):
default = {"title": "NA", "author": "NA", "journal": "NA", "year":"NA", "log": "NA", "citation":"NA"}
try:
self.info = self.getInfo(article, driver)
except:
self.info = default
if self.info is not default:
self.pdf = self.getPdf(article, driver)
self.filename = self.getFileName()
self.info['filename'] = self.filename
output = self.output_fpath + "/" + self.filename
if self.pdf is not "NA":
try:
downloadPdf(output, self.pdf)
except:
self.log = self.info['log'] + "||| pdf download error"
self.total_articles[num] = self.info
def run(keywords, journals, recursive = 30):
'''
search based on keywords and journals combinations
recursive is the number of request pages.
'''
if not chromedriver_path:
driver = webdriver.Chrome()
else:
driver = webdriver.Chrome(chromedriver_path)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.get('https://xueshu.baidu.com/')
for i in keywords:
for j in journals:
cnt = 1
articles = Article(i, j, fpath)
search_keyword = i ## generate search keyword like "wikipedia source: 'mis quarterly'"
print("current search key: {0}".format(search_keyword))
input_element = driver.find_element_by_id("kw")
input_element.clear()
input_element.send_keys(search_keyword)
input_element.submit()
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elements = driver.find_elements_by_css_selector("h3[class = t\ c_font]") #find article boxs
for e in elements:
try:
articles.fit(e, driver, cnt)
except:
print("page {} number {} parse error!".format(n, cnt))
cnt += 1
try:
driver.find_element_by_css_selector("i[class=c-icon-pager-next]").click()
print("next page")
except:
print("error next page!")
break # break the loop if can't find next page
log = pd.DataFrame(articles.total_articles).T # generate log files
now = datetime.datetime.now()
log.to_excel(os.path.join(articles.output_fpath,"logfile_{}.xlsx".format(now.strftime("%m-%d-%Y"))))
driver.quit()
if __name__ == '__main__':
config = json.load(open(sys.argv[1])) # read config
keywords = config['keywords']
journals = config['journals']
fpath = config['fpath']
chromedriver_path = config['chromedriver_path']
run(keywords, journals, recursive=50)