-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
244 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
#!/usr/bin/python | ||
|
||
__title__ = "TTSL - Tool to Scrape LinkedIn" | ||
__author__ = 'Danny Chrastil' | ||
__email__ = '[email protected]' | ||
__description__ = "A recon tool that allows you to scrape profile search results from LinkedIn" | ||
__disclaimer__ = "This tool violates TOS of LinkedIn.com. For educational purposes only. Use at your own risk" | ||
__version__ = '2.0' | ||
|
||
import sys | ||
import re | ||
import time | ||
import xlsxwriter | ||
import json | ||
import argparse | ||
import requests | ||
import subprocess | ||
import urllib | ||
import math | ||
from thready import threaded | ||
reload(sys) | ||
sys.setdefaultencoding('utf-8') | ||
|
||
""" Setup Argument Parameters """ | ||
parser = argparse.ArgumentParser(description='Discovery LinkedIn') | ||
parser.add_argument('-u', '--keywords', help='Keywords to search') | ||
parser.add_argument('-o', '--output', help='Output file (do not include extentions)') | ||
args = parser.parse_args() | ||
|
||
def get_search(): | ||
# Fetch the initial page to get results/page counts | ||
url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=0" % search | ||
headers = {'Csrf-Token':'ajax:7736867257193100830'} | ||
cookies['JSESSIONID'] = 'ajax:7736867257193100830' | ||
r = requests.get(url, cookies=cookies, headers=headers) | ||
content = json.loads(r.text) | ||
data_total = content['elements'][0]['total'] | ||
|
||
# Calculate pages off final results at 40 results/page | ||
pages = data_total / 40 | ||
if data_total % 40 == 0: | ||
# Becuase we count 0... Subtract a page if there are no left over results on the last page | ||
pages = pages - 1 | ||
if pages == 0: | ||
pages = 1 | ||
|
||
print "[Info] %i Results Found" % data_total | ||
if data_total > 1000: | ||
pages = 24 | ||
print "[Notice] LinkedIn only allows 1000 results. Refine keywords to capture all data" | ||
print "[Info] Fetching %i Pages" % pages | ||
|
||
# Set record position for XLSX | ||
recordpos = 1 | ||
|
||
for p in range(pages): | ||
# Request results for each page using the start offset | ||
url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=%i" % (search, p*40) | ||
r = requests.get(url, cookies=cookies, headers=headers) | ||
content = r.text.encode('UTF-8') | ||
content = json.loads(content) | ||
print "[Info] Fetching page %i with %i results" % (p,len(content['elements'][0]['elements'])) | ||
for c in content['elements'][0]['elements']: | ||
if c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False: | ||
try: | ||
data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry'] | ||
except: | ||
data_industry = "" | ||
data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName'] | ||
data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName'] | ||
data_slug = "https://www.linkedin.com/in/%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier'] | ||
data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation'] | ||
data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location'] | ||
try: | ||
data_picture = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.voyager.common.MediaProcessorImage']['id'] | ||
except: | ||
print "[Notice] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation) | ||
data_picture = "" | ||
|
||
# Write data to XLSX file | ||
worksheet1.write('A%i' % recordpos, data_firstname) | ||
worksheet1.write('B%i' % recordpos, data_lastname) | ||
worksheet1.write('C%i' % recordpos, data_occupation) | ||
worksheet1.write('D%i' % recordpos, data_location) | ||
worksheet1.write('E%i' % recordpos, data_industry) | ||
worksheet1.write('F%i' % recordpos, data_slug) | ||
worksheet1.write('G%i' % recordpos, data_picture) | ||
worksheet2.write('A%i' % recordpos, '=IMAGE(dataset!G%i)' % recordpos) | ||
worksheet2.write('B%i' % recordpos, '=dataset!A%i&" "&dataset!B%i&"\n"&dataset!C%i' % (recordpos,recordpos,recordpos)) | ||
worksheet2.write('C%i' % recordpos, '=HYPERLINK(dataset!F%i)' % recordpos) | ||
worksheet2.set_row(recordpos-1,125) | ||
# Increment Record Position | ||
recordpos = recordpos + 1 | ||
else: | ||
print "[Notice] Headless profile found. Skipping" | ||
|
||
def authenticate(): | ||
try: | ||
session = subprocess.Popen(['python', 'TTSL_login.py'], stdout=subprocess.PIPE).communicate()[0].replace("\n","") | ||
print "[Info] Obtained new session: %s" % session | ||
cookies = dict(li_at=session) | ||
except Exception, e: | ||
sys.exit("[Fatal] Could not authenticate to linkedin. %s" % e) | ||
return cookies | ||
|
||
if __name__ == '__main__': | ||
title = """ | ||
__ __| __ __| __| | | ||
| | \__ \ | | ||
_| _| ____/ ____| | ||
tool to scrape linkedin v2.0 | ||
""" | ||
print title.decode('UTF-8') | ||
|
||
# Prompt user for data variables | ||
search = args.keywords if args.keywords!=None else raw_input("Enter search Keywords (use quotes for more percise results)\n") | ||
outfile = args.output if args.output!=None else raw_input("Enter filename for output (exclude file extension)\n") | ||
|
||
# URL Encode for the querystring | ||
search = urllib.quote_plus(search) | ||
cookies = authenticate() | ||
output = open(outfile,'a+') | ||
|
||
# Initiate XLSX File | ||
workbook = xlsxwriter.Workbook('%s.xlsx' % outfile) | ||
worksheet1 = workbook.add_worksheet('dataset') | ||
worksheet2 = workbook.add_worksheet('report') | ||
worksheet2.set_column(0,0, 25) | ||
worksheet2.set_column(1,2, 75) | ||
|
||
# Initialize Scraping | ||
get_search() | ||
|
||
# Close XLSD File | ||
workbook.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#!/usr/bin/python | ||
|
||
__author__ = 'Danny Chrastil' | ||
__email__ = '[email protected]' | ||
__description__ = 'Python Requests doesnt handle LinkedIn authentication well. This uses urllib instead' | ||
__version__ = '0.2' | ||
|
||
import cookielib | ||
import os | ||
import urllib | ||
import urllib2 | ||
import re | ||
import string | ||
import sys | ||
import config | ||
from bs4 import BeautifulSoup | ||
|
||
def linkedIn(): | ||
global opener | ||
cookie_filename = "cookies.txt" | ||
|
||
# Simulate browser with cookies enabled | ||
cj = cookielib.MozillaCookieJar(cookie_filename) | ||
if os.access(cookie_filename, os.F_OK): | ||
cj.load() | ||
|
||
# Load Proxy settings | ||
if len(config.proxylist) > 0: | ||
print "[Status] Setting up proxy (%s)" % config.proxylist[0] | ||
proxy_handler = urllib2.ProxyHandler({'https':config.proxylist[0]}) | ||
opener = urllib2.build_opener( | ||
proxy_handler, | ||
urllib2.HTTPRedirectHandler(), | ||
urllib2.HTTPHandler(debuglevel=0), | ||
urllib2.HTTPSHandler(debuglevel=0), | ||
urllib2.HTTPCookieProcessor(cj) | ||
) | ||
else: | ||
opener = urllib2.build_opener( | ||
urllib2.HTTPRedirectHandler(), | ||
urllib2.HTTPHandler(debuglevel=0), | ||
urllib2.HTTPSHandler(debuglevel=0), | ||
urllib2.HTTPCookieProcessor(cj) | ||
) | ||
|
||
# Get CSRF Token | ||
html = loadPage("https://www.linkedin.com/") | ||
soup = BeautifulSoup(html, "html.parser") | ||
csrf = soup.find(id="loginCsrfParam-login")['value'] | ||
# Authenticate | ||
login_data = urllib.urlencode({ | ||
'session_key': config.linkedin['username'], | ||
'session_password': config.linkedin['password'], | ||
'loginCsrfParam': csrf, | ||
}) | ||
html = loadPage("https://www.linkedin.com/uas/login-submit", login_data) | ||
soup = BeautifulSoup(html, "html.parser") | ||
print cj._cookies['.www.linkedin.com']['/']['li_at'].value | ||
cj.save() | ||
os.remove(cookie_filename) | ||
|
||
def loadPage(url, data=None): | ||
try: | ||
response = opener.open(url) | ||
except: | ||
print "\n[Fatal] Your IP may have been temporarily blocked" | ||
|
||
try: | ||
if data is not None: | ||
response = opener.open(url, data) | ||
else: | ||
response = opener.open(url) | ||
#return response.headers.get('Set-Cookie') | ||
return ''.join(response.readlines()) | ||
except: | ||
# If URL doesn't load for ANY reason, try again... | ||
# Quick and dirty solution for 404 returns because of network problems | ||
# However, this could infinite loop if there's an actual problem | ||
print "[Notice] Exception hit" | ||
sys.exit(0) | ||
|
||
linkedIn() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/usr/bin/python | ||
|
||
## [LINKEDIN CREDENTIALS] ## | ||
# it may be preferable to use a fake | ||
# account to avoid account suspension | ||
|
||
linkedin = dict( | ||
username = '', | ||
password = '', | ||
) | ||
|
||
## [PROXY LIST] ## | ||
# Leave empty to use your own IP address | ||
# by using a proxy you can avoid being | ||
# blocked for sending too much traffic | ||
|
||
proxylist = [] | ||
#proxylist.append('http://127.0.0.1:8080') | ||
|
||
## [MISCELLANEOUS] ## | ||
|
||
timeout = 10 | ||
|