Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
dchrastil authored Mar 15, 2017
1 parent e077a1e commit 854ca6a
Show file tree
Hide file tree
Showing 3 changed files with 244 additions and 0 deletions.
138 changes: 138 additions & 0 deletions TTSL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/python

__title__ = "TTSL - Tool to Scrape LinkedIn"
__author__ = 'Danny Chrastil'
__email__ = '[email protected]'
__description__ = "A recon tool that allows you to scrape profile search results from LinkedIn"
__disclaimer__ = "This tool violates TOS of LinkedIn.com. For educational purposes only. Use at your own risk"
__version__ = '2.0'

import sys
import re
import time
import xlsxwriter
import json
import argparse
import requests
import subprocess
import urllib
import math
from thready import threaded
reload(sys)
sys.setdefaultencoding('utf-8')

""" Setup Argument Parameters """
parser = argparse.ArgumentParser(description='Discovery LinkedIn')
parser.add_argument('-u', '--keywords', help='Keywords to search')
parser.add_argument('-o', '--output', help='Output file (do not include extentions)')
args = parser.parse_args()

def get_search():
# Fetch the initial page to get results/page counts
url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=0" % search
headers = {'Csrf-Token':'ajax:7736867257193100830'}
cookies['JSESSIONID'] = 'ajax:7736867257193100830'
r = requests.get(url, cookies=cookies, headers=headers)
content = json.loads(r.text)
data_total = content['elements'][0]['total']

# Calculate pages off final results at 40 results/page
pages = data_total / 40
if data_total % 40 == 0:
# Becuase we count 0... Subtract a page if there are no left over results on the last page
pages = pages - 1
if pages == 0:
pages = 1

print "[Info] %i Results Found" % data_total
if data_total > 1000:
pages = 24
print "[Notice] LinkedIn only allows 1000 results. Refine keywords to capture all data"
print "[Info] Fetching %i Pages" % pages
print

# Set record position for XLSX
recordpos = 1

for p in range(pages):
# Request results for each page using the start offset
url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=%i" % (search, p*40)
r = requests.get(url, cookies=cookies, headers=headers)
content = r.text.encode('UTF-8')
content = json.loads(content)
print "[Info] Fetching page %i with %i results" % (p,len(content['elements'][0]['elements']))
for c in content['elements'][0]['elements']:
if c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False:
try:
data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry']
except:
data_industry = ""
data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName']
data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName']
data_slug = "https://www.linkedin.com/in/%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier']
data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation']
data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location']
try:
data_picture = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.voyager.common.MediaProcessorImage']['id']
except:
print "[Notice] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation)
data_picture = ""

# Write data to XLSX file
worksheet1.write('A%i' % recordpos, data_firstname)
worksheet1.write('B%i' % recordpos, data_lastname)
worksheet1.write('C%i' % recordpos, data_occupation)
worksheet1.write('D%i' % recordpos, data_location)
worksheet1.write('E%i' % recordpos, data_industry)
worksheet1.write('F%i' % recordpos, data_slug)
worksheet1.write('G%i' % recordpos, data_picture)
worksheet2.write('A%i' % recordpos, '=IMAGE(dataset!G%i)' % recordpos)
worksheet2.write('B%i' % recordpos, '=dataset!A%i&" "&dataset!B%i&"\n"&dataset!C%i' % (recordpos,recordpos,recordpos))
worksheet2.write('C%i' % recordpos, '=HYPERLINK(dataset!F%i)' % recordpos)
worksheet2.set_row(recordpos-1,125)
# Increment Record Position
recordpos = recordpos + 1
else:
print "[Notice] Headless profile found. Skipping"
print

def authenticate():
try:
session = subprocess.Popen(['python', 'TTSL_login.py'], stdout=subprocess.PIPE).communicate()[0].replace("\n","")
print "[Info] Obtained new session: %s" % session
cookies = dict(li_at=session)
except Exception, e:
sys.exit("[Fatal] Could not authenticate to linkedin. %s" % e)
return cookies

if __name__ == '__main__':
title = """
__ __| __ __| __| |
| | \__ \ |
_| _| ____/ ____|
tool to scrape linkedin v2.0
"""
print title.decode('UTF-8')

# Prompt user for data variables
search = args.keywords if args.keywords!=None else raw_input("Enter search Keywords (use quotes for more percise results)\n")
outfile = args.output if args.output!=None else raw_input("Enter filename for output (exclude file extension)\n")
print

# URL Encode for the querystring
search = urllib.quote_plus(search)
cookies = authenticate()
output = open(outfile,'a+')

# Initiate XLSX File
workbook = xlsxwriter.Workbook('%s.xlsx' % outfile)
worksheet1 = workbook.add_worksheet('dataset')
worksheet2 = workbook.add_worksheet('report')
worksheet2.set_column(0,0, 25)
worksheet2.set_column(1,2, 75)

# Initialize Scraping
get_search()

# Close XLSD File
workbook.close()
83 changes: 83 additions & 0 deletions TTSL_login.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/python

__author__ = 'Danny Chrastil'
__email__ = '[email protected]'
__description__ = 'Python Requests doesnt handle LinkedIn authentication well. This uses urllib instead'
__version__ = '0.2'

import cookielib
import os
import urllib
import urllib2
import re
import string
import sys
import config
from bs4 import BeautifulSoup

def linkedIn():
global opener
cookie_filename = "cookies.txt"

# Simulate browser with cookies enabled
cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
cj.load()

# Load Proxy settings
if len(config.proxylist) > 0:
print "[Status] Setting up proxy (%s)" % config.proxylist[0]
proxy_handler = urllib2.ProxyHandler({'https':config.proxylist[0]})
opener = urllib2.build_opener(
proxy_handler,
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(cj)
)
else:
opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(cj)
)

# Get CSRF Token
html = loadPage("https://www.linkedin.com/")
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find(id="loginCsrfParam-login")['value']
# Authenticate
login_data = urllib.urlencode({
'session_key': config.linkedin['username'],
'session_password': config.linkedin['password'],
'loginCsrfParam': csrf,
})
html = loadPage("https://www.linkedin.com/uas/login-submit", login_data)
soup = BeautifulSoup(html, "html.parser")
print cj._cookies['.www.linkedin.com']['/']['li_at'].value
cj.save()
os.remove(cookie_filename)

def loadPage(url, data=None):
try:
response = opener.open(url)
except:
print "\n[Fatal] Your IP may have been temporarily blocked"

try:
if data is not None:
response = opener.open(url, data)
else:
response = opener.open(url)
#return response.headers.get('Set-Cookie')
return ''.join(response.readlines())
except:
# If URL doesn't load for ANY reason, try again...
# Quick and dirty solution for 404 returns because of network problems
# However, this could infinite loop if there's an actual problem
print "[Notice] Exception hit"
sys.exit(0)

linkedIn()

23 changes: 23 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/python

## [LINKEDIN CREDENTIALS] ##
# it may be preferable to use a fake
# account to avoid account suspension

linkedin = dict(
username = '',
password = '',
)

## [PROXY LIST] ##
# Leave empty to use your own IP address
# by using a proxy you can avoid being
# blocked for sending too much traffic

proxylist = []
#proxylist.append('http://127.0.0.1:8080')

## [MISCELLANEOUS] ##

timeout = 10

0 comments on commit 854ca6a

Please sign in to comment.