Initial commit

dchrastil · Mar 15, 2017 · 854ca6a · 854ca6a
1 parent e077a1e
commit 854ca6a
Show file tree

Hide file tree

Showing 3 changed files with 244 additions and 0 deletions.
diff --git a/TTSL.py b/TTSL.py
@@ -0,0 +1,138 @@
+#!/usr/bin/python
+
+__title__ = "TTSL - Tool to Scrape LinkedIn"
+__author__ = 'Danny Chrastil'
+__email__ = '[email protected]'
+__description__ = "A recon tool that allows you to scrape profile search results from LinkedIn"
+__disclaimer__ = "This tool violates TOS of LinkedIn.com. For educational purposes only. Use at your own risk"
+__version__ = '2.0'
+
+import sys
+import re
+import time
+import xlsxwriter
+import json
+import argparse
+import requests
+import subprocess
+import urllib
+import math
+from thready import threaded
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+""" Setup Argument Parameters """
+parser = argparse.ArgumentParser(description='Discovery LinkedIn')
+parser.add_argument('-u', '--keywords', help='Keywords to search')
+parser.add_argument('-o', '--output', help='Output file (do not include extentions)')
+args = parser.parse_args()
+
+def get_search():
+    # Fetch the initial page to get results/page counts
+    url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=0" % search
+    headers = {'Csrf-Token':'ajax:7736867257193100830'}
+    cookies['JSESSIONID'] = 'ajax:7736867257193100830'
+    r = requests.get(url, cookies=cookies, headers=headers)
+    content = json.loads(r.text)
+    data_total = content['elements'][0]['total']
+
+    # Calculate pages off final results at 40 results/page
+    pages = data_total / 40
+    if data_total % 40 == 0:
+        # Becuase we count 0... Subtract a page if there are no left over results on the last page
+        pages = pages - 1 
+    if pages == 0: 
+        pages = 1
+
+    print "[Info] %i Results Found" % data_total
+    if data_total > 1000:
+        pages = 24
+        print "[Notice] LinkedIn only allows 1000 results. Refine keywords to capture all data"
+    print "[Info] Fetching %i Pages" % pages
+    print
+
+    # Set record position for XLSX
+    recordpos = 1
+
+    for p in range(pages):
+        # Request results for each page using the start offset
+        url = "https://www.linkedin.com/voyager/api/search/cluster?count=40&guides=List()&keywords=%s&origin=GLOBAL_SEARCH_HEADER&q=guided&searchId=1489295486936&start=%i" % (search, p*40)
+        r = requests.get(url, cookies=cookies, headers=headers)
+        content = r.text.encode('UTF-8')
+        content = json.loads(content)
+        print "[Info] Fetching page %i with %i results" % (p,len(content['elements'][0]['elements']))
+        for c in content['elements'][0]['elements']:
+            if c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['headless'] == False:
+                try:
+                    data_industry = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['industry']
+                except:
+                    data_industry = ""    
+                data_firstname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['firstName']
+                data_lastname = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['lastName']
+                data_slug = "https://www.linkedin.com/in/%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['publicIdentifier']
+                data_occupation = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['occupation']
+                data_location = c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['location']
+                try:
+                    data_picture = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" % c['hitInfo']['com.linkedin.voyager.search.SearchProfile']['miniProfile']['picture']['com.linkedin.voyager.common.MediaProcessorImage']['id']
+                except:
+                    print "[Notice] No picture found for %s %s, %s" % (data_firstname, data_lastname, data_occupation)
+                    data_picture = ""
+
+                # Write data to XLSX file
+                worksheet1.write('A%i' % recordpos, data_firstname)          
+                worksheet1.write('B%i' % recordpos, data_lastname)          
+                worksheet1.write('C%i' % recordpos, data_occupation)          
+                worksheet1.write('D%i' % recordpos, data_location)          
+                worksheet1.write('E%i' % recordpos, data_industry)          
+                worksheet1.write('F%i' % recordpos, data_slug)          
+                worksheet1.write('G%i' % recordpos, data_picture)          
+                worksheet2.write('A%i' % recordpos, '=IMAGE(dataset!G%i)' % recordpos)
+                worksheet2.write('B%i' % recordpos, '=dataset!A%i&" "&dataset!B%i&"\n"&dataset!C%i' % (recordpos,recordpos,recordpos))
+                worksheet2.write('C%i' % recordpos, '=HYPERLINK(dataset!F%i)' % recordpos)
+                worksheet2.set_row(recordpos-1,125)        
+                # Increment Record Position
+                recordpos = recordpos + 1
+            else:
+                print "[Notice] Headless profile found. Skipping"
+        print
+
+def authenticate():
+    try:
+        session = subprocess.Popen(['python', 'TTSL_login.py'], stdout=subprocess.PIPE).communicate()[0].replace("\n","")
+        print "[Info] Obtained new session: %s" % session
+        cookies = dict(li_at=session)
+    except Exception, e:
+        sys.exit("[Fatal] Could not authenticate to linkedin. %s" % e)
+    return cookies
+
+if __name__ == '__main__':
+    title = """
+__ __| __ __| __|  |    
+   |      | \__ \  |    
+  _|     _| ____/ ____| 
+tool to scrape linkedin v2.0
+"""
+    print title.decode('UTF-8')
+
+    # Prompt user for data variables
+    search = args.keywords if args.keywords!=None else raw_input("Enter search Keywords (use quotes for more percise results)\n")
+    outfile = args.output if args.output!=None else raw_input("Enter filename for output (exclude file extension)\n")
+    print 
+
+    # URL Encode for the querystring
+    search = urllib.quote_plus(search)
+    cookies = authenticate()
+    output = open(outfile,'a+')
+
+    # Initiate XLSX File
+    workbook = xlsxwriter.Workbook('%s.xlsx' % outfile)
+    worksheet1 = workbook.add_worksheet('dataset')
+    worksheet2 = workbook.add_worksheet('report')
+    worksheet2.set_column(0,0, 25)
+    worksheet2.set_column(1,2, 75)
+
+    # Initialize Scraping
+    get_search()
+
+    # Close XLSD File
+    workbook.close()
diff --git a/TTSL_login.py b/TTSL_login.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+
+__author__ = 'Danny Chrastil'
+__email__ = '[email protected]'
+__description__ = 'Python Requests doesnt handle LinkedIn authentication well. This uses urllib instead'
+__version__ = '0.2'
+
+import cookielib
+import os
+import urllib
+import urllib2
+import re
+import string
+import sys
+import config
+from bs4 import BeautifulSoup
+
+def linkedIn():
+        global opener
+        cookie_filename = "cookies.txt"
+
+        # Simulate browser with cookies enabled
+        cj = cookielib.MozillaCookieJar(cookie_filename)
+        if os.access(cookie_filename, os.F_OK):
+            cj.load()
+
+        # Load Proxy settings
+        if len(config.proxylist) > 0:
+            print "[Status] Setting up proxy (%s)" % config.proxylist[0]
+            proxy_handler = urllib2.ProxyHandler({'https':config.proxylist[0]})
+            opener = urllib2.build_opener(
+                proxy_handler,
+                urllib2.HTTPRedirectHandler(),
+                urllib2.HTTPHandler(debuglevel=0),
+                urllib2.HTTPSHandler(debuglevel=0),
+                urllib2.HTTPCookieProcessor(cj)
+            )
+        else:
+            opener = urllib2.build_opener(
+                urllib2.HTTPRedirectHandler(),
+                urllib2.HTTPHandler(debuglevel=0),
+                urllib2.HTTPSHandler(debuglevel=0),
+                urllib2.HTTPCookieProcessor(cj)
+            )
+
+        # Get CSRF Token
+        html = loadPage("https://www.linkedin.com/")
+        soup = BeautifulSoup(html, "html.parser")
+	csrf = soup.find(id="loginCsrfParam-login")['value']
+        # Authenticate
+        login_data = urllib.urlencode({
+            'session_key': config.linkedin['username'],
+            'session_password': config.linkedin['password'],
+            'loginCsrfParam': csrf,
+        })
+        html = loadPage("https://www.linkedin.com/uas/login-submit", login_data)
+        soup = BeautifulSoup(html, "html.parser")
+	print cj._cookies['.www.linkedin.com']['/']['li_at'].value
+        cj.save()
+        os.remove(cookie_filename)
+
+def loadPage(url, data=None):
+        try:
+            response = opener.open(url)
+        except:
+            print "\n[Fatal] Your IP may have been temporarily blocked"
+
+        try:
+            if data is not None:
+                response = opener.open(url, data)
+            else:
+                response = opener.open(url)
+            #return response.headers.get('Set-Cookie')
+            return ''.join(response.readlines())
+        except:
+            # If URL doesn't load for ANY reason, try again...
+            # Quick and dirty solution for 404 returns because of network problems
+            # However, this could infinite loop if there's an actual problem
+            print "[Notice] Exception hit"
+            sys.exit(0)
+
+linkedIn()
+
diff --git a/config.py b/config.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+
+## [LINKEDIN CREDENTIALS] ##
+# it may be preferable to use a fake
+# account to avoid account suspension
+
+linkedin = dict(
+    username = '',
+    password = '',
+)
+
+## [PROXY LIST] ##
+# Leave empty to use your own IP address
+# by using a proxy you can avoid being
+# blocked for sending too much traffic
+
+proxylist = []
+#proxylist.append('http://127.0.0.1:8080')
+
+## [MISCELLANEOUS] ##
+
+timeout = 10
+