From 4674d5599e71d4582819800bdcb695ece29fb6bf Mon Sep 17 00:00:00 2001 From: Tweeticoats <60335703+Tweeticoats@users.noreply.github.com> Date: Sun, 4 Jul 2021 16:31:18 +0930 Subject: [PATCH] Adding multiscrape python based performer scraper. (#594) --- scrapers/multiscrape.py | 253 +++++++++++++++++++++++++++++++++++++++ scrapers/multiscrape.yml | 17 +++ 2 files changed, 270 insertions(+) create mode 100644 scrapers/multiscrape.py create mode 100644 scrapers/multiscrape.yml diff --git a/scrapers/multiscrape.py b/scrapers/multiscrape.py new file mode 100644 index 000000000..e0ea2afc3 --- /dev/null +++ b/scrapers/multiscrape.py @@ -0,0 +1,253 @@ + +import json +import requests +import sys + + +class multiscrape: + + url="http://localhost:9999/graphql" + headers = { + "Accept-Encoding": "gzip, deflate, br", + "Content-Type": "application/json", + "Accept": "application/json", + "Connection": "keep-alive", + "DNT": "1" + } + + ''' + update the below config in the preferred order for each field. + If there are no results for that performer and field it will use the results of the next scraper in the list and cache the results.''' + config ={ + "gender": ['stash-sqlite'], + "url" : ['Babepedia','stash-sqlite','FreeonesCommunity','Brazzers','Pornhub'], + "twitter":['Babepedia','stash-sqlite'], + "instagram": ['Babepedia'], + "birthdate": ['IMBD','FreeonesCommunity','Babepedia','stash-sqlite'], + "ethnicity": ['Babepedia','stash-sqlite'], + "country": ['Babepedia','stash-sqlite'], + "eye_color": ['Babepedia','stash-sqlite'], + "height":['Babepedia','Pornhub','stash-sqlite'], + "measurements":['Babepedia','Pornhub','FreeonesCommunity','stash-sqlite'], + "fake_tits":['Babepedia','stash-sqlite'], + "career_length": ['Pornhub','Babepedia','stash-sqlite'], + "tattoos":['Babepedia','stash-sqlite'], + "piercings": ['Babepedia','stash-sqlite'], + "aliases": ['Babepedia','stash-sqlite'], + "tags": ['Babepedia'], + "details": ['FreeonesCommunity','Babepedia','Brazzers'], + "death_date": ['Babepedia'], + "hair_color": ['Babepedia'], + "weight":['Babepedia','FreeonesCommunity'], + "image": ['performer-image-dir','Babepedia','FreeonesCommunity'] + } + + + def __log(self,levelChar, s): + if levelChar == "": + return + + print(self.__prefix(levelChar) + s + "\n", file=sys.stderr, flush=True) + + def trace(self,s): + self.__log(b't', s) + + def debug(self,s): + self.__log(b'd', s) + + def info(self,s): + self.__log(b'i', s) + + def warning(self,s): + self.__log(b'w', s) + + def error(self,s): + self.__log(b'e', s) + + + + def __callGraphQL(self, query, variables=None): + json = {} + json['query'] = query + if variables != None: + json['variables'] = variables + + # handle cookies + response = requests.post(self.url, json=json, headers=self.headers) + + if response.status_code == 200: + result = response.json() + if result.get("error", None): + for error in result["error"]["errors"]: + raise Exception("GraphQL error: {}".format(error)) + if result.get("data", None): + return result.get("data") + else: + raise Exception( + "GraphQL query failed:{} - {}. Query: {}. Variables: {}".format(response.status_code, response.content, query, variables)) + + + def list_scrapers(self, type): + query = """query listPerformerScrapers { + listPerformerScrapers { + id + name + performer{ + supported_scrapes + } + } + }""" + ret = [] + result = self.__callGraphQL(query) + for r in result["listSceneScrapers"]: + if type in r["scene"]["supported_scrapes"]: + ret.append(r["id"]) + return ret + + def scrape_performer_list(self, scraper_id, performer): + query = """query scrapePerformerList($scraper_id: ID!, $performer: String!) { + scrapePerformerList(scraper_id: $scraper_id, query: $performer) { + name + url + gender + twitter + instagram + birthdate + ethnicity + country + eye_color + height + measurements + fake_tits + career_length + tattoos + piercings + aliases + image + } + }""" + + variables = {'scraper_id': scraper_id, 'performer': performer} + result = self.__callGraphQL(query, variables) + if result is not None: + return result["scrapePerformerList"] + return None + + def scrape_performer(self, scraper_id, performer): + query = """query scrapePerformer($scraper_id: ID!, $performer: ScrapedPerformerInput!) { + scrapePerformer(scraper_id: $scraper_id, scraped_performer: $performer) { + name + url + gender + twitter + instagram + birthdate + ethnicity + country + eye_color + height + measurements + fake_tits + career_length + tattoos + piercings + aliases + image + } +}""" + variables = {'scraper_id': scraper_id, 'performer': performer} + result = self.__callGraphQL(query, variables) + return result["scrapePerformer"] + + def requred_scrapers(self): + scrapers=[] + for key in self.config.keys(): + for s in self.config.get(key): + if s not in scrapers: + scrapers.append(s) + return scrapers + + + def query_performers(self,name): + ret=[] + + for scraper in self.requred_scrapers(): + print("Querying performers "+ scraper, file=sys.stderr) + tmp=self.scrape_performer_list(scraper,name) + if tmp is not None: + for s in tmp: + found=False + for t in ret: + if s["name"]==t["name"]: + found=True + if not found: + ret.append(s) + return ret + + def fetch_performer(self,name): + ret={"name":name} + + scraper_cache={} + + for field in self.config.keys(): + found=False + for s in self.config[field]: + if s in scraper_cache.keys(): + if field in scraper_cache[s]: + ret[field]=scraper_cache[s][field] + print("updating field from cache using scraper: " + s +" for field: " +field, file=sys.stderr) + found=True + if s not in scraper_cache.keys() and not found: + print("Running scraper: " + s +" " +field, file=sys.stderr) + spl=self.scrape_performer_list(s, name) + if spl is not None: + for spli in spl: + if spli["name"].lower()==name.lower(): + r=self.scrape_performer(s, {"name":spli["name"]}) + if r is not None: + scraper_cache[s]=r + found=True + break; + if found: + print("Saving results from scraper: " +field + " " +s,file=sys.stderr) + if field in scraper_cache[s]: + ret[field]=scraper_cache[s][field] + else: + found=False + else: + scraper_cache[s]={} + return ret + + + +if sys.argv[1] == "query": + fragment = json.loads(sys.stdin.read()) + print("input: " + json.dumps(fragment),file=sys.stderr) + scraper=multiscrape() + result = scraper.query_performers(fragment['name']) + if not result: + print(f"Could not determine details for performer: `{fragment['name']}`",file=sys.stderr) + print("{}") + else: + print (json.dumps(result)) + +if sys.argv[1] == "fetch": + fragment = json.loads(sys.stdin.read()) + print("input: " + json.dumps(fragment),file=sys.stderr) + scraper=multiscrape() + result = scraper.fetch_performer(fragment['name']) + if not result: + print(f"Could not determine details for performer: `{fragment['name']}`",file=sys.stderr) + print("{}") + else: + True + print (json.dumps(result)) + + + +if sys.argv[1] == "test": + scraper=multiscrape() + scrapers=scraper.requred_scrapers() + print(scrapers) + + diff --git a/scrapers/multiscrape.yml b/scrapers/multiscrape.yml new file mode 100644 index 000000000..6d7307340 --- /dev/null +++ b/scrapers/multiscrape.yml @@ -0,0 +1,17 @@ +name: multiscrape + +performerByFragment: + action: script + script: + - python + - multiscrape.py + - fetch + +performerByName: + action: script + script: + - python + - multiscrape.py + - query + +# Last Updated June 9, 2021