-
-
Notifications
You must be signed in to change notification settings - Fork 442
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding multiscrape python based performer scraper. (#594)
- Loading branch information
1 parent
5d7e03d
commit 4674d55
Showing
2 changed files
with
270 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,253 @@ | ||
|
||
import json | ||
import requests | ||
import sys | ||
|
||
|
||
class multiscrape: | ||
|
||
url="http://localhost:9999/graphql" | ||
headers = { | ||
"Accept-Encoding": "gzip, deflate, br", | ||
"Content-Type": "application/json", | ||
"Accept": "application/json", | ||
"Connection": "keep-alive", | ||
"DNT": "1" | ||
} | ||
|
||
''' | ||
update the below config in the preferred order for each field. | ||
If there are no results for that performer and field it will use the results of the next scraper in the list and cache the results.''' | ||
config ={ | ||
"gender": ['stash-sqlite'], | ||
"url" : ['Babepedia','stash-sqlite','FreeonesCommunity','Brazzers','Pornhub'], | ||
"twitter":['Babepedia','stash-sqlite'], | ||
"instagram": ['Babepedia'], | ||
"birthdate": ['IMBD','FreeonesCommunity','Babepedia','stash-sqlite'], | ||
"ethnicity": ['Babepedia','stash-sqlite'], | ||
"country": ['Babepedia','stash-sqlite'], | ||
"eye_color": ['Babepedia','stash-sqlite'], | ||
"height":['Babepedia','Pornhub','stash-sqlite'], | ||
"measurements":['Babepedia','Pornhub','FreeonesCommunity','stash-sqlite'], | ||
"fake_tits":['Babepedia','stash-sqlite'], | ||
"career_length": ['Pornhub','Babepedia','stash-sqlite'], | ||
"tattoos":['Babepedia','stash-sqlite'], | ||
"piercings": ['Babepedia','stash-sqlite'], | ||
"aliases": ['Babepedia','stash-sqlite'], | ||
"tags": ['Babepedia'], | ||
"details": ['FreeonesCommunity','Babepedia','Brazzers'], | ||
"death_date": ['Babepedia'], | ||
"hair_color": ['Babepedia'], | ||
"weight":['Babepedia','FreeonesCommunity'], | ||
"image": ['performer-image-dir','Babepedia','FreeonesCommunity'] | ||
} | ||
|
||
|
||
def __log(self,levelChar, s): | ||
if levelChar == "": | ||
return | ||
|
||
print(self.__prefix(levelChar) + s + "\n", file=sys.stderr, flush=True) | ||
|
||
def trace(self,s): | ||
self.__log(b't', s) | ||
|
||
def debug(self,s): | ||
self.__log(b'd', s) | ||
|
||
def info(self,s): | ||
self.__log(b'i', s) | ||
|
||
def warning(self,s): | ||
self.__log(b'w', s) | ||
|
||
def error(self,s): | ||
self.__log(b'e', s) | ||
|
||
|
||
|
||
def __callGraphQL(self, query, variables=None): | ||
json = {} | ||
json['query'] = query | ||
if variables != None: | ||
json['variables'] = variables | ||
|
||
# handle cookies | ||
response = requests.post(self.url, json=json, headers=self.headers) | ||
|
||
if response.status_code == 200: | ||
result = response.json() | ||
if result.get("error", None): | ||
for error in result["error"]["errors"]: | ||
raise Exception("GraphQL error: {}".format(error)) | ||
if result.get("data", None): | ||
return result.get("data") | ||
else: | ||
raise Exception( | ||
"GraphQL query failed:{} - {}. Query: {}. Variables: {}".format(response.status_code, response.content, query, variables)) | ||
|
||
|
||
def list_scrapers(self, type): | ||
query = """query listPerformerScrapers { | ||
listPerformerScrapers { | ||
id | ||
name | ||
performer{ | ||
supported_scrapes | ||
} | ||
} | ||
}""" | ||
ret = [] | ||
result = self.__callGraphQL(query) | ||
for r in result["listSceneScrapers"]: | ||
if type in r["scene"]["supported_scrapes"]: | ||
ret.append(r["id"]) | ||
return ret | ||
|
||
def scrape_performer_list(self, scraper_id, performer): | ||
query = """query scrapePerformerList($scraper_id: ID!, $performer: String!) { | ||
scrapePerformerList(scraper_id: $scraper_id, query: $performer) { | ||
name | ||
url | ||
gender | ||
birthdate | ||
ethnicity | ||
country | ||
eye_color | ||
height | ||
measurements | ||
fake_tits | ||
career_length | ||
tattoos | ||
piercings | ||
aliases | ||
image | ||
} | ||
}""" | ||
|
||
variables = {'scraper_id': scraper_id, 'performer': performer} | ||
result = self.__callGraphQL(query, variables) | ||
if result is not None: | ||
return result["scrapePerformerList"] | ||
return None | ||
|
||
def scrape_performer(self, scraper_id, performer): | ||
query = """query scrapePerformer($scraper_id: ID!, $performer: ScrapedPerformerInput!) { | ||
scrapePerformer(scraper_id: $scraper_id, scraped_performer: $performer) { | ||
name | ||
url | ||
gender | ||
birthdate | ||
ethnicity | ||
country | ||
eye_color | ||
height | ||
measurements | ||
fake_tits | ||
career_length | ||
tattoos | ||
piercings | ||
aliases | ||
image | ||
} | ||
}""" | ||
variables = {'scraper_id': scraper_id, 'performer': performer} | ||
result = self.__callGraphQL(query, variables) | ||
return result["scrapePerformer"] | ||
|
||
def requred_scrapers(self): | ||
scrapers=[] | ||
for key in self.config.keys(): | ||
for s in self.config.get(key): | ||
if s not in scrapers: | ||
scrapers.append(s) | ||
return scrapers | ||
|
||
|
||
def query_performers(self,name): | ||
ret=[] | ||
|
||
for scraper in self.requred_scrapers(): | ||
print("Querying performers "+ scraper, file=sys.stderr) | ||
tmp=self.scrape_performer_list(scraper,name) | ||
if tmp is not None: | ||
for s in tmp: | ||
found=False | ||
for t in ret: | ||
if s["name"]==t["name"]: | ||
found=True | ||
if not found: | ||
ret.append(s) | ||
return ret | ||
|
||
def fetch_performer(self,name): | ||
ret={"name":name} | ||
|
||
scraper_cache={} | ||
|
||
for field in self.config.keys(): | ||
found=False | ||
for s in self.config[field]: | ||
if s in scraper_cache.keys(): | ||
if field in scraper_cache[s]: | ||
ret[field]=scraper_cache[s][field] | ||
print("updating field from cache using scraper: " + s +" for field: " +field, file=sys.stderr) | ||
found=True | ||
if s not in scraper_cache.keys() and not found: | ||
print("Running scraper: " + s +" " +field, file=sys.stderr) | ||
spl=self.scrape_performer_list(s, name) | ||
if spl is not None: | ||
for spli in spl: | ||
if spli["name"].lower()==name.lower(): | ||
r=self.scrape_performer(s, {"name":spli["name"]}) | ||
if r is not None: | ||
scraper_cache[s]=r | ||
found=True | ||
break; | ||
if found: | ||
print("Saving results from scraper: " +field + " " +s,file=sys.stderr) | ||
if field in scraper_cache[s]: | ||
ret[field]=scraper_cache[s][field] | ||
else: | ||
found=False | ||
else: | ||
scraper_cache[s]={} | ||
return ret | ||
|
||
|
||
|
||
if sys.argv[1] == "query": | ||
fragment = json.loads(sys.stdin.read()) | ||
print("input: " + json.dumps(fragment),file=sys.stderr) | ||
scraper=multiscrape() | ||
result = scraper.query_performers(fragment['name']) | ||
if not result: | ||
print(f"Could not determine details for performer: `{fragment['name']}`",file=sys.stderr) | ||
print("{}") | ||
else: | ||
print (json.dumps(result)) | ||
|
||
if sys.argv[1] == "fetch": | ||
fragment = json.loads(sys.stdin.read()) | ||
print("input: " + json.dumps(fragment),file=sys.stderr) | ||
scraper=multiscrape() | ||
result = scraper.fetch_performer(fragment['name']) | ||
if not result: | ||
print(f"Could not determine details for performer: `{fragment['name']}`",file=sys.stderr) | ||
print("{}") | ||
else: | ||
True | ||
print (json.dumps(result)) | ||
|
||
|
||
|
||
if sys.argv[1] == "test": | ||
scraper=multiscrape() | ||
scrapers=scraper.requred_scrapers() | ||
print(scrapers) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
name: multiscrape | ||
|
||
performerByFragment: | ||
action: script | ||
script: | ||
- python | ||
- multiscrape.py | ||
- fetch | ||
|
||
performerByName: | ||
action: script | ||
script: | ||
- python | ||
- multiscrape.py | ||
- query | ||
|
||
# Last Updated June 9, 2021 |