From db57b55657cf66c0f1d37d9025aa8abffdcba79a Mon Sep 17 00:00:00 2001 From: SirCumAlot1988 Date: Fri, 30 Dec 2022 23:54:09 +0100 Subject: [PATCH 1/6] Revision of Boobpedia scraper Fixes the following problems: -Performer Image not scraped anymore due to minor changes in the website -Most of the metadata is not scraped anymore due to minor changes in the website (Birthdate, Country, Ethnicity, Nationality, Eye Color, Height, Weight, Fake Tits, Career Length, Twitter Instagram) -Birthdate not scraped properly in some cases -Hair Color not scraped properly in some cases -Measurements not scraped properly in some cases -Gender defaults to female now -Some cosmetic corrections to career length and details --- scrapers/Boobpedia.yml | 71 ++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/scrapers/Boobpedia.yml b/scrapers/Boobpedia.yml index c5ad462fa..11c955543 100644 --- a/scrapers/Boobpedia.yml +++ b/scrapers/Boobpedia.yml @@ -22,19 +22,24 @@ xPathScrapers: with: "https://www.boobpedia.com" performerScraper: performer: - Name: //h1 - Twitter: //table//tr/td/b/a[text()='Twitter']/@href - Instagram: //table//tr/td/b/a[text()='Instagram']/@href + Name: //h1 + Gender: + fixed: "Female" + Twitter: //table//tr/td/b[text()='Twitter']/../following-sibling::td//@href + Instagram: //table//tr/td/b[text()='Instagram']/../following-sibling::td//@href Birthdate: - selector: //table//tr/td//b[text()='Born:']/../following-sibling::td/a + selector: //table//tr/td//b[text()='Born']/../following-sibling::td/a concat: " " postProcess: + - replace: + - regex: (.*\d\d\d\d).* + with: $1 - parseDate: January 2 2006 - Ethnicity: //table//tr/td/b[text()='Ethnicity:']/../following-sibling::td/a - Country: //table//tr/td/b[text()='Nationality:']/../following-sibling::td/a - EyeColor: //table//tr/td/b[text()='Eye color:']/../following-sibling::td + Ethnicity: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a + Country: //table//tr/td/b[text()='Nationality']/../following-sibling::td/a + EyeColor: //table//tr/td/b[text()='Eye color']/../following-sibling::td Height: - selector: //table//tr/td/b[text()='Height:']/../following-sibling::td + selector: //table//tr/td/b[text()='Height']/../following-sibling::td postProcess: - replace: - regex: (?:.+\D)?(\d+\.\d+)\Dm.+ @@ -42,28 +47,47 @@ xPathScrapers: - regex: \. with: "" Weight: - selector: //table//tr/td/b[text()='Weight:']/../following-sibling::td + selector: //table//tr/td/b[text()='Weight']/../following-sibling::td postProcess: - replace: - regex: (?:.+\D)?(\d+)\Dkg.+ with: $1 Measurements: - selector: //table//tr/td/b[text()='Measurements:']/../following-sibling::td|//table//tr/td[contains(b,'cup')]/following-sibling::td + selector: //table//tr/td/b[text()='Measurements']/../following-sibling::td|//table//tr/td[contains(b,'cup')]/following-sibling::td concat: "|" postProcess: - replace: - - regex: (\d+)-(\d+)-(\d+)[^|]+\|(\S+).+ # get measurements + cup - with: $1$4-$2-$3 + - regex: (\d+)-(\d+)-(\d+)[^|]*\|(\S+).+ # get measurements + cup + with: $4-$2-$3 - regex: \|.+$ # fallback to clear non matching regexes with: "" - FakeTits: //table//tr/td/b[text()='Boobs:']/../following-sibling::td/a - HairColor: //table//tr/td[contains(b,'Hair')]/following-sibling::td + - regex: \[\d*\] # Remove References + with: "" + - regex: ( in) # Remove Unit Inches + with: "" + FakeTits: //table//tr/td/b[text()='Boobs']/../following-sibling::td/a + HairColor: + selector: //table//tr/td[contains(b,'Hair')]/following-sibling::td//text() + concat: ", " + postProcess: + - replace: + - regex: (,,) + with: "," + - regex: ( , ) + with: " " # nbsp; screws up the parsing, so use contains instead - CareerLength: //table//tr/td/b[text()[contains(.,'active:')]]/../following-sibling::td + CareerLength: + selector: //table//tr/td/b[text()[contains(.,'active')]]/../following-sibling::td + postProcess: + - replace: + - regex: (present) + with: "Present" + - regex: (current) + with: "Present" Aliases: //table//tr/td/b[text()[contains(.,'known')]]/../following-sibling::td Image: #selector: //table[@class="infobox"]//img/@src #alterntive image, no need for subScraper but gets lq image - selector: //table[@class="infobox"]//a[img[@src]]/@href + selector: //table[@class="infobox plainlinks"]//a[img[@src]]/@href postProcess: - replace: - regex: ^ @@ -83,4 +107,17 @@ xPathScrapers: Details: selector: //div[@class="mw-parser-output"]/p concat: "\n\n" -# Last Updated February 01, 2022 + postProcess: + - replace: + # Remove References + - regex: \[\d*\] + with: "" + # Remove and , which appears in the details of some performers (e.g. Jenna Jameson) + - regex: () + with: "" + - regex: () + with: "" + # Remove triple line breaks + - regex: \n\n\n + with: "\n" +# Last Updated December 30, 2022 From a7cdf0f284cd2e0ea44971c427571dd61e230aa0 Mon Sep 17 00:00:00 2001 From: SirCumAlot1988 Date: Wed, 4 Jan 2023 20:31:28 +0100 Subject: [PATCH 2/6] Improvements: -Added regex for removing references to further fields (Ethnicity, Eye Color, Fake Tits, Hair Color, Career Length, Aliases) -Career Length: Maps "Present" and "Current" to empty string -Country: Maps nationality to country -Career Length: Maps em dash to hyphen -Fake Tits: Maps "Enhanced" to "Fake" and "Natural" to "Natural" --- scrapers/Boobpedia.yml | 347 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 337 insertions(+), 10 deletions(-) diff --git a/scrapers/Boobpedia.yml b/scrapers/Boobpedia.yml index 11c955543..f1c46ed16 100644 --- a/scrapers/Boobpedia.yml +++ b/scrapers/Boobpedia.yml @@ -35,9 +35,18 @@ xPathScrapers: - regex: (.*\d\d\d\d).* with: $1 - parseDate: January 2 2006 - Ethnicity: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a - Country: //table//tr/td/b[text()='Nationality']/../following-sibling::td/a - EyeColor: //table//tr/td/b[text()='Eye color']/../following-sibling::td + Ethnicity: + selector: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a + postProcess: + - replace: + - regex: \[\d*\] + with: "" + EyeColor: + selector: //table//tr/td/b[text()='Eye color']/../following-sibling::td + postProcess: + - replace: + - regex: \[\d*\] + with: "" Height: selector: //table//tr/td/b[text()='Height']/../following-sibling::td postProcess: @@ -57,7 +66,7 @@ xPathScrapers: concat: "|" postProcess: - replace: - - regex: (\d+)-(\d+)-(\d+)[^|]*\|(\S+).+ # get measurements + cup + - regex: (\d+)-(\d+)-(\d+)[^|]*\|(\d+\S+).+ # get measurements + cup with: $4-$2-$3 - regex: \|.+$ # fallback to clear non matching regexes with: "" @@ -65,7 +74,15 @@ xPathScrapers: with: "" - regex: ( in) # Remove Unit Inches with: "" - FakeTits: //table//tr/td/b[text()='Boobs']/../following-sibling::td/a + FakeTits: + selector: //table//tr/td/b[text()='Boobs']/../following-sibling::td/a + postProcess: + - replace: + - regex: \[\d*\] # Remove References + with: "" + - map: + "Enhanced": "Fake" + "Natural": "Natural" HairColor: selector: //table//tr/td[contains(b,'Hair')]/following-sibling::td//text() concat: ", " @@ -75,16 +92,27 @@ xPathScrapers: with: "," - regex: ( , ) with: " " + - regex: \[\d*\] + with: "" # nbsp; screws up the parsing, so use contains instead CareerLength: selector: //table//tr/td/b[text()[contains(.,'active')]]/../following-sibling::td postProcess: - replace: - - regex: (present) - with: "Present" - - regex: (current) - with: "Present" - Aliases: //table//tr/td/b[text()[contains(.,'known')]]/../following-sibling::td + - regex: \[\d*\] # Remove References + with: "" + - regex: (—|–) + with: "-" + - regex: (\S)-(\S) + with: $1 - $2 + - regex: (Present|present|Current|current) + with: "" + Aliases: + selector: //table//tr/td/b[text()[contains(.,'known')]]/../following-sibling::td + postProcess: + - replace: + - regex: \[\d*\] + with: "" Image: #selector: //table[@class="infobox"]//img/@src #alterntive image, no need for subScraper but gets lq image selector: //table[@class="infobox plainlinks"]//a[img[@src]]/@href @@ -120,4 +148,303 @@ xPathScrapers: # Remove triple line breaks - regex: \n\n\n with: "\n" + Country: + selector: //table//tr/td/b[text()='Nationality']/../following-sibling::td/a + postProcess: + - map: + "Abkhaz": "Abkhazia" + "Abkhazian": "Abkhazia" + "Afghan": "Afghanistan" + "Albanian": "Albania" + "Algerian": "Algeria" + "American Samoan": "American Samoa" + "American": "United States of America" + "Andorran": "Andorra" + "Angolan": "Angola" + "Anguillan": "Anguilla" + "Antarctic": "Antarctica" + "Antiguan": "Antigua and Barbuda" + "Argentine": "Argentina" + "Argentinian": "Argentina" + "Armenian": "Armenia" + "Aruban": "Aruba" + "Australian": "Australia" + "Austrian": "Austria" + "Azerbaijani": "Azerbaijan" + "Azeri": "Azerbaijan" + "Bahamian": "Bahamas" + "Bahraini": "Bahrain" + "Bangladeshi": "Bangladesh" + "Barbadian": "Barbados" + "Barbudan": "Antigua and Barbuda" + "Basotho": "Lesotho" + "Belarusian": "Belarus" + "Belgian": "Belgium" + "Belizean": "Belize" + "Beninese": "Benin" + "Beninois": "Benin" + "Bermudan": "Bermuda" + "Bermudian": "Bermuda" + "Bhutanese": "Bhutan" + "BIOT": "British Indian Ocean Territory" + "Bissau-Guinean": "Guinea-Bissau" + "Bolivian": "Bolivia" + "Bonaire": "Bonaire" + "Bonairean": "Bonaire" + "Bosnian": "Bosnia and Herzegovina" + "Botswanan": "Botswana" + "Bouvet Island": "Bouvet Island" + "Brazilian": "Brazil" + "British Virgin Island": "Virgin Islands British" + "British": "United Kingdom" + "Bruneian": "Brunei" + "Bulgarian": "Bulgaria" + "Burkinabé": "Burkina Faso" + "Burmese": "Burma" + "Burundian": "Burundi" + "Cabo Verdean": "Cabo Verde" + "Cambodian": "Cambodia" + "Cameroonian": "Cameroon" + "Canadian": "Canada" + "Cantonese": "Hong Kong" + "Caymanian": "Cayman Islands" + "Central African": "Central African Republic" + "Chadian": "Chad" + "Channel Island": "Guernsey" + #Channel Island: "Jersey" + "Chilean": "Chile" + "Chinese": "China" + "Christmas Island": "Christmas Island" + "Cocos Island": "Cocos (Keeling) Islands" + "Colombian": "Colombia" + "Comoran": "Comoros" + "Comorian": "Comoros" + "Congolese": "Congo" + "Cook Island": "Cook Islands" + "Costa Rican": "Costa Rica" + "Croatian": "Croatia" + "Cuban": "Cuba" + "Curaçaoan": "Curaçao" + "Cypriot": "Cyprus" + "Czech": "Czech Republic" + "Danish": "Denmark" + "Djiboutian": "Djibouti" + "Dominican": "Dominica" + "Dutch": "Netherlands" + "Ecuadorian": "Ecuador" + "Egyptian": "Egypt" + "Emirati": "United Arab Emirates" + "Emiri": "United Arab Emirates" + "Emirian": "United Arab Emirates" + "English people": "England" + "English": "England" + "Equatoguinean": "Equatorial Guinea" + "Equatorial Guinean": "Equatorial Guinea" + "Eritrean": "Eritrea" + "Estonian": "Estonia" + "Ethiopian": "Ethiopia" + "European": "European Union" + "Falkland Island": "Falkland Islands" + "Faroese": "Faroe Islands" + "Fijian": "Fiji" + "Filipino": "Philippines" + "Finnish": "Finland" + "Formosan": "Taiwan" + "French Guianese": "French Guiana" + "French Polynesian": "French Polynesia" + "French Southern Territories": "French Southern Territories" + "French": "France" + "Futunan": "Wallis and Futuna" + "Gabonese": "Gabon" + "Gambian": "Gambia" + "Georgian": "Georgia" + "German": "Germany" + "Ghanaian": "Ghana" + "Gibraltar": "Gibraltar" + "Greek": "Greece" + "Greenlandic": "Greenland" + "Grenadian": "Grenada" + "Guadeloupe": "Guadeloupe" + "Guamanian": "Guam" + "Guatemalan": "Guatemala" + "Guinean": "Guinea" + "Guyanese": "Guyana" + "Haitian": "Haiti" + "Heard Island": "Heard Island and McDonald Islands" + "Hellenic": "Greece" + "Herzegovinian": "Bosnia and Herzegovina" + "Honduran": "Honduras" + "Hong Kong": "Hong Kong" + "Hong Konger": "Hong Kong" + "Hungarian": "Hungary" + "Icelandic": "Iceland" + "Indian": "India" + "Indonesian": "Indonesia" + "Iranian": "Iran" + "Iraqi": "Iraq" + "Irish": "Ireland" + "Israeli": "Israel" + "Israelite": "Israel" + "Italian": "Italy" + "Ivorian": "Ivory Coast" + "Jamaican": "Jamaica" + "Jan Mayen": "Jan Mayen" + "Japanese": "Japan" + "Jordanian": "Jordan" + "Kazakh": "Kazakhstan" + "Kazakhstani": "Kazakhstan" + "Kenyan": "Kenya" + "Kirghiz": "Kyrgyzstan" + "Kirgiz": "Kyrgyzstan" + "Kiribati": "Kiribati" + "Korean": "South Korea" + "Kosovan": "Kosovo" + "Kosovar": "Kosovo" + "Kuwaiti": "Kuwait" + "Kyrgyz": "Kyrgyzstan" + "Kyrgyzstani": "Kyrgyzstan" + "Lao": "Lao People's Democratic Republic" + "Laotian": "Lao People's Democratic Republic" + "Latvian": "Latvia" + "Lebanese": "Lebanon" + "Lettish": "Latvia" + "Liberian": "Liberia" + "Libyan": "Libya" + "Liechtensteiner": "Liechtenstein" + "Lithuanian": "Lithuania" + "Luxembourg": "Luxembourg" + "Luxembourgish": "Luxembourg" + "Macanese": "Macau" + "Macedonian": "North Macedonia" + "Magyar": "Hungary" + "Mahoran": "Mayotte" + "Malagasy": "Madagascar" + "Malawian": "Malawi" + "Malaysian": "Malaysia" + "Maldivian": "Maldives" + "Malian": "Mali" + "Malinese": "Mali" + "Maltese": "Malta" + "Manx": "Isle of Man" + "Marshallese": "Marshall Islands" + "Martinican": "Martinique" + "Martiniquais": "Martinique" + "Mauritanian": "Mauritania" + "Mauritian": "Mauritius" + "McDonald Islands": "Heard Island and McDonald Islands" + "Mexican": "Mexico" + "Moldovan": "Moldova" + "Monacan": "Monaco" + "Mongolian": "Mongolia" + "Montenegrin": "Montenegro" + "Montserratian": "Montserrat" + "Monégasque": "Monaco" + "Moroccan": "Morocco" + "Motswana": "Botswana" + "Mozambican": "Mozambique" + "Myanma": "Myanmar" + "Namibian": "Namibia" + "Nauruan": "Nauru" + "Nepalese": "Nepal" + "Nepali": "Nepal" + "Netherlandic": "Netherlands" + "New Caledonian": "New Caledonia" + "New Zealand": "New Zealand" + "Ni-Vanuatu": "Vanuatu" + "Nicaraguan": "Nicaragua" + "Nigerian": "Nigeria" + "Nigerien": "Niger" + "Niuean": "Niue" + "Norfolk Island": "Norfolk Island" + "Northern Irish": "Northern Ireland" + "Northern Marianan": "Northern Mariana Islands" + "Norwegian": "Norway" + "Omani": "Oman" + "Pakistani": "Pakistan" + "Palauan": "Palau" + "Palestinian": "Palestine" + "Panamanian": "Panama" + "Papua New Guinean": "Papua New Guinea" + "Papuan": "Papua New Guinea" + "Paraguayan": "Paraguay" + "Persian": "Iran" + "Peruvian": "Peru" + "Philippine": "Philippines" + "Pitcairn Island": "Pitcairn Islands" + "Polish": "Poland" + "Portuguese": "Portugal" + "Puerto Rican": "Puerto Rico" + "Qatari": "Qatar" + "Romanian": "Romania" + "Russian": "Russia" + "Rwandan": "Rwanda" + "Saba": "Saba" + "Saban": "Saba" + "Sahraouian": "Western Sahara" + "Sahrawi": "Western Sahara" + "Sahrawian": "Western Sahara" + "Salvadoran": "El Salvador" + "Sammarinese": "San Marino" + "Samoan": "Samoa" + "Saudi Arabian": "Saudi Arabia" + "Saudi": "Saudi Arabia" + "Scottish": "Scotland" + "Senegalese": "Senegal" + "Serbian": "Serbia" + "Seychellois": "Seychelles" + "Sierra Leonean": "Sierra Leone" + "Singapore": "Singapore" + "Singaporean": "Singapore" + "Slovak": "Slovakia" + "Slovene": "Slovenia" + "Slovenian": "Slovenia" + "Solomon Island": "Solomon Islands" + "Somali": "Somalia" + "Somalilander": "Somaliland" + "South African": "South Africa" + "South Georgia Island": "South Georgia and the South Sandwich Islands" + "South Ossetian": "South Ossetia" + "South Sandwich Island": "South Georgia and the South Sandwich Islands" + "South Sudanese": "South Sudan" + "Spanish": "Spain" + "Sri Lankan": "Sri Lanka" + "Sudanese": "Sudan" + "Surinamese": "Suriname" + "Svalbard resident": "Svalbard" + "Swati": "Eswatini" + "Swazi": "Eswatini" + "Swedish": "Sweden" + "Swiss": "Switzerland" + "Syrian": "Syrian Arab Republic" + "Taiwanese": "Taiwan" + "Tajikistani": "Tajikistan" + "Tanzanian": "Tanzania" + "Thai": "Thailand" + "Timorese": "Timor-Leste" + "Tobagonian": "Trinidad and Tobago" + "Togolese": "Togo" + "Tokelauan": "Tokelau" + "Tongan": "Tonga" + "Trinidadian": "Trinidad and Tobago" + "Tunisian": "Tunisia" + "Turkish": "Turkey" + "Turkmen": "Turkmenistan" + "Turks and Caicos Island": "Turks and Caicos Islands" + "Tuvaluan": "Tuvalu" + "Ugandan": "Uganda" + "Ukrainian": "Ukraine" + "Uruguayan": "Uruguay" + "Uzbek": "Uzbekistan" + "Uzbekistani": "Uzbekistan" + "Vanuatuan": "Vanuatu" + "Vatican": "Vatican City State" + "Venezuelan": "Venezuela" + "Vietnamese": "Vietnam" + "Wallis and Futuna": "Wallis and Futuna" + "Wallisian": "Wallis and Futuna" + "Welsh": "Wales" + "Yemeni": "Yemen" + "Zambian": "Zambia" + "Zimbabwean": "Zimbabwe" + "Åland Island": "Åland Islands" # Last Updated December 30, 2022 From a21b3d6293a8517b4c41d36dc8b894a3f9c5be3b Mon Sep 17 00:00:00 2001 From: bnkai <48220860+bnkai@users.noreply.github.com> Date: Wed, 18 Jan 2023 18:13:24 +0200 Subject: [PATCH 3/6] Update Boobpedia.yml - format - removed fixed Gender as not all performers where female - fixed twitter/instagram selectors - tweaked a couple of regexes --- scrapers/Boobpedia.yml | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/scrapers/Boobpedia.yml b/scrapers/Boobpedia.yml index f1c46ed16..b17f0bf09 100644 --- a/scrapers/Boobpedia.yml +++ b/scrapers/Boobpedia.yml @@ -22,11 +22,9 @@ xPathScrapers: with: "https://www.boobpedia.com" performerScraper: performer: - Name: //h1 - Gender: - fixed: "Female" - Twitter: //table//tr/td/b[text()='Twitter']/../following-sibling::td//@href - Instagram: //table//tr/td/b[text()='Instagram']/../following-sibling::td//@href + Name: //h1 + Twitter: //table//tr/td/a[b[text()='Twitter']]/@href + Instagram: //table//tr/td/a[b[text()='Instagram']]/@href Birthdate: selector: //table//tr/td//b[text()='Born']/../following-sibling::td/a concat: " " @@ -35,8 +33,8 @@ xPathScrapers: - regex: (.*\d\d\d\d).* with: $1 - parseDate: January 2 2006 - Ethnicity: - selector: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a + Ethnicity: + selector: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a postProcess: - replace: - regex: \[\d*\] @@ -74,7 +72,7 @@ xPathScrapers: with: "" - regex: ( in) # Remove Unit Inches with: "" - FakeTits: + FakeTits: selector: //table//tr/td/b[text()='Boobs']/../following-sibling::td/a postProcess: - replace: @@ -105,7 +103,7 @@ xPathScrapers: with: "-" - regex: (\S)-(\S) with: $1 - $2 - - regex: (Present|present|Current|current) + - regex: (?i)(present|current) with: "" Aliases: selector: //table//tr/td/b[text()[contains(.,'known')]]/../following-sibling::td @@ -141,14 +139,12 @@ xPathScrapers: - regex: \[\d*\] with: "" # Remove and , which appears in the details of some performers (e.g. Jenna Jameson) - - regex: () - with: "" - - regex: () + - regex: with: "" # Remove triple line breaks - regex: \n\n\n with: "\n" - Country: + Country: selector: //table//tr/td/b[text()='Nationality']/../following-sibling::td/a postProcess: - map: From 2acc4ee8ad4795d5dafe4310f640ad6b17b3de40 Mon Sep 17 00:00:00 2001 From: SirCumAlot1988 Date: Sun, 12 Mar 2023 22:55:29 +0100 Subject: [PATCH 4/6] Revised AEBN scraper: -Implemented ability to scrape movie scenes -Fixed movie performers not scraped properly -Added functionality to scrape performer tattoos, piercings and aliases --- scrapers/AEBN.py | 491 ++++++++++++++++++++++++++++++++++++++++++++++ scrapers/AEBN.yml | 134 +++---------- 2 files changed, 513 insertions(+), 112 deletions(-) create mode 100644 scrapers/AEBN.py diff --git a/scrapers/AEBN.py b/scrapers/AEBN.py new file mode 100644 index 000000000..2527cf198 --- /dev/null +++ b/scrapers/AEBN.py @@ -0,0 +1,491 @@ +import sys +import random +import json +import base64 +import re +import datetime + +# Seperators to append scene nr to url +seperators = "+.," + +# Seperator between movie title and Scene Nr string +title_seperator = ": " + +try: + import py_common.log as log +except ModuleNotFoundError: + print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr) + sys.exit() + +try: + import requests +except ModuleNotFoundError: + print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", + file=sys.stderr) + print( + "If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", + file=sys.stderr) + sys.exit() + +try: + from bs4 import BeautifulSoup +except ModuleNotFoundError: + print("You need to install the BeautifulSoup module. (https://pypi.org/project/beautifulsoup4/)", file=sys.stderr) + print( + "If you have pip (normally installed with python), run this command in a terminal (cmd): pip install beautifulsoup4", + file=sys.stderr) + sys.exit() + +try: + import dateparser +except ModuleNotFoundError: + print("You need to install the dateparser module. (https://pypi.org/project/dateparser/", file=sys.stderr) + print( + "If you have pip (normally installed with python), run this command in a terminal (cmd): pip install dateparser", + file=sys.stderr) + sys.exit() + +class Scene: + def __init__(self): + self.title = "" + self.performers = [] + self.tags = [] + self.thumbnail = "" + self.scene_nr = "" + self.scene_id = "" + self.movie = None + +class Movie(Scene): + def __init__(self): + super().__init__() + self.url = "" + self.date = "" + self.director = "" + self.studio = "" + self.details = "" + self.front_cover = "" + self.back_cover = "" + self.scenes = [] + self.duration = "" + +class Performer(): + def __init__(self): + self.name = "" + self.url = "" + self.aliases = "" + self.gender = "" + self.birthdate = "" + self.ethnicity = "" + self.hair_color = "" + self.eye_color = "" + self.height = "" + self.weight = "" + self.tattoos = "" + self.piercings = "" + self.url = "" + self.details = "" + self.image = "" + +def parse_scene(scene): + scene_parsed = Scene() + + # Get scene id + scene_parsed.scene_id = scene["id"] + + # Get scene Nr + scene_parsed.scene_nr = int(scene.h1.span.text.replace("Scene ", "")) + + # Some thumbnails are loaded together with the page, others are loaded when the scrollbar is used + # Get URLs of visible thumbnails + thumb_urls = scene.findChildren("img") + thumb_urls_parsed = [] + + for thumb_url in thumb_urls: + thumb_urls_parsed.append("https:" + thumb_url["src"].split("?")[0]) + + # Get URLs of thumbnails, which are loaded when the scrollbar is used + thumb_urls = scene.findChildren("div", {"class": "dts-collection-item dts-collection-item-scene-thumb"}) + + for thumb_url in thumb_urls: + if thumb_url.div["class"] == ["dts-lazy-loading-placeholder"]: + thumb_urls_parsed.append("https:" + thumb_url["data-scene-thumb-image-url"].split("?")[0]) + + # Choose randomly one of the thumbnails + scene_parsed.thumbnail = random.choice(thumb_urls_parsed) + + # Get tags for current scene + tag_groups = [child for child in scene.findChildren("span", {"class": "section-detail-list-item-title"}) if + "stars" not in child.text.lower()] + + for tag_group in tag_groups: + for tag in tag_group.parent.contents: + if tag.name == "a": + scene_parsed.tags.append(tag.text) + + # Get performers of current scene + performers = scene.findChildren("span", {"class": "dts-scene-star-wrapper"}) + + for performer in performers: + scene_parsed.performers.append(performer.a.text.strip()) + + return scene_parsed + + +def parse_movie(url): + + movie = Movie() + + # Get domain name and movie path (necessary for scraping of performer names) + match = re.search(".*[.]com", url) + domain_name = match.group() + movie_path = url.replace(domain_name, "") + + request = requests.get(url) + soup = BeautifulSoup(request.text, 'html.parser') + + # Title + movie.title = soup.h1.text + + # URL + movie.url = url + + # Date + date = soup.find_all("li", "section-detail-list-item-release-date") + if len(date) > 0: + date_str = date[0].text.replace("Released: ", "") + movie.date = dateparser.parse(date_str) + + # Director + director_soup = soup.find_all("li", "section-detail-list-item-director") + if len(director_soup) > 0: + directors = director_soup[0].find_all("a") + + for director in directors: + movie.director = movie.director + director.text.strip() + ", " + + if movie.director[-2:] == ", ": + movie.director = movie.director[:-2] + + # Studio + studio = soup.find_all("div", "dts-studio-name-wrapper") + + if len(studio) > 0: + movie.studio = studio[0].a.text + + # Performers + performers = soup.find_all("div", "dts-collection-item dts-collection-item-star") + + for i in range(len(performers)): + if "data-loc" in performers[i].attrs: + # Performer is loaded on demand, if the scrollbar is used. Data is retrieved via POST request + payload = { + "f": movie_path, + "fbase": movie_path, + "starIdRoot": performers[i]["data-star-id-root"], + "imgHeight": performers[i]["data-img-height"], + "useHeadshot": performers[i]["data-use-headshot"], + "useSilhouette": performers[i]["data-use-silhouette"], + "showFavoriteLink": performers[i]["data-show-favorite-link"] + } + + performer_request = requests.post(domain_name + performers[i]["data-loc"], params=payload) + performer_soup = BeautifulSoup(performer_request.text, 'html.parser') + movie.performers.append(performer_soup.a.text.strip()) + else: + movie.performers.append(performers[i]["title"]) + + # Tags + tags = soup.find_all("div", "dts-collection-item dts-collection-item-category") + + for tag in tags: + movie.tags.append(tag.text.strip()) + + # Description + movie.details = soup.find_all("div", "dts-section-page-detail-description-body")[0].text.strip() + + # Cover Images + movie.front_cover = "https:" + soup.find_all("img", "dts-modal-boxcover-front")[0]["src"] + movie.back_cover = "https:" + soup.find_all("img", "dts-modal-boxcover-back")[0]["src"] + + # Scenes + scenes = soup.find_all("section", id=lambda x: x and x.startswith("scene")) + + for scene in scenes: + parsed_scene = parse_scene(scene) + + # Title, URL, date, director and studio are inherited from movie + parsed_scene.title = movie.title + title_seperator + "Scene " + str(parsed_scene.scene_nr).zfill(2) + parsed_scene.movie = movie + + movie.scenes.append(parsed_scene) + + # Add performers and tags from scenes to movie performers and tags + movie.performers = movie.performers + parsed_scene.performers + movie.tags = movie.tags + parsed_scene.tags + + # Remove duplicated performers and tags + movie.performers = list(dict.fromkeys(movie.performers)) + movie.tags = list(dict.fromkeys(movie.tags)) + + # Get movie duration + # For some reason movie quality shares the same class as movie duration + entries = soup.find_all("li", "section-detail-list-item-duration") + + for entry in entries: + if "Running Time: " in entry.text: + movie.duration = entry.text.replace("Running Time: ", "") + + return movie + +def parse_performer(url): + + performer = Performer() + + request = requests.get(url) + soup = BeautifulSoup(request.text, 'html.parser') + + # Title + performer.name = soup.h1.text + + attributes = soup.find_all("span", "section-detail-list-item-title") + + for attribute in attributes: + # Gender + if "Gender" in attribute.text: + performer.gender = attribute.parent.text.replace("Gender: ", "") + + # Birthdate + if "Birth Date" in attribute.text: + birthdate_str = attribute.parent.text.replace("Birth Date: ", "") + + # Datetime expects "Sep" instead of "Sept" + birthdate_str = birthdate_str.replace("Sept", "Sep") + + birthdate = datetime.datetime.strptime(birthdate_str, "%b %d, %Y") + performer.birthdate = birthdate.strftime("%Y-%m-%d") + + # Ethnicity + if "Ethnicity" in attribute.text: + performer.ethnicity = attribute.parent.text.replace("Ethnicity: ", "") + + # Hair Color + if "Hair Color" in attribute.text: + performer.hair_color = attribute.parent.text.replace("Hair Color: ", "") + + # Eye Color + if "Eye Color" in attribute.text: + performer.eye_color = attribute.parent.text.replace("Eye Color: ", "") + + # Height + if "Height" in attribute.text: + height_str = attribute.parent.text + match_height = re.search("(\d*) cm", height_str) + if match_height: + performer.height = match_height.group(1) + + # Weight + if "Weight" in attribute.text: + weight_str = attribute.parent.text + match_weight = re.search("(\d*)kg", weight_str) + if match_weight: + performer.weight = match_weight.group(1) + + # Details + details = soup.find_all("div", "dts-star-bio") + + if len(details) > 0: + performer.details = details[0].text + + # Tattoos are given in the details section + if "Tattoos: " in performer.details: + match_tattoos = re.search("Tattoos: (.*)", performer.details) + if match_tattoos: + performer.tattoos = match_tattoos.group(1) + performer.details = performer.details.replace("Tattoos: " + performer.tattoos, "") + + if "Tattoo: " in performer.details: + match_tattoos = re.search("Tattoo: (.*)", performer.details) + if match_tattoos: + performer.tattoos = match_tattoos.group(1) + performer.details = performer.details.replace("Tattoo: " + performer.tattoos, "") + + # Piercings are given in the details section + if "Piercings:" in performer.details: + match_piercings = re.search("Piercings: (.*)", performer.details) + if match_piercings: + performer.piercings = match_piercings.group(1) + performer.details = performer.details.replace("Piercings: " + performer.piercings, "") + + if "Non-ear piercings:" in performer.details: + match_piercings = re.search("Non-ear piercings: (.*)", performer.details) + if match_piercings: + performer.piercings = match_piercings.group(1) + performer.details = performer.details.replace("Non-ear piercings: " + performer.piercings, "") + + # Aliases are given in the details section + if "AKA" in performer.details: + match_aliases = re.search("AKA (.*)", performer.details) + if match_aliases: + performer.aliases = match_aliases.group(1) + performer.details = performer.details.replace("AKA " + performer.aliases, "") + + if "A.K.A:" in performer.details: + match_aliases = re.search("A.K.A: (.*)", performer.details) + if match_aliases: + performer.aliases = match_aliases.group(1) + performer.details = performer.details.replace("A.K.A: " + performer.aliases, "") + + # Remove leading/trailing spaces from performer bio + performer.details = performer.details.strip() + + # Image + image = soup.find_all("div", "dts-section-page-detail-main-image-wrapper") + + if len(image) > 0: + image_url_small = image[0].img.attrs["src"] + + match_image_url = re.search("(.*\.jpg)", image_url_small) + if match_image_url: + performer.image = "https:" + match_image_url.group(1) + + # URL + performer.url = url + + return performer + +def build_stash_scene_json(title, date, director, studio, performers, movie, tags, thumbnail, details = ""): + + # Decode image + img = requests.get(thumbnail).content + b64img = base64.b64encode(img) + utf8img = b64img.decode('utf-8') + + # Build JSON for Stash API + json = {} + json["title"] = title + json["url"] = url + if date != "": + json["date"] = date.strftime("%Y-%m-%d") + json["director"] = director + json["studio"] = {"name": studio} + json["performers"] = [{"name": performer} for performer in performers] + json["movies"] = [build_stash_movie_json(movie, decode_cover=False)] + json["tags"] = [{"name": tag} for tag in tags] + json["details"] = details + json["image"] = "data:image/jpeg;base64," + utf8img + + return json + +def build_stash_movie_json(movie, decode_cover=True): + + json = {} + + if decode_cover: + # Decode front cover image + front_img = requests.get(movie.front_cover).content + front_b64img = base64.b64encode(front_img) + front_utf8img = front_b64img.decode('utf-8') + json["front_image"] = "data:image/jpeg;base64," + front_utf8img + + # Decode back cover image + back_img = requests.get(movie.back_cover).content + back_b64img = base64.b64encode(back_img) + back_utf8img = back_b64img.decode('utf-8') + json["back_image"] = "data:image/jpeg;base64," + back_utf8img + + # Build JSON for Stash API + json["name"] = movie.title + json["duration"] = movie.duration + if movie.date != "": + json["date"] = movie.date.strftime("%Y-%m-%d") + json["studio"] = {"name": movie.studio} + json["director"] = movie.director + json["url"] = movie.url + json["synopsis"] = movie.details + + return json + +def build_stash_performer_json(performer): + + # Build JSON for Stash API + json = {} + json["name"] = performer.name + json["aliases"] = performer.aliases + json["gender"] = performer.gender + json["birthdate"] = performer.birthdate + json["ethnicity"] = performer.ethnicity + json["hair_color"] = performer.hair_color + json["eye_color"] = performer.eye_color + json["height"] = performer.height + json["weight"] = performer.weight + json["tattoos"] = performer.tattoos + json["piercings"] = performer.piercings + json["url"] = performer.url + json["details"] = performer.details + + # Decode image + if performer.image != "": + img = requests.get(performer.image).content + b64img = base64.b64encode(img) + utf8img = b64img.decode('utf-8') + json["images"] = ["data:image/jpeg;base64," + utf8img] + + return json + +#Debug +# url = "https://straight.aebn.com/straight/stars/3090/erik-everhard?fmc=1" +# sys.argv.append("performer") +#End Debug + +frag = json.loads(sys.stdin.read()) +url = frag["url"] + +if not frag['url']: + log.error("No URL entered") + sys.exit(1) + +if len(sys.argv) > 1: + if sys.argv[1] == "movie" or sys.argv[1] == "scene": + movie = parse_movie(url) + if sys.argv[1] == "scene": + # Check if complete movie shall be returned or only one of the movie scenes + match_scene_url = re.search(".*#(scene-\d*)$", url) + if match_scene_url: + scene_id = match_scene_url.group(1) + scene_found = False + for scene in movie.scenes: + if scene.scene_id == scene_id: + ret = build_stash_scene_json(scene.title, movie.date, movie.director, movie.studio, + scene.performers, scene.movie, scene.tags, scene.thumbnail) + scene_found = True + if not scene_found: + log.error("Scene not found") + sys.exit() + + match_scene_nr = re.search(".*[" + seperators + "](\d*)$", url) + if match_scene_nr: + try: + scene_nr = int(match_scene_nr.group(1)) + except ValueError: + log.error("Scene Nr must be Integer") + scene_found = False + for scene in movie.scenes: + if scene.scene_nr == scene_nr: + ret = build_stash_scene_json(scene.title, movie.date, movie.director, movie.studio, + scene.performers, scene.movie, scene.tags, scene.thumbnail) + scene_found = True + if not scene_found: + log.error("Scene not found") + sys.exit() + + if not match_scene_url and not match_scene_nr: + ret = build_stash_scene_json(movie.title, movie.date, movie.director, movie.studio, movie.performers, + movie, movie.tags, movie.front_cover, movie.details) + else: + ret = build_stash_movie_json(movie) + else: + mode = "performer" + performer = parse_performer(url) + ret = build_stash_performer_json(performer) + +print(json.dumps(ret)) diff --git a/scrapers/AEBN.yml b/scrapers/AEBN.yml index 72c27368f..a3de10c31 100644 --- a/scrapers/AEBN.yml +++ b/scrapers/AEBN.yml @@ -1,116 +1,26 @@ -name: "AEBN" -performerByURL: - - action: scrapeXPath - url: - - aebn.com - scraper: performerScraper +name: AEBN_dev sceneByURL: - - action: scrapeXPath - url: + - url: - aebn.com - scraper: sceneScraper + action: script + script: + - python3 + - AEBN_dev_v02.py + - scene movieByURL: - - action: scrapeXPath - url: + - url: + - aebn.com + action: script + script: + - python3 + - AEBN_dev_v02.py + - movie +performerByURL: + - url: - aebn.com - scraper: movieScraper -xPathScrapers: - performerScraper: - performer: - Name: //div[@class='dts-section-page-heading-title']/h1 - Gender: - selector: //li[@class="section-detail-list-item-gender"]/text() - postProcess: - - map: - TS: transgender_female - URL: //link[@rel="canonical"]/@href - Birthdate: - selector: //div[@class='section-detail dts-list-attributes']/ul/li[contains(.,"Birth Date")] - postProcess: - - replace: - - regex: .+:\s(.+) - with: $1 - - regex: "Sept" - with: "Sep" - - parseDate: Jan 2, 2006 - Height: - selector: //li[@class='section-detail-list-item-height']/text() - postProcess: - - replace: - - regex: .+\((\d+).+\) - with: $1 - Weight: - selector: //li[@class='section-detail-list-item-weight']/text() - postProcess: - - replace: - - regex: .+\((\d+).+\) - with: $1 - EyeColor: - selector: //li[@class='section-detail-list-item-eye-color']/text() - Ethnicity: - selector: //li[@class='section-detail-list-item-ethnicity']/text() - postProcess: - - map: - White: Caucasian - HairColor: - selector: //li[@class='section-detail-list-item-hair-color']/text() - Details: - selector: //div[@class='dts-section-page-detail-description-body'] - Image: - selector: //div[@class='dts-section-page-detail-main-image-wrapper']/picture/img/@src - postProcess: - - replace: - - regex: ^([^?]+).*$ - with: "https:$1" - sceneScraper: - scene: - Title: //h1[@class="dts-section-page-heading-title"]|//div[@class="dts-section-page-heading-title"]/h1 - Date: - selector: //li[@class="section-detail-list-item-release-date"]/text() - postProcess: - - replace: - - regex: "Sept" - with: "Sep" - - parseDate: Jan 2, 2006 - Details: - selector: //div[@class="dts-section-page-detail-description-body"]//text() - Performers: - Name: //div[@class="dts-star-name-overlay"]/text() - Tags: - Name: //span[@class="dts-image-display-name"]//text() - Image: - selector: //picture[@class="dts-movie-boxcover-front"]/img/@src - postProcess: - - replace: - - regex: ^([^?]+).*$ - with: "https:$1" - movieScraper: - movie: - Name: //h1[@class="dts-section-page-heading-title"]|//div[@class="dts-section-page-heading-title"]/h1 - Director: - selector: //li[@class='section-detail-list-item-director']//span//a - concat: ", " - Duration: //li[@class='section-detail-list-item-duration'][contains(span,"Running Time")]/text() - Date: - selector: //li[@class="section-detail-list-item-release-date"]/text() - postProcess: - - replace: - - regex: "Sept" - with: "Sep" - - parseDate: Jan 2, 2006 - Synopsis: //div[@class="dts-section-page-detail-description-body"]//text() - Studio: - Name: //div[@class='dts-studio-name-wrapper']/a/text() - FrontImage: - selector: //picture[@class="dts-movie-boxcover-front"]/img/@src - postProcess: - - replace: - - regex: ^([^?]+).*$ - with: "https:$1" - BackImage: - selector: //picture[@class="dts-movie-boxcover-back"]/img/@src - postProcess: - - replace: - - regex: ^([^?]+).*$ - with: "https:$1" -# Last Updated July 06, 2022 + action: script + script: + - python3 + - AEBN_dev_v02.py + - performer +# Last Updated March 12, 2023 From 5db4721a9b0b740cf39afdaa4cd65fc2a910e2dd Mon Sep 17 00:00:00 2001 From: SirCumAlot1988 Date: Thu, 16 Mar 2023 10:46:42 +0100 Subject: [PATCH 5/6] -Added functionality to scrape performer details and/or images during scene scraping -Improved handling of tattoos/piercings/aliases during performer scraping -Added handling of transgender performers --- scrapers/AEBN.py | 264 +++++++++++++++++++++++++--------------------- scrapers/AEBN.yml | 2 +- 2 files changed, 144 insertions(+), 122 deletions(-) diff --git a/scrapers/AEBN.py b/scrapers/AEBN.py index 2527cf198..f7ae5da89 100644 --- a/scrapers/AEBN.py +++ b/scrapers/AEBN.py @@ -5,12 +5,22 @@ import re import datetime -# Seperators to append scene nr to url -seperators = "+.," +# Seperators to append scene nr to url when sending a request +request_seperators = "+.," -# Seperator between movie title and Scene Nr string +# Seperator between movie title and Scene Nr string for auto generated titles title_seperator = ": " +# Settings on how to handle performer scraping during scene scraping +# +# If both are set to false, the performer scraper will not be invoked and only performer name and URL are scraped +# If at least one of them is set to True, the performer scraper will be invoked for each performer of a scene, +# scraping the complete performer details and/or performer images +# +# Note that this will slow down scene scraping significantly if set to True +scrape_performer_details = False +scrape_performer_images = False + try: import py_common.log as log except ModuleNotFoundError: @@ -48,7 +58,7 @@ class Scene: def __init__(self): self.title = "" - self.performers = [] + self.performers = {} self.tags = [] self.thumbnail = "" self.scene_nr = "" @@ -86,7 +96,8 @@ def __init__(self): self.details = "" self.image = "" -def parse_scene(scene): +def parse_scene(scene, domain_name): + scene_parsed = Scene() # Get scene id @@ -126,7 +137,8 @@ def parse_scene(scene): performers = scene.findChildren("span", {"class": "dts-scene-star-wrapper"}) for performer in performers: - scene_parsed.performers.append(performer.a.text.strip()) + performer_name = performer.a.text.strip() + scene_parsed.performers[performer_name] = domain_name + performer.a["href"] return scene_parsed @@ -176,6 +188,7 @@ def parse_movie(url): performers = soup.find_all("div", "dts-collection-item dts-collection-item-star") for i in range(len(performers)): + if "data-loc" in performers[i].attrs: # Performer is loaded on demand, if the scrollbar is used. Data is retrieved via POST request payload = { @@ -190,9 +203,13 @@ def parse_movie(url): performer_request = requests.post(domain_name + performers[i]["data-loc"], params=payload) performer_soup = BeautifulSoup(performer_request.text, 'html.parser') - movie.performers.append(performer_soup.a.text.strip()) + performer_name = performer_soup.a.text.strip() + performer_url = domain_name + performer_soup.a["href"] else: - movie.performers.append(performers[i]["title"]) + performer_name = performers[i]["title"] + performer_url = domain_name + performers[i].a["href"] + + movie.performers[performer_name] = performer_url # Tags tags = soup.find_all("div", "dts-collection-item dts-collection-item-category") @@ -211,20 +228,19 @@ def parse_movie(url): scenes = soup.find_all("section", id=lambda x: x and x.startswith("scene")) for scene in scenes: - parsed_scene = parse_scene(scene) + parsed_scene = parse_scene(scene, domain_name) - # Title, URL, date, director and studio are inherited from movie - parsed_scene.title = movie.title + title_seperator + "Scene " + str(parsed_scene.scene_nr).zfill(2) + # Add scene title and movie + parsed_scene.title = movie.title + title_seperator + "Scene " + str(parsed_scene.scene_nr).zfill(2) parsed_scene.movie = movie movie.scenes.append(parsed_scene) # Add performers and tags from scenes to movie performers and tags - movie.performers = movie.performers + parsed_scene.performers + movie.performers.update(parsed_scene.performers) movie.tags = movie.tags + parsed_scene.tags - # Remove duplicated performers and tags - movie.performers = list(dict.fromkeys(movie.performers)) + # Remove duplicated tags movie.tags = list(dict.fromkeys(movie.tags)) # Get movie duration @@ -237,7 +253,7 @@ def parse_movie(url): return movie -def parse_performer(url): +def parse_performer(url, scrape_performer_details=True, scrape_performer_image=True): performer = Performer() @@ -247,109 +263,114 @@ def parse_performer(url): # Title performer.name = soup.h1.text - attributes = soup.find_all("span", "section-detail-list-item-title") - - for attribute in attributes: - # Gender - if "Gender" in attribute.text: - performer.gender = attribute.parent.text.replace("Gender: ", "") - - # Birthdate - if "Birth Date" in attribute.text: - birthdate_str = attribute.parent.text.replace("Birth Date: ", "") - - # Datetime expects "Sep" instead of "Sept" - birthdate_str = birthdate_str.replace("Sept", "Sep") - - birthdate = datetime.datetime.strptime(birthdate_str, "%b %d, %Y") - performer.birthdate = birthdate.strftime("%Y-%m-%d") - - # Ethnicity - if "Ethnicity" in attribute.text: - performer.ethnicity = attribute.parent.text.replace("Ethnicity: ", "") - - # Hair Color - if "Hair Color" in attribute.text: - performer.hair_color = attribute.parent.text.replace("Hair Color: ", "") - - # Eye Color - if "Eye Color" in attribute.text: - performer.eye_color = attribute.parent.text.replace("Eye Color: ", "") - - # Height - if "Height" in attribute.text: - height_str = attribute.parent.text - match_height = re.search("(\d*) cm", height_str) - if match_height: - performer.height = match_height.group(1) - - # Weight - if "Weight" in attribute.text: - weight_str = attribute.parent.text - match_weight = re.search("(\d*)kg", weight_str) - if match_weight: - performer.weight = match_weight.group(1) - - # Details - details = soup.find_all("div", "dts-star-bio") - - if len(details) > 0: - performer.details = details[0].text - - # Tattoos are given in the details section - if "Tattoos: " in performer.details: - match_tattoos = re.search("Tattoos: (.*)", performer.details) - if match_tattoos: - performer.tattoos = match_tattoos.group(1) - performer.details = performer.details.replace("Tattoos: " + performer.tattoos, "") - - if "Tattoo: " in performer.details: - match_tattoos = re.search("Tattoo: (.*)", performer.details) - if match_tattoos: - performer.tattoos = match_tattoos.group(1) - performer.details = performer.details.replace("Tattoo: " + performer.tattoos, "") - - # Piercings are given in the details section - if "Piercings:" in performer.details: - match_piercings = re.search("Piercings: (.*)", performer.details) - if match_piercings: - performer.piercings = match_piercings.group(1) - performer.details = performer.details.replace("Piercings: " + performer.piercings, "") - - if "Non-ear piercings:" in performer.details: - match_piercings = re.search("Non-ear piercings: (.*)", performer.details) - if match_piercings: - performer.piercings = match_piercings.group(1) - performer.details = performer.details.replace("Non-ear piercings: " + performer.piercings, "") - - # Aliases are given in the details section - if "AKA" in performer.details: - match_aliases = re.search("AKA (.*)", performer.details) - if match_aliases: - performer.aliases = match_aliases.group(1) - performer.details = performer.details.replace("AKA " + performer.aliases, "") - - if "A.K.A:" in performer.details: - match_aliases = re.search("A.K.A: (.*)", performer.details) - if match_aliases: - performer.aliases = match_aliases.group(1) - performer.details = performer.details.replace("A.K.A: " + performer.aliases, "") - - # Remove leading/trailing spaces from performer bio - performer.details = performer.details.strip() + # URL + performer.url = url - # Image - image = soup.find_all("div", "dts-section-page-detail-main-image-wrapper") + if scrape_performer_details: + attributes = soup.find_all("span", "section-detail-list-item-title") + + for attribute in attributes: + # Gender + if "Gender" in attribute.text: + performer.gender = attribute.parent.text.replace("Gender: ", "") + + # Handle Transsexual performers + if performer.gender == "TS": performer.gender = "Transgender Female" + + # Birthdate + if "Birth Date" in attribute.text: + birthdate_str = attribute.parent.text.replace("Birth Date: ", "") + + # Datetime expects "Sep" instead of "Sept" + birthdate_str = birthdate_str.replace("Sept", "Sep") + + birthdate = datetime.datetime.strptime(birthdate_str, "%b %d, %Y") + performer.birthdate = birthdate.strftime("%Y-%m-%d") + + # Ethnicity + if "Ethnicity" in attribute.text: + performer.ethnicity = attribute.parent.text.replace("Ethnicity: ", "") + + # Hair Color + if "Hair Color" in attribute.text: + performer.hair_color = attribute.parent.text.replace("Hair Color: ", "") + + # Eye Color + if "Eye Color" in attribute.text: + performer.eye_color = attribute.parent.text.replace("Eye Color: ", "") + + # Height + if "Height" in attribute.text: + height_str = attribute.parent.text + match_height = re.search("(\d*) cm", height_str) + if match_height: + performer.height = match_height.group(1) + + # Weight + if "Weight" in attribute.text: + weight_str = attribute.parent.text + match_weight = re.search("(\d*)kg", weight_str) + if match_weight: + performer.weight = match_weight.group(1) + + # Details + details = soup.find_all("div", "dts-star-bio") + + if len(details) > 0: + performer.details = details[0].text + + # Tattoos are given in the details section + tattoo_keywords = ["Tattoos", "Tattoo"] + + for keyword in tattoo_keywords: + if keyword.lower() in performer.details.lower(): + pattern = re.compile(keyword + ":* ([^)\n]*)", re.IGNORECASE) + match_tattoos = re.search(pattern, performer.details) + if match_tattoos: + performer.tattoos = match_tattoos.group(1) + performer.details = pattern.sub("", performer.details) + + # Piercings are given in the details section + piercing_keywords = ["Non-Ear piercings", "Piercings"] + + for keyword in piercing_keywords: + if keyword.lower() in performer.details.lower(): + pattern = re.compile(keyword + ":* ([^)\n]*)", re.IGNORECASE) + match_piercings = re.search(pattern, performer.details) + if match_piercings: + performer.piercings = match_piercings.group(1) + performer.details = pattern.sub("", performer.details) + + # Aliases are given in the details section + alias_keywords = ["A.K.A", "AKA"] + + for keyword in alias_keywords: + if keyword.lower() in performer.details.lower(): + pattern = re.compile(keyword + ":* ([^)\n]*)", re.IGNORECASE) + match_aliases = re.search(pattern, performer.details) + if match_aliases: + performer.aliases = match_aliases.group(1) + performer.details = pattern.sub("", performer.details) + + # Remove leading/trailing spaces from performer bio + performer.details = performer.details.strip() + + # Remove "()" which might result from tattoo/piercing/alias replacements above + performer.details = performer.details.replace("()", "") + + # Remove double spaces which might result from tattoo/piercing/alias replacements above + performer.details = performer.details.replace(" ", " ") - if len(image) > 0: - image_url_small = image[0].img.attrs["src"] + # Image + if scrape_performer_image: + image = soup.find_all("div", "dts-section-page-detail-main-image-wrapper") - match_image_url = re.search("(.*\.jpg)", image_url_small) - if match_image_url: - performer.image = "https:" + match_image_url.group(1) + if len(image) > 0: + image_url_small = image[0].img.attrs["src"] - # URL - performer.url = url + match_image_url = re.search("(.*\.jpg)", image_url_small) + if match_image_url: + performer.image = "https:" + match_image_url.group(1) return performer @@ -368,7 +389,13 @@ def build_stash_scene_json(title, date, director, studio, performers, movie, tag json["date"] = date.strftime("%Y-%m-%d") json["director"] = director json["studio"] = {"name": studio} - json["performers"] = [{"name": performer} for performer in performers] + + # Build performer json + json["performers"] = [] + for performer_name in performers.keys(): + parsed_performer = parse_performer(performers[performer_name], scrape_performer_details, scrape_performer_images) + json["performers"].append(build_stash_performer_json(parsed_performer)) + json["movies"] = [build_stash_movie_json(movie, decode_cover=False)] json["tags"] = [{"name": tag} for tag in tags] json["details"] = details @@ -432,11 +459,6 @@ def build_stash_performer_json(performer): return json -#Debug -# url = "https://straight.aebn.com/straight/stars/3090/erik-everhard?fmc=1" -# sys.argv.append("performer") -#End Debug - frag = json.loads(sys.stdin.read()) url = frag["url"] @@ -462,7 +484,7 @@ def build_stash_performer_json(performer): log.error("Scene not found") sys.exit() - match_scene_nr = re.search(".*[" + seperators + "](\d*)$", url) + match_scene_nr = re.search(".*[" + request_seperators + "](\d*)$", url) if match_scene_nr: try: scene_nr = int(match_scene_nr.group(1)) diff --git a/scrapers/AEBN.yml b/scrapers/AEBN.yml index a3de10c31..514720b46 100644 --- a/scrapers/AEBN.yml +++ b/scrapers/AEBN.yml @@ -23,4 +23,4 @@ performerByURL: - python3 - AEBN_dev_v02.py - performer -# Last Updated March 12, 2023 +# Last Updated March 16, 2023 From 2f7eb20021ed351a1abe42aa08f94ee4fffcc49a Mon Sep 17 00:00:00 2001 From: SirCumAlot1988 Date: Thu, 16 Mar 2023 12:14:03 +0100 Subject: [PATCH 6/6] Fixed AEBN.yml to include the correct.py file --- scrapers/AEBN.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapers/AEBN.yml b/scrapers/AEBN.yml index 514720b46..ba4500ff4 100644 --- a/scrapers/AEBN.yml +++ b/scrapers/AEBN.yml @@ -1,11 +1,11 @@ -name: AEBN_dev +name: AEBN sceneByURL: - url: - aebn.com action: script script: - python3 - - AEBN_dev_v02.py + - AEBN.py - scene movieByURL: - url: @@ -13,7 +13,7 @@ movieByURL: action: script script: - python3 - - AEBN_dev_v02.py + - AEBN.py - movie performerByURL: - url: @@ -21,6 +21,6 @@ performerByURL: action: script script: - python3 - - AEBN_dev_v02.py + - AEBN.py - performer # Last Updated March 16, 2023