From db57b55657cf66c0f1d37d9025aa8abffdcba79a Mon Sep 17 00:00:00 2001
From: SirCumAlot1988 <rachelaldanafan88@gmail.com>
Date: Fri, 30 Dec 2022 23:54:09 +0100
Subject: [PATCH 1/6] Revision of Boobpedia scraper Fixes the following
 problems: -Performer Image not scraped anymore due to minor changes in the
 website -Most of the metadata is not scraped anymore due to minor changes in
 the website (Birthdate, Country, Ethnicity, Nationality, Eye Color, Height,
 Weight, Fake Tits, Career Length, Twitter Instagram) -Birthdate not scraped
 properly in some cases -Hair Color not scraped properly in some cases
 -Measurements not scraped properly in some cases -Gender defaults to female
 now -Some cosmetic corrections to career length and details

---
 scrapers/Boobpedia.yml | 71 ++++++++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 17 deletions(-)

diff --git a/scrapers/Boobpedia.yml b/scrapers/Boobpedia.yml
index c5ad462fa..11c955543 100644
--- a/scrapers/Boobpedia.yml
+++ b/scrapers/Boobpedia.yml
@@ -22,19 +22,24 @@ xPathScrapers:
                 with: "https://www.boobpedia.com"
   performerScraper:
     performer:
-      Name: //h1
-      Twitter: //table//tr/td/b/a[text()='Twitter']/@href
-      Instagram: //table//tr/td/b/a[text()='Instagram']/@href
+      Name: //h1     
+      Gender:
+        fixed: "Female"
+      Twitter: //table//tr/td/b[text()='Twitter']/../following-sibling::td//@href
+      Instagram: //table//tr/td/b[text()='Instagram']/../following-sibling::td//@href
       Birthdate:
-        selector: //table//tr/td//b[text()='Born:']/../following-sibling::td/a
+        selector: //table//tr/td//b[text()='Born']/../following-sibling::td/a
         concat: " "
         postProcess:
+          - replace:
+              - regex: (.*\d\d\d\d).*
+                with: $1
           - parseDate: January 2 2006
-      Ethnicity: //table//tr/td/b[text()='Ethnicity:']/../following-sibling::td/a
-      Country: //table//tr/td/b[text()='Nationality:']/../following-sibling::td/a
-      EyeColor: //table//tr/td/b[text()='Eye color:']/../following-sibling::td
+      Ethnicity: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a
+      Country: //table//tr/td/b[text()='Nationality']/../following-sibling::td/a
+      EyeColor: //table//tr/td/b[text()='Eye color']/../following-sibling::td
       Height:
-        selector: //table//tr/td/b[text()='Height:']/../following-sibling::td
+        selector: //table//tr/td/b[text()='Height']/../following-sibling::td
         postProcess:
           - replace:
               - regex: (?:.+\D)?(\d+\.\d+)\Dm.+
@@ -42,28 +47,47 @@ xPathScrapers:
               - regex: \.
                 with: ""
       Weight:
-        selector: //table//tr/td/b[text()='Weight:']/../following-sibling::td
+        selector: //table//tr/td/b[text()='Weight']/../following-sibling::td
         postProcess:
           - replace:
               - regex: (?:.+\D)?(\d+)\Dkg.+
                 with: $1
       Measurements:
-        selector: //table//tr/td/b[text()='Measurements:']/../following-sibling::td|//table//tr/td[contains(b,'cup')]/following-sibling::td
+        selector: //table//tr/td/b[text()='Measurements']/../following-sibling::td|//table//tr/td[contains(b,'cup')]/following-sibling::td
         concat: "|"
         postProcess:
           - replace:
-              - regex: (\d+)-(\d+)-(\d+)[^|]+\|(\S+).+ # get measurements + cup
-                with: $1$4-$2-$3
+              - regex: (\d+)-(\d+)-(\d+)[^|]*\|(\S+).+ # get measurements + cup
+                with: $4-$2-$3
               - regex: \|.+$ # fallback to clear non matching regexes
                 with: ""
-      FakeTits: //table//tr/td/b[text()='Boobs:']/../following-sibling::td/a
-      HairColor: //table//tr/td[contains(b,'Hair')]/following-sibling::td
+              - regex: \[\d*\] # Remove References
+                with: ""
+              - regex: ( in) # Remove Unit Inches
+                with: ""
+      FakeTits: //table//tr/td/b[text()='Boobs']/../following-sibling::td/a
+      HairColor:
+        selector: //table//tr/td[contains(b,'Hair')]/following-sibling::td//text()
+        concat: ", "
+        postProcess:
+          - replace:
+              - regex: (,,)
+                with: ","
+              - regex: ( , )
+                with: " "
       # nbsp; screws up the parsing, so use contains instead
-      CareerLength: //table//tr/td/b[text()[contains(.,'active:')]]/../following-sibling::td
+      CareerLength:
+        selector: //table//tr/td/b[text()[contains(.,'active')]]/../following-sibling::td
+        postProcess:
+          - replace:
+              - regex: (present)
+                with: "Present"
+              - regex: (current)
+                with: "Present"
       Aliases: //table//tr/td/b[text()[contains(.,'known')]]/../following-sibling::td
       Image:
         #selector: //table[@class="infobox"]//img/@src #alterntive image, no need for subScraper but gets lq image
-        selector: //table[@class="infobox"]//a[img[@src]]/@href
+        selector: //table[@class="infobox plainlinks"]//a[img[@src]]/@href
         postProcess:
           - replace:
               - regex: ^
@@ -83,4 +107,17 @@ xPathScrapers:
       Details:
         selector: //div[@class="mw-parser-output"]/p
         concat: "\n\n"
-# Last Updated February 01, 2022
+        postProcess:
+          - replace:
+              # Remove References
+              - regex: \[\d*\]
+                with: ""
+              # Remove <protect> and </protect>, which appears in the details of some performers (e.g. Jenna Jameson)
+              - regex: (<protect>)
+                with: ""
+              - regex: (</protect>)
+                with: ""
+              # Remove triple line breaks
+              - regex: \n\n\n
+                with: "\n"
+# Last Updated December 30, 2022

From a7cdf0f284cd2e0ea44971c427571dd61e230aa0 Mon Sep 17 00:00:00 2001
From: SirCumAlot1988 <rachelaldanafan88@gmail.com>
Date: Wed, 4 Jan 2023 20:31:28 +0100
Subject: [PATCH 2/6] Improvements: -Added regex for removing references to
 further fields (Ethnicity, Eye Color, Fake Tits, Hair Color, Career Length,
 Aliases) -Career Length: Maps "Present" and "Current" to empty string
 -Country: Maps nationality to country -Career Length: Maps em dash to hyphen
 -Fake Tits: Maps "Enhanced" to "Fake" and "Natural" to "Natural"

---
 scrapers/Boobpedia.yml | 347 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 337 insertions(+), 10 deletions(-)

diff --git a/scrapers/Boobpedia.yml b/scrapers/Boobpedia.yml
index 11c955543..f1c46ed16 100644
--- a/scrapers/Boobpedia.yml
+++ b/scrapers/Boobpedia.yml
@@ -35,9 +35,18 @@ xPathScrapers:
               - regex: (.*\d\d\d\d).*
                 with: $1
           - parseDate: January 2 2006
-      Ethnicity: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a
-      Country: //table//tr/td/b[text()='Nationality']/../following-sibling::td/a
-      EyeColor: //table//tr/td/b[text()='Eye color']/../following-sibling::td
+      Ethnicity:   
+        selector: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a  
+        postProcess:
+          - replace:
+              - regex: \[\d*\]
+                with: ""
+      EyeColor:
+        selector: //table//tr/td/b[text()='Eye color']/../following-sibling::td
+        postProcess:
+          - replace:
+              - regex: \[\d*\]
+                with: ""
       Height:
         selector: //table//tr/td/b[text()='Height']/../following-sibling::td
         postProcess:
@@ -57,7 +66,7 @@ xPathScrapers:
         concat: "|"
         postProcess:
           - replace:
-              - regex: (\d+)-(\d+)-(\d+)[^|]*\|(\S+).+ # get measurements + cup
+              - regex: (\d+)-(\d+)-(\d+)[^|]*\|(\d+\S+).+ # get measurements + cup
                 with: $4-$2-$3
               - regex: \|.+$ # fallback to clear non matching regexes
                 with: ""
@@ -65,7 +74,15 @@ xPathScrapers:
                 with: ""
               - regex: ( in) # Remove Unit Inches
                 with: ""
-      FakeTits: //table//tr/td/b[text()='Boobs']/../following-sibling::td/a
+      FakeTits: 
+        selector: //table//tr/td/b[text()='Boobs']/../following-sibling::td/a
+        postProcess:
+          - replace:
+              - regex: \[\d*\] # Remove References
+                with: ""
+          - map:
+              "Enhanced": "Fake"
+              "Natural": "Natural"
       HairColor:
         selector: //table//tr/td[contains(b,'Hair')]/following-sibling::td//text()
         concat: ", "
@@ -75,16 +92,27 @@ xPathScrapers:
                 with: ","
               - regex: ( , )
                 with: " "
+              - regex: \[\d*\]
+                with: ""
       # nbsp; screws up the parsing, so use contains instead
       CareerLength:
         selector: //table//tr/td/b[text()[contains(.,'active')]]/../following-sibling::td
         postProcess:
           - replace:
-              - regex: (present)
-                with: "Present"
-              - regex: (current)
-                with: "Present"
-      Aliases: //table//tr/td/b[text()[contains(.,'known')]]/../following-sibling::td
+              - regex: \[\d*\] # Remove References
+                with: ""
+              - regex: (—|–)
+                with: "-"
+              - regex: (\S)-(\S)
+                with: $1 - $2
+              - regex: (Present|present|Current|current)
+                with: ""
+      Aliases:
+        selector: //table//tr/td/b[text()[contains(.,'known')]]/../following-sibling::td
+        postProcess:
+          - replace:
+              - regex: \[\d*\]
+                with: ""
       Image:
         #selector: //table[@class="infobox"]//img/@src #alterntive image, no need for subScraper but gets lq image
         selector: //table[@class="infobox plainlinks"]//a[img[@src]]/@href
@@ -120,4 +148,303 @@ xPathScrapers:
               # Remove triple line breaks
               - regex: \n\n\n
                 with: "\n"
+      Country: 
+        selector: //table//tr/td/b[text()='Nationality']/../following-sibling::td/a
+        postProcess:
+          - map:
+              "Abkhaz": "Abkhazia"
+              "Abkhazian": "Abkhazia"
+              "Afghan": "Afghanistan"
+              "Albanian": "Albania"
+              "Algerian": "Algeria"
+              "American Samoan": "American Samoa"
+              "American": "United States of America"
+              "Andorran": "Andorra"
+              "Angolan": "Angola"
+              "Anguillan": "Anguilla"
+              "Antarctic": "Antarctica"
+              "Antiguan": "Antigua and Barbuda"
+              "Argentine": "Argentina"
+              "Argentinian": "Argentina"
+              "Armenian": "Armenia"
+              "Aruban": "Aruba"
+              "Australian": "Australia"
+              "Austrian": "Austria"
+              "Azerbaijani": "Azerbaijan"
+              "Azeri": "Azerbaijan"
+              "Bahamian": "Bahamas"
+              "Bahraini": "Bahrain"
+              "Bangladeshi": "Bangladesh"
+              "Barbadian": "Barbados"
+              "Barbudan": "Antigua and Barbuda"
+              "Basotho": "Lesotho"
+              "Belarusian": "Belarus"
+              "Belgian": "Belgium"
+              "Belizean": "Belize"
+              "Beninese": "Benin"
+              "Beninois": "Benin"
+              "Bermudan": "Bermuda"
+              "Bermudian": "Bermuda"
+              "Bhutanese": "Bhutan"
+              "BIOT": "British Indian Ocean Territory"
+              "Bissau-Guinean": "Guinea-Bissau"
+              "Bolivian": "Bolivia"
+              "Bonaire": "Bonaire"
+              "Bonairean": "Bonaire"
+              "Bosnian": "Bosnia and Herzegovina"
+              "Botswanan": "Botswana"
+              "Bouvet Island": "Bouvet Island"
+              "Brazilian": "Brazil"
+              "British Virgin Island": "Virgin Islands British"
+              "British": "United Kingdom"
+              "Bruneian": "Brunei"
+              "Bulgarian": "Bulgaria"
+              "Burkinabé": "Burkina Faso"
+              "Burmese": "Burma"
+              "Burundian": "Burundi"
+              "Cabo Verdean": "Cabo Verde"
+              "Cambodian": "Cambodia"
+              "Cameroonian": "Cameroon"
+              "Canadian": "Canada"
+              "Cantonese": "Hong Kong"
+              "Caymanian": "Cayman Islands"
+              "Central African": "Central African Republic"
+              "Chadian": "Chad"
+              "Channel Island": "Guernsey"
+              #Channel Island: "Jersey"
+              "Chilean": "Chile"
+              "Chinese": "China"
+              "Christmas Island": "Christmas Island"
+              "Cocos Island": "Cocos (Keeling) Islands"
+              "Colombian": "Colombia"
+              "Comoran": "Comoros"
+              "Comorian": "Comoros"
+              "Congolese": "Congo"
+              "Cook Island": "Cook Islands"
+              "Costa Rican": "Costa Rica"
+              "Croatian": "Croatia"
+              "Cuban": "Cuba"
+              "Curaçaoan": "Curaçao"
+              "Cypriot": "Cyprus"
+              "Czech": "Czech Republic"
+              "Danish": "Denmark"
+              "Djiboutian": "Djibouti"
+              "Dominican": "Dominica"
+              "Dutch": "Netherlands"
+              "Ecuadorian": "Ecuador"
+              "Egyptian": "Egypt"
+              "Emirati": "United Arab Emirates"
+              "Emiri": "United Arab Emirates"
+              "Emirian": "United Arab Emirates"
+              "English people": "England"
+              "English": "England"
+              "Equatoguinean": "Equatorial Guinea"
+              "Equatorial Guinean": "Equatorial Guinea"
+              "Eritrean": "Eritrea"
+              "Estonian": "Estonia"
+              "Ethiopian": "Ethiopia"
+              "European": "European Union"
+              "Falkland Island": "Falkland Islands"
+              "Faroese": "Faroe Islands"
+              "Fijian": "Fiji"
+              "Filipino": "Philippines"
+              "Finnish": "Finland"
+              "Formosan": "Taiwan"
+              "French Guianese": "French Guiana"
+              "French Polynesian": "French Polynesia"
+              "French Southern Territories": "French Southern Territories"
+              "French": "France"
+              "Futunan": "Wallis and Futuna"
+              "Gabonese": "Gabon"
+              "Gambian": "Gambia"
+              "Georgian": "Georgia"
+              "German": "Germany"
+              "Ghanaian": "Ghana"
+              "Gibraltar": "Gibraltar"
+              "Greek": "Greece"
+              "Greenlandic": "Greenland"
+              "Grenadian": "Grenada"
+              "Guadeloupe": "Guadeloupe"
+              "Guamanian": "Guam"
+              "Guatemalan": "Guatemala"
+              "Guinean": "Guinea"
+              "Guyanese": "Guyana"
+              "Haitian": "Haiti"
+              "Heard Island": "Heard Island and McDonald Islands"
+              "Hellenic": "Greece"
+              "Herzegovinian": "Bosnia and Herzegovina"
+              "Honduran": "Honduras"
+              "Hong Kong": "Hong Kong"
+              "Hong Konger": "Hong Kong"
+              "Hungarian": "Hungary"
+              "Icelandic": "Iceland"
+              "Indian": "India"
+              "Indonesian": "Indonesia"
+              "Iranian": "Iran"
+              "Iraqi": "Iraq"
+              "Irish": "Ireland"
+              "Israeli": "Israel"
+              "Israelite": "Israel"
+              "Italian": "Italy"
+              "Ivorian": "Ivory Coast"
+              "Jamaican": "Jamaica"
+              "Jan Mayen": "Jan Mayen"
+              "Japanese": "Japan"
+              "Jordanian": "Jordan"
+              "Kazakh": "Kazakhstan"
+              "Kazakhstani": "Kazakhstan"
+              "Kenyan": "Kenya"
+              "Kirghiz": "Kyrgyzstan"
+              "Kirgiz": "Kyrgyzstan"
+              "Kiribati": "Kiribati"
+              "Korean": "South Korea"
+              "Kosovan": "Kosovo"
+              "Kosovar": "Kosovo"
+              "Kuwaiti": "Kuwait"
+              "Kyrgyz": "Kyrgyzstan"
+              "Kyrgyzstani": "Kyrgyzstan"
+              "Lao": "Lao People's Democratic Republic"
+              "Laotian": "Lao People's Democratic Republic"
+              "Latvian": "Latvia"
+              "Lebanese": "Lebanon"
+              "Lettish": "Latvia"
+              "Liberian": "Liberia"
+              "Libyan": "Libya"
+              "Liechtensteiner": "Liechtenstein"
+              "Lithuanian": "Lithuania"
+              "Luxembourg": "Luxembourg"
+              "Luxembourgish": "Luxembourg"
+              "Macanese": "Macau"
+              "Macedonian": "North Macedonia"
+              "Magyar": "Hungary"
+              "Mahoran": "Mayotte"
+              "Malagasy": "Madagascar"
+              "Malawian": "Malawi"
+              "Malaysian": "Malaysia"
+              "Maldivian": "Maldives"
+              "Malian": "Mali"
+              "Malinese": "Mali"
+              "Maltese": "Malta"
+              "Manx": "Isle of Man"
+              "Marshallese": "Marshall Islands"
+              "Martinican": "Martinique"
+              "Martiniquais": "Martinique"
+              "Mauritanian": "Mauritania"
+              "Mauritian": "Mauritius"
+              "McDonald Islands": "Heard Island and McDonald Islands"
+              "Mexican": "Mexico"
+              "Moldovan": "Moldova"
+              "Monacan": "Monaco"
+              "Mongolian": "Mongolia"
+              "Montenegrin": "Montenegro"
+              "Montserratian": "Montserrat"
+              "Monégasque": "Monaco"
+              "Moroccan": "Morocco"
+              "Motswana": "Botswana"
+              "Mozambican": "Mozambique"
+              "Myanma": "Myanmar"
+              "Namibian": "Namibia"
+              "Nauruan": "Nauru"
+              "Nepalese": "Nepal"
+              "Nepali": "Nepal"
+              "Netherlandic": "Netherlands"
+              "New Caledonian": "New Caledonia"
+              "New Zealand": "New Zealand"
+              "Ni-Vanuatu": "Vanuatu"
+              "Nicaraguan": "Nicaragua"
+              "Nigerian": "Nigeria"
+              "Nigerien": "Niger"
+              "Niuean": "Niue"
+              "Norfolk Island": "Norfolk Island"
+              "Northern Irish": "Northern Ireland"
+              "Northern Marianan": "Northern Mariana Islands"
+              "Norwegian": "Norway"
+              "Omani": "Oman"
+              "Pakistani": "Pakistan"
+              "Palauan": "Palau"
+              "Palestinian": "Palestine"
+              "Panamanian": "Panama"
+              "Papua New Guinean": "Papua New Guinea"
+              "Papuan": "Papua New Guinea"
+              "Paraguayan": "Paraguay"
+              "Persian": "Iran"
+              "Peruvian": "Peru"
+              "Philippine": "Philippines"
+              "Pitcairn Island": "Pitcairn Islands"
+              "Polish": "Poland"
+              "Portuguese": "Portugal"
+              "Puerto Rican": "Puerto Rico"
+              "Qatari": "Qatar"
+              "Romanian": "Romania"
+              "Russian": "Russia"
+              "Rwandan": "Rwanda"
+              "Saba": "Saba"
+              "Saban": "Saba"
+              "Sahraouian": "Western Sahara"
+              "Sahrawi": "Western Sahara"
+              "Sahrawian": "Western Sahara"
+              "Salvadoran": "El Salvador"
+              "Sammarinese": "San Marino"
+              "Samoan": "Samoa"
+              "Saudi Arabian": "Saudi Arabia"
+              "Saudi": "Saudi Arabia"
+              "Scottish": "Scotland"
+              "Senegalese": "Senegal"
+              "Serbian": "Serbia"
+              "Seychellois": "Seychelles"
+              "Sierra Leonean": "Sierra Leone"
+              "Singapore": "Singapore"
+              "Singaporean": "Singapore"
+              "Slovak": "Slovakia"
+              "Slovene": "Slovenia"
+              "Slovenian": "Slovenia"
+              "Solomon Island": "Solomon Islands"
+              "Somali": "Somalia"
+              "Somalilander": "Somaliland"
+              "South African": "South Africa"
+              "South Georgia Island": "South Georgia and the South Sandwich Islands"
+              "South Ossetian": "South Ossetia"
+              "South Sandwich Island": "South Georgia and the South Sandwich Islands"
+              "South Sudanese": "South Sudan"
+              "Spanish": "Spain"
+              "Sri Lankan": "Sri Lanka"
+              "Sudanese": "Sudan"
+              "Surinamese": "Suriname"
+              "Svalbard resident": "Svalbard"
+              "Swati": "Eswatini"
+              "Swazi": "Eswatini"
+              "Swedish": "Sweden"
+              "Swiss": "Switzerland"
+              "Syrian": "Syrian Arab Republic"
+              "Taiwanese": "Taiwan"
+              "Tajikistani": "Tajikistan"
+              "Tanzanian": "Tanzania"
+              "Thai": "Thailand"
+              "Timorese": "Timor-Leste"
+              "Tobagonian": "Trinidad and Tobago"
+              "Togolese": "Togo"
+              "Tokelauan": "Tokelau"
+              "Tongan": "Tonga"
+              "Trinidadian": "Trinidad and Tobago"
+              "Tunisian": "Tunisia"
+              "Turkish": "Turkey"
+              "Turkmen": "Turkmenistan"
+              "Turks and Caicos Island": "Turks and Caicos Islands"
+              "Tuvaluan": "Tuvalu"
+              "Ugandan": "Uganda"
+              "Ukrainian": "Ukraine"
+              "Uruguayan": "Uruguay"
+              "Uzbek": "Uzbekistan"
+              "Uzbekistani": "Uzbekistan"
+              "Vanuatuan": "Vanuatu"
+              "Vatican": "Vatican City State"
+              "Venezuelan": "Venezuela"
+              "Vietnamese": "Vietnam"
+              "Wallis and Futuna": "Wallis and Futuna"
+              "Wallisian": "Wallis and Futuna"
+              "Welsh": "Wales"
+              "Yemeni": "Yemen"
+              "Zambian": "Zambia"
+              "Zimbabwean": "Zimbabwe"
+              "Åland Island": "Åland Islands"
 # Last Updated December 30, 2022

From a21b3d6293a8517b4c41d36dc8b894a3f9c5be3b Mon Sep 17 00:00:00 2001
From: bnkai <48220860+bnkai@users.noreply.github.com>
Date: Wed, 18 Jan 2023 18:13:24 +0200
Subject: [PATCH 3/6] Update Boobpedia.yml

- format
- removed fixed Gender as not all performers where female
- fixed twitter/instagram selectors
- tweaked a couple of regexes
---
 scrapers/Boobpedia.yml | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/scrapers/Boobpedia.yml b/scrapers/Boobpedia.yml
index f1c46ed16..b17f0bf09 100644
--- a/scrapers/Boobpedia.yml
+++ b/scrapers/Boobpedia.yml
@@ -22,11 +22,9 @@ xPathScrapers:
                 with: "https://www.boobpedia.com"
   performerScraper:
     performer:
-      Name: //h1     
-      Gender:
-        fixed: "Female"
-      Twitter: //table//tr/td/b[text()='Twitter']/../following-sibling::td//@href
-      Instagram: //table//tr/td/b[text()='Instagram']/../following-sibling::td//@href
+      Name: //h1
+      Twitter: //table//tr/td/a[b[text()='Twitter']]/@href
+      Instagram: //table//tr/td/a[b[text()='Instagram']]/@href
       Birthdate:
         selector: //table//tr/td//b[text()='Born']/../following-sibling::td/a
         concat: " "
@@ -35,8 +33,8 @@ xPathScrapers:
               - regex: (.*\d\d\d\d).*
                 with: $1
           - parseDate: January 2 2006
-      Ethnicity:   
-        selector: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a  
+      Ethnicity:
+        selector: //table//tr/td/b[text()='Ethnicity']/../following-sibling::td/a
         postProcess:
           - replace:
               - regex: \[\d*\]
@@ -74,7 +72,7 @@ xPathScrapers:
                 with: ""
               - regex: ( in) # Remove Unit Inches
                 with: ""
-      FakeTits: 
+      FakeTits:
         selector: //table//tr/td/b[text()='Boobs']/../following-sibling::td/a
         postProcess:
           - replace:
@@ -105,7 +103,7 @@ xPathScrapers:
                 with: "-"
               - regex: (\S)-(\S)
                 with: $1 - $2
-              - regex: (Present|present|Current|current)
+              - regex: (?i)(present|current)
                 with: ""
       Aliases:
         selector: //table//tr/td/b[text()[contains(.,'known')]]/../following-sibling::td
@@ -141,14 +139,12 @@ xPathScrapers:
               - regex: \[\d*\]
                 with: ""
               # Remove <protect> and </protect>, which appears in the details of some performers (e.g. Jenna Jameson)
-              - regex: (<protect>)
-                with: ""
-              - regex: (</protect>)
+              - regex: </?protect>
                 with: ""
               # Remove triple line breaks
               - regex: \n\n\n
                 with: "\n"
-      Country: 
+      Country:
         selector: //table//tr/td/b[text()='Nationality']/../following-sibling::td/a
         postProcess:
           - map:

From 2acc4ee8ad4795d5dafe4310f640ad6b17b3de40 Mon Sep 17 00:00:00 2001
From: SirCumAlot1988 <rachelaldanafan88@gmail.com>
Date: Sun, 12 Mar 2023 22:55:29 +0100
Subject: [PATCH 4/6] Revised AEBN scraper:

-Implemented ability to scrape movie scenes
-Fixed movie performers not scraped properly
-Added functionality to scrape performer tattoos, piercings and aliases
---
 scrapers/AEBN.py  | 491 ++++++++++++++++++++++++++++++++++++++++++++++
 scrapers/AEBN.yml | 134 +++----------
 2 files changed, 513 insertions(+), 112 deletions(-)
 create mode 100644 scrapers/AEBN.py

diff --git a/scrapers/AEBN.py b/scrapers/AEBN.py
new file mode 100644
index 000000000..2527cf198
--- /dev/null
+++ b/scrapers/AEBN.py
@@ -0,0 +1,491 @@
+import sys
+import random
+import json
+import base64
+import re
+import datetime
+
+# Seperators to append scene nr to url
+seperators = "+.,"
+
+# Seperator between movie title and Scene Nr string
+title_seperator = ": "
+
+try:
+    import py_common.log as log
+except ModuleNotFoundError:
+    print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
+    sys.exit()
+
+try:
+    import requests
+except ModuleNotFoundError:
+    print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)",
+          file=sys.stderr)
+    print(
+        "If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests",
+        file=sys.stderr)
+    sys.exit()
+
+try:
+    from bs4 import BeautifulSoup
+except ModuleNotFoundError:
+    print("You need to install the BeautifulSoup module. (https://pypi.org/project/beautifulsoup4/)", file=sys.stderr)
+    print(
+        "If you have pip (normally installed with python), run this command in a terminal (cmd): pip install beautifulsoup4",
+        file=sys.stderr)
+    sys.exit()
+
+try:
+    import dateparser
+except ModuleNotFoundError:
+    print("You need to install the dateparser module. (https://pypi.org/project/dateparser/", file=sys.stderr)
+    print(
+        "If you have pip (normally installed with python), run this command in a terminal (cmd): pip install dateparser",
+        file=sys.stderr)
+    sys.exit()
+
+class Scene:
+    def __init__(self):
+        self.title = ""
+        self.performers = []
+        self.tags = []
+        self.thumbnail = ""
+        self.scene_nr = ""
+        self.scene_id = ""
+        self.movie = None
+
+class Movie(Scene):
+    def __init__(self):
+        super().__init__()
+        self.url = ""
+        self.date = ""
+        self.director = ""
+        self.studio = ""
+        self.details = ""
+        self.front_cover = ""
+        self.back_cover = ""
+        self.scenes = []
+        self.duration = ""
+
+class Performer():
+    def __init__(self):
+        self.name = ""
+        self.url = ""
+        self.aliases = ""
+        self.gender = ""
+        self.birthdate = ""
+        self.ethnicity = ""
+        self.hair_color = ""
+        self.eye_color = ""
+        self.height = ""
+        self.weight = ""
+        self.tattoos = ""
+        self.piercings = ""
+        self.url = ""
+        self.details = ""
+        self.image = ""
+
+def parse_scene(scene):
+    scene_parsed = Scene()
+
+    # Get scene id
+    scene_parsed.scene_id = scene["id"]
+
+    # Get scene Nr
+    scene_parsed.scene_nr = int(scene.h1.span.text.replace("Scene ", ""))
+
+    # Some thumbnails are loaded together with the page, others are loaded when the scrollbar is used
+    # Get URLs of visible thumbnails
+    thumb_urls = scene.findChildren("img")
+    thumb_urls_parsed = []
+
+    for thumb_url in thumb_urls:
+        thumb_urls_parsed.append("https:" + thumb_url["src"].split("?")[0])
+
+    # Get URLs of thumbnails, which are loaded when the scrollbar is used
+    thumb_urls = scene.findChildren("div", {"class": "dts-collection-item dts-collection-item-scene-thumb"})
+
+    for thumb_url in thumb_urls:
+        if thumb_url.div["class"] == ["dts-lazy-loading-placeholder"]:
+            thumb_urls_parsed.append("https:" + thumb_url["data-scene-thumb-image-url"].split("?")[0])
+
+    # Choose randomly one of the thumbnails
+    scene_parsed.thumbnail = random.choice(thumb_urls_parsed)
+
+    # Get tags for current scene
+    tag_groups = [child for child in scene.findChildren("span", {"class": "section-detail-list-item-title"}) if
+                  "stars" not in child.text.lower()]
+
+    for tag_group in tag_groups:
+        for tag in tag_group.parent.contents:
+            if tag.name == "a":
+                scene_parsed.tags.append(tag.text)
+
+    # Get performers of current scene
+    performers = scene.findChildren("span", {"class": "dts-scene-star-wrapper"})
+
+    for performer in performers:
+        scene_parsed.performers.append(performer.a.text.strip())
+
+    return scene_parsed
+
+
+def parse_movie(url):
+
+    movie = Movie()
+
+    # Get domain name and movie path (necessary for scraping of performer names)
+    match = re.search(".*[.]com", url)
+    domain_name = match.group()
+    movie_path = url.replace(domain_name, "")
+
+    request = requests.get(url)
+    soup = BeautifulSoup(request.text, 'html.parser')
+
+    # Title
+    movie.title = soup.h1.text
+
+    # URL
+    movie.url = url
+
+    # Date
+    date = soup.find_all("li", "section-detail-list-item-release-date")
+    if len(date) > 0:
+        date_str = date[0].text.replace("Released: ", "")
+        movie.date = dateparser.parse(date_str)
+
+    # Director
+    director_soup = soup.find_all("li", "section-detail-list-item-director")
+    if len(director_soup) > 0:
+        directors = director_soup[0].find_all("a")
+
+        for director in directors:
+            movie.director = movie.director + director.text.strip() + ", "
+
+        if movie.director[-2:] == ", ":
+            movie.director = movie.director[:-2]
+
+    # Studio
+    studio = soup.find_all("div", "dts-studio-name-wrapper")
+
+    if len(studio) > 0:
+        movie.studio = studio[0].a.text
+
+    # Performers
+    performers = soup.find_all("div", "dts-collection-item dts-collection-item-star")
+
+    for i in range(len(performers)):
+        if "data-loc" in performers[i].attrs:
+            # Performer is loaded on demand, if the scrollbar is used. Data is retrieved via POST request
+            payload = {
+                "f": movie_path,
+                "fbase": movie_path,
+                "starIdRoot": performers[i]["data-star-id-root"],
+                "imgHeight": performers[i]["data-img-height"],
+                "useHeadshot": performers[i]["data-use-headshot"],
+                "useSilhouette": performers[i]["data-use-silhouette"],
+                "showFavoriteLink": performers[i]["data-show-favorite-link"]
+            }
+
+            performer_request = requests.post(domain_name + performers[i]["data-loc"], params=payload)
+            performer_soup = BeautifulSoup(performer_request.text, 'html.parser')
+            movie.performers.append(performer_soup.a.text.strip())
+        else:
+            movie.performers.append(performers[i]["title"])
+
+    # Tags
+    tags = soup.find_all("div", "dts-collection-item dts-collection-item-category")
+
+    for tag in tags:
+        movie.tags.append(tag.text.strip())
+
+    # Description
+    movie.details = soup.find_all("div", "dts-section-page-detail-description-body")[0].text.strip()
+
+    # Cover Images
+    movie.front_cover = "https:" + soup.find_all("img", "dts-modal-boxcover-front")[0]["src"]
+    movie.back_cover = "https:" + soup.find_all("img", "dts-modal-boxcover-back")[0]["src"]
+
+    # Scenes
+    scenes = soup.find_all("section", id=lambda x: x and x.startswith("scene"))
+
+    for scene in scenes:
+        parsed_scene = parse_scene(scene)
+
+        # Title, URL, date, director and studio are inherited from movie
+        parsed_scene.title = movie.title + title_seperator +  "Scene " + str(parsed_scene.scene_nr).zfill(2)
+        parsed_scene.movie = movie
+
+        movie.scenes.append(parsed_scene)
+
+        # Add performers and tags from scenes to movie performers and tags
+        movie.performers = movie.performers + parsed_scene.performers
+        movie.tags = movie.tags + parsed_scene.tags
+
+    # Remove duplicated performers and tags
+    movie.performers = list(dict.fromkeys(movie.performers))
+    movie.tags = list(dict.fromkeys(movie.tags))
+
+    # Get movie duration
+    # For some reason movie quality shares the same class as movie duration
+    entries = soup.find_all("li", "section-detail-list-item-duration")
+
+    for entry in entries:
+        if "Running Time: " in entry.text:
+            movie.duration = entry.text.replace("Running Time: ", "")
+
+    return movie
+
+def parse_performer(url):
+
+    performer = Performer()
+
+    request = requests.get(url)
+    soup = BeautifulSoup(request.text, 'html.parser')
+
+    # Title
+    performer.name = soup.h1.text
+
+    attributes = soup.find_all("span", "section-detail-list-item-title")
+
+    for attribute in attributes:
+        # Gender
+        if "Gender" in attribute.text:
+            performer.gender = attribute.parent.text.replace("Gender: ", "")
+
+        # Birthdate
+        if "Birth Date" in attribute.text:
+            birthdate_str = attribute.parent.text.replace("Birth Date: ", "")
+
+            # Datetime expects "Sep" instead of "Sept"
+            birthdate_str = birthdate_str.replace("Sept", "Sep")
+
+            birthdate = datetime.datetime.strptime(birthdate_str, "%b %d, %Y")
+            performer.birthdate = birthdate.strftime("%Y-%m-%d")
+
+        # Ethnicity
+        if "Ethnicity" in attribute.text:
+            performer.ethnicity = attribute.parent.text.replace("Ethnicity: ", "")
+
+        # Hair Color
+        if "Hair Color" in attribute.text:
+            performer.hair_color = attribute.parent.text.replace("Hair Color: ", "")
+
+        # Eye Color
+        if "Eye Color" in attribute.text:
+            performer.eye_color = attribute.parent.text.replace("Eye Color: ", "")
+
+        # Height
+        if "Height" in attribute.text:
+            height_str = attribute.parent.text
+            match_height = re.search("(\d*) cm", height_str)
+            if match_height:
+                performer.height = match_height.group(1)
+
+        # Weight
+        if "Weight" in attribute.text:
+            weight_str = attribute.parent.text
+            match_weight = re.search("(\d*)kg", weight_str)
+            if match_weight:
+                performer.weight = match_weight.group(1)
+
+    # Details
+    details = soup.find_all("div", "dts-star-bio")
+
+    if len(details) > 0:
+        performer.details = details[0].text
+
+    # Tattoos are given in the details section
+    if "Tattoos: " in performer.details:
+        match_tattoos = re.search("Tattoos: (.*)", performer.details)
+        if match_tattoos:
+            performer.tattoos = match_tattoos.group(1)
+            performer.details = performer.details.replace("Tattoos: " + performer.tattoos, "")
+
+    if "Tattoo: " in performer.details:
+        match_tattoos = re.search("Tattoo: (.*)", performer.details)
+        if match_tattoos:
+            performer.tattoos = match_tattoos.group(1)
+            performer.details = performer.details.replace("Tattoo: " + performer.tattoos, "")
+
+    # Piercings are given in the details section
+    if "Piercings:" in performer.details:
+        match_piercings = re.search("Piercings: (.*)", performer.details)
+        if match_piercings:
+            performer.piercings = match_piercings.group(1)
+            performer.details = performer.details.replace("Piercings: " + performer.piercings, "")
+
+    if "Non-ear piercings:" in performer.details:
+        match_piercings = re.search("Non-ear piercings: (.*)", performer.details)
+        if match_piercings:
+            performer.piercings = match_piercings.group(1)
+            performer.details = performer.details.replace("Non-ear piercings: " + performer.piercings, "")
+
+    # Aliases are given in the details section
+    if "AKA" in performer.details:
+        match_aliases = re.search("AKA (.*)", performer.details)
+        if match_aliases:
+            performer.aliases = match_aliases.group(1)
+            performer.details = performer.details.replace("AKA " + performer.aliases, "")
+
+    if "A.K.A:" in performer.details:
+        match_aliases = re.search("A.K.A: (.*)", performer.details)
+        if match_aliases:
+            performer.aliases = match_aliases.group(1)
+            performer.details = performer.details.replace("A.K.A: " + performer.aliases, "")
+
+    # Remove leading/trailing spaces from performer bio
+    performer.details = performer.details.strip()
+
+    # Image
+    image = soup.find_all("div", "dts-section-page-detail-main-image-wrapper")
+
+    if len(image) > 0:
+        image_url_small = image[0].img.attrs["src"]
+
+        match_image_url = re.search("(.*\.jpg)", image_url_small)
+        if match_image_url:
+            performer.image = "https:" + match_image_url.group(1)
+
+    # URL
+    performer.url = url
+
+    return performer
+
+def build_stash_scene_json(title, date, director, studio, performers, movie, tags, thumbnail, details = ""):
+
+    # Decode image
+    img = requests.get(thumbnail).content
+    b64img = base64.b64encode(img)
+    utf8img = b64img.decode('utf-8')
+
+    # Build JSON for Stash API
+    json = {}
+    json["title"] = title
+    json["url"] = url
+    if date != "":
+        json["date"] = date.strftime("%Y-%m-%d")
+    json["director"] = director
+    json["studio"] = {"name": studio}
+    json["performers"] = [{"name": performer} for performer in performers]
+    json["movies"] = [build_stash_movie_json(movie, decode_cover=False)]
+    json["tags"] = [{"name": tag} for tag in tags]
+    json["details"] = details
+    json["image"] = "data:image/jpeg;base64," + utf8img
+
+    return json
+
+def build_stash_movie_json(movie, decode_cover=True):
+
+    json = {}
+
+    if decode_cover:
+        # Decode front cover image
+        front_img = requests.get(movie.front_cover).content
+        front_b64img = base64.b64encode(front_img)
+        front_utf8img = front_b64img.decode('utf-8')
+        json["front_image"] = "data:image/jpeg;base64," + front_utf8img
+
+        # Decode back cover image
+        back_img = requests.get(movie.back_cover).content
+        back_b64img = base64.b64encode(back_img)
+        back_utf8img = back_b64img.decode('utf-8')
+        json["back_image"] = "data:image/jpeg;base64," + back_utf8img
+
+    # Build JSON for Stash API
+    json["name"] = movie.title
+    json["duration"] = movie.duration
+    if movie.date != "":
+        json["date"] = movie.date.strftime("%Y-%m-%d")
+    json["studio"] = {"name": movie.studio}
+    json["director"] = movie.director
+    json["url"] = movie.url
+    json["synopsis"] = movie.details
+
+    return json
+
+def build_stash_performer_json(performer):
+
+    # Build JSON for Stash API
+    json = {}
+    json["name"] = performer.name
+    json["aliases"] = performer.aliases
+    json["gender"] = performer.gender
+    json["birthdate"] = performer.birthdate
+    json["ethnicity"] = performer.ethnicity
+    json["hair_color"] = performer.hair_color
+    json["eye_color"] = performer.eye_color
+    json["height"] = performer.height
+    json["weight"] = performer.weight
+    json["tattoos"] = performer.tattoos
+    json["piercings"] = performer.piercings
+    json["url"] = performer.url
+    json["details"] = performer.details
+
+    # Decode image
+    if performer.image != "":
+        img = requests.get(performer.image).content
+        b64img = base64.b64encode(img)
+        utf8img = b64img.decode('utf-8')
+        json["images"] = ["data:image/jpeg;base64," + utf8img]
+
+    return json
+
+#Debug
+# url = "https://straight.aebn.com/straight/stars/3090/erik-everhard?fmc=1"
+# sys.argv.append("performer")
+#End Debug
+
+frag = json.loads(sys.stdin.read())
+url = frag["url"]
+
+if not frag['url']:
+    log.error("No URL entered")
+    sys.exit(1)
+
+if len(sys.argv) > 1:
+    if sys.argv[1] == "movie" or sys.argv[1] == "scene":
+        movie = parse_movie(url)
+        if sys.argv[1] == "scene":
+            # Check if complete movie shall be returned or only one of the movie scenes
+            match_scene_url = re.search(".*#(scene-\d*)$", url)
+            if match_scene_url:
+                scene_id = match_scene_url.group(1)
+                scene_found = False
+                for scene in movie.scenes:
+                    if scene.scene_id == scene_id:
+                        ret = build_stash_scene_json(scene.title, movie.date, movie.director, movie.studio,
+                                                     scene.performers, scene.movie, scene.tags, scene.thumbnail)
+                        scene_found = True
+                if not scene_found:
+                    log.error("Scene not found")
+                    sys.exit()
+
+            match_scene_nr = re.search(".*[" + seperators + "](\d*)$", url)
+            if match_scene_nr:
+                try:
+                    scene_nr = int(match_scene_nr.group(1))
+                except ValueError:
+                    log.error("Scene Nr must be Integer")
+                scene_found = False
+                for scene in movie.scenes:
+                    if scene.scene_nr == scene_nr:
+                        ret = build_stash_scene_json(scene.title, movie.date, movie.director, movie.studio,
+                                                     scene.performers, scene.movie, scene.tags, scene.thumbnail)
+                        scene_found = True
+                if not scene_found:
+                    log.error("Scene not found")
+                    sys.exit()
+
+            if not match_scene_url and not match_scene_nr:
+                ret = build_stash_scene_json(movie.title, movie.date, movie.director, movie.studio, movie.performers,
+                                             movie, movie.tags, movie.front_cover, movie.details)
+        else:
+            ret = build_stash_movie_json(movie)
+    else:
+        mode = "performer"
+        performer = parse_performer(url)
+        ret = build_stash_performer_json(performer)
+
+print(json.dumps(ret))
diff --git a/scrapers/AEBN.yml b/scrapers/AEBN.yml
index 72c27368f..a3de10c31 100644
--- a/scrapers/AEBN.yml
+++ b/scrapers/AEBN.yml
@@ -1,116 +1,26 @@
-name: "AEBN"
-performerByURL:
-  - action: scrapeXPath
-    url:
-      - aebn.com
-    scraper: performerScraper
+name: AEBN_dev
 sceneByURL:
-  - action: scrapeXPath
-    url:
+  - url:
       - aebn.com
-    scraper: sceneScraper
+    action: script
+    script:
+      - python3
+      - AEBN_dev_v02.py
+      - scene
 movieByURL:
-  - action: scrapeXPath
-    url:
+  - url:
+      - aebn.com
+    action: script
+    script:
+      - python3
+      - AEBN_dev_v02.py
+      - movie
+performerByURL:
+  - url:
       - aebn.com
-    scraper: movieScraper
-xPathScrapers:
-  performerScraper:
-    performer:
-      Name: //div[@class='dts-section-page-heading-title']/h1
-      Gender:
-        selector: //li[@class="section-detail-list-item-gender"]/text()
-        postProcess:
-          - map:
-              TS: transgender_female
-      URL: //link[@rel="canonical"]/@href
-      Birthdate:
-        selector: //div[@class='section-detail dts-list-attributes']/ul/li[contains(.,"Birth Date")]
-        postProcess:
-          - replace:
-              - regex: .+:\s(.+)
-                with: $1
-              - regex: "Sept"
-                with: "Sep"
-          - parseDate: Jan 2, 2006
-      Height:
-        selector: //li[@class='section-detail-list-item-height']/text()
-        postProcess:
-          - replace:
-              - regex: .+\((\d+).+\)
-                with: $1
-      Weight:
-        selector: //li[@class='section-detail-list-item-weight']/text()
-        postProcess:
-          - replace:
-              - regex: .+\((\d+).+\)
-                with: $1
-      EyeColor: 
-        selector: //li[@class='section-detail-list-item-eye-color']/text()
-      Ethnicity:
-        selector: //li[@class='section-detail-list-item-ethnicity']/text()
-        postProcess:
-          - map:
-              White: Caucasian
-      HairColor:
-        selector: //li[@class='section-detail-list-item-hair-color']/text()
-      Details:
-        selector: //div[@class='dts-section-page-detail-description-body']  
-      Image: 
-        selector: //div[@class='dts-section-page-detail-main-image-wrapper']/picture/img/@src
-        postProcess:
-          - replace:
-              - regex: ^([^?]+).*$
-                with: "https:$1"
-  sceneScraper:
-    scene:
-      Title: //h1[@class="dts-section-page-heading-title"]|//div[@class="dts-section-page-heading-title"]/h1
-      Date:
-        selector: //li[@class="section-detail-list-item-release-date"]/text()
-        postProcess:
-          - replace:
-              - regex: "Sept"
-                with: "Sep"
-          - parseDate: Jan 2, 2006
-      Details:
-        selector: //div[@class="dts-section-page-detail-description-body"]//text()
-      Performers:
-        Name: //div[@class="dts-star-name-overlay"]/text()
-      Tags:
-        Name: //span[@class="dts-image-display-name"]//text()
-      Image:
-        selector: //picture[@class="dts-movie-boxcover-front"]/img/@src
-        postProcess:
-          - replace:
-              - regex: ^([^?]+).*$
-                with: "https:$1" 
-  movieScraper:
-    movie:
-      Name: //h1[@class="dts-section-page-heading-title"]|//div[@class="dts-section-page-heading-title"]/h1
-      Director:
-        selector: //li[@class='section-detail-list-item-director']//span//a
-        concat: ", "
-      Duration: //li[@class='section-detail-list-item-duration'][contains(span,"Running Time")]/text()
-      Date:
-        selector: //li[@class="section-detail-list-item-release-date"]/text()
-        postProcess:
-          - replace:
-              - regex: "Sept"
-                with: "Sep"
-          - parseDate: Jan 2, 2006
-      Synopsis: //div[@class="dts-section-page-detail-description-body"]//text()
-      Studio:
-        Name: //div[@class='dts-studio-name-wrapper']/a/text()
-      FrontImage:
-        selector: //picture[@class="dts-movie-boxcover-front"]/img/@src
-        postProcess:
-          - replace:
-              - regex: ^([^?]+).*$
-                with: "https:$1"
-      BackImage:
-        selector: //picture[@class="dts-movie-boxcover-back"]/img/@src
-        postProcess:
-          - replace:
-              - regex: ^([^?]+).*$
-                with: "https:$1"
-# Last Updated July 06, 2022
+    action: script
+    script:
+      - python3
+      - AEBN_dev_v02.py
+      - performer
+# Last Updated March 12, 2023

From 5db4721a9b0b740cf39afdaa4cd65fc2a910e2dd Mon Sep 17 00:00:00 2001
From: SirCumAlot1988 <rachelaldanafan88@gmail.com>
Date: Thu, 16 Mar 2023 10:46:42 +0100
Subject: [PATCH 5/6] -Added functionality to scrape performer details and/or
 images during scene scraping -Improved handling of tattoos/piercings/aliases
 during performer scraping -Added handling of transgender performers

---
 scrapers/AEBN.py  | 264 +++++++++++++++++++++++++---------------------
 scrapers/AEBN.yml |   2 +-
 2 files changed, 144 insertions(+), 122 deletions(-)

diff --git a/scrapers/AEBN.py b/scrapers/AEBN.py
index 2527cf198..f7ae5da89 100644
--- a/scrapers/AEBN.py
+++ b/scrapers/AEBN.py
@@ -5,12 +5,22 @@
 import re
 import datetime
 
-# Seperators to append scene nr to url
-seperators = "+.,"
+# Seperators to append scene nr to url when sending a request
+request_seperators = "+.,"
 
-# Seperator between movie title and Scene Nr string
+# Seperator between movie title and Scene Nr string for auto generated titles
 title_seperator = ": "
 
+# Settings on how to handle performer scraping during scene scraping
+#
+# If both are set to false, the performer scraper will not be invoked and only performer name and URL are scraped
+# If at least one of them is set to True, the performer scraper will be invoked for each performer of a scene,
+# scraping the complete performer details and/or performer images
+#
+# Note that this will slow down scene scraping significantly if set to True
+scrape_performer_details = False
+scrape_performer_images = False
+
 try:
     import py_common.log as log
 except ModuleNotFoundError:
@@ -48,7 +58,7 @@
 class Scene:
     def __init__(self):
         self.title = ""
-        self.performers = []
+        self.performers = {}
         self.tags = []
         self.thumbnail = ""
         self.scene_nr = ""
@@ -86,7 +96,8 @@ def __init__(self):
         self.details = ""
         self.image = ""
 
-def parse_scene(scene):
+def parse_scene(scene, domain_name):
+
     scene_parsed = Scene()
 
     # Get scene id
@@ -126,7 +137,8 @@ def parse_scene(scene):
     performers = scene.findChildren("span", {"class": "dts-scene-star-wrapper"})
 
     for performer in performers:
-        scene_parsed.performers.append(performer.a.text.strip())
+        performer_name = performer.a.text.strip()
+        scene_parsed.performers[performer_name] = domain_name + performer.a["href"]
 
     return scene_parsed
 
@@ -176,6 +188,7 @@ def parse_movie(url):
     performers = soup.find_all("div", "dts-collection-item dts-collection-item-star")
 
     for i in range(len(performers)):
+
         if "data-loc" in performers[i].attrs:
             # Performer is loaded on demand, if the scrollbar is used. Data is retrieved via POST request
             payload = {
@@ -190,9 +203,13 @@ def parse_movie(url):
 
             performer_request = requests.post(domain_name + performers[i]["data-loc"], params=payload)
             performer_soup = BeautifulSoup(performer_request.text, 'html.parser')
-            movie.performers.append(performer_soup.a.text.strip())
+            performer_name = performer_soup.a.text.strip()
+            performer_url = domain_name + performer_soup.a["href"]
         else:
-            movie.performers.append(performers[i]["title"])
+            performer_name = performers[i]["title"]
+            performer_url = domain_name + performers[i].a["href"]
+
+        movie.performers[performer_name] = performer_url
 
     # Tags
     tags = soup.find_all("div", "dts-collection-item dts-collection-item-category")
@@ -211,20 +228,19 @@ def parse_movie(url):
     scenes = soup.find_all("section", id=lambda x: x and x.startswith("scene"))
 
     for scene in scenes:
-        parsed_scene = parse_scene(scene)
+        parsed_scene = parse_scene(scene, domain_name)
 
-        # Title, URL, date, director and studio are inherited from movie
-        parsed_scene.title = movie.title + title_seperator +  "Scene " + str(parsed_scene.scene_nr).zfill(2)
+        # Add scene title and movie
+        parsed_scene.title = movie.title + title_seperator + "Scene " + str(parsed_scene.scene_nr).zfill(2)
         parsed_scene.movie = movie
 
         movie.scenes.append(parsed_scene)
 
         # Add performers and tags from scenes to movie performers and tags
-        movie.performers = movie.performers + parsed_scene.performers
+        movie.performers.update(parsed_scene.performers)
         movie.tags = movie.tags + parsed_scene.tags
 
-    # Remove duplicated performers and tags
-    movie.performers = list(dict.fromkeys(movie.performers))
+    # Remove duplicated tags
     movie.tags = list(dict.fromkeys(movie.tags))
 
     # Get movie duration
@@ -237,7 +253,7 @@ def parse_movie(url):
 
     return movie
 
-def parse_performer(url):
+def parse_performer(url, scrape_performer_details=True, scrape_performer_image=True):
 
     performer = Performer()
 
@@ -247,109 +263,114 @@ def parse_performer(url):
     # Title
     performer.name = soup.h1.text
 
-    attributes = soup.find_all("span", "section-detail-list-item-title")
-
-    for attribute in attributes:
-        # Gender
-        if "Gender" in attribute.text:
-            performer.gender = attribute.parent.text.replace("Gender: ", "")
-
-        # Birthdate
-        if "Birth Date" in attribute.text:
-            birthdate_str = attribute.parent.text.replace("Birth Date: ", "")
-
-            # Datetime expects "Sep" instead of "Sept"
-            birthdate_str = birthdate_str.replace("Sept", "Sep")
-
-            birthdate = datetime.datetime.strptime(birthdate_str, "%b %d, %Y")
-            performer.birthdate = birthdate.strftime("%Y-%m-%d")
-
-        # Ethnicity
-        if "Ethnicity" in attribute.text:
-            performer.ethnicity = attribute.parent.text.replace("Ethnicity: ", "")
-
-        # Hair Color
-        if "Hair Color" in attribute.text:
-            performer.hair_color = attribute.parent.text.replace("Hair Color: ", "")
-
-        # Eye Color
-        if "Eye Color" in attribute.text:
-            performer.eye_color = attribute.parent.text.replace("Eye Color: ", "")
-
-        # Height
-        if "Height" in attribute.text:
-            height_str = attribute.parent.text
-            match_height = re.search("(\d*) cm", height_str)
-            if match_height:
-                performer.height = match_height.group(1)
-
-        # Weight
-        if "Weight" in attribute.text:
-            weight_str = attribute.parent.text
-            match_weight = re.search("(\d*)kg", weight_str)
-            if match_weight:
-                performer.weight = match_weight.group(1)
-
-    # Details
-    details = soup.find_all("div", "dts-star-bio")
-
-    if len(details) > 0:
-        performer.details = details[0].text
-
-    # Tattoos are given in the details section
-    if "Tattoos: " in performer.details:
-        match_tattoos = re.search("Tattoos: (.*)", performer.details)
-        if match_tattoos:
-            performer.tattoos = match_tattoos.group(1)
-            performer.details = performer.details.replace("Tattoos: " + performer.tattoos, "")
-
-    if "Tattoo: " in performer.details:
-        match_tattoos = re.search("Tattoo: (.*)", performer.details)
-        if match_tattoos:
-            performer.tattoos = match_tattoos.group(1)
-            performer.details = performer.details.replace("Tattoo: " + performer.tattoos, "")
-
-    # Piercings are given in the details section
-    if "Piercings:" in performer.details:
-        match_piercings = re.search("Piercings: (.*)", performer.details)
-        if match_piercings:
-            performer.piercings = match_piercings.group(1)
-            performer.details = performer.details.replace("Piercings: " + performer.piercings, "")
-
-    if "Non-ear piercings:" in performer.details:
-        match_piercings = re.search("Non-ear piercings: (.*)", performer.details)
-        if match_piercings:
-            performer.piercings = match_piercings.group(1)
-            performer.details = performer.details.replace("Non-ear piercings: " + performer.piercings, "")
-
-    # Aliases are given in the details section
-    if "AKA" in performer.details:
-        match_aliases = re.search("AKA (.*)", performer.details)
-        if match_aliases:
-            performer.aliases = match_aliases.group(1)
-            performer.details = performer.details.replace("AKA " + performer.aliases, "")
-
-    if "A.K.A:" in performer.details:
-        match_aliases = re.search("A.K.A: (.*)", performer.details)
-        if match_aliases:
-            performer.aliases = match_aliases.group(1)
-            performer.details = performer.details.replace("A.K.A: " + performer.aliases, "")
-
-    # Remove leading/trailing spaces from performer bio
-    performer.details = performer.details.strip()
+    # URL
+    performer.url = url
 
-    # Image
-    image = soup.find_all("div", "dts-section-page-detail-main-image-wrapper")
+    if scrape_performer_details:
+        attributes = soup.find_all("span", "section-detail-list-item-title")
+
+        for attribute in attributes:
+            # Gender
+            if "Gender" in attribute.text:
+                performer.gender = attribute.parent.text.replace("Gender: ", "")
+
+                # Handle Transsexual performers
+                if performer.gender == "TS": performer.gender = "Transgender Female"
+
+            # Birthdate
+            if "Birth Date" in attribute.text:
+                birthdate_str = attribute.parent.text.replace("Birth Date: ", "")
+
+                # Datetime expects "Sep" instead of "Sept"
+                birthdate_str = birthdate_str.replace("Sept", "Sep")
+
+                birthdate = datetime.datetime.strptime(birthdate_str, "%b %d, %Y")
+                performer.birthdate = birthdate.strftime("%Y-%m-%d")
+
+            # Ethnicity
+            if "Ethnicity" in attribute.text:
+                performer.ethnicity = attribute.parent.text.replace("Ethnicity: ", "")
+
+            # Hair Color
+            if "Hair Color" in attribute.text:
+                performer.hair_color = attribute.parent.text.replace("Hair Color: ", "")
+
+            # Eye Color
+            if "Eye Color" in attribute.text:
+                performer.eye_color = attribute.parent.text.replace("Eye Color: ", "")
+
+            # Height
+            if "Height" in attribute.text:
+                height_str = attribute.parent.text
+                match_height = re.search("(\d*) cm", height_str)
+                if match_height:
+                    performer.height = match_height.group(1)
+
+            # Weight
+            if "Weight" in attribute.text:
+                weight_str = attribute.parent.text
+                match_weight = re.search("(\d*)kg", weight_str)
+                if match_weight:
+                    performer.weight = match_weight.group(1)
+
+        # Details
+        details = soup.find_all("div", "dts-star-bio")
+
+        if len(details) > 0:
+            performer.details = details[0].text
+
+        # Tattoos are given in the details section
+        tattoo_keywords = ["Tattoos", "Tattoo"]
+
+        for keyword in tattoo_keywords:
+            if keyword.lower() in performer.details.lower():
+                pattern = re.compile(keyword + ":* ([^)\n]*)", re.IGNORECASE)
+                match_tattoos = re.search(pattern, performer.details)
+                if match_tattoos:
+                    performer.tattoos = match_tattoos.group(1)
+                    performer.details = pattern.sub("", performer.details)
+
+        # Piercings are given in the details section
+        piercing_keywords = ["Non-Ear piercings", "Piercings"]
+
+        for keyword in piercing_keywords:
+            if keyword.lower() in performer.details.lower():
+                pattern = re.compile(keyword + ":* ([^)\n]*)", re.IGNORECASE)
+                match_piercings = re.search(pattern, performer.details)
+                if match_piercings:
+                    performer.piercings = match_piercings.group(1)
+                    performer.details = pattern.sub("", performer.details)
+
+        # Aliases are given in the details section
+        alias_keywords = ["A.K.A", "AKA"]
+
+        for keyword in alias_keywords:
+            if keyword.lower() in performer.details.lower():
+                pattern = re.compile(keyword + ":* ([^)\n]*)", re.IGNORECASE)
+                match_aliases = re.search(pattern, performer.details)
+                if match_aliases:
+                    performer.aliases = match_aliases.group(1)
+                    performer.details = pattern.sub("", performer.details)
+
+        # Remove leading/trailing spaces from performer bio
+        performer.details = performer.details.strip()
+
+        # Remove "()" which might result from tattoo/piercing/alias replacements above
+        performer.details = performer.details.replace("()", "")
+
+        # Remove double spaces which might result from tattoo/piercing/alias replacements above
+        performer.details = performer.details.replace("  ", " ")
 
-    if len(image) > 0:
-        image_url_small = image[0].img.attrs["src"]
+    # Image
+    if scrape_performer_image:
+        image = soup.find_all("div", "dts-section-page-detail-main-image-wrapper")
 
-        match_image_url = re.search("(.*\.jpg)", image_url_small)
-        if match_image_url:
-            performer.image = "https:" + match_image_url.group(1)
+        if len(image) > 0:
+            image_url_small = image[0].img.attrs["src"]
 
-    # URL
-    performer.url = url
+            match_image_url = re.search("(.*\.jpg)", image_url_small)
+            if match_image_url:
+                performer.image = "https:" + match_image_url.group(1)
 
     return performer
 
@@ -368,7 +389,13 @@ def build_stash_scene_json(title, date, director, studio, performers, movie, tag
         json["date"] = date.strftime("%Y-%m-%d")
     json["director"] = director
     json["studio"] = {"name": studio}
-    json["performers"] = [{"name": performer} for performer in performers]
+
+    # Build performer json
+    json["performers"] = []
+    for performer_name in performers.keys():
+        parsed_performer = parse_performer(performers[performer_name], scrape_performer_details, scrape_performer_images)
+        json["performers"].append(build_stash_performer_json(parsed_performer))
+
     json["movies"] = [build_stash_movie_json(movie, decode_cover=False)]
     json["tags"] = [{"name": tag} for tag in tags]
     json["details"] = details
@@ -432,11 +459,6 @@ def build_stash_performer_json(performer):
 
     return json
 
-#Debug
-# url = "https://straight.aebn.com/straight/stars/3090/erik-everhard?fmc=1"
-# sys.argv.append("performer")
-#End Debug
-
 frag = json.loads(sys.stdin.read())
 url = frag["url"]
 
@@ -462,7 +484,7 @@ def build_stash_performer_json(performer):
                     log.error("Scene not found")
                     sys.exit()
 
-            match_scene_nr = re.search(".*[" + seperators + "](\d*)$", url)
+            match_scene_nr = re.search(".*[" + request_seperators + "](\d*)$", url)
             if match_scene_nr:
                 try:
                     scene_nr = int(match_scene_nr.group(1))
diff --git a/scrapers/AEBN.yml b/scrapers/AEBN.yml
index a3de10c31..514720b46 100644
--- a/scrapers/AEBN.yml
+++ b/scrapers/AEBN.yml
@@ -23,4 +23,4 @@ performerByURL:
       - python3
       - AEBN_dev_v02.py
       - performer
-# Last Updated March 12, 2023
+# Last Updated March 16, 2023

From 2f7eb20021ed351a1abe42aa08f94ee4fffcc49a Mon Sep 17 00:00:00 2001
From: SirCumAlot1988 <rachelaldanafan88@gmail.com>
Date: Thu, 16 Mar 2023 12:14:03 +0100
Subject: [PATCH 6/6] Fixed AEBN.yml to include the correct.py file

---
 scrapers/AEBN.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scrapers/AEBN.yml b/scrapers/AEBN.yml
index 514720b46..ba4500ff4 100644
--- a/scrapers/AEBN.yml
+++ b/scrapers/AEBN.yml
@@ -1,11 +1,11 @@
-name: AEBN_dev
+name: AEBN
 sceneByURL:
   - url:
       - aebn.com
     action: script
     script:
       - python3
-      - AEBN_dev_v02.py
+      - AEBN.py
       - scene
 movieByURL:
   - url:
@@ -13,7 +13,7 @@ movieByURL:
     action: script
     script:
       - python3
-      - AEBN_dev_v02.py
+      - AEBN.py
       - movie
 performerByURL:
   - url:
@@ -21,6 +21,6 @@ performerByURL:
     action: script
     script:
       - python3
-      - AEBN_dev_v02.py
+      - AEBN.py
       - performer
 # Last Updated March 16, 2023