From ed1908f189544a29ba798db80691d01b1b255b17 Mon Sep 17 00:00:00 2001 From: Michael Brandt Date: Sat, 16 Nov 2024 23:27:34 -0700 Subject: [PATCH] WIP collection iteration with selenium --- data_ingest/pyproject.toml | 1 + .../beethoven_collections_view.html | 6619 +++++++++++++++++ data_ingest/utils/imslp_scraping.py | 89 + 3 files changed, 6709 insertions(+) create mode 100644 data_ingest/tests/scrape_responses/beethoven_collections_view.html create mode 100644 data_ingest/utils/imslp_scraping.py diff --git a/data_ingest/pyproject.toml b/data_ingest/pyproject.toml index 6bae1d1..d69f35a 100644 --- a/data_ingest/pyproject.toml +++ b/data_ingest/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "imslp>=0.2.3", "pytest>=8.3.3", "ruff>=0.7.4", + "selenium>=4.26.1", ] [tool.pytest.ini_options] pythonpath = ["."] diff --git a/data_ingest/tests/scrape_responses/beethoven_collections_view.html b/data_ingest/tests/scrape_responses/beethoven_collections_view.html new file mode 100644 index 0000000..48fbbd3 --- /dev/null +++ b/data_ingest/tests/scrape_responses/beethoven_collections_view.html @@ -0,0 +1,6619 @@ + + + + Category:Beethoven, Ludwig van - IMSLP + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+
+
+
+
+ + +

+ Category:Beethoven, Ludwig van +

+
+
+
+
+
+
+ + Ludwig van Beethoven + +
+
+ + Signature + +
+
+
+

+ + Ludwig van Beethoven + +

+ (16 December 1770 — 26 March 1827) +
+ +

+ + External links + +

+ +

+ + See also + +

+ +

+ + Miscellaneous information + +

+ +
+
+
+
+
+
+ + + + +
+
+ +
+

+ Compositions (359) +

+
+
+

+ Compositions by: Beethoven, Ludwig van +

+

+ The following + + 200 + + pages are in this category, out of + + 359 + + total. + + + + + 🔀 + + + 📻 + + +

+
+ (no previous) ( + + next 159 + + ) +
+
+ + + + + + + +
+

+ A +

+ +

+ B +

+ +

+ C +

+ +

+ D +

+ +

+ E +

+ +

+ F +

+ +

+ G +

+ +

+ H +

+ +
+

+ I +

+ +

+ J +

+ +

+ K +

+ +

+ L +

+ +

+ M +

+ +

+ N +

+ +

+ O +

+ +

+ P +

+ +
+
+
+ (no previous) ( + + next 159 + + ) +
+
+ + +
+
+
+

+ Collaborations (1) +

+
+
+

+ Collaborations with: Beethoven, Ludwig van +

+

+ The following + + 1 + + pages are in this category, out of + + 1 + + total. + + +

+
+ + + + + + +
+

+ J +

+ +
+
+
+ + +
+
+
+

+ Pasticcios (1) +

+
+
+

+ Pasticcios by or with: Beethoven, Ludwig van +

+

+ The following + + 1 + + pages are in this category, out of + + 1 + + total. + + +

+
+ + + + + + +
+

+ M +

+ +
+
+
+ + +
+
+
+

+ Collections (223) +

+
+
+

+ Collections by or with: Beethoven, Ludwig van +

+

+ The following + + 200 + + pages are in this category, out of + + 223 + + total. + + + + + 🔀 + + + 📻 + + +

+
+ (no previous) ( + + next 23 + + ) +
+
+ + + + + + + +
+

+

+ +

+ A +

+ +

+ B +

+ +

+ C +

+ +

+ D +

+ +

+ E +

+ +

+ F +

+ +

+ G +

+ +

+ H +

+ +

+ I +

+ +

+ K +

+ +

+ L +

+ +

+ M +

+ +
+

+ M cont. +

+ +

+ N +

+ +

+ O +

+ +

+ P +

+ +

+ Q +

+ +

+ R +

+ +

+ S +

+ +

+ T +

+ +
+
+
+ (no previous) ( + + next 23 + + ) +
+
+ + +
+
+
+

+ As Arranger (10) +

+
+
+

+ Arrangements by: Beethoven, Ludwig van +

+

+ The following + + 10 + + pages are in this category, out of + + 10 + + total. + + + + + 🔀 + + + 📻 + + +

+ +
+ + +
+
+
+

+ As Copyist (1) +

+
+
+

+ Works copied by: Beethoven, Ludwig van +

+

+ The following + + 1 + + pages are in this category, out of + + 1 + + total. + + +

+ +
+ + +
+
+
+

+ As Dedicatee (18) +

+
+
+

+ Works dedicated to: Beethoven, Ludwig van +

+

+ The following + + 18 + + pages are in this category, out of + + 18 + + total. + + + + + 🔀 + + + 📻 + + +

+ +
+ + +
+
+
+

+ Books (4) +

+
+
+

+ Books by: Beethoven, Ludwig van +

+

+ The following + + 4 + + pages are in this category, out of + + 4 + + total. + + + + + 🔀 + + +

+ +
+ + +
+
+
+
+ +
+
+
+
+ +
+ +
+
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + diff --git a/data_ingest/utils/imslp_scraping.py b/data_ingest/utils/imslp_scraping.py new file mode 100644 index 0000000..88104f8 --- /dev/null +++ b/data_ingest/utils/imslp_scraping.py @@ -0,0 +1,89 @@ +import time +from typing import List +from urllib.parse import quote, urlparse + +import requests +from bs4 import BeautifulSoup, Tag +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.by import By +from webdriver_manager.chrome import ChromeDriverManager + + +def get_collection_page(composer_name: str) -> BeautifulSoup: + base_url = "https://imslp.org/wiki/Category:" + # Encode the composer name correctly using quote + encoded_name = quote(composer_name, safe=",") # This will encode the comma and leave it as '%2C' + + full_url = f"{base_url}{encoded_name}" + response = requests.get(full_url) + response.raise_for_status() # Ensure successful request + + return BeautifulSoup(response.text, 'html.parser') + + +def get_composer_collection_objects(base_url: str) -> List[str]: + options = Options() + options.add_argument('--headless') # Run in headless mode (no browser UI) + driver = webdriver.Chrome( + service=Service( + ChromeDriverManager().install() + ), + options=options +) + + # Open the URL directly in Selenium + driver.get(base_url) + + collection_objects = [] + try: + collection_link = driver.find_element( + By.XPATH, "//a[contains(text(), 'Collections')]" + ) + + # Scroll to the element (optional, if the element is out of view) + ActionChains(driver).move_to_element(collection_link).perform() + + print(f"COLLECTION LINK: {collection_link}") + # Click the collection link + collection_container_id = urlparse(collection_link.get_attribute('href')).fragment + + + print(collection_container_id) + collection_link.click() + + # Wait for the new content to load after clicking + time.sleep(1) + + page_source = driver.page_source + soup = BeautifulSoup(page_source, 'html.parser') + links = soup.find(id=collection_container_id) + if isinstance(links, Tag): + links = links.find_all('a', class_ = "categorypagelink") + for link in links: + href = link.get('href') + if href: + collection_objects.append(href) + else: + print('not found') + print(collection_objects) + + except Exception as e: + print(f"Error: {e}") + + driver.quit() + + return collection_objects + + +# # Example usage: +# composer_name = "Beethoven, Ludwig van" # Replace with the composer you're looking for +# soup = get_collection_page(composer_name) +url = 'https://imslp.org/wiki/Category:Beethoven,_Ludwig_van' +collection_objects = get_composer_collection_objects(url) + +# Print out the collection objects for debugging + +