lint and cleanup

mbrandt00 · Nov 17, 2024 · d7401c7 · d7401c7
1 parent ed1908f
commit d7401c7
Show file tree

Hide file tree

Showing 7 changed files with 270 additions and 171 deletions.
diff --git a/data_ingest/tests/test_imslp_collection_scraper.py b/data_ingest/tests/test_imslp_collection_scraper.py
@@ -0,0 +1,13 @@
+import pytest
+from bs4 import BeautifulSoup
+
+from utils import get_composer_collection_objects
+
+
+def test_getcomposer_collection_objects():
+    with open("tests/scrape_responses/beethoven_collections_view.html", "r") as file:
+        html_content = file.read()
+        soup = BeautifulSoup(html_content, "html.parser")
+        collection = get_composer_collection_objects(soup)
+
+    assert collection is True
diff --git a/data_ingest/tests/test_parse_movements.py b/data_ingest/tests/test_parse_movements.py
@@ -1,4 +1,3 @@
-import pytest
 from bs4 import BeautifulSoup
 
 from utils import parse_movements
@@ -9,38 +8,44 @@ def test_parse_piece_sections():
         html_content = file.read()
         soup = BeautifulSoup(html_content, "html.parser")
         result = parse_movements(soup)
-        assert isinstance( result, list)
+        assert isinstance(result, list)
         # key signature
-        assert result[0]['key_signature'] == 'c'
-        assert result[1]['key_signature'] == 'bflatminor'
+        assert result[0]["key_signature"] == "c"
+        assert result[1]["key_signature"] == "bflatminor"
         # number
-        assert result[0]['number'] == 1
-        assert result[1]['number'] == 2
+        assert result[0]["number"] == 1
+        assert result[1]["number"] == 2
         # clean name without number
-        assert result[0]['name'] == 'Allegro vivace'
-        assert result[1]['name'] == 'Allegretto'
+        assert result[0]["name"] == "Allegro vivace"
+        assert result[1]["name"] == "Allegretto"
 
         # url
         print(result)
-        assert result[0]['download_url'] == 'https://imslp.org/wiki/Special:ImagefromIndex/309270'
-        assert result[1]['download_url'] == 'https://imslp.org/wiki/Special:ImagefromIndex/309271'
+        assert (
+            result[0]["download_url"]
+            == "https://imslp.org/wiki/Special:ImagefromIndex/309270"
+        )
+        assert (
+            result[1]["download_url"]
+            == "https://imslp.org/wiki/Special:ImagefromIndex/309271"
+        )
+
 
 def test_parse_piece_movements():
-    with open('tests/scrape_responses/chopin_cello_sonata.html') as file:
+    with open("tests/scrape_responses/chopin_cello_sonata.html") as file:
         html_content = file.read()
         soup = BeautifulSoup(html_content, "html.parser")
         result = parse_movements(soup)
-        assert result[0]['key_signature'] == 'gminor'
-        assert result[1]['key_signature'] == 'dminor'
+        assert result[0]["key_signature"] == "gminor"
+        assert result[1]["key_signature"] == "dminor"
         # number
-        assert result[0]['number'] == 1
-        assert result[1]['number'] == 2
+        assert result[0]["number"] == 1
+        assert result[1]["number"] == 2
         # clean name without number
-        assert result[0]['name'] == 'Allegro moderato'
-        assert result[1]['name'] == 'Scherzo'
+        assert result[0]["name"] == "Allegro moderato"
+        assert result[1]["name"] == "Scherzo"
 
         # url
         print(result)
-        assert result[0]['download_url'] is None
-        assert result[1]['download_url'] is None
-
+        assert result[0]["download_url"] is None
+        assert result[1]["download_url"] is None
diff --git a/data_ingest/utils.py b/data_ingest/utils.py
diff --git a/data_ingest/utils/__init__.py b/data_ingest/utils/__init__.py
@@ -1,5 +1,4 @@
+from .imslp_scraping import get_composer_collection_objects
 from .movements import parse_movements
 
-__all__ = [
-    'parse_movements',
-]
+__all__ = ["parse_movements", "get_composer_collection_objects"]
diff --git a/data_ingest/utils/imslp_scraping.py b/data_ingest/utils/imslp_scraping.py
@@ -15,24 +15,23 @@
 def get_collection_page(composer_name: str) -> BeautifulSoup:
     base_url = "https://imslp.org/wiki/Category:"
     # Encode the composer name correctly using quote
-    encoded_name = quote(composer_name, safe=",")  # This will encode the comma and leave it as '%2C'
-
+    encoded_name = quote(
+        composer_name, safe=","
+    )  # This will encode the comma and leave it as '%2C'
+
     full_url = f"{base_url}{encoded_name}"
     response = requests.get(full_url)
     response.raise_for_status()  # Ensure successful request
-    
-    return BeautifulSoup(response.text, 'html.parser')
+
+    return BeautifulSoup(response.text, "html.parser")
 
 
 def get_composer_collection_objects(base_url: str) -> List[str]:
     options = Options()
-    options.add_argument('--headless')  # Run in headless mode (no browser UI)
-    driver = webdriver.Chrome( 
-    service=Service(
-        ChromeDriverManager().install()
-    ),
-    options=options
-)
+    options.add_argument("--headless")  # Run in headless mode (no browser UI)
+    driver = webdriver.Chrome(
+        service=Service(ChromeDriverManager().install()), options=options
+    )
 
     # Open the URL directly in Selenium
     driver.get(base_url)
@@ -48,8 +47,9 @@ def get_composer_collection_objects(base_url: str) -> List[str]:
 
         print(f"COLLECTION LINK: {collection_link}")
         # Click the collection link
-        collection_container_id = urlparse(collection_link.get_attribute('href')).fragment
-
+        collection_container_id = urlparse(
+            collection_link.get_attribute("href")
+        ).fragment
 
         print(collection_container_id)
         collection_link.click()
@@ -58,16 +58,16 @@ def get_composer_collection_objects(base_url: str) -> List[str]:
         time.sleep(1)
 
         page_source = driver.page_source
-        soup = BeautifulSoup(page_source, 'html.parser')
+        soup = BeautifulSoup(page_source, "html.parser")
         links = soup.find(id=collection_container_id)
         if isinstance(links, Tag):
-            links = links.find_all('a', class_ = "categorypagelink")
+            links = links.find_all("a", class_="categorypagelink")
             for link in links:
-                href = link.get('href')
-                if href: 
+                href = link.get("href")
+                if href:
                     collection_objects.append(href)
-        else: 
-            print('not found')
+        else:
+            print("not found")
         print(collection_objects)
 
     except Exception as e:
@@ -81,9 +81,7 @@ def get_composer_collection_objects(base_url: str) -> List[str]:
 # # Example usage:
 # composer_name = "Beethoven, Ludwig van"  # Replace with the composer you're looking for
 # soup = get_collection_page(composer_name)
-url = 'https://imslp.org/wiki/Category:Beethoven,_Ludwig_van'
+url = "https://imslp.org/wiki/Category:Beethoven,_Ludwig_van"
 collection_objects = get_composer_collection_objects(url)
 
 # Print out the collection objects for debugging
-
-