Skip to content

Commit

Permalink
lint and cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
mbrandt00 committed Nov 17, 2024
1 parent ed1908f commit d7401c7
Show file tree
Hide file tree
Showing 7 changed files with 270 additions and 171 deletions.
13 changes: 13 additions & 0 deletions data_ingest/tests/test_imslp_collection_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import pytest
from bs4 import BeautifulSoup

from utils import get_composer_collection_objects


def test_getcomposer_collection_objects():
with open("tests/scrape_responses/beethoven_collections_view.html", "r") as file:
html_content = file.read()
soup = BeautifulSoup(html_content, "html.parser")
collection = get_composer_collection_objects(soup)

assert collection is True
45 changes: 25 additions & 20 deletions data_ingest/tests/test_parse_movements.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import pytest
from bs4 import BeautifulSoup

from utils import parse_movements
Expand All @@ -9,38 +8,44 @@ def test_parse_piece_sections():
html_content = file.read()
soup = BeautifulSoup(html_content, "html.parser")
result = parse_movements(soup)
assert isinstance( result, list)
assert isinstance(result, list)
# key signature
assert result[0]['key_signature'] == 'c'
assert result[1]['key_signature'] == 'bflatminor'
assert result[0]["key_signature"] == "c"
assert result[1]["key_signature"] == "bflatminor"
# number
assert result[0]['number'] == 1
assert result[1]['number'] == 2
assert result[0]["number"] == 1
assert result[1]["number"] == 2
# clean name without number
assert result[0]['name'] == 'Allegro vivace'
assert result[1]['name'] == 'Allegretto'
assert result[0]["name"] == "Allegro vivace"
assert result[1]["name"] == "Allegretto"

# url
print(result)
assert result[0]['download_url'] == 'https://imslp.org/wiki/Special:ImagefromIndex/309270'
assert result[1]['download_url'] == 'https://imslp.org/wiki/Special:ImagefromIndex/309271'
assert (
result[0]["download_url"]
== "https://imslp.org/wiki/Special:ImagefromIndex/309270"
)
assert (
result[1]["download_url"]
== "https://imslp.org/wiki/Special:ImagefromIndex/309271"
)


def test_parse_piece_movements():
with open('tests/scrape_responses/chopin_cello_sonata.html') as file:
with open("tests/scrape_responses/chopin_cello_sonata.html") as file:
html_content = file.read()
soup = BeautifulSoup(html_content, "html.parser")
result = parse_movements(soup)
assert result[0]['key_signature'] == 'gminor'
assert result[1]['key_signature'] == 'dminor'
assert result[0]["key_signature"] == "gminor"
assert result[1]["key_signature"] == "dminor"
# number
assert result[0]['number'] == 1
assert result[1]['number'] == 2
assert result[0]["number"] == 1
assert result[1]["number"] == 2
# clean name without number
assert result[0]['name'] == 'Allegro moderato'
assert result[1]['name'] == 'Scherzo'
assert result[0]["name"] == "Allegro moderato"
assert result[1]["name"] == "Scherzo"

# url
print(result)
assert result[0]['download_url'] is None
assert result[1]['download_url'] is None

assert result[0]["download_url"] is None
assert result[1]["download_url"] is None
101 changes: 0 additions & 101 deletions data_ingest/utils.py

This file was deleted.

5 changes: 2 additions & 3 deletions data_ingest/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .imslp_scraping import get_composer_collection_objects
from .movements import parse_movements

__all__ = [
'parse_movements',
]
__all__ = ["parse_movements", "get_composer_collection_objects"]
42 changes: 20 additions & 22 deletions data_ingest/utils/imslp_scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,23 @@
def get_collection_page(composer_name: str) -> BeautifulSoup:
base_url = "https://imslp.org/wiki/Category:"
# Encode the composer name correctly using quote
encoded_name = quote(composer_name, safe=",") # This will encode the comma and leave it as '%2C'

encoded_name = quote(
composer_name, safe=","
) # This will encode the comma and leave it as '%2C'

full_url = f"{base_url}{encoded_name}"
response = requests.get(full_url)
response.raise_for_status() # Ensure successful request
return BeautifulSoup(response.text, 'html.parser')

return BeautifulSoup(response.text, "html.parser")


def get_composer_collection_objects(base_url: str) -> List[str]:
options = Options()
options.add_argument('--headless') # Run in headless mode (no browser UI)
driver = webdriver.Chrome(
service=Service(
ChromeDriverManager().install()
),
options=options
)
options.add_argument("--headless") # Run in headless mode (no browser UI)
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()), options=options
)

# Open the URL directly in Selenium
driver.get(base_url)
Expand All @@ -48,8 +47,9 @@ def get_composer_collection_objects(base_url: str) -> List[str]:

print(f"COLLECTION LINK: {collection_link}")
# Click the collection link
collection_container_id = urlparse(collection_link.get_attribute('href')).fragment

collection_container_id = urlparse(
collection_link.get_attribute("href")
).fragment

print(collection_container_id)
collection_link.click()
Expand All @@ -58,16 +58,16 @@ def get_composer_collection_objects(base_url: str) -> List[str]:
time.sleep(1)

page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
soup = BeautifulSoup(page_source, "html.parser")
links = soup.find(id=collection_container_id)
if isinstance(links, Tag):
links = links.find_all('a', class_ = "categorypagelink")
links = links.find_all("a", class_="categorypagelink")
for link in links:
href = link.get('href')
if href:
href = link.get("href")
if href:
collection_objects.append(href)
else:
print('not found')
else:
print("not found")
print(collection_objects)

except Exception as e:
Expand All @@ -81,9 +81,7 @@ def get_composer_collection_objects(base_url: str) -> List[str]:
# # Example usage:
# composer_name = "Beethoven, Ludwig van" # Replace with the composer you're looking for
# soup = get_collection_page(composer_name)
url = 'https://imslp.org/wiki/Category:Beethoven,_Ludwig_van'
url = "https://imslp.org/wiki/Category:Beethoven,_Ludwig_van"
collection_objects = get_composer_collection_objects(url)

# Print out the collection objects for debugging


Loading

0 comments on commit d7401c7

Please sign in to comment.