Skip to content

Commit

Permalink
add backoff and retry mechanisms to fetch_imslp
Browse files Browse the repository at this point in the history
  • Loading branch information
mbrandt00 committed Jan 16, 2025
1 parent 4bba8b7 commit 166ff43
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 24 deletions.
74 changes: 53 additions & 21 deletions data_ingest/utils/fetch_imslp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import logging
import os
import time

import polars as pl
from imslp_scraping import get_all_composer_pieces, get_composer_url
Expand All @@ -26,8 +27,6 @@
)
logger = logging.getLogger(__name__)

# short_composers = ["Scriabin, Aleksandr"]

composers = [
"Bach, Johann Sebastian",
"Mozart, Wolfgang Amadeus",
Expand Down Expand Up @@ -108,30 +107,63 @@
"Chopin, Frédéric",
"Scriabin, Aleksandr",
]

pieces = []
for composer in composers:
logger.info("Starting import for %s", composer)
url = get_composer_url(composer)
data = get_all_composer_pieces(url)
for piece_url in data:
if len(pieces) % 100 == 0:
logger.info("Processed %s pieces", len(pieces))
try:
piece = create_piece(url=piece_url)
except ValueError as e:
logger.error("Error processing %s: %s", piece_url, e)
continue
pieces.append(piece)
try:
url = get_composer_url(composer)
data = get_all_composer_pieces(url)

for piece_url in data:
if len(pieces) % 100 == 0:
logger.info("Processed %s pieces", len(pieces))

max_retries = 3
retry_delay = 15

for attempt in range(max_retries):
try:
piece = create_piece(url=piece_url)
if piece:
pieces.append(piece)
break
except ValueError as e:
logger.error("Error processing %s: %s", piece_url, e)
if attempt < max_retries - 1: # Don't sleep on the last attempt
logger.info(f"Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
continue
except Exception as e:
logger.error(f"Unexpected error processing {piece_url}: {str(e)}")
if attempt < max_retries - 1:
logger.info(f"Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
continue

except Exception as e:
logger.error(f"Error processing composer {composer}: {str(e)}")
continue # Move to next composer if there's an error

if not pieces:
logger.error("No pieces were collected. Exiting without saving.")
exit(1)

pieces_dict = []
for piece in pieces:
piece_dict = vars(piece).copy()
piece_dict["movements"] = json.dumps([vars(m) for m in piece.movements])
pieces_dict.append(piece_dict)
if piece is not None: # Add null check
piece_dict = vars(piece).copy()
piece_dict["movements"] = json.dumps([vars(m) for m in piece.movements])
pieces_dict.append(piece_dict)

df = pl.DataFrame(pieces_dict, strict=False, infer_schema_length=1000)
if pieces_dict:
df = pl.DataFrame(pieces_dict, strict=False, infer_schema_length=1000)
current_datetime = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"full_df_{current_datetime}.parquet"
df.write_parquet(output_file)
logger.info("Data saved to %s", output_file)
os.system("pmset sleepnow")
else:
logger.error("No valid pieces to save")

current_datetime = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"full_df_{current_datetime}.parquet"
df.write_parquet(output_file)
logger.info("Data saved to %s", output_file)
os.system("pmset sleepnow")
32 changes: 29 additions & 3 deletions data_ingest/utils/pieces.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple, TypedDict

Expand All @@ -6,7 +7,25 @@
from helpers import (convert_empty_vals_to_none, parse_key_signature,
standardize_dict_keys)
from movements import Movement, parse_movements
from requests import Session
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

logger = logging.getLogger(__name__)


def create_session_with_retries():
session = Session()
retries = Retry(
total=5, # total number of retries
backoff_factor=1, # wait 1, 2, 4, 8, 16 seconds between retries
status_forcelist=[500, 502, 503, 504], # retry on these HTTP status codes
allowed_methods=["GET", "HEAD", "OPTIONS"] # only retry on these methods
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session

@dataclass
class Piece:
Expand Down Expand Up @@ -97,10 +116,17 @@ def create_piece(
) -> Optional[Piece]:
if not data and not url:
raise ValueError("No data or url argument found")

if url:
response = requests.get(url, timeout=10)
data = BeautifulSoup(response.text, "html.parser")
try:
session = create_session_with_retries()
response = session.get(url, timeout=10)
response.raise_for_status()
data = BeautifulSoup(response.text, "html.parser")
except (requests.exceptions.SSLError, requests.exceptions.RequestException) as e:
logging.error(f"Request failed for {url}: {str(e)}")
return None


if not data:
raise ValueError("Beautiful soup object could not be initialized")
Expand Down

0 comments on commit 166ff43

Please sign in to comment.