diff --git a/data_ingest/utils/fetch_imslp.py b/data_ingest/utils/fetch_imslp.py index cdb06cf..3595e81 100644 --- a/data_ingest/utils/fetch_imslp.py +++ b/data_ingest/utils/fetch_imslp.py @@ -2,6 +2,7 @@ import json import logging import os +import time import polars as pl from imslp_scraping import get_all_composer_pieces, get_composer_url @@ -26,8 +27,6 @@ ) logger = logging.getLogger(__name__) -# short_composers = ["Scriabin, Aleksandr"] - composers = [ "Bach, Johann Sebastian", "Mozart, Wolfgang Amadeus", @@ -108,30 +107,63 @@ "Chopin, Frédéric", "Scriabin, Aleksandr", ] + pieces = [] for composer in composers: logger.info("Starting import for %s", composer) - url = get_composer_url(composer) - data = get_all_composer_pieces(url) - for piece_url in data: - if len(pieces) % 100 == 0: - logger.info("Processed %s pieces", len(pieces)) - try: - piece = create_piece(url=piece_url) - except ValueError as e: - logger.error("Error processing %s: %s", piece_url, e) - continue - pieces.append(piece) + try: + url = get_composer_url(composer) + data = get_all_composer_pieces(url) + + for piece_url in data: + if len(pieces) % 100 == 0: + logger.info("Processed %s pieces", len(pieces)) + + max_retries = 3 + retry_delay = 15 + + for attempt in range(max_retries): + try: + piece = create_piece(url=piece_url) + if piece: + pieces.append(piece) + break + except ValueError as e: + logger.error("Error processing %s: %s", piece_url, e) + if attempt < max_retries - 1: # Don't sleep on the last attempt + logger.info(f"Retrying in {retry_delay} seconds...") + time.sleep(retry_delay) + continue + except Exception as e: + logger.error(f"Unexpected error processing {piece_url}: {str(e)}") + if attempt < max_retries - 1: + logger.info(f"Retrying in {retry_delay} seconds...") + time.sleep(retry_delay) + continue + + except Exception as e: + logger.error(f"Error processing composer {composer}: {str(e)}") + continue # Move to next composer if there's an error + +if not pieces: + logger.error("No pieces were collected. Exiting without saving.") + exit(1) pieces_dict = [] for piece in pieces: - piece_dict = vars(piece).copy() - piece_dict["movements"] = json.dumps([vars(m) for m in piece.movements]) - pieces_dict.append(piece_dict) + if piece is not None: # Add null check + piece_dict = vars(piece).copy() + piece_dict["movements"] = json.dumps([vars(m) for m in piece.movements]) + pieces_dict.append(piece_dict) -df = pl.DataFrame(pieces_dict, strict=False, infer_schema_length=1000) +if pieces_dict: + df = pl.DataFrame(pieces_dict, strict=False, infer_schema_length=1000) + current_datetime = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = f"full_df_{current_datetime}.parquet" + df.write_parquet(output_file) + logger.info("Data saved to %s", output_file) + os.system("pmset sleepnow") +else: + logger.error("No valid pieces to save") -current_datetime = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") -output_file = f"full_df_{current_datetime}.parquet" -df.write_parquet(output_file) -logger.info("Data saved to %s", output_file) +os.system("pmset sleepnow") diff --git a/data_ingest/utils/pieces.py b/data_ingest/utils/pieces.py index 9e4a803..d8ba4b0 100644 --- a/data_ingest/utils/pieces.py +++ b/data_ingest/utils/pieces.py @@ -1,3 +1,4 @@ +import logging from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple, TypedDict @@ -6,7 +7,25 @@ from helpers import (convert_empty_vals_to_none, parse_key_signature, standardize_dict_keys) from movements import Movement, parse_movements +from requests import Session +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +logger = logging.getLogger(__name__) + + +def create_session_with_retries(): + session = Session() + retries = Retry( + total=5, # total number of retries + backoff_factor=1, # wait 1, 2, 4, 8, 16 seconds between retries + status_forcelist=[500, 502, 503, 504], # retry on these HTTP status codes + allowed_methods=["GET", "HEAD", "OPTIONS"] # only retry on these methods + ) + adapter = HTTPAdapter(max_retries=retries) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session @dataclass class Piece: @@ -97,10 +116,17 @@ def create_piece( ) -> Optional[Piece]: if not data and not url: raise ValueError("No data or url argument found") - + if url: - response = requests.get(url, timeout=10) - data = BeautifulSoup(response.text, "html.parser") + try: + session = create_session_with_retries() + response = session.get(url, timeout=10) + response.raise_for_status() + data = BeautifulSoup(response.text, "html.parser") + except (requests.exceptions.SSLError, requests.exceptions.RequestException) as e: + logging.error(f"Request failed for {url}: {str(e)}") + return None + if not data: raise ValueError("Beautiful soup object could not be initialized")