From 2d999e934577f126a622c6ffb722b84ca895c6ea Mon Sep 17 00:00:00 2001 From: Maximilian Zirps Date: Mon, 23 Dec 2024 12:27:44 +0100 Subject: [PATCH] make it async --- backend/src/api.py | 104 --------------------------- backend/src/api/SingletonAiohttp.py | 24 +++++++ backend/src/api/genius.py | 106 ++++++++++++++++++++++++++++ backend/src/generate_data.py | 105 ++++++++++++--------------- backend/src/main.py | 14 ++-- backend/src/test.py | 48 ++++++++----- 6 files changed, 216 insertions(+), 185 deletions(-) delete mode 100644 backend/src/api.py create mode 100644 backend/src/api/SingletonAiohttp.py create mode 100644 backend/src/api/genius.py diff --git a/backend/src/api.py b/backend/src/api.py deleted file mode 100644 index bde8313..0000000 --- a/backend/src/api.py +++ /dev/null @@ -1,104 +0,0 @@ - -from models import Song -from bs4 import BeautifulSoup -import urllib.parse -import requests -import logging -logger = logging.getLogger('uvicorn.error') - -class API_Client(): - - def __init__(self, token: str): - self.API_ENDPOINT = "https://api.genius.com" - self.TOKEN = token - if not self.TOKEN: - raise Exception("No token specified. Change .env/.env.local") - - - def search(self,query: str) -> list[Song]: - url = f"{self.API_ENDPOINT}/search?{urllib.parse.urlencode({'q': query})}" - headers = { - "Authorization": f"Bearer {self.TOKEN}", - "Accept": "application/json" - } - response = requests.get(url, headers=headers) - - if response.status_code == 200: - try: - data = response.json() - songs_data = data.get("response", {}).get("hits", []) - - songs = [ - Song( - id=song["result"]["id"], - title=song["result"]["title"], - artist=song["result"]["primary_artist"]["name"] - ) - for song in songs_data - ] - - return songs - except ValueError: - raise ValueError("Response is not valid JSON:", response.text) - elif response.status_code == 404: - try: - data = response.json() - if "meta" in data: - print(f"Error {data['meta']['status']}: {data['meta']['message']}") - else: - raise Exception("Error 404: Resource not found") - except ValueError: - raise ValueError("Error 404: Resource not found (Non-JSON Response)", response.text) - else: - logging.error(f"Request failed with status code {response.status_code}") - try: - raise Exception("Response:", response.json()) - except ValueError: - raise ValueError("Response (Non-JSON):", response.text) - - - - def get_lyrics(self, song_id: int) -> str: - url = f"{self.API_ENDPOINT}/songs/{song_id}" - headers = { - "Authorization": f"Bearer {self.TOKEN}", - "Accept": "application/json" - } - response = requests.get(url, headers=headers) - - if response.status_code == 200: - try: - data = response.json() - song_url = data.get("response", {}).get("song", {}).get("url", "") - return self.scrape_lyrics(song_url) - except ValueError: - raise ValueError("Response is not valid JSON:", response.text) - elif response.status_code == 404: - try: - data = response.json() - if "meta" in data: - print(f"Error {data['meta']['status']}: {data['meta']['message']}") - else: - raise Exception("Error 404: Resource not found") - except ValueError: - raise ValueError("Error 404: Resource not found (Non-JSON Response)", response.text) - else: - logging.error(f"Request failed with status code {response.status_code}") - try: - raise Exception("Response:", response.json()) - except ValueError: - raise ValueError("Response (Non-JSON):", response.text) - - def scrape_lyrics(self, song_url) -> str: - song_page = requests.get(song_url) - - soup = BeautifulSoup(song_page.text, "html.parser") - - # Extract lyrics - lyrics_div = soup.find("div", class_="lyrics") # Older Genius pages - if not lyrics_div: - # Newer Genius pages use `data-lyrics-container` - lyrics_div = soup.find_all("div", attrs={"data-lyrics-container": "true"}) - - lyrics = "\n".join([line.get_text() for line in lyrics_div]) - return lyrics \ No newline at end of file diff --git a/backend/src/api/SingletonAiohttp.py b/backend/src/api/SingletonAiohttp.py new file mode 100644 index 0000000..20c6663 --- /dev/null +++ b/backend/src/api/SingletonAiohttp.py @@ -0,0 +1,24 @@ +# https://github.com/raphaelauv/fastAPI-aiohttp-example/blob/master/src/fastAPI_aiohttp/fastAPI.py + +from typing import Optional +import aiohttp + + +class SingletonAiohttp: + aiohttp_client: Optional[aiohttp.ClientSession] = None + + @classmethod + def get_aiohttp_client(cls, base_url:str=None,token:str=None) -> aiohttp.ClientSession: + if cls.aiohttp_client is None: + headers = {} + if token: + headers["Authorization"] = f"Bearer {token}" + cls.aiohttp_client = aiohttp.ClientSession(headers=headers, base_url=base_url, connector=aiohttp.TCPConnector(limit_per_host=5)) + + return cls.aiohttp_client + + @classmethod + async def close_aiohttp_client(cls) -> None: + if cls.aiohttp_client: + await cls.aiohttp_client.close() + cls.aiohttp_client = None \ No newline at end of file diff --git a/backend/src/api/genius.py b/backend/src/api/genius.py new file mode 100644 index 0000000..b3a9889 --- /dev/null +++ b/backend/src/api/genius.py @@ -0,0 +1,106 @@ +from fuzzywuzzy import fuzz +from api.SingletonAiohttp import SingletonAiohttp +from models import Song +from bs4 import BeautifulSoup +import urllib.parse +import logging +logger = logging.getLogger('uvicorn.error') + +class API_Client(): + + def __init__(self, token: str): + self.api_endpoint = "https://api.genius.com" + if not token: + raise Exception("No token specified. Change .env/.env.local") + self.session = SingletonAiohttp.get_aiohttp_client(token=token) + + async def close(self): + await SingletonAiohttp.close_aiohttp_client() + + async def search(self, query: str) -> list[Song]: + async with self.session.get(f"{self.api_endpoint}/search?{urllib.parse.urlencode({'q': query})}") as response: + if response.status == 200: + try: + data = await response.json() + songs_data = data.get("response", {}).get("hits", []) + + songs = [ + Song( + id=song["result"]["id"], + title=song["result"]["title"], + artist=song["result"]["primary_artist"]["name"] + ) + for song in songs_data + ] + + return songs + except ValueError: + raise ValueError("Response is not valid JSON:", await response.text()) + elif response.status == 404: + try: + data = await response.json() + if "meta" in data: + print(f"Error {data['meta']['status']}: {data['meta']['message']}") + else: + raise Exception("Error 404: Resource not found") + except ValueError: + raise ValueError("Error 404: Resource not found (Non-JSON Response)", await response.text()) + else: + logging.error(f"Request failed with status code {response.status}") + try: + raise Exception("Response:", await response.text()) + except ValueError: + raise ValueError("Response (Non-JSON):", await response.text()) + + + + + async def get_lyrics(self, song_id: int) -> str: + async with self.session.get(f"{self.api_endpoint}/songs/{song_id}") as response: + if response.status == 200: + try: + data = await response.json() + song_url = data.get("response", {}).get("song", {}).get("url", "") + return await self.scrape_lyrics(song_url) + except ValueError: + raise ValueError("Response is not valid JSON:", await response.text()) + elif response.status == 404: + try: + data = await response.text() + if "meta" in data: + print(f"Error {data['meta']['status']}: {data['meta']['message']}") + else: + raise Exception("Error 404: Resource not found") + except ValueError: + raise ValueError("Error 404: Resource not found (Non-JSON Response)", await response.text()) + else: + logging.error(f"Request failed with status code {response.status}") + try: + raise Exception("Response:", await response.text()) + except ValueError: + raise ValueError("Response (Non-JSON):", await response.text()) + + async def scrape_lyrics(self, song_url) -> str: + async with self.session.get(song_url) as response: + + soup = BeautifulSoup(await response.text(), "html.parser") + + # Extract lyrics + lyrics_div = soup.find("div", class_="lyrics") # Older Genius pages + if not lyrics_div: + # Newer Genius pages use `data-lyrics-container` + lyrics_div = soup.find_all("div", attrs={"data-lyrics-container": "true"}) + + lyrics = "\n".join([line.get_text() for line in lyrics_div]) + return lyrics + + + def get_best_match(self, search_results, query: str): + best_match = None + highest_score = 0 + for result in search_results: + score = fuzz.ratio(result.artist.lower() + " " + result.title.lower(), query.lower()) + if score > highest_score: + highest_score = score + best_match = result + return best_match \ No newline at end of file diff --git a/backend/src/generate_data.py b/backend/src/generate_data.py index 2b2c5f5..8060f37 100644 --- a/backend/src/generate_data.py +++ b/backend/src/generate_data.py @@ -1,70 +1,59 @@ +import asyncio import os - -import api +from api.genius import API_Client from env import load_env -from model import AttachmentStyleProbabilities, TextGenerationModel -import csv import json -from fuzzywuzzy import fuzz -from tqdm import tqdm +from tqdm.asyncio import tqdm_asyncio load_env() +async def add_id_to_song(api_client: API_Client, song: dict) -> dict: + query = f"{song['artist']} - {song['title']}" + search_results = await api_client.search(query) + found_song = api_client.get_best_match(search_results, query) + if not found_song: + print(f"Song not found: {song}") + return song + song_dict = { + "query": query, + "id": found_song.id, + "artist": found_song.artist, + "title": found_song.title + } + if "attachment_style" in song: + song_dict["attachment_style"] = song["attachment_style"].lower() + return song_dict -def load_existing_data(path_to_data: str) -> list: - """ - Load existing data from a CSV file. The CSV file should have the following columns: - artist, title,attachment_style - """ - data = [] - with open(path_to_data, mode='r', newline='') as file: - reader = csv.DictReader(file) - for row in reader: - data.append({"artist":row['artist'], "title":row['title'],"attachment_style": row['attachment_style']}) - return data -def generate_data(song_query: str) -> list[AttachmentStyleProbabilities]: +async def query_song(api_client:API_Client, song_query: str) -> dict: + search_results = await api_client.search(song_query) + found_song = api_client.get_best_match(search_results, song_query) + if not found_song: + print(f"Song not found: {song_query}") + song_dict = { + "id": found_song.id, + "artist": found_song.artist, + "title": found_song.title, + } + return song_dict + - return [] - -if __name__ == "__main__": - global api_client - token = os.getenv("TOKEN") - api_client = api.API_Client(token) - data = load_existing_data("./data/train.csv") - songs_with_ids = [] - for song in tqdm(data, desc="Processing songs"): - search_results = api_client.search(f"{song['artist']} - {song['title']}") - if search_results: - found_song = None - best_match = None - highest_score = 0 - for result in search_results: - artist_score = fuzz.ratio(result.artist.lower(), song['artist'].lower()) - title_score = fuzz.ratio(result.title.lower(), song['title'].lower()) - score = artist_score + title_score - if score > highest_score: - highest_score = score - best_match = result - if best_match: - found_song = best_match - if not found_song: - print(song) - continue - song_dict = { - "id": found_song.id, - "artist": found_song.artist, - "title": found_song.title, - "attachment_style": song['attachment_style'].lower(), - } - songs_with_ids.append(song_dict) - +async def main(): + token = os.getenv("TOKEN") + genius = API_Client(token) + with open("./data/songs.json", "r") as json_file: + songs = json.load(json_file) + + songs_with_ids = await tqdm_asyncio.gather(*[add_id_to_song(genius, song) for song in songs]) + with open("./data/songs_with_ids.json", "w") as json_file: json.dump(songs_with_ids, json_file, indent=4) - - - #model = TextGenerationModel() - song_queries = [] - #output = [generate_data(song_query) for song_query in song_queries] - #output += load_existing_data("../data/train.csv") \ No newline at end of file + + await genius.close() + + + +if __name__ == "__main__": + asyncio.run(main()) + \ No newline at end of file diff --git a/backend/src/main.py b/backend/src/main.py index bd9f728..0c903f4 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -1,7 +1,9 @@ from collections import Counter from typing import List, Optional from urllib.request import Request -import api + +from fastapi.concurrency import asynccontextmanager +import api.genius as genius from src.env import load_env from model import TextGenerationModel from fastapi import FastAPI, Query, HTTPException @@ -42,15 +44,19 @@ async def universal_exception_handler(request: Request, exc: Exception): content={"message": "An unexpected error occurred"} ) -@app.on_event("startup") -async def init_model(): +@asynccontextmanager +async def lifespan(app: FastAPI): global model global api_client model_id = os.getenv("MODEL_ID") logger.info(f"Using model {model_id}") model = TextGenerationModel(model_id) token = os.getenv("TOKEN") - api_client = api.API_Client(token) + api_client = genius.API_Client(token) + + yield + + await api_client.close() class LyricsRequest(BaseModel): lyrics: str diff --git a/backend/src/test.py b/backend/src/test.py index d1f613a..89fdc8e 100644 --- a/backend/src/test.py +++ b/backend/src/test.py @@ -1,43 +1,53 @@ +import asyncio import os - -import api +from tqdm.asyncio import tqdm_asyncio +from api.genius import API_Client from env import load_env -from model import AttachmentStyleProbabilities, TextGenerationModel -import csv +from model import TextGenerationModel import json -from fuzzywuzzy import fuzz from tqdm import tqdm import numpy as np load_env() -if __name__ == "__main__": - global api_client + + + +async def main(): token = os.getenv("TOKEN") - api_client = api.API_Client(token) - - with open("./data/test_output.json", "r") as json_file: - predictions = json.load(json_file) - - if predictions: - correct = np.sum([song["predicted"] == song["attachment_style"] for song in predictions]) - print(f"Accuracy: {correct/len(predictions)}") + genius = API_Client(token) + if os.path.exists("./data/test_output.json"): + print("Loading existing data") + with open("./data/test_output.json", "r") as json_file: + predictions = json.load(json_file) + correct = np.sum([song["predicted"] == song["attachment_style"] for song in predictions]) + print(f"Accuracy: {correct/len(predictions)}") else: - + print("Fetching new data") with open("./data/train.json", "r") as json_file: songs = json.load(json_file) model = TextGenerationModel() - for song in tqdm(songs, desc="Processing songs"): - lyrics = api_client.get_lyrics(song["id"]) + async def get_lyrics_and_classify(genius, song: dict) -> dict: + lyrics = await genius.get_lyrics(song["id"]) attachment_style = model.classify_attachment_style(lyrics) if attachment_style: song["predicted"] = max(attachment_style, key=attachment_style.get) else: song["predicted"] = "unknown" + return song + + songs = await tqdm_asyncio.gather(*[get_lyrics_and_classify(genius, song) for song in songs]) with open("./data/test_output.json", "w") as json_file: - json.dump(songs, json_file, indent=4) \ No newline at end of file + json.dump(songs, json_file, indent=4) + + await genius.close() + + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file