make it async

maxzirps · Dec 23, 2024 · 2d999e9 · 2d999e9
1 parent 484c348
commit 2d999e9
Show file tree

Hide file tree

Showing 6 changed files with 216 additions and 185 deletions.
diff --git a/backend/src/api.py b/backend/src/api.py
diff --git a/backend/src/api/SingletonAiohttp.py b/backend/src/api/SingletonAiohttp.py
@@ -0,0 +1,24 @@
+# https://github.com/raphaelauv/fastAPI-aiohttp-example/blob/master/src/fastAPI_aiohttp/fastAPI.py
+
+from typing import  Optional
+import aiohttp
+
+
+class SingletonAiohttp:
+    aiohttp_client: Optional[aiohttp.ClientSession] = None
+
+    @classmethod
+    def get_aiohttp_client(cls, base_url:str=None,token:str=None) -> aiohttp.ClientSession:
+        if cls.aiohttp_client is None:
+            headers = {}
+            if token:
+                headers["Authorization"] =  f"Bearer {token}"
+            cls.aiohttp_client = aiohttp.ClientSession(headers=headers, base_url=base_url, connector=aiohttp.TCPConnector(limit_per_host=5))
+
+        return cls.aiohttp_client
+
+    @classmethod
+    async def close_aiohttp_client(cls) -> None:
+        if cls.aiohttp_client:
+            await cls.aiohttp_client.close()
+            cls.aiohttp_client = None
diff --git a/backend/src/api/genius.py b/backend/src/api/genius.py
@@ -0,0 +1,106 @@
+from fuzzywuzzy import fuzz
+from api.SingletonAiohttp import SingletonAiohttp
+from models import Song
+from bs4 import BeautifulSoup
+import urllib.parse
+import logging
+logger = logging.getLogger('uvicorn.error')
+
+class API_Client():
+
+    def __init__(self, token: str):
+        self.api_endpoint = "https://api.genius.com"
+        if not token:
+            raise Exception("No token specified. Change .env/.env.local")
+        self.session = SingletonAiohttp.get_aiohttp_client(token=token)
+
+    async def close(self):
+        await SingletonAiohttp.close_aiohttp_client()
+
+    async def search(self, query: str) -> list[Song]:
+        async with self.session.get(f"{self.api_endpoint}/search?{urllib.parse.urlencode({'q': query})}") as response:
+            if response.status == 200:
+                try:
+                    data = await response.json()
+                    songs_data = data.get("response", {}).get("hits", [])
+
+                    songs = [
+                        Song(
+                            id=song["result"]["id"],
+                            title=song["result"]["title"],
+                            artist=song["result"]["primary_artist"]["name"]
+                        )
+                        for song in songs_data
+                    ]
+
+                    return songs
+                except ValueError:
+                    raise ValueError("Response is not valid JSON:", await response.text())
+            elif response.status == 404:
+                try:
+                    data = await response.json()
+                    if "meta" in data:
+                        print(f"Error {data['meta']['status']}: {data['meta']['message']}")
+                    else:
+                        raise Exception("Error 404: Resource not found")
+                except ValueError:
+                    raise ValueError("Error 404: Resource not found (Non-JSON Response)", await response.text())
+            else:
+                logging.error(f"Request failed with status code {response.status}")
+                try:
+                    raise Exception("Response:", await response.text())
+                except ValueError:
+                    raise ValueError("Response (Non-JSON):", await response.text())
+
+
+
+
+    async def get_lyrics(self, song_id: int) -> str:
+        async with self.session.get(f"{self.api_endpoint}/songs/{song_id}") as response:
+            if response.status == 200:
+                try:
+                    data = await response.json()
+                    song_url = data.get("response", {}).get("song", {}).get("url", "")
+                    return await self.scrape_lyrics(song_url)
+                except ValueError:
+                    raise ValueError("Response is not valid JSON:", await response.text())
+            elif response.status == 404:
+                try:
+                    data = await response.text()
+                    if "meta" in data:
+                        print(f"Error {data['meta']['status']}: {data['meta']['message']}")
+                    else:
+                        raise Exception("Error 404: Resource not found")
+                except ValueError:
+                    raise ValueError("Error 404: Resource not found (Non-JSON Response)", await response.text())
+            else:
+                logging.error(f"Request failed with status code {response.status}")
+                try:
+                    raise Exception("Response:", await response.text())
+                except ValueError:
+                    raise ValueError("Response (Non-JSON):", await response.text())
+
+    async def scrape_lyrics(self, song_url) -> str:
+        async with self.session.get(song_url) as response:
+
+            soup = BeautifulSoup(await response.text(), "html.parser")
+
+            # Extract lyrics
+            lyrics_div = soup.find("div", class_="lyrics")  # Older Genius pages
+            if not lyrics_div:
+            # Newer Genius pages use `data-lyrics-container`
+                lyrics_div = soup.find_all("div", attrs={"data-lyrics-container": "true"})
+
+            lyrics = "\n".join([line.get_text() for line in lyrics_div])
+            return lyrics
+
+
+    def get_best_match(self, search_results, query: str):
+            best_match = None
+            highest_score = 0
+            for result in search_results:
+                score = fuzz.ratio(result.artist.lower() + " " + result.title.lower(), query.lower())
+                if score > highest_score:
+                    highest_score = score
+                    best_match = result
+            return best_match
diff --git a/backend/src/generate_data.py b/backend/src/generate_data.py
@@ -1,70 +1,59 @@
+import asyncio
 import os
-
-import api
+from api.genius import API_Client
 from env import load_env
-from model import AttachmentStyleProbabilities, TextGenerationModel
-import csv
 import json
-from fuzzywuzzy import fuzz
-from tqdm import tqdm
+from tqdm.asyncio import tqdm_asyncio
 
 load_env()
 
+async def add_id_to_song(api_client: API_Client, song: dict) -> dict:
+    query = f"{song['artist']} - {song['title']}"
+    search_results = await api_client.search(query)
+    found_song = api_client.get_best_match(search_results, query)
+    if not found_song:
+        print(f"Song not found: {song}")
+        return song
+    song_dict = {
+        "query": query,
+        "id": found_song.id,
+        "artist": found_song.artist,
+        "title": found_song.title
+    }
+    if "attachment_style" in song:
+        song_dict["attachment_style"] = song["attachment_style"].lower()
+    return song_dict
 
-def load_existing_data(path_to_data: str) -> list:
-    """
-    Load existing data from a CSV file. The CSV file should have the following columns:
-    artist, title,attachment_style
-    """
-    data = []
-    with open(path_to_data, mode='r', newline='') as file:
-        reader = csv.DictReader(file)
-        for row in reader:
-            data.append({"artist":row['artist'], "title":row['title'],"attachment_style": row['attachment_style']})
-    return data
 
-def generate_data(song_query: str) -> list[AttachmentStyleProbabilities]:
+async def query_song(api_client:API_Client, song_query: str) -> dict:
+    search_results = await api_client.search(song_query)
+    found_song = api_client.get_best_match(search_results, song_query)
+    if not found_song:
+        print(f"Song not found: {song_query}")
+    song_dict = {
+                   "id": found_song.id,
+                   "artist": found_song.artist,
+                   "title": found_song.title,
+               }
+    return song_dict
+
 
-    return []
-
-if __name__ == "__main__":
-    global api_client
-    token = os.getenv("TOKEN")
-    api_client = api.API_Client(token)
-    data = load_existing_data("./data/train.csv")
-    songs_with_ids = []
 
-    for song in tqdm(data, desc="Processing songs"):
-        search_results = api_client.search(f"{song['artist']} -  {song['title']}")
-        if search_results:
-            found_song = None
-            best_match = None
-            highest_score = 0
-            for result in search_results:
-                artist_score = fuzz.ratio(result.artist.lower(), song['artist'].lower())
-                title_score = fuzz.ratio(result.title.lower(), song['title'].lower())
-                score = artist_score + title_score
-                if score > highest_score:
-                    highest_score = score
-                    best_match = result
-            if best_match:
-                found_song = best_match
-            if not found_song:
-                print(song)
-                continue
-            song_dict = {
-                "id": found_song.id,
-                "artist": found_song.artist,
-                "title": found_song.title,
-                "attachment_style": song['attachment_style'].lower(),
-            }
-            songs_with_ids.append(song_dict)
-
+async def main():
+    token = os.getenv("TOKEN")
+    genius = API_Client(token)
+    with open("./data/songs.json", "r") as json_file:
+        songs = json.load(json_file)
+
+    songs_with_ids = await tqdm_asyncio.gather(*[add_id_to_song(genius, song) for song in songs])
+
     with open("./data/songs_with_ids.json", "w") as json_file:
         json.dump(songs_with_ids, json_file, indent=4)
-
-
-    #model = TextGenerationModel()
-    song_queries = []
-    #output = [generate_data(song_query) for song_query in song_queries]
-    #output += load_existing_data("../data/train.csv")
+
+    await genius.close()
+
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
diff --git a/backend/src/main.py b/backend/src/main.py
@@ -1,7 +1,9 @@
 from collections import Counter
 from typing import List, Optional
 from urllib.request import Request
-import api
+
+from fastapi.concurrency import asynccontextmanager
+import api.genius as genius
 from src.env import load_env
 from model import TextGenerationModel
 from fastapi import FastAPI, Query, HTTPException
@@ -42,15 +44,19 @@ async def universal_exception_handler(request: Request, exc: Exception):
         content={"message": "An unexpected error occurred"}
     )
 
-@app.on_event("startup")
-async def init_model():
+@asynccontextmanager
+async def lifespan(app: FastAPI):
     global model
     global api_client
     model_id = os.getenv("MODEL_ID")
     logger.info(f"Using model {model_id}")
     model = TextGenerationModel(model_id)
     token = os.getenv("TOKEN")
-    api_client = api.API_Client(token)
+    api_client = genius.API_Client(token)
+
+    yield
+
+    await api_client.close()
 
 class LyricsRequest(BaseModel):
     lyrics: str