-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
216 additions
and
185 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# https://github.com/raphaelauv/fastAPI-aiohttp-example/blob/master/src/fastAPI_aiohttp/fastAPI.py | ||
|
||
from typing import Optional | ||
import aiohttp | ||
|
||
|
||
class SingletonAiohttp: | ||
aiohttp_client: Optional[aiohttp.ClientSession] = None | ||
|
||
@classmethod | ||
def get_aiohttp_client(cls, base_url:str=None,token:str=None) -> aiohttp.ClientSession: | ||
if cls.aiohttp_client is None: | ||
headers = {} | ||
if token: | ||
headers["Authorization"] = f"Bearer {token}" | ||
cls.aiohttp_client = aiohttp.ClientSession(headers=headers, base_url=base_url, connector=aiohttp.TCPConnector(limit_per_host=5)) | ||
|
||
return cls.aiohttp_client | ||
|
||
@classmethod | ||
async def close_aiohttp_client(cls) -> None: | ||
if cls.aiohttp_client: | ||
await cls.aiohttp_client.close() | ||
cls.aiohttp_client = None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
from fuzzywuzzy import fuzz | ||
from api.SingletonAiohttp import SingletonAiohttp | ||
from models import Song | ||
from bs4 import BeautifulSoup | ||
import urllib.parse | ||
import logging | ||
logger = logging.getLogger('uvicorn.error') | ||
|
||
class API_Client(): | ||
|
||
def __init__(self, token: str): | ||
self.api_endpoint = "https://api.genius.com" | ||
if not token: | ||
raise Exception("No token specified. Change .env/.env.local") | ||
self.session = SingletonAiohttp.get_aiohttp_client(token=token) | ||
|
||
async def close(self): | ||
await SingletonAiohttp.close_aiohttp_client() | ||
|
||
async def search(self, query: str) -> list[Song]: | ||
async with self.session.get(f"{self.api_endpoint}/search?{urllib.parse.urlencode({'q': query})}") as response: | ||
if response.status == 200: | ||
try: | ||
data = await response.json() | ||
songs_data = data.get("response", {}).get("hits", []) | ||
|
||
songs = [ | ||
Song( | ||
id=song["result"]["id"], | ||
title=song["result"]["title"], | ||
artist=song["result"]["primary_artist"]["name"] | ||
) | ||
for song in songs_data | ||
] | ||
|
||
return songs | ||
except ValueError: | ||
raise ValueError("Response is not valid JSON:", await response.text()) | ||
elif response.status == 404: | ||
try: | ||
data = await response.json() | ||
if "meta" in data: | ||
print(f"Error {data['meta']['status']}: {data['meta']['message']}") | ||
else: | ||
raise Exception("Error 404: Resource not found") | ||
except ValueError: | ||
raise ValueError("Error 404: Resource not found (Non-JSON Response)", await response.text()) | ||
else: | ||
logging.error(f"Request failed with status code {response.status}") | ||
try: | ||
raise Exception("Response:", await response.text()) | ||
except ValueError: | ||
raise ValueError("Response (Non-JSON):", await response.text()) | ||
|
||
|
||
|
||
|
||
async def get_lyrics(self, song_id: int) -> str: | ||
async with self.session.get(f"{self.api_endpoint}/songs/{song_id}") as response: | ||
if response.status == 200: | ||
try: | ||
data = await response.json() | ||
song_url = data.get("response", {}).get("song", {}).get("url", "") | ||
return await self.scrape_lyrics(song_url) | ||
except ValueError: | ||
raise ValueError("Response is not valid JSON:", await response.text()) | ||
elif response.status == 404: | ||
try: | ||
data = await response.text() | ||
if "meta" in data: | ||
print(f"Error {data['meta']['status']}: {data['meta']['message']}") | ||
else: | ||
raise Exception("Error 404: Resource not found") | ||
except ValueError: | ||
raise ValueError("Error 404: Resource not found (Non-JSON Response)", await response.text()) | ||
else: | ||
logging.error(f"Request failed with status code {response.status}") | ||
try: | ||
raise Exception("Response:", await response.text()) | ||
except ValueError: | ||
raise ValueError("Response (Non-JSON):", await response.text()) | ||
|
||
async def scrape_lyrics(self, song_url) -> str: | ||
async with self.session.get(song_url) as response: | ||
|
||
soup = BeautifulSoup(await response.text(), "html.parser") | ||
|
||
# Extract lyrics | ||
lyrics_div = soup.find("div", class_="lyrics") # Older Genius pages | ||
if not lyrics_div: | ||
# Newer Genius pages use `data-lyrics-container` | ||
lyrics_div = soup.find_all("div", attrs={"data-lyrics-container": "true"}) | ||
|
||
lyrics = "\n".join([line.get_text() for line in lyrics_div]) | ||
return lyrics | ||
|
||
|
||
def get_best_match(self, search_results, query: str): | ||
best_match = None | ||
highest_score = 0 | ||
for result in search_results: | ||
score = fuzz.ratio(result.artist.lower() + " " + result.title.lower(), query.lower()) | ||
if score > highest_score: | ||
highest_score = score | ||
best_match = result | ||
return best_match |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,70 +1,59 @@ | ||
import asyncio | ||
import os | ||
|
||
import api | ||
from api.genius import API_Client | ||
from env import load_env | ||
from model import AttachmentStyleProbabilities, TextGenerationModel | ||
import csv | ||
import json | ||
from fuzzywuzzy import fuzz | ||
from tqdm import tqdm | ||
from tqdm.asyncio import tqdm_asyncio | ||
|
||
load_env() | ||
|
||
async def add_id_to_song(api_client: API_Client, song: dict) -> dict: | ||
query = f"{song['artist']} - {song['title']}" | ||
search_results = await api_client.search(query) | ||
found_song = api_client.get_best_match(search_results, query) | ||
if not found_song: | ||
print(f"Song not found: {song}") | ||
return song | ||
song_dict = { | ||
"query": query, | ||
"id": found_song.id, | ||
"artist": found_song.artist, | ||
"title": found_song.title | ||
} | ||
if "attachment_style" in song: | ||
song_dict["attachment_style"] = song["attachment_style"].lower() | ||
return song_dict | ||
|
||
def load_existing_data(path_to_data: str) -> list: | ||
""" | ||
Load existing data from a CSV file. The CSV file should have the following columns: | ||
artist, title,attachment_style | ||
""" | ||
data = [] | ||
with open(path_to_data, mode='r', newline='') as file: | ||
reader = csv.DictReader(file) | ||
for row in reader: | ||
data.append({"artist":row['artist'], "title":row['title'],"attachment_style": row['attachment_style']}) | ||
return data | ||
|
||
def generate_data(song_query: str) -> list[AttachmentStyleProbabilities]: | ||
async def query_song(api_client:API_Client, song_query: str) -> dict: | ||
search_results = await api_client.search(song_query) | ||
found_song = api_client.get_best_match(search_results, song_query) | ||
if not found_song: | ||
print(f"Song not found: {song_query}") | ||
song_dict = { | ||
"id": found_song.id, | ||
"artist": found_song.artist, | ||
"title": found_song.title, | ||
} | ||
return song_dict | ||
|
||
|
||
return [] | ||
|
||
if __name__ == "__main__": | ||
global api_client | ||
token = os.getenv("TOKEN") | ||
api_client = api.API_Client(token) | ||
data = load_existing_data("./data/train.csv") | ||
songs_with_ids = [] | ||
|
||
for song in tqdm(data, desc="Processing songs"): | ||
search_results = api_client.search(f"{song['artist']} - {song['title']}") | ||
if search_results: | ||
found_song = None | ||
best_match = None | ||
highest_score = 0 | ||
for result in search_results: | ||
artist_score = fuzz.ratio(result.artist.lower(), song['artist'].lower()) | ||
title_score = fuzz.ratio(result.title.lower(), song['title'].lower()) | ||
score = artist_score + title_score | ||
if score > highest_score: | ||
highest_score = score | ||
best_match = result | ||
if best_match: | ||
found_song = best_match | ||
if not found_song: | ||
print(song) | ||
continue | ||
song_dict = { | ||
"id": found_song.id, | ||
"artist": found_song.artist, | ||
"title": found_song.title, | ||
"attachment_style": song['attachment_style'].lower(), | ||
} | ||
songs_with_ids.append(song_dict) | ||
|
||
async def main(): | ||
token = os.getenv("TOKEN") | ||
genius = API_Client(token) | ||
with open("./data/songs.json", "r") as json_file: | ||
songs = json.load(json_file) | ||
|
||
songs_with_ids = await tqdm_asyncio.gather(*[add_id_to_song(genius, song) for song in songs]) | ||
|
||
with open("./data/songs_with_ids.json", "w") as json_file: | ||
json.dump(songs_with_ids, json_file, indent=4) | ||
|
||
|
||
#model = TextGenerationModel() | ||
song_queries = [] | ||
#output = [generate_data(song_query) for song_query in song_queries] | ||
#output += load_existing_data("../data/train.csv") | ||
|
||
await genius.close() | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
asyncio.run(main()) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.