Skip to content

Commit

Permalink
a lot of improvements to matching
Browse files Browse the repository at this point in the history
  • Loading branch information
xnetcat committed Nov 6, 2022
1 parent 182a4ce commit 8fcdf0f
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 30 deletions.
126 changes: 96 additions & 30 deletions spotdl/providers/audio/ytmusic.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def search(self, song: Song) -> Optional[str]:
)

if len(isrc_results) == 1:
isrc_result = self.order_results([isrc_results[0]], song)
isrc_result = self.order_results([isrc_results[0]], song, True)
if len(isrc_result) == 1:
isrc_link, isrc_score = isrc_result.popitem()

Expand Down Expand Up @@ -168,14 +168,15 @@ def get_results(self, search_term: str, **kwargs) -> List[Dict[str, Any]]:
return simplified_results

def order_results(
self, results: List[Dict[str, Any]], song: Song
self, results: List[Dict[str, Any]], song: Song, is_isrc: bool = False
) -> Dict[str, Any]:
"""
Filter results based on the song's metadata.
### Arguments
- results: The results to filter.
- song: The song to filter by.
- is_isrc: Whether the results are from an isrc search.
### Returns
- A dict of filtered results.
Expand All @@ -194,9 +195,16 @@ def order_results(

# DEBUG CODE
# print(f"#############################")
# print(f"song.name: {song.name}")
# print(f"song.album_name: {song.album_name}")
# print(f"song.artist: {song.artist}")
# print(f"song.artists: {song.artists}")
# print(f"song.isrc: {song.isrc}")
# print(f"song.duration: {song.duration}")
# print(f"slug_song_name: {slug_song_name}")
# print(f"slug_song_album_name: {slug_song_album_name}")
# print(f"slug_song_main_artist: {slug_song_main_artist}")
# print(f"slug_song_artists: {slug_song_artists}")
# print(f"slug_song_title: {slug_song_title}")
# print(f"slug_song_duration: {song.duration}")
# print(f"slug_song_artists: {slug_song_artists}")
Expand Down Expand Up @@ -224,6 +232,7 @@ def order_results(
# print(f"result link: {result['link']}")
# print(f"result type: {result['type']}")
# print(f"result duration: {result['duration']}")
# print(f"result artists_list: {result['artists_list']}")
# print(f"slug_result_name: {slug_result_name}")
# print(f"slug_result_artists: {slug_result_artists}")
# print(f"slug_result_album: {slug_result_album}")
Expand All @@ -237,18 +246,37 @@ def order_results(
main_artist_match = fuzz.ratio(
slug_song_main_artist, slugify(result["artists_list"][0])
)
# print(f"main_artist_match: {main_artist_match}")
# print(f"? main_artist_match: {main_artist_match}")

artist_match_number = main_artist_match
if len(song.artists) > 1:
# match the song's artists with the result's artists
artists_match = fuzz.ratio(slug_song_artists, slug_result_artists)
# print(f"artists_match: {artists_match}")

if len(song.artists) == len(result["artists_list"]):
artists_match = fuzz.ratio(slug_song_artists, slug_result_artists)
# print(f"exact artists_match: {artists_match}")
else:
# Sort list1
artist1_list = list(map(slugify, song.artists))
artist1_list.sort()

# Sort list2
artist2_list = list(map(slugify, result["artists_list"]))
artist2_list.sort()

# Zip two sorted lists
zipped_lists = list(zip(song.artists, result["artists_list"]))

artist_match = 0.0
for artist1, artist2 in zipped_lists:
artist_match += fuzz.ratio(slugify(artist1), slugify(artist2))

artists_match = artist_match / len(zipped_lists)

artist_match_number += artists_match

artist_match = artist_match_number / (2 if len(song.artists) > 1 else 1)
# print("first artist_match: ", artist_match)
# print("? first artist_match: ", artist_match)

# additional checks for results that are not songs
if artist_match <= 50 and result["type"] != "song":
Expand All @@ -263,6 +291,23 @@ def order_results(
artist_match = channel_name_match
# print("? second artist_match: ", artist_match)

# If artist match is still too low,
# we fallback to matching all song artist names
# with the result's title
if artist_match <= 50:
artist_title_match = 0
for artist in song.artists:
slug_artist = slugify(artist).replace("-", "")
if slug_artist in slug_result_name.replace("-", ""):
artist_title_match += 1

artist_title_match = (artist_title_match / len(song.artists)) * 100
# print(f"? artist_title_match: {artist_title_match}")

if artist_title_match > artist_match:
artist_match = artist_title_match
# print("? third artist_match: ", artist_match)

# additional checks for results that are songs
if artist_match < 70 and result["type"] == "song":
# Check if the song name is very similar to the result name
Expand Down Expand Up @@ -303,7 +348,7 @@ def order_results(
artist_match += 15 if len(song.artists[1:]) <= 2 else 10
# print("? other artist artist_match: ", artist_match)

# print("final artist_match: ", artist_match)
# print("? final artist_match: ", artist_match)

# skip results with artist match lower than 70%
if artist_match < 70:
Expand All @@ -321,19 +366,29 @@ def order_results(
# but not in the result name
# if it is, we add the artist to the result name
for artist in song.artists:
slug_song_artist = slugify(artist)
if slug_song_artist in test_str2 and not slug_song_artist in test_str1:
slug_song_artist = slugify(artist).replace("-", "")
if slug_song_artist in test_str2.replace(
"-", ""
) and not slug_song_artist in test_str1.replace("-", ""):
test_str1 += f"-{slug_song_artist}"

# same thing for for song name
for artist in song.artists:
slug_result_artist = slugify(artist)
if (
slug_result_artist in test_str1
and not slug_result_artist in test_str2
):
slug_result_artist = slugify(artist).replace("-", "")
if slug_result_artist in test_str1.replace(
"-", ""
) and not slug_result_artist in test_str2.replace("-", ""):
test_str2 += f"-{slug_result_artist}"

test_str1 = test_str1.split("-")
test_str2 = test_str2.split("-")

test_str1.sort()
test_str2.sort()

test_str1 = "-".join(test_str1)
test_str2 = "-".join(test_str2)

# print(f"test_str1: {test_str1}")
# print(f"test_str2: {test_str2}")

Expand Down Expand Up @@ -368,33 +423,44 @@ def order_results(
non_match_value = (delta**2) / song.duration * 100
time_match = 100 - non_match_value

# drop results with time match lower than 50%
# print(f"time_match: {time_match}")
if time_match < 45:
# print("! time_match < 45 - skipping")
continue
# print(f"? time_match: {time_match}")

# Calculate total match
average_match = (artist_match + name_match + time_match) / 3
average_match = (artist_match + name_match) / 2

# print(f"album_match: {album_match}")
# print(f"time_match: {time_match}")
# print(f"average_match: {average_match}")
# print(f"? album_match: {album_match}")
# print(f"? time_match: {time_match}")
# print(f"? average_match (only artist and name): {average_match}")

if (
result["type"] == "song"
and slug_result_album
and fuzz.partial_ratio(
slug_song_album_name, slug_result_name, score_cutoff=95
slug_song_album_name, slug_result_name, score_cutoff=85
)
and slug_result_album == slug_song_album_name
):
# If the result album name is similar to the song album name
# and the result album name is the same as the song album name
# we add album match to the average match
average_match = (
artist_match + album_match + name_match + time_match
) / 4
# and average match is higher than 80%
# we add album match to the total match
average_match = average_match + album_match / 2

# print(f"? average_match with album_match: {average_match}")

if time_match < 50 and average_match < 85:
# If the time match is lower than 50% and the average match is lower than 85%
# we skip the result
# print("! time_match < 50 and average_match < 85 - skipping")
continue
elif time_match < 50:
# If the time match is lower than 50% but the average match is higher than 85%
# we add time match to the average match

# if the result is an isrc result we don't add time match
if not is_isrc:
average_match = (average_match + time_match) / 2
# print(f"? average_match with time_match, not isrc: {average_match}")

# print(f"? final average_match: {average_match}")

# the results along with the avg Match
links_with_match_value[result["link"]] = average_match
Expand Down
12 changes: 12 additions & 0 deletions tests/test_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,18 @@
"https://open.spotify.com/track/2Ikdgh3J5vCRmnCL3Xcrtv",
"https://music.youtube.com/watch?v=sJpzMSHKUqI",
),
(
"https://open.spotify.com/track/4uOHYc6dCVLcNdQBRUlA0G",
"https://www.youtube.com/watch?v=Mb3tyjibXCg",
),
(
# this song is bugged on ytmusic for some reason
# it doesn't show up in search results
# so we can only find the lyrics version of it
# which is fine but not ideal
"https://open.spotify.com/track/1zi7xx7UVEFkmKfv06H8x0",
"https://www.youtube.com/watch?v=ki0Ocze98U8",
),
],
)
def test_ytmusic_matching(monkeypatch, query, expected):
Expand Down

0 comments on commit 8fcdf0f

Please sign in to comment.