Skip to content

Commit

Permalink
feat: Expand always URLs, to avoid loosing some links (#70)
Browse files Browse the repository at this point in the history
Co-authored-by: Daniel Martin Gonzalez <[email protected]>
  • Loading branch information
hectorzin and danimart1991 authored Nov 8, 2024
1 parent 21fcd54 commit 0d5a461
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 25 deletions.
19 changes: 3 additions & 16 deletions botaffiumeiro.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,9 @@
load_configuration,
)

SHORT_URL_DOMAINS = ["amzn.to", "amzn.eu", "s.click.aliexpress.com", "bit.ly", "tinyurl.com"]
DOMAIN_PATTERNS = {
"aliexpress": ALIEXPRESS_PATTERN,
}
# "aliexpress_short_url_pattern": r"https?://s\.click\.aliexpress\.com/e/[\w\d_]+",

logging.basicConfig(
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
Expand Down Expand Up @@ -87,14 +85,6 @@ def extract_embedded_url(query_params):
return embedded_domains


def is_short_url(url: str) -> bool:
"""
Checks if the given URL belongs to a known short URL domain.
"""
parsed_url = urlparse(url)
return parsed_url.netloc in SHORT_URL_DOMAINS


def extract_domains_from_message(message_text: str) -> Tuple[set, str]:
"""
Extracts domains from a message using domain patterns and searches for embedded URLs.
Expand All @@ -115,12 +105,9 @@ def extract_domains_from_message(message_text: str) -> Tuple[set, str]:

for url in urls_in_message:
# If it's a short URL, expand it
if is_short_url(url):
expanded_url = expand_shortened_url(url)
# Replace the short URL with the expanded URL in the message text
message_text = message_text.replace(url, expanded_url)
else:
expanded_url = url
expanded_url = expand_shortened_url(url)
# Replace the short URL with the expanded URL in the message text
message_text = message_text.replace(url, expanded_url)

# Now extract the domain from the expanded URL
parsed_url = urlparse(expanded_url)
Expand Down
20 changes: 11 additions & 9 deletions tests/test_botaffiumeiro.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ def test_mixed_full_and_shortened_urls(self, mock_expand):

# Simulate the expansion of the shortened URLs
mock_expand.side_effect = [
"https://www.amazon.com/dp/product123", # Long URL link, expands as itself
"https://www.amazon.com/dp/product456", # Expanded URL for amzn.to
"https://www.aliexpress.com/item/1005001234567890.html", # Expanded URL for aliexpress shortened link
]
Expand All @@ -378,6 +379,7 @@ def test_mixed_full_and_shortened_urls(self, mock_expand):
domains, modified_message = extract_domains_from_message(message_text)

# Check that the expand_shortened_url function was called twice with correct URLs
mock_expand.assert_any_call("https://www.amazon.com/dp/product123")
mock_expand.assert_any_call("https://amzn.to/abc123")
mock_expand.assert_any_call("https://s.click.aliexpress.com/e/buyproduct")

Expand Down Expand Up @@ -501,24 +503,24 @@ def test_extract_domains_with_long_urls(self):
"""
Test: Extract domains from long Amazon and AliExpress URLs.
"""
# Texto con URLs largas ya expandidas
# Text with long URLs already expanded
message_text = (
"Check out this Amazon deal: https://www.amazon.com/dp/B08XYZ123 "
"and this AliExpress: https://www.aliexpress.com/item/12345.html"
"and this AliExpress: https://es.aliexpress.com/item/12345.html" ## We use a localized URL because expanding always, can change generic to local URL
)

# Llama a la función que procesa el mensaje
# Call the function that processes the message
domains, modified_message = extract_domains_from_message(message_text)

# Verifica que los dominios correctos fueron extraídos
self.assertIn("amazon.com", domains) # Debería encontrar amazon.com
self.assertIn("aliexpress.com", domains) # Debería encontrar aliexpress.com
# Verify that the correct domains were extracted
self.assertIn("amazon.com", domains) # Should find amazon.com
self.assertIn("aliexpress.com", domains) # Should find aliexpress.com

# Verifica que las URLs completas estén presentes en el mensaje modificado
# Verify that the full URLs are present in the modified message
self.assertIn("https://www.amazon.com/dp/B08XYZ123", modified_message)
self.assertIn("https://www.aliexpress.com/item/12345.html", modified_message)
self.assertIn("aliexpress.com/item/12345.html", modified_message) # Should find aliexpress.com (not checking exact subdomain, as it may expand to different regions)

# Asegúrate de que no hubo modificaciones innecesarias
# Ensure there were no unnecessary modifications
self.assertEqual(message_text, modified_message)


Expand Down

0 comments on commit 0d5a461

Please sign in to comment.