Skip to content

Commit

Permalink
(fix) change amazon scrape title approach
Browse files Browse the repository at this point in the history
  • Loading branch information
tphoney committed May 20, 2024
1 parent f043bbc commit 58e4c7d
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 45 deletions.
5 changes: 3 additions & 2 deletions TODO
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
- allow amazon tv search for indivdual series
- allow amazon tv search for newer series
- music, a-ha/ash doesnt match as an artist why ?
- move language filtering out of plex search,should only happens in web tv & movie
- move newer show out of amazon and cinema-paradiso, move to web page
- move language filtering out of plex search, should only happens in web tv & movie
- when scraping movies, do we stop at the first best match ?

## done

Expand All @@ -35,3 +35,4 @@
- parallelise cinema-paradiso tv search
- for movies/tc dont refresh plex list every time, unless necessary
- parallelise amazon search tv/movie
- move newer show out of amazon and cinema-paradiso, move to web page
97 changes: 55 additions & 42 deletions amazon/amazon.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ var (
)

func SearchAmazonMoviesInParallel(plexMovies []types.PlexMovie, language string) (searchResults []types.SearchResults) {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(plexMovies))
semaphore := make(chan struct{}, types.ConcurrencyLimit)

Expand All @@ -44,10 +45,12 @@ func SearchAmazonMoviesInParallel(plexMovies []types.PlexMovie, language string)
numberMoviesProcessed++
}
numberMoviesProcessed = 0 // job is done
fmt.Println("amazon movies found:", len(searchResults))
return searchResults
}

func SearchAmazonTVInParallel(plexTVShows []types.PlexTVShow, language string) (searchResults []types.SearchResults) {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(plexTVShows))
semaphore := make(chan struct{}, types.ConcurrencyLimit)

Expand All @@ -66,6 +69,7 @@ func SearchAmazonTVInParallel(plexTVShows []types.PlexTVShow, language string) (
numberTVProcessed++
}
numberTVProcessed = 0 // job is done
fmt.Println("amazon TV shows found:", len(searchResults))
return searchResults
}

Expand All @@ -79,52 +83,54 @@ func GetTVJobProgress() int {

func ScrapeTitlesParallel(searchResults []types.SearchResults) (scrapedResults []types.SearchResults) {
numberMoviesProcessed = 0

ch := make(chan types.SearchResults, len(searchResults))
semaphore := make(chan struct{}, types.ConcurrencyLimit)
for i := range searchResults {
// check if the search result is a movie
if len(searchResults[i].MovieSearchResults) > 0 {
ch := make(chan *types.MovieSearchResult, len(searchResults[i].MovieSearchResults))
semaphore := make(chan struct{}, types.ConcurrencyLimit)
for j := range searchResults[i].MovieSearchResults {
go func(j int) {
semaphore <- struct{}{}
defer func() { <-semaphore }()
scrapeTitle(&searchResults[i].MovieSearchResults[j], searchResults[i].PlexMovie.DateAdded, ch)
}(j)
}
movieResults := make([]types.MovieSearchResult, 0, len(searchResults[i].MovieSearchResults))
for range searchResults[i].MovieSearchResults {
result := <-ch
movieResults = append(movieResults, *result)
}
fmt.Println("Scraped", len(movieResults), "titles for", searchResults[i].PlexMovie.Title)
searchResults[i].MovieSearchResults = movieResults
}
scrapedResults = append(scrapedResults, searchResults[i])
go func(i int) {
semaphore <- struct{}{}
defer func() { <-semaphore }()
scrapeTitles(&searchResults[i], ch)
}(i)
}

scrapedResults = make([]types.SearchResults, 0, len(searchResults))
for range searchResults {
result := <-ch
scrapedResults = append(scrapedResults, result)
numberMoviesProcessed++
}
numberMoviesProcessed = 0
fmt.Println("amazon Movie titles scraped:", len(scrapedResults))
return scrapedResults
}

func scrapeTitle(movie *types.MovieSearchResult, dateAdded time.Time, ch chan<- *types.MovieSearchResult) {
rawData, err := makeRequest(movie.URL, "")
if err != nil {
fmt.Println("scrapeTitle: Error making request:", err)
ch <- movie
return
}
// Find the release date
movie.ReleaseDate = time.Time{} // default to zero time
r := regexp.MustCompile(`<a class="grey noline" alt=".*">(.*?)</a></span>`)
match := r.FindStringSubmatch(rawData)
if match != nil {
stringDate := match[1]
movie.ReleaseDate, _ = time.Parse("Jan 02, 2006", stringDate)
}
if movie.ReleaseDate.After(dateAdded) {
movie.NewRelease = true
func scrapeTitles(searchResult *types.SearchResults, ch chan<- types.SearchResults) {
dateAdded := searchResult.PlexMovie.DateAdded
for i := range searchResult.MovieSearchResults {
// this is to limit the number of requests
if !searchResult.MovieSearchResults[i].BestMatch {
continue
}
rawData, err := makeRequest(searchResult.MovieSearchResults[i].URL, "")
if err != nil {
fmt.Println("scrapeTitle: Error making request:", err)
ch <- *searchResult
return
}
// Find the release date
searchResult.MovieSearchResults[i].ReleaseDate = time.Time{} // default to zero time
r := regexp.MustCompile(`<a class="grey noline" alt=".*">(.*?)</a></span>`)
match := r.FindStringSubmatch(rawData)
if match != nil {
stringDate := match[1]
searchResult.MovieSearchResults[i].ReleaseDate, _ = time.Parse("Jan 02, 2006", stringDate)
}
if searchResult.MovieSearchResults[i].ReleaseDate.After(dateAdded) {
searchResult.MovieSearchResults[i].NewRelease = true
}
break // only scrape the first best match
}
ch <- movie
ch <- *searchResult
}

func searchAmazonMovie(plexMovie types.PlexMovie, language string, movieSearchResult chan<- types.SearchResults) {
Expand Down Expand Up @@ -262,24 +268,31 @@ func makeRequest(inputURL, language string) (response string, err error) {
}

if err != nil {
fmt.Println("Error creating request:", err)
fmt.Println("makeRequest: error creating request:", err)
return response, err
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("Error sending request:", err)
fmt.Println("makeRequest: error sending request:", err)
return response, err
}

defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("Error reading response body:", err)
fmt.Println("makeRequest: error reading response body:", err)
return response, err
}

// check for a 200 status code
if resp.StatusCode != http.StatusOK {
fmt.Println("amazon: status code not OK, probably rate limited:", resp.StatusCode)
return response, fmt.Errorf("amazon: status code not OK: %d", resp.StatusCode)
}

rawResponse := string(body)
return rawResponse, nil
}
26 changes: 26 additions & 0 deletions amazon/amazon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,29 @@ func TestSearchAmazonTV(t *testing.T) {
}
fmt.Println(result)
}

func TestScrapeTitlesParallel(t *testing.T) {
result := ScrapeTitlesParallel([]types.SearchResults{
{
PlexMovie: types.PlexMovie{
Title: "napoleon dynamite",
Year: "2001",
},
MovieSearchResults: []types.MovieSearchResult{
{
FoundTitle: "Napoleon Dynamite",
URL: "https://www.blu-ray.com/movies/Napoleon-Dynamite-Blu-ray/2535/",
BestMatch: true,
},
},
},
})

if len(result) == 0 {
t.Errorf("Expected search results, but got none")
}
if result[0].MovieSearchResults[0].ReleaseDate.IsZero() {
t.Errorf("Expected release date, but got none")
}
fmt.Println(result)
}
2 changes: 1 addition & 1 deletion web/tv/tv.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func (c TVConfig) ProcessHTML(w http.ResponseWriter, r *http.Request) {
}
filters = newFilters
//nolint: gocritic
plexTV = plexTV[:10]
// plexTV = plexTV[:10]
//lint: gocritic

tvJobRunning = true
Expand Down

0 comments on commit 58e4c7d

Please sign in to comment.