Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallelise amazon #27

Merged
merged 5 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions TODO
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

## features

- new release for amazon tv series
- allow amazon tv search for indivdual series
- new release for cinema-paradiso tv / movie

## bugs

- allow amazon tv search for indivdual series
- allow amazon tv search for newer series
- music, a-ha/ash doesnt match as an artist why ?
- move language filtering out of plex search,should only happens in web tv & movie
- move language filtering out of plex search, should only happen in web tv & movie web pages
- when scraping movies, do we stop at the first best match ?

## done

Expand All @@ -33,3 +36,5 @@
- parallelise cinema-paradiso movie search 6m20 to 2m25
- parallelise cinema-paradiso tv search
- for movies/tc dont refresh plex list every time, unless necessary
- parallelise amazon search tv/movie
- move newer show out of amazon and cinema-paradiso, move to web page
311 changes: 175 additions & 136 deletions amazon/amazon.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,183 +16,178 @@ import (
)

const (
amazonURL = "https://www.blu-ray.com/movies/search.php?keyword="
amazonURL = "https://www.blu-ray.com/movies/search.php?keyword="
LanguageGerman = "german"
)

func ScrapeTitles(searchResults *types.SearchResults) (scrapedResults []types.MovieSearchResult) {
var results, lookups []types.MovieSearchResult
for _, searchResult := range searchResults.MovieSearchResults {
if !searchResult.BestMatch {
results = append(results, searchResult)
} else {
lookups = append(lookups, searchResult)
}
}
var (
numberMoviesProcessed int = 0
numberTVProcessed int = 0
)

if len(lookups) > 0 {
ch := make(chan *types.MovieSearchResult, len(lookups))
// Limit number of concurrent requests
semaphore := make(chan struct{}, types.ConcurrencyLimit)
for i := range lookups {
go func() {
semaphore <- struct{}{}
defer func() { <-semaphore }()
scrapeTitle(&lookups[i], searchResults.PlexMovie.DateAdded, ch)
}()
}
func SearchAmazonMoviesInParallel(plexMovies []types.PlexMovie, language string) (searchResults []types.SearchResults) {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(plexMovies))
semaphore := make(chan struct{}, types.ConcurrencyLimit)

for i := range plexMovies {
go func(i int) {
semaphore <- struct{}{}
defer func() { <-semaphore }()
searchAmazonMovie(plexMovies[i], language, ch)
}(i)
}

for i := 0; i < len(lookups); i++ {
lookup := <-ch
results = append(results, *lookup)
}
searchResults = make([]types.SearchResults, 0, len(plexMovies))
for range plexMovies {
result := <-ch
searchResults = append(searchResults, result)
numberMoviesProcessed++
}
return results
numberMoviesProcessed = 0 // job is done
fmt.Println("amazon movies found:", len(searchResults))
return searchResults
}

func scrapeTitle(movie *types.MovieSearchResult, dateAdded time.Time, ch chan<- *types.MovieSearchResult) {
req, err := http.NewRequestWithContext(context.Background(), "GET", movie.URL, bytes.NewBuffer([]byte{}))
movie.ReleaseDate = time.Time{}
if err != nil {
fmt.Println("Error creating request:", err)
ch <- movie
return
func SearchAmazonTVInParallel(plexTVShows []types.PlexTVShow, language string) (searchResults []types.SearchResults) {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(plexTVShows))
semaphore := make(chan struct{}, types.ConcurrencyLimit)

for i := range plexTVShows {
go func(i int) {
semaphore <- struct{}{}
defer func() { <-semaphore }()
searchAmazonTV(&plexTVShows[i], language, ch)
}(i)
}

req.Header.Set("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("Error sending request:", err)
ch <- movie
return
searchResults = make([]types.SearchResults, 0, len(plexTVShows))
for range plexTVShows {
result := <-ch
searchResults = append(searchResults, result)
numberTVProcessed++
}
numberTVProcessed = 0 // job is done
fmt.Println("amazon TV shows found:", len(searchResults))
return searchResults
}

defer resp.Body.Close()
func GetMovieJobProgress() int {
return numberMoviesProcessed
}

body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("Error reading response body:", err)
ch <- movie
return
func GetTVJobProgress() int {
return numberTVProcessed
}

func ScrapeTitlesParallel(searchResults []types.SearchResults) (scrapedResults []types.SearchResults) {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(searchResults))
semaphore := make(chan struct{}, types.ConcurrencyLimit)
for i := range searchResults {
go func(i int) {
semaphore <- struct{}{}
defer func() { <-semaphore }()
scrapeTitles(&searchResults[i], ch)
}(i)
}
rawData := string(body)
movie.ReleaseDate = findTitleDetails(rawData)
if movie.ReleaseDate.After(dateAdded) {
movie.NewRelease = true

scrapedResults = make([]types.SearchResults, 0, len(searchResults))
for range searchResults {
result := <-ch
scrapedResults = append(scrapedResults, result)
numberMoviesProcessed++
}
ch <- movie
numberMoviesProcessed = 0
fmt.Println("amazon Movie titles scraped:", len(scrapedResults))
return scrapedResults
}

func findTitleDetails(response string) (releaseDate time.Time) {
r := regexp.MustCompile(`<a class="grey noline" alt=".*">(.*?)</a></span>`)

match := r.FindStringSubmatch(response)
if match != nil {
stringDate := match[1]
var err error
releaseDate, err = time.Parse("Jan 02, 2006", stringDate)
func scrapeTitles(searchResult *types.SearchResults, ch chan<- types.SearchResults) {
dateAdded := searchResult.PlexMovie.DateAdded
for i := range searchResult.MovieSearchResults {
// this is to limit the number of requests
if !searchResult.MovieSearchResults[i].BestMatch {
continue
}
rawData, err := makeRequest(searchResult.MovieSearchResults[i].URL, "")
if err != nil {
releaseDate = time.Time{}
fmt.Println("scrapeTitle: Error making request:", err)
ch <- *searchResult
return
}
// Find the release date
searchResult.MovieSearchResults[i].ReleaseDate = time.Time{} // default to zero time
r := regexp.MustCompile(`<a class="grey noline" alt=".*">(.*?)</a></span>`)
match := r.FindStringSubmatch(rawData)
if match != nil {
stringDate := match[1]
searchResult.MovieSearchResults[i].ReleaseDate, _ = time.Parse("Jan 02, 2006", stringDate)
}
if searchResult.MovieSearchResults[i].ReleaseDate.After(dateAdded) {
searchResult.MovieSearchResults[i].NewRelease = true
}
} else {
releaseDate = time.Time{}
}

return releaseDate
ch <- *searchResult
}

func SearchAmazonMovie(plexMovie types.PlexMovie, filter string) (movieSearchResult types.SearchResults, err error) {
func searchAmazonMovie(plexMovie types.PlexMovie, language string, movieSearchResult chan<- types.SearchResults) {
result := types.SearchResults{}
result.PlexMovie = plexMovie
result.SearchURL = ""

urlEncodedTitle := url.QueryEscape(plexMovie.Title)
amazonURL := amazonURL + urlEncodedTitle
if filter != "" {
amazonURL += filter
// this searches for the movie in a language
switch language {
case LanguageGerman:
amazonURL += "&audio=" + language
default:
// do nothing
}
amazonURL += "&submit=Search&action=search"
req, err := http.NewRequestWithContext(context.Background(), "GET", amazonURL, bytes.NewBuffer([]byte{}))

movieSearchResult.PlexMovie = plexMovie
movieSearchResult.SearchURL = amazonURL

req.Header.Set("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
country := "uk"
if strings.Contains(filter, "german") {
country = "de"
}
req.Header.Set("Cookie", fmt.Sprintf("country=%s;", country))
if err != nil {
fmt.Println("Error creating request:", err)
return movieSearchResult, err
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("Error sending request:", err)
return movieSearchResult, err
}

defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
rawData, err := makeRequest(amazonURL, language)
if err != nil {
fmt.Println("Error reading response body:", err)
return movieSearchResult, err
fmt.Println("searchAmazonMovie: Error making request:", err)
movieSearchResult <- result
return
}
rawData := string(body)

moviesFound, _ := findTitlesInResponse(rawData, true)
movieSearchResult.MovieSearchResults = moviesFound
movieSearchResult = utils.MarkBestMatch(&movieSearchResult)
return movieSearchResult, nil
result.MovieSearchResults = moviesFound
result = utils.MarkBestMatch(&result)
movieSearchResult <- result
}

func SearchAmazonTV(plexTVShow *types.PlexTVShow, filter string) (tvSearchResult types.SearchResults, err error) {
func searchAmazonTV(plexTVShow *types.PlexTVShow, language string, tvSearchResult chan<- types.SearchResults) {
result := types.SearchResults{}
result.PlexTVShow = *plexTVShow
result.SearchURL = amazonURL

urlEncodedTitle := url.QueryEscape(fmt.Sprintf("%s complete series", plexTVShow.Title)) // complete series
amazonURL := amazonURL + urlEncodedTitle
if filter != "" {
amazonURL += filter
// this searches for the movie in a language
switch language {
case LanguageGerman:
amazonURL += "&audio=" + language
default:
// do nothing
}
amazonURL += "&submit=Search&action=search"
req, err := http.NewRequestWithContext(context.Background(), "GET", amazonURL, bytes.NewBuffer([]byte{}))

tvSearchResult.PlexTVShow = *plexTVShow
tvSearchResult.SearchURL = amazonURL

req.Header.Set("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
country := "uk"
if strings.Contains(filter, "german") {
country = "de"
}
req.Header.Set("Cookie", fmt.Sprintf("country=%s;", country))
if err != nil {
fmt.Println("Error creating request:", err)
return tvSearchResult, err
}

client := &http.Client{}
resp, err := client.Do(req)
rawData, err := makeRequest(amazonURL, language)
if err != nil {
fmt.Println("Error sending request:", err)
return tvSearchResult, err
}

defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("Error reading response body:", err)
return tvSearchResult, err
fmt.Println("searchAmazonTV: Error making request:", err)
tvSearchResult <- result
return
}
rawData := string(body)

_, titlesFound := findTitlesInResponse(rawData, false)
tvSearchResult.TVSearchResults = titlesFound
tvSearchResult = utils.MarkBestMatch(&tvSearchResult)
return tvSearchResult, nil
result.TVSearchResults = titlesFound
result = utils.MarkBestMatch(&result)
tvSearchResult <- result
}

func findTitlesInResponse(response string, movie bool) (movieResults []types.MovieSearchResult, tvResults []types.TVSearchResult) {
Expand Down Expand Up @@ -256,3 +251,47 @@ func findTitlesInResponse(response string, movie bool) (movieResults []types.Mov

return movieResults, tvResults
}

func makeRequest(inputURL, language string) (response string, err error) {
req, err := http.NewRequestWithContext(context.Background(), "GET", inputURL, bytes.NewBuffer([]byte{}))

req.Header.Set("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")

// this forces results from a specific amazon region
switch language {
case LanguageGerman:
req.Header.Set("Cookie", "country=de;")
default:
req.Header.Set("Cookie", "country=uk;")
}

if err != nil {
fmt.Println("makeRequest: error creating request:", err)
return response, err
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("makeRequest: error sending request:", err)
return response, err
}

defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("makeRequest: error reading response body:", err)
return response, err
}

// check for a 200 status code
if resp.StatusCode != http.StatusOK {
fmt.Println("amazon: status code not OK, probably rate limited:", resp.StatusCode)
return response, fmt.Errorf("amazon: status code not OK: %d", resp.StatusCode)
}

rawResponse := string(body)
return rawResponse, nil
}
Loading
Loading