Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(feat) allow scraping of cinemaparadiso movies #31

Merged
merged 1 commit into from
May 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions TODO
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- new release for amazon tv series
- allow amazon tv search for indivdual series
- new release for cinema-paradiso tv / movie
- improve cinema-paradiso movie scrape, many search results are the same page. wasted processing

## bugs

Expand Down
175 changes: 106 additions & 69 deletions cinemaparadiso/cinemaparadiso.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ var (

// nolint: dupl, nolintlint
func GetCinemaParadisoMoviesInParallel(plexMovies []types.PlexMovie) (searchResults []types.SearchResults) {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(plexMovies))
semaphore := make(chan struct{}, types.ConcurrencyLimit)

Expand All @@ -49,6 +50,28 @@ func GetCinemaParadisoMoviesInParallel(plexMovies []types.PlexMovie) (searchResu
return searchResults
}

func ScrapeMovieTitlesParallel(searchResults []types.SearchResults) []types.SearchResults {
numberMoviesProcessed = 0
ch := make(chan types.SearchResults, len(searchResults))
semaphore := make(chan struct{}, types.ConcurrencyLimit)

for i := range searchResults {
go func(i int) {
semaphore <- struct{}{}
defer func() { <-semaphore }()
scrapeMovieTitle(&searchResults[i], ch)
}(i)
}
detailedSearchResults := make([]types.SearchResults, 0, len(searchResults))
for range searchResults {
result := <-ch
detailedSearchResults = append(detailedSearchResults, result)
numberMoviesProcessed++
}
numberMoviesProcessed = 0 // job is done
return detailedSearchResults
}

// nolint: dupl, nolintlint
func GetCinemaParadisoTVInParallel(plexTVShows []types.PlexTVShow) (searchResults []types.SearchResults) {
ch := make(chan types.SearchResults, len(plexTVShows))
Expand Down Expand Up @@ -85,7 +108,7 @@ func searchCinemaParadisoMovie(plexMovie *types.PlexMovie, movieSearchResult cha
result.PlexMovie = *plexMovie
urlEncodedTitle := url.QueryEscape(plexMovie.Title)
result.SearchURL = cinemaparadisoSearchURL + "?form-search-field=" + urlEncodedTitle
rawData, err := makeSearchRequest(urlEncodedTitle)
rawData, err := makeRequest(result.SearchURL, http.MethodPost, fmt.Sprintf("form-search-field=%s", urlEncodedTitle))
if err != nil {
fmt.Println("Error making web request:", err)
movieSearchResult <- result
Expand All @@ -98,12 +121,53 @@ func searchCinemaParadisoMovie(plexMovie *types.PlexMovie, movieSearchResult cha
movieSearchResult <- result
}

func scrapeMovieTitle(result *types.SearchResults, movieSearchResult chan<- types.SearchResults) {
// now we can get the series information for each best match
for i := range result.MovieSearchResults {
if !result.MovieSearchResults[i].BestMatch {
continue
}
rawData, err := makeRequest(result.MovieSearchResults[i].URL, http.MethodGet, "")
if err != nil {
fmt.Println("Error making web request:", err)
movieSearchResult <- *result
return
}
// search for the release date <dt>Release Date:</dt><dd>29/07/2013</dd>
r := regexp.MustCompile(`<section id="format-(.*?)".*?Release Date:<\/dt><dd>(.*?)<\/dd>`)
// this will match multiple times for different formats eg DVD, Blu-ray, 4K
match := r.FindAllStringSubmatch(rawData, -1)
discReleases := make(map[string]time.Time)
for i := range match {
switch match[i][1] {
case "1":
discReleases[types.DiskDVD], _ = time.Parse("02/01/2006", match[i][2])
case "3":
discReleases[types.DiskBluray], _ = time.Parse("02/01/2006", match[i][2])
case "14":
discReleases[types.Disk4K], _ = time.Parse("02/01/2006", match[i][2])
}
}
_, ok := discReleases[result.MovieSearchResults[i].Format]
if ok {
result.MovieSearchResults[i].ReleaseDate = discReleases[result.MovieSearchResults[i].Format]
} else {
result.MovieSearchResults[i].ReleaseDate = time.Time{}
}
// check if the release date is after the date the movie was added to plexs
if result.MovieSearchResults[i].ReleaseDate.After(result.PlexMovie.DateAdded) {
result.MovieSearchResults[i].NewRelease = true
}
}
movieSearchResult <- *result
}

func searchCinemaParadisoTV(plexTVShow *types.PlexTVShow, tvSearchResult chan<- types.SearchResults) {
result := types.SearchResults{}
urlEncodedTitle := url.QueryEscape(plexTVShow.Title)
result.PlexTVShow = *plexTVShow
result.SearchURL = cinemaparadisoSearchURL + "?form-search-field=" + urlEncodedTitle
rawData, err := makeSearchRequest(urlEncodedTitle)
rawData, err := makeRequest(result.SearchURL, http.MethodPost, fmt.Sprintf("form-search-field=%s", urlEncodedTitle))
if err != nil {
fmt.Println("searchCinemaParadisoTV: Error making web request:", err)
tvSearchResult <- result
Expand All @@ -124,62 +188,15 @@ func searchCinemaParadisoTV(plexTVShow *types.PlexTVShow, tvSearchResult chan<-

func findTVSeriesInfo(seriesURL string) (tvSeries []types.TVSeasonResult, err error) {
// make a request to the url
req, err := http.NewRequestWithContext(context.Background(), "GET", seriesURL, bytes.NewBuffer([]byte{}))
if err != nil {
fmt.Println("Error creating request:", err)
return tvSeries, err
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("Error sending request:", err)
return tvSeries, err
}

defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
rawData, err := makeRequest(seriesURL, http.MethodGet, "")
if err != nil {
fmt.Println("Error reading response body:", err)
fmt.Println("findTVSeriesInfo: Error making web request:", err)
return tvSeries, err
}
rawData := string(body)
// write the raw data to a file
// os.WriteFile("series.html", body, 0644)
tvSeries = findTVSeriesInResponse(rawData)
return tvSeries, nil
}

func makeSearchRequest(urlEncodedTitle string) (rawResponse string, err error) {
rawQuery := []byte(fmt.Sprintf("form-search-field=%s", urlEncodedTitle))
req, err := http.NewRequestWithContext(context.Background(), "POST", cinemaparadisoSearchURL, bytes.NewBuffer(rawQuery))
req.Header.Set("Content-Type", "application/x-www-form-urlencoded") // Assuming form data

if err != nil {
fmt.Println("Error creating request:", err)
return rawResponse, err
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("Error sending request:", err)
return rawResponse, err
}
defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("Error reading response body:", err)
return rawResponse, err
}
rawData := string(body)
// write the raw data to a file
// os.WriteFile("search.html", body, 0644)
return rawData, nil
}

func findTVSeriesInResponse(response string) (tvSeries []types.TVSeasonResult) {
// look for the series in the response
r := regexp.MustCompile(`<li data-filmId="(\d*)">`)
Expand Down Expand Up @@ -209,28 +226,12 @@ func findTVSeriesInResponse(response string) (tvSeries []types.TVSeasonResult) {
}

func makeSeriesRequest(tv types.TVSeasonResult) (types.TVSeasonResult, error) {
content := []byte(fmt.Sprintf("FilmID=%s", tv.URL))
req, err := http.NewRequestWithContext(context.Background(), "POST", cinemaparadisoSeriesURL, bytes.NewBuffer(content))
rawData, err := makeRequest(cinemaparadisoSeriesURL, http.MethodPost, fmt.Sprintf("FilmID=%s", tv.URL))
if err != nil {
return tv, fmt.Errorf("makeSeriesRequest: error creating request: %w", err)
return tv, fmt.Errorf("makeSeriesRequest: error making request: %w", err)
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return tv, fmt.Errorf("makeSeriesRequest: error sending request: %w", err)
}

defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
if err != nil {
return tv, fmt.Errorf("makeSeriesRequest: error reading response body: %w", err)
}
rawData := string(body)
// write the raw data to a file
r := regexp.MustCompile(`{.."Media..":.."(.*?)",.."ReleaseDate..":.."(.*?)"}`)

// Find all matches
matches := r.FindAllStringSubmatch(rawData, -1)
for _, match := range matches {
Expand Down Expand Up @@ -303,6 +304,42 @@ func findTitlesInResponse(response string, movie bool) (movieResults []types.Mov
return movieResults, tvResults
}

func makeRequest(urlEncodedTitle, method, content string) (rawResponse string, err error) {
var req *http.Request
switch method {
case http.MethodPost:
req, err = http.NewRequestWithContext(context.Background(), http.MethodPost, urlEncodedTitle, bytes.NewBuffer([]byte(content)))
if strings.Contains(content, "form-search-field") {
req.Header.Set("Content-Type", "application/x-www-form-urlencoded") // Assuming form data
}
case http.MethodGet:
req, err = http.NewRequestWithContext(context.Background(), http.MethodGet, urlEncodedTitle, http.NoBody)
}

if err != nil {
fmt.Println("Error creating request:", err)
return rawResponse, err
}

client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
fmt.Println("Error sending request:", err)
return rawResponse, err
}
defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("Error reading response body:", err)
return rawResponse, err
}
rawData := string(body)
// write the raw data to a file
// os.WriteFile("search.html", body, 0644)
return rawData, nil
}

func extractDiscFormats(movieEntry string) []string {
ulStartIndex := strings.Index(movieEntry, `<ul class="media-types">`) + len(`<ul class="media-types">`)
ulEndIndex := strings.Index(movieEntry[ulStartIndex:], "</ul>") + ulStartIndex
Expand Down
28 changes: 28 additions & 0 deletions cinemaparadiso/cinemaparadiso_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,31 @@ func TestSearchCinemaParadisoMovies(t *testing.T) {
t.Errorf("Expected searchurl, but got none")
}
}
func TestScrapeMovieTitlesParallel(t *testing.T) {
searchResults := []types.SearchResults{
{
PlexMovie: types.PlexMovie{
Title: "Elf",
Year: "2021",
},
MovieSearchResults: []types.MovieSearchResult{
{
URL: "https://www.cinemaparadiso.co.uk/rentals/elf-10167.html",
Format: "Blu-ray",
Year: "2003",
BestMatch: true,
},
},
},
}

detailedSearchResults := ScrapeMovieTitlesParallel(searchResults)

if len(detailedSearchResults) != len(searchResults) {
t.Errorf("Expected %d detailed search results, but got %d", len(searchResults), len(detailedSearchResults))
}
// we should have a release date
if detailedSearchResults[0].MovieSearchResults[0].ReleaseDate.IsZero() {
t.Errorf("Expected release date, but got none")
}
}
5 changes: 4 additions & 1 deletion web/movies/movies.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func (c MoviesConfig) ProcessHTML(w http.ResponseWriter, r *http.Request) {
// filter plex movies based on preferences, eg. only movies with a certain resolution
filteredPlexMovies := plex.FilterPlexMovies(plexMovies, plexFilters)
//nolint: gocritic
// filteredPlexMovies = filteredPlexMovies[:100]
// filteredPlexMovies = filteredPlexMovies[:50]
//lint: gocritic
jobRunning = true
numberOfMoviesProcessed = 0
Expand All @@ -83,6 +83,9 @@ func (c MoviesConfig) ProcessHTML(w http.ResponseWriter, r *http.Request) {
startTime := time.Now()
if lookup == "cinemaParadiso" {
searchResults = cinemaparadiso.GetCinemaParadisoMoviesInParallel(filteredPlexMovies)
if lookupFilters.NewerVersion {
searchResults = cinemaparadiso.ScrapeMovieTitlesParallel(searchResults)
}
} else {
searchResults = amazon.SearchAmazonMoviesInParallel(filteredPlexMovies, lookupFilters.AudioLanguage, c.Config.AmazonRegion)
// if we are filtering by newer version, we need to search again
Expand Down
Loading