From 49cf9f195c3609cedae575c3ab632d0118447fb6 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sun, 24 Mar 2024 15:52:06 +0100 Subject: [PATCH] remove old orf crawler --- .../mServer/crawler/sender/orf/JsonUtils.java | 51 ---- .../crawler/sender/orf/OrfConstants.java | 35 --- .../crawler/sender/orf/OrfCrawler.java | 104 ------- .../crawler/sender/orf/OrfEpisodeInfoDTO.java | 39 --- .../crawler/sender/orf/OrfVideoInfoDTO.java | 50 ---- .../orf/json/OrfMoreEpisodesDeserializer.java | 26 -- .../orf/parser/OrfMoreEpisodesParser.java | 27 -- .../orf/parser/OrfPlaylistDeserializer.java | 104 ------- .../parser/OrfVideoDetailDeserializer.java | 149 ---------- .../crawler/sender/orf/tasks/OrfDayTask.java | 54 ---- .../sender/orf/tasks/OrfFilmDetailTask.java | 272 ------------------ .../crawler/sender/orf/tasks/OrfHelper.java | 63 ---- .../orf/tasks/OrfHistoryOverviewTask.java | 45 --- .../sender/orf/tasks/OrfHistoryTopicTask.java | 39 --- .../sender/orf/tasks/OrfLetterPageTask.java | 57 ---- .../crawler/sender/orf/tasks/OrfTaskBase.java | 110 ------- 16 files changed, 1225 deletions(-) delete mode 100644 src/main/java/mServer/crawler/sender/orf/JsonUtils.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/OrfConstants.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/OrfCrawler.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/OrfEpisodeInfoDTO.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/OrfVideoInfoDTO.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/json/OrfMoreEpisodesDeserializer.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/parser/OrfMoreEpisodesParser.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/parser/OrfPlaylistDeserializer.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/parser/OrfVideoDetailDeserializer.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfDayTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfFilmDetailTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfHelper.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryOverviewTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryTopicTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfLetterPageTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfTaskBase.java diff --git a/src/main/java/mServer/crawler/sender/orf/JsonUtils.java b/src/main/java/mServer/crawler/sender/orf/JsonUtils.java deleted file mode 100644 index 989425f03..000000000 --- a/src/main/java/mServer/crawler/sender/orf/JsonUtils.java +++ /dev/null @@ -1,51 +0,0 @@ -package mServer.crawler.sender.orf; - -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import java.util.Optional; - -public final class JsonUtils { - - private JsonUtils() { - super(); - } - - /** - * Gets the value of an attribute - * - * @param aJsonObject the object - * @param aAttributeName the name of the attribute - * @return the value of the attribute, if it exists, else Optional.empty - */ - public static Optional getAttributeAsString(final JsonObject aJsonObject, - final String aAttributeName) { - if (aJsonObject.has(aAttributeName)) { - final JsonElement aElement = aJsonObject.get(aAttributeName); - if (!aElement.isJsonNull()) { - return Optional.of(aElement.getAsString()); - } - } - - return Optional.empty(); - } - - /** - * Checks if the {@link JsonObject} has all given elements and if no element - * is null. - * - * @param aJsonObject The object to check. - * @param aElementIds The elements which it should has. - * @return true when the object has all given elements and if no element is - * null. - */ - public static boolean hasElements(final JsonObject aJsonObject, - final String... aElementIds) { - for (final String elementId : aElementIds) { - if (!aJsonObject.has(elementId) || aJsonObject.get(elementId).isJsonNull()) { - return false; - } - } - - return true; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/OrfConstants.java b/src/main/java/mServer/crawler/sender/orf/OrfConstants.java deleted file mode 100644 index 3b95c98cf..000000000 --- a/src/main/java/mServer/crawler/sender/orf/OrfConstants.java +++ /dev/null @@ -1,35 +0,0 @@ -package mServer.crawler.sender.orf; - -public final class OrfConstants { - - public static final String URL_BASE = "https://tvthek.orf.at"; - - /** - * URL für die Sendungen eines Tages Muss am Ende noch um das Datum dd.MM.yyyy ergänzt werden - */ - public static final String URL_DAY = URL_BASE + "/schedule/"; - - /** - * Basis-URL für Übersichtsseite nach Buchstaben Muss am Ende noch um Buchstabe bzw. 0 ergänzt - * werden - */ - public static final String URL_SHOW_LETTER_PAGE = URL_BASE + "/profiles/letter/"; - - /** - * URL für erste Übersichtsseite nach Buchstaben - */ - public static final String URL_SHOW_LETTER_PAGE_A = URL_SHOW_LETTER_PAGE + "A"; - - /** - * URL für verpasste Sendungen eines Tages Muss am Ende noch um Datum ergänzt werden im Format - * DD.MM.YYYY - */ - public static final String URL_DATE = URL_BASE + "/schedule/"; - - /** - * URL für Übersichtsseite des Archivs - */ - public static final String URL_ARCHIVE = URL_BASE + "/archive"; - - private OrfConstants() {} -} diff --git a/src/main/java/mServer/crawler/sender/orf/OrfCrawler.java b/src/main/java/mServer/crawler/sender/orf/OrfCrawler.java deleted file mode 100644 index b4a039a83..000000000 --- a/src/main/java/mServer/crawler/sender/orf/OrfCrawler.java +++ /dev/null @@ -1,104 +0,0 @@ -package mServer.crawler.sender.orf; - -import de.mediathekview.mlib.Const; -import de.mediathekview.mlib.daten.DatenFilm; -import de.mediathekview.mlib.tool.Log; -import mServer.crawler.CrawlerTool; -import mServer.crawler.FilmeSuchen; -import mServer.crawler.sender.MediathekCrawler; -import mServer.crawler.sender.base.CrawlerUrlDTO; -import mServer.crawler.sender.orf.tasks.*; - -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.time.temporal.ChronoUnit; -import java.util.Set; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.RecursiveTask; - -public class OrfCrawler extends MediathekCrawler { - - public static final String SENDERNAME = Const.ORF; - - public OrfCrawler(FilmeSuchen ssearch, int startPrio) { - super(ssearch, SENDERNAME, 0, 1, startPrio); - } - - private Set getDaysEntries() throws InterruptedException, ExecutionException { - final OrfDayTask dayTask = new OrfDayTask(this, getDayUrls()); - final Set shows = forkJoinPool.submit(dayTask).get(); - - Log.sysLog("ORF: Anzahl Sendungen aus Verpasst: " + shows.size()); - - return shows; - } - - private ConcurrentLinkedQueue getDayUrls() { - final int maximumDaysForSendungVerpasstSection = 8; - final int maximumDaysForSendungVerpasstSectionFuture = 0; - - final ConcurrentLinkedQueue urls = new ConcurrentLinkedQueue<>(); - for (int i = 0; i < maximumDaysForSendungVerpasstSection - + maximumDaysForSendungVerpasstSectionFuture; i++) { - urls.add(new CrawlerUrlDTO(OrfConstants.URL_DAY + LocalDateTime.now() - .plus(maximumDaysForSendungVerpasstSectionFuture, ChronoUnit.DAYS) - .minus(i, ChronoUnit.DAYS).format(DateTimeFormatter.ofPattern("dd.MM.yyyy")))); - } - - return urls; - } - - - private Set getArchiveEntries() throws InterruptedException, ExecutionException { - final OrfHistoryOverviewTask historyTask = new OrfHistoryOverviewTask(this); - final ConcurrentLinkedQueue topics = forkJoinPool.submit(historyTask).get(); - - final OrfHistoryTopicTask topicTask = new OrfHistoryTopicTask(this, topics); - final Set shows = forkJoinPool.submit(topicTask).get(); - - Log.sysLog("ORF: Anzahl Sendungen History: " + shows.size()); - - return shows; - } - - private ConcurrentLinkedQueue getLetterEntries() throws InterruptedException, ExecutionException { - final OrfLetterPageTask letterTask = new OrfLetterPageTask(); - final ConcurrentLinkedQueue shows = forkJoinPool.submit(letterTask).get(); - - Log.sysLog("ORF: Anzahl Sendungen nach Buchstaben: " + shows.size()); - - return shows; - } - - @Override - protected RecursiveTask> createCrawlerTask() { - - boolean processMoreEpisodes = false; - - final ConcurrentLinkedQueue shows = new ConcurrentLinkedQueue<>(); - try { - - if (CrawlerTool.loadLongMax()) { - shows.addAll(getLetterEntries()); - shows.addAll(getArchiveEntries()); - processMoreEpisodes = true; - } else { - getDaysEntries().forEach(show -> { - if (!shows.contains(show)) { - shows.add(show); - } - }); - } - - } catch (InterruptedException | ExecutionException exception) { - Log.errorLog(56146546, exception); - } - Log.sysLog("ORF Anzahl: " + shows.size()); - - meldungAddMax(shows.size()); - - return new OrfFilmDetailTask(this, shows, processMoreEpisodes); - } - -} diff --git a/src/main/java/mServer/crawler/sender/orf/OrfEpisodeInfoDTO.java b/src/main/java/mServer/crawler/sender/orf/OrfEpisodeInfoDTO.java deleted file mode 100644 index 95947a87d..000000000 --- a/src/main/java/mServer/crawler/sender/orf/OrfEpisodeInfoDTO.java +++ /dev/null @@ -1,39 +0,0 @@ -package mServer.crawler.sender.orf; - -import java.time.Duration; -import java.util.Optional; - -public class OrfEpisodeInfoDTO { - - private final OrfVideoInfoDTO videoInfo; - private final Optional description; - private final Optional duration; - private final Optional title; - - public OrfEpisodeInfoDTO(final OrfVideoInfoDTO aVideoInfo, - final Optional aTitle, - final Optional aDescription, - final Optional aDuration - ) { - title = aTitle; - description = aDescription; - duration = aDuration; - videoInfo = aVideoInfo; - } - - public OrfVideoInfoDTO getVideoInfo() { - return videoInfo; - } - - public Optional getDescription() { - return description; - } - - public Optional getDuration() { - return duration; - } - - public Optional getTitle() { - return title; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/OrfVideoInfoDTO.java b/src/main/java/mServer/crawler/sender/orf/OrfVideoInfoDTO.java deleted file mode 100644 index b64f4c07a..000000000 --- a/src/main/java/mServer/crawler/sender/orf/OrfVideoInfoDTO.java +++ /dev/null @@ -1,50 +0,0 @@ -package mServer.crawler.sender.orf; - -import java.util.EnumMap; -import java.util.Map; -import mServer.crawler.sender.base.Qualities; - -public class OrfVideoInfoDTO { - - public static final String FILTER_JUGENDSCHUTZ = ".*/Jugendschutz[0-9][0-9][0-9][0-9]b[0-9][0-9][0-9][0-9]_.*"; - private final Map videoUrls; - private String subtitleUrl; - - public OrfVideoInfoDTO() { - videoUrls = new EnumMap<>(Qualities.class); - } - - public boolean hasVideoUrls() { - return !videoUrls.isEmpty(); - } - - public Qualities getDefaultQuality() { - if (videoUrls.containsKey(Qualities.NORMAL)) { - return Qualities.NORMAL; - } - return videoUrls.keySet().iterator().next(); - } - - public String getDefaultVideoUrl() { - return videoUrls.get(getDefaultQuality()); - } - - public String getSubtitleUrl() { - return subtitleUrl; - } - - public Map getVideoUrls() { - return videoUrls; - } - - public String put(final Qualities key, final String value) { - if (value == null || value.matches(FILTER_JUGENDSCHUTZ)) { - return ""; - } - return videoUrls.put(key, value); - } - - public void setSubtitleUrl(final String subtitleUrl) { - this.subtitleUrl = subtitleUrl; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/json/OrfMoreEpisodesDeserializer.java b/src/main/java/mServer/crawler/sender/orf/json/OrfMoreEpisodesDeserializer.java deleted file mode 100644 index 4785fee83..000000000 --- a/src/main/java/mServer/crawler/sender/orf/json/OrfMoreEpisodesDeserializer.java +++ /dev/null @@ -1,26 +0,0 @@ -package mServer.crawler.sender.orf.json; - -import com.google.gson.JsonDeserializationContext; -import com.google.gson.JsonDeserializer; -import com.google.gson.JsonElement; -import mServer.crawler.sender.base.CrawlerUrlDTO; -import mServer.crawler.sender.base.JsonUtils; -import mServer.crawler.sender.base.UrlUtils; -import mServer.crawler.sender.orf.OrfConstants; - -import java.lang.reflect.Type; -import java.util.Optional; - -public class OrfMoreEpisodesDeserializer implements JsonDeserializer { - - private static final String ATTRIBUTE_URL = "url"; - - @Override - public CrawlerUrlDTO deserialize( - JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) { - - final Optional url = - JsonUtils.getAttributeAsString(jsonElement.getAsJsonObject(), ATTRIBUTE_URL); - return url.map(s -> new CrawlerUrlDTO(UrlUtils.addDomainIfMissing(s, OrfConstants.URL_BASE))).orElse(null); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/parser/OrfMoreEpisodesParser.java b/src/main/java/mServer/crawler/sender/orf/parser/OrfMoreEpisodesParser.java deleted file mode 100644 index b262a36dd..000000000 --- a/src/main/java/mServer/crawler/sender/orf/parser/OrfMoreEpisodesParser.java +++ /dev/null @@ -1,27 +0,0 @@ -package mServer.crawler.sender.orf.parser; - - -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.jsoup.nodes.Document; - -import java.util.ArrayList; -import java.util.List; - -public class OrfMoreEpisodesParser { - private static final String EPISODES_SELECTOR = "article.b-teaser > a.teaser-link"; - private static final String ATTRIBUTE_HREF = "href"; - - public List parse(final Document document, final String topic) { - final List result = new ArrayList<>(); - - document - .select(EPISODES_SELECTOR) - .forEach( - episode -> { - final String url = episode.attr(ATTRIBUTE_HREF); - result.add(new TopicUrlDTO(topic, url)); - }); - - return result; - } -} \ No newline at end of file diff --git a/src/main/java/mServer/crawler/sender/orf/parser/OrfPlaylistDeserializer.java b/src/main/java/mServer/crawler/sender/orf/parser/OrfPlaylistDeserializer.java deleted file mode 100644 index 36b0cadaa..000000000 --- a/src/main/java/mServer/crawler/sender/orf/parser/OrfPlaylistDeserializer.java +++ /dev/null @@ -1,104 +0,0 @@ -package mServer.crawler.sender.orf.parser; - -import com.google.gson.JsonArray; -import com.google.gson.JsonDeserializationContext; -import com.google.gson.JsonDeserializer; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import java.lang.reflect.Type; -import java.time.Duration; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import mServer.crawler.sender.orf.JsonUtils; -import mServer.crawler.sender.orf.OrfEpisodeInfoDTO; -import mServer.crawler.sender.orf.OrfVideoInfoDTO; - -public class OrfPlaylistDeserializer implements JsonDeserializer> { - - private static final String ELEMENT_GAPLESS_VIDEO = "gapless_video"; - private static final String ELEMENT_PLAYLIST = "playlist"; - private static final String ELEMENT_VIDEOS = "videos"; - - private static final String ATTRIBUTE_TITLE = "title"; - private static final String ATTRIBUTE_DESCRIPTION = "description"; - private static final String ATTRIBUTE_DURATION = "duration"; - private static final String ATTRIBUTE_DURATION_IN_SECONDS = "duration_in_seconds"; - - @Override - public List deserialize(JsonElement aJsonElement, Type aType, JsonDeserializationContext aContext) { - - List episodes = new ArrayList<>(); - - if (!aJsonElement.getAsJsonObject().has(ELEMENT_PLAYLIST)) { - return episodes; - } - - JsonObject playlistObject = aJsonElement.getAsJsonObject().get(ELEMENT_PLAYLIST).getAsJsonObject(); - if (JsonUtils.hasElements(playlistObject, ELEMENT_GAPLESS_VIDEO)) { - parseGaplessVideo(episodes, playlistObject); - } - - parseVideos(episodes, playlistObject); - - return episodes; - } - - private void parseGaplessVideo(List aEpisodes, JsonObject aPlaylistObject) { - - final Optional title = JsonUtils.getAttributeAsString(aPlaylistObject, ATTRIBUTE_TITLE); - final Optional duration = parseDurationInSeconds(aPlaylistObject); - - final Optional videoInfoOptional = parseUrls(aPlaylistObject.getAsJsonObject(ELEMENT_GAPLESS_VIDEO)); - - if (videoInfoOptional.isPresent()) { - OrfEpisodeInfoDTO episode = new OrfEpisodeInfoDTO(videoInfoOptional.get(), title, Optional.empty(), duration); - aEpisodes.add(episode); - } - } - - private void parseVideos(List aEpisodes, JsonObject aPlaylistObject) { - JsonArray videosArray = aPlaylistObject.getAsJsonObject().get(ELEMENT_VIDEOS).getAsJsonArray(); - - for (JsonElement videoElement : videosArray) { - JsonObject videoObject = videoElement.getAsJsonObject(); - final Optional title = JsonUtils.getAttributeAsString(videoObject, ATTRIBUTE_TITLE); - final Optional description = JsonUtils.getAttributeAsString(videoObject, ATTRIBUTE_DESCRIPTION); - final Optional duration = parseDuration(videoObject); - - final Optional videoInfoOptional = parseUrls(videoObject); - - if (videoInfoOptional.isPresent()) { - OrfEpisodeInfoDTO episode = new OrfEpisodeInfoDTO(videoInfoOptional.get(), title, description, duration); - aEpisodes.add(episode); - } - } - } - - private Optional parseUrls(final JsonObject aVideoObject) { - - OrfVideoDetailDeserializer deserializer = new OrfVideoDetailDeserializer(); - return deserializer.deserializeVideoObject(aVideoObject); - } - - private static Optional parseDuration(final JsonObject aVideoObject) { - if (aVideoObject.has(ATTRIBUTE_DURATION)) { - Long durationValue = aVideoObject.get(ATTRIBUTE_DURATION).getAsLong(); - - // Duration ist in Millisekunden angegeben, diese interessieren aber nicht - return Optional.of(Duration.ofSeconds(durationValue / 1000)); - } - - return Optional.empty(); - } - - private static Optional parseDurationInSeconds(final JsonObject aVideoObject) { - if (aVideoObject.has(ATTRIBUTE_DURATION_IN_SECONDS)) { - Double durationValue = aVideoObject.get(ATTRIBUTE_DURATION_IN_SECONDS).getAsDouble(); - - return Optional.of(Duration.ofSeconds(durationValue.longValue())); - } - - return Optional.empty(); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/parser/OrfVideoDetailDeserializer.java b/src/main/java/mServer/crawler/sender/orf/parser/OrfVideoDetailDeserializer.java deleted file mode 100644 index 846aedb83..000000000 --- a/src/main/java/mServer/crawler/sender/orf/parser/OrfVideoDetailDeserializer.java +++ /dev/null @@ -1,149 +0,0 @@ -package mServer.crawler.sender.orf.parser; - -import java.lang.reflect.Type; -import java.util.Optional; -import com.google.gson.JsonDeserializationContext; -import com.google.gson.JsonDeserializer; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonParseException; -import de.mediathekview.mlib.tool.Log; -import mServer.crawler.sender.base.Qualities; -import mServer.crawler.sender.orf.OrfVideoInfoDTO; - -public class OrfVideoDetailDeserializer implements JsonDeserializer> { - - private static final String WRONG_HTTPS_URL_PART = ".apa."; - private static final String RIGHT_HTTPS_URL_PART = ".sf.apa."; - private static final String ELEMENT_PLAYLIST = "playlist"; - private static final String ELEMENT_VIDEOS = "videos"; - private static final String ELEMENT_SUBTITLES = "subtitles"; - private static final String ELEMENT_SOURCES = "sources"; - - private static final String ATTRIBUTE_DELIVERY = "delivery"; - private static final String ATTRIBUTE_PROTOCOL = "protocol"; - private static final String ATTRIBUTE_QUALITY = "quality"; - private static final String ATTRIBUTE_SRC = "src"; - private static final String ATTRIBUTE_TYPE = "type"; - - private static final String RELEVANT_DELIVERY1 = "progressive"; - private static final String RELEVANT_DELIVERY2 = "hls"; - private static final String RELEVANT_PROTOCOL = "http"; - private static final String RELEVANT_SUBTITLE_TYPE = "ttml"; - private static final String RELEVANT_VIDEO_TYPE1 = "video/mp4"; - private static final String RELEVANT_VIDEO_TYPE2 = "application/x-mpegURL"; - - private static String fixHttpsURL(final String url) { - if (url.contains(RIGHT_HTTPS_URL_PART)) { - return url; - } - return url.replace(WRONG_HTTPS_URL_PART, RIGHT_HTTPS_URL_PART); - } - - private static Optional getQuality(final String aQuality) { - switch (aQuality) { - case "Q1A": - return Optional.empty(); - case "Q4A": - return Optional.of(Qualities.SMALL); - case "Q6A": - return Optional.of(Qualities.NORMAL); - case "Q8C": - return Optional.of(Qualities.HD); - case "Q0A": - // QXA/QXB(DRM): another m3u8 has to be loaded which is often geoblocked - case "QXA": - case "QXADRM": - case "QXB": - case "QXBDRM": - case "Q8A": - return Optional.empty(); - default: - Log.sysLog("ORF: unknown quality: " + aQuality); - } - return Optional.empty(); - } - - private static void parseSubtitles(final JsonElement aSubtitlesElement, - final OrfVideoInfoDTO dto) { - if (aSubtitlesElement.isJsonArray()) { - aSubtitlesElement.getAsJsonArray().forEach(subtitleElement -> { - final JsonObject subtitleObject = subtitleElement.getAsJsonObject(); - if (subtitleObject.has(ATTRIBUTE_SRC) && subtitleObject.has(ATTRIBUTE_TYPE)) { - final String type = subtitleObject.get(ATTRIBUTE_TYPE).getAsString(); - - if (type.equalsIgnoreCase(RELEVANT_SUBTITLE_TYPE)) { - final String url = fixHttpsURL(subtitleObject.get(ATTRIBUTE_SRC).getAsString()); - dto.setSubtitleUrl(url); - } - } - }); - } - } - - private static void parseVideo(final JsonElement aVideoElement, final OrfVideoInfoDTO dto) { - if (aVideoElement.isJsonArray()) { - aVideoElement.getAsJsonArray().forEach(videoElement -> { - final JsonObject videoObject = videoElement.getAsJsonObject(); - if (videoObject.has(ATTRIBUTE_PROTOCOL) && videoObject.has(ATTRIBUTE_QUALITY) - && videoObject.has(ATTRIBUTE_SRC) && videoObject.has(ATTRIBUTE_TYPE)) { - final String type = videoObject.get(ATTRIBUTE_TYPE).getAsString(); - final String protocol = videoObject.get(ATTRIBUTE_PROTOCOL).getAsString(); - final String delivery = videoObject.get(ATTRIBUTE_DELIVERY).getAsString(); - - if (isVideoRelevant(type, protocol, delivery)) { - final String quality = videoObject.get(ATTRIBUTE_QUALITY).getAsString(); - final String url = fixHttpsURL(videoObject.get(ATTRIBUTE_SRC).getAsString()); - - final Optional resolution = getQuality(quality); - if (resolution.isPresent()) { - dto.put(resolution.get(), url); - } - } - } - }); - } - } - - private static boolean isVideoRelevant(String type, String protocol, String delivery) { - return (type.equalsIgnoreCase(RELEVANT_VIDEO_TYPE1) || type.equalsIgnoreCase(RELEVANT_VIDEO_TYPE2)) - && protocol.equalsIgnoreCase(RELEVANT_PROTOCOL) - && (delivery.equalsIgnoreCase(RELEVANT_DELIVERY1) || delivery.equalsIgnoreCase(RELEVANT_DELIVERY2)); - } - - @Override - public Optional deserialize(final JsonElement aJsonElement, final Type aType, - final JsonDeserializationContext aContext) throws JsonParseException { - - final JsonObject jsonObject = aJsonElement.getAsJsonObject(); - if (jsonObject.has(ELEMENT_PLAYLIST)) { - final JsonObject playlistObject = jsonObject.get(ELEMENT_PLAYLIST).getAsJsonObject(); - if (playlistObject.has(ELEMENT_VIDEOS)) { - final JsonObject videoObject - = playlistObject.get(ELEMENT_VIDEOS).getAsJsonArray().get(0).getAsJsonObject(); - - return deserializeVideoObject(videoObject); - } - } - - return Optional.empty(); - } - - public Optional deserializeVideoObject(final JsonObject aVideoObject) { - final OrfVideoInfoDTO dto = new OrfVideoInfoDTO(); - - if (aVideoObject.has(ELEMENT_SOURCES)) { - parseVideo(aVideoObject.get(ELEMENT_SOURCES), dto); - } - - if (aVideoObject.has(ELEMENT_SUBTITLES)) { - parseSubtitles(aVideoObject.get(ELEMENT_SUBTITLES), dto); - } - - if (dto.hasVideoUrls()) { - return Optional.of(dto); - } - - return Optional.empty(); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfDayTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfDayTask.java deleted file mode 100644 index 67fb3ded4..000000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfDayTask.java +++ /dev/null @@ -1,54 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import mServer.crawler.sender.base.AbstractUrlTask; -import java.util.concurrent.ConcurrentLinkedQueue; -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.base.CrawlerUrlDTO; -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -public class OrfDayTask extends OrfTaskBase { - - private static final String ITEM_SELECTOR = "article a"; - private static final String TITLE_SELECTOR1 = ".item-title"; - private static final String TITLE_SELECTOR2 = ".teaser-title"; - private static final String ATTRIBUTE_HREF = "href"; - - public OrfDayTask(final MediathekReader aCrawler, - final ConcurrentLinkedQueue aUrlToCrawlDTOs) { - super(aCrawler, aUrlToCrawlDTOs); - } - - @Override - protected void processDocument(CrawlerUrlDTO aUrlDTO, Document aDocument) { - Elements elements = aDocument.select(ITEM_SELECTOR); - elements.forEach( - item -> { - Element titleElement = getTitleElement(item); - if (titleElement != null) { - String theme = OrfHelper.parseTheme(titleElement.text()); - String url = item.attr(ATTRIBUTE_HREF); - - TopicUrlDTO dto = new TopicUrlDTO(theme, url); - taskResults.add(dto); - } - }); - - ORF_LOGGER.trace(String.format("%s: Anzahl Filme: %d", aUrlDTO.getUrl(), taskResults.size())); - } - - private Element getTitleElement(Element item) { - Element titleElement = item.selectFirst(TITLE_SELECTOR1); - if (titleElement == null) { - titleElement = item.selectFirst(TITLE_SELECTOR2); - } - return titleElement; - } - - @Override - protected AbstractUrlTask createNewOwnInstance(ConcurrentLinkedQueue aURLsToCrawl) { - return new OrfDayTask(crawler, aURLsToCrawl); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfFilmDetailTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfFilmDetailTask.java deleted file mode 100644 index 2210bd922..000000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfFilmDetailTask.java +++ /dev/null @@ -1,272 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import mServer.crawler.sender.base.*; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import com.google.gson.reflect.TypeToken; -import de.mediathekview.mlib.daten.DatenFilm; -import de.mediathekview.mlib.tool.Log; - -import java.io.IOException; -import java.lang.reflect.Type; -import java.time.Duration; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeParseException; -import java.time.temporal.ChronoUnit; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.concurrent.ConcurrentLinkedQueue; -import mServer.crawler.CrawlerTool; -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.orf.OrfEpisodeInfoDTO; -import mServer.crawler.sender.orf.OrfVideoInfoDTO; -import mServer.crawler.sender.orf.TopicUrlDTO; -import mServer.crawler.sender.orf.json.OrfMoreEpisodesDeserializer; -import mServer.crawler.sender.orf.parser.OrfMoreEpisodesParser; -import mServer.crawler.sender.orf.parser.OrfPlaylistDeserializer; -import org.apache.commons.lang3.StringUtils; -import org.jsoup.nodes.Document; - -public class OrfFilmDetailTask extends OrfTaskBase { - - private static final String TITLE_SELECTOR = ".description-container .description-title"; - private static final String VIDEO_META_DATA_SELECTOR = ".video-meta-data"; - private static final String TIME_SELECTOR = VIDEO_META_DATA_SELECTOR + " time"; - private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration"; - private static final String DESCRIPTION_SELECTOR = ".description-container .description-text"; - private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist"; - private static final String MORE_EPISODES_SELECTOR = "div.more-episodes"; - - private static final String ATTRIBUTE_DATETIME = "datetime"; - private static final String ATTRIBUTE_DATA_JSB = "data-jsb"; - - private static final String PREFIX_AUDIO_DESCRIPTION = "AD |"; - - private static final DateTimeFormatter DATE_TIME_FORMATTER - = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); - - private static final DateTimeFormatter DATE_FORMAT - = DateTimeFormatter.ofPattern("dd.MM.yyyy"); - private static final DateTimeFormatter TIME_FORMAT - = DateTimeFormatter.ofPattern("HH:mm:ss"); - - private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken() {}.getType(); - private static final Type LIST_EPISODEINFO_TYPE_TOKEN = new TypeToken>() { - }.getType(); - - private final boolean processMoreEpisodes; - private final transient JsoupConnection jsoupConnection; - - public OrfFilmDetailTask(final MediathekReader aCrawler, - final ConcurrentLinkedQueue aUrlToCrawlDTOs, boolean processMoreEpisodes) { - super(aCrawler, aUrlToCrawlDTOs); - this.processMoreEpisodes = processMoreEpisodes; - jsoupConnection = new JsoupConnection(); - } - - @Override - protected void processDocument(TopicUrlDTO aUrlDTO, Document aDocument) { - final Optional title = HtmlDocumentUtils.getElementString(TITLE_SELECTOR, aDocument); - final Optional time = parseDate(aDocument); - final Optional duration = parseDuration(aDocument); - final Optional description = HtmlDocumentUtils.getElementString(DESCRIPTION_SELECTOR, aDocument); - - final List episodes = parseEpisodes(aDocument); - - for (int i = 0; i < episodes.size(); i++) { - OrfEpisodeInfoDTO episode = episodes.get(i); - if (i == 0) { - createFilm(aUrlDTO, episode.getVideoInfo(), title, description, time, duration); - } else { - createFilm(aUrlDTO, episode.getVideoInfo(), episode.getTitle(), episode.getDescription(), time, episode.getDuration()); - } - } - - if (processMoreEpisodes) { - final List topicUrlDTOS = parseMoreEpisodes(aDocument, aUrlDTO.getTopic()); - topicUrlDTOS.remove(aUrlDTO); - processMoreEpisodes(topicUrlDTOS); - } - - ORF_LOGGER.trace(String.format("%s - %s: Anzahl Filme: %d", aUrlDTO.getTopic(), aUrlDTO.getUrl(), taskResults.size())); - } - - @Override - protected AbstractUrlTask createNewOwnInstance(ConcurrentLinkedQueue aURLsToCrawl) { - return createNewOwnInstance(aURLsToCrawl, processMoreEpisodes); - } - - private AbstractUrlTask createNewOwnInstance(final ConcurrentLinkedQueue urlsToCrawl, boolean processMoreEpisodes) { - return new OrfFilmDetailTask(crawler, urlsToCrawl, processMoreEpisodes); - } - - private void createFilm(final TopicUrlDTO aUrlDTO, - final OrfVideoInfoDTO aVideoInfo, - final Optional aTitle, - final Optional aDescription, - final Optional aTime, - final Optional aDuration) { - - if (aTitle.isPresent()) { - boolean isAudioDescription = aUrlDTO.getTopic().startsWith(PREFIX_AUDIO_DESCRIPTION); - - LocalDateTime time = aTime.orElse(LocalDateTime.now()); - - String datum = time.format(DATE_FORMAT); - String zeit = time.format(TIME_FORMAT); - String url = aVideoInfo.getDefaultVideoUrl(); - - final DatenFilm film = new DatenFilm(crawler.getSendername(), - isAudioDescription - ? trimAudioDescriptionPrefix(aUrlDTO.getTopic()) - : aUrlDTO.getTopic(), - aUrlDTO.getUrl(), - isAudioDescription - ? trimAudioDescriptionPrefix(aTitle.get()) + " (Audiodeskription)" - : aTitle.get(), - url, - "", - datum, - zeit, - aDuration.orElse(Duration.ZERO).getSeconds(), - aDescription.orElse("")); - - if (StringUtils.isNotBlank(aVideoInfo.getSubtitleUrl())) { - CrawlerTool.addUrlSubtitle(film, aVideoInfo.getSubtitleUrl()); - } - - addUrls(film, aVideoInfo.getVideoUrls()); - - taskResults.add(film); - } else { - Log.sysLog("OrfFilmDetailTask: no title or video found for url " + aUrlDTO.getUrl()); - } - } - - private String trimAudioDescriptionPrefix(String text) { - return text.substring(PREFIX_AUDIO_DESCRIPTION.length()); - } - - private void addUrls(final DatenFilm aFilm, final Map aVideoUrls) { - - if (aVideoUrls.containsKey(Qualities.HD)) { - CrawlerTool.addUrlHd(aFilm, aVideoUrls.get(Qualities.HD)); - } - if (aVideoUrls.containsKey(Qualities.SMALL)) { - CrawlerTool.addUrlKlein(aFilm, aVideoUrls.get(Qualities.SMALL)); - } - } - - private List parseEpisodes(Document aDocument) { - Optional json = HtmlDocumentUtils.getElementAttributeString(VIDEO_SELECTOR, ATTRIBUTE_DATA_JSB, aDocument); - - if (json.isPresent()) { - - final Gson gson = new GsonBuilder().registerTypeAdapter(LIST_EPISODEINFO_TYPE_TOKEN, - new OrfPlaylistDeserializer()).create(); - - return gson.fromJson(json.get(), LIST_EPISODEINFO_TYPE_TOKEN); - } - - return new ArrayList<>(); - } - - private static Optional parseDate(Document aDocument) { - Optional date = HtmlDocumentUtils.getElementAttributeString(TIME_SELECTOR, ATTRIBUTE_DATETIME, aDocument); - if (date.isPresent()) { - String dateValue = date.get().replace("CET", " ").replace("CEST", " "); - try { - LocalDateTime localDate = LocalDateTime.parse(dateValue, DATE_TIME_FORMATTER); - return Optional.of(localDate); - } catch (DateTimeParseException e) { - Log.sysLog("OrfFilmDetailTask: unknown date format: " + date.get()); - } - } - - return Optional.empty(); - } - - private static Optional parseDuration(Document aDocument) { - Optional duration = HtmlDocumentUtils.getElementString(DURATION_SELECTOR, aDocument); - if (!duration.isPresent()) { - return Optional.empty(); - } - - Optional unit = determineChronoUnit(duration.get()); - if (!unit.isPresent()) { - Log.sysLog("OrfFilmDetailTask: unknown duration type: " + duration.get()); - return Optional.empty(); - } - - String[] parts = duration.get().split(" ")[0].trim().split(":"); - if (parts.length != 2) { - Log.sysLog("OrfFilmDetailTask: unknown duration part count: " + duration.get()); - return Optional.empty(); - } - - ChronoUnit unitValue = unit.get(); - if (unitValue == ChronoUnit.SECONDS || unitValue == ChronoUnit.MINUTES) { - return Optional.of( - Duration.ofMinutes(Long.parseLong(parts[0])) - .plusSeconds(Long.parseLong(parts[1])) - ); - } - if (unitValue == ChronoUnit.HOURS) { - return Optional.of( - Duration.ofHours(Long.parseLong(parts[0])) - .plusMinutes(Long.parseLong(parts[1])) - ); - } - - return Optional.empty(); - } - - private static Optional determineChronoUnit(String aDuration) { - if (aDuration.contains("Min.")) { - return Optional.of(ChronoUnit.MINUTES); - } - if (aDuration.contains("Std.")) { - return Optional.of(ChronoUnit.HOURS); - } - if (aDuration.contains("Sek.")) { - return Optional.of(ChronoUnit.SECONDS); - } - - return Optional.empty(); - } - - private List parseMoreEpisodes(final Document document, final String topic) { - final Optional json = HtmlDocumentUtils.getElementAttributeString(MORE_EPISODES_SELECTOR, ATTRIBUTE_DATA_JSB, document); - if (json.isPresent()) { - final Gson gson = - new GsonBuilder() - .registerTypeAdapter(CRAWLER_URL_TYPE_TOKEN, new OrfMoreEpisodesDeserializer()) - .create(); - - CrawlerUrlDTO moreEpisodesUrl = gson.fromJson(json.get(), CRAWLER_URL_TYPE_TOKEN); - if (moreEpisodesUrl != null) { - try { - final Document moreEpisodesDocument = jsoupConnection.getDocument(moreEpisodesUrl.getUrl()); - OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser(); - return parser.parse(moreEpisodesDocument, topic); - } catch (IOException e) { - Log.errorLog(237462889, String.format("OrfFilmDetailTask: loading more episodes url %s failed.", moreEpisodesUrl.getUrl())); - } - } - } - - return new ArrayList<>(); - } - - private void processMoreEpisodes(final List moreFilms) { - if (moreFilms != null && !moreFilms.isEmpty()) { - final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(moreFilms); - final OrfFilmDetailTask task = (OrfFilmDetailTask) createNewOwnInstance(queue, false); - task.fork(); - taskResults.addAll(task.join()); - } - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHelper.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfHelper.java deleted file mode 100644 index 2a7f3f187..000000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHelper.java +++ /dev/null @@ -1,63 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import java.util.ArrayList; -import java.util.List; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import mServer.crawler.sender.orf.OrfConstants; - -/** - * Helper methods for ORF tasks - */ -public class OrfHelper { - - private static final String LETTER_URL_SELECTOR = "li.letter-item > a"; - private static final String ATTRIBUTE_HREF = "href"; - private static final String ATTRIBUTE_TITLE = "title"; - - private OrfHelper() { - } - - public static String parseTheme(final Element aItem) { - String theme = aItem.attr(ATTRIBUTE_TITLE); - return parseTheme(theme); - } - - public static String parseTheme(final String theme) { - final String result = theme.replaceAll("[0-9]{1,2}:[0-9][0-9]$", "").trim(); - // Thema steht vor Doppelpunkt - // Ausnahmen - // - ZIB-Sendungen mit Uhrzeit - // - DokEins-Sendungen - // - Ungarisches Magazin - int index = result.indexOf(':'); - if (index > 0 - && !result.startsWith("ZIB") - && !result.startsWith("DOKeins") - && !result.contains("Ungarisches Magazin")) { - return result.substring(0, index).trim(); - } - return result; - } - - /** - * determines the links to the letter pages - * - * @param aDocument the html document with letter links - * @return list with urls - */ - public static List parseLetterLinks(Document aDocument) { - final List results = new ArrayList<>(); - - Elements links = aDocument.select(LETTER_URL_SELECTOR); - links.forEach(element -> { - if (element.hasAttr(ATTRIBUTE_HREF)) { - String subpage = element.attr(ATTRIBUTE_HREF); - results.add(OrfConstants.URL_BASE + subpage); - } - }); - - return results; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryOverviewTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryOverviewTask.java deleted file mode 100644 index c0a5de6e2..000000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryOverviewTask.java +++ /dev/null @@ -1,45 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.base.JsoupConnection; -import mServer.crawler.sender.orf.OrfConstants; -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; - -import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentLinkedQueue; - -public class OrfHistoryOverviewTask implements Callable> { - - private static final String ATTRIBUTE_HREF = "href"; - private static final String ATTRIBUTE_TITLE = "title"; - private static final String TOPIC_URL_SELECTOR = "section.has-4-in-row article > a"; - - private final MediathekReader crawler; - private final JsoupConnection jsoupConnection; - - public OrfHistoryOverviewTask( - final MediathekReader aCrawler) { - crawler = aCrawler; - jsoupConnection = new JsoupConnection(); - } - - @Override - public ConcurrentLinkedQueue call() throws Exception { - final ConcurrentLinkedQueue results = new ConcurrentLinkedQueue<>(); - - // URLs für Seiten parsen - final Document document = jsoupConnection.getDocument(OrfConstants.URL_ARCHIVE); - - final Elements topics = document.select(TOPIC_URL_SELECTOR); - topics.forEach( - topicElement -> { - final String url = topicElement.attr(ATTRIBUTE_HREF); - final String topic = topicElement.attr(ATTRIBUTE_TITLE); - results.add(new TopicUrlDTO(topic, url)); - }); - - return results; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryTopicTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryTopicTask.java deleted file mode 100644 index 7dda27be1..000000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryTopicTask.java +++ /dev/null @@ -1,39 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.base.AbstractDocumentTask; -import mServer.crawler.sender.base.AbstractRecursivConverterTask; -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.jsoup.nodes.Document; - -import java.util.concurrent.ConcurrentLinkedQueue; - -public class OrfHistoryTopicTask extends AbstractDocumentTask { - - private static final String ATTRIBUTE_HREF = "href"; - private static final String SHOW_URL_SELECTOR = "article > a"; - - public OrfHistoryTopicTask( - final MediathekReader crawler, - final ConcurrentLinkedQueue urlToCrawlDTOs - ) { - super(crawler, urlToCrawlDTOs); - } - - @Override - protected AbstractRecursivConverterTask createNewOwnInstance( - final ConcurrentLinkedQueue aElementsToProcess) { - return new OrfHistoryTopicTask(crawler, aElementsToProcess); - } - - @Override - protected void processDocument(final TopicUrlDTO aUrlDto, final Document aDocument) { - aDocument - .select(SHOW_URL_SELECTOR) - .forEach( - showElement -> { - final String url = showElement.attr(ATTRIBUTE_HREF); - taskResults.add(new TopicUrlDTO(aUrlDto.getTopic(), url)); - }); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfLetterPageTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfLetterPageTask.java deleted file mode 100644 index 6d52b090b..000000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfLetterPageTask.java +++ /dev/null @@ -1,57 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import java.io.IOException; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentLinkedQueue; -import mServer.crawler.sender.orf.OrfConstants; -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; - -public class OrfLetterPageTask implements Callable> { - - private static final Logger LOG = LogManager.getLogger(OrfLetterPageTask.class); - - private static final String SHOW_URL_SELECTOR = "article > a"; - - @Override - public ConcurrentLinkedQueue call() throws Exception { - final ConcurrentLinkedQueue results = new ConcurrentLinkedQueue<>(); - - // URLs für Seiten parsen - final Document document = Jsoup.connect(OrfConstants.URL_SHOW_LETTER_PAGE_A).get(); - List overviewLinks = OrfHelper.parseLetterLinks(document); - - // Sendungen für die einzelnen Seiten pro Buchstabe ermitteln - overviewLinks.forEach(url -> { - try { - Document subpageDocument = Jsoup.connect(url).get(); - results.addAll(parseOverviewPage(subpageDocument)); - } catch (IOException ex) { - LOG.fatal("OrfLetterPageTask: error parsing url " + url, ex); - } - }); - - return results; - } - - private ConcurrentLinkedQueue parseOverviewPage(Document aDocument) { - final ConcurrentLinkedQueue results = new ConcurrentLinkedQueue<>(); - - Elements links = aDocument.select(SHOW_URL_SELECTOR); - links.forEach(element -> { - if (element.hasAttr("href")) { - String link = element.attr("href"); - String theme = OrfHelper.parseTheme(element); - - results.add(new TopicUrlDTO(theme, link)); - } - }); - - return results; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfTaskBase.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfTaskBase.java deleted file mode 100644 index 3ac1b8928..000000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfTaskBase.java +++ /dev/null @@ -1,110 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import de.mediathekview.mlib.Config; -import de.mediathekview.mlib.tool.Log; -import java.io.IOException; -import java.net.SocketException; -import java.net.SocketTimeoutException; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.TimeUnit; -import mServer.crawler.FilmeSuchen; -import mServer.crawler.RunSender; -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.base.AbstractUrlTask; -import mServer.crawler.sender.base.CrawlerUrlDTO; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.jsoup.HttpStatusException; -import org.jsoup.Jsoup; -import org.jsoup.Connection.Response; -import org.jsoup.nodes.Document; - -public abstract class OrfTaskBase - extends AbstractUrlTask { - - private static final long serialVersionUID = -4124779055395250987L; - private static final String LOAD_DOCUMENT_HTTPERROR - = "Some HTTP error happened while crawl the %s page \"%s\"."; - - private static final int MAX_TIMEOUT = (int) TimeUnit.SECONDS.toMillis(300); - - protected static final Logger ORF_LOGGER = LogManager.getLogger("OrfLogger"); - - public OrfTaskBase(final MediathekReader aCrawler, - final ConcurrentLinkedQueue aUrlToCrawlDTOs) { - super(aCrawler, aUrlToCrawlDTOs); - } - - /** - * In this method you have to use the JSOUP {@link Document} to create a - * object of the return type {@link T}. Add the results to - * {@link AbstractUrlTask#taskResults}. - * - * @param aUrlDTO A DTO containing at least the URL of the given document. - * @param aDocument The JSOUP {@link Document}. - */ - protected abstract void processDocument(final D aUrlDTO, final Document aDocument); - - @Override - protected void processElement(final D aUrlDTO) { - if (Config.getStop()) { - return; - } - - boolean retry = false; - int timeout = (int) TimeUnit.SECONDS.toMillis(120); - - do { - try { - retry = false; - - final Document document = loadDocument(aUrlDTO, timeout); - processDocument(aUrlDTO, document); - } catch (final HttpStatusException httpStatusError) { - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLER); - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLVERSUCHE); - ORF_LOGGER.trace(httpStatusError); - Log.sysLog(String.format(LOAD_DOCUMENT_HTTPERROR, crawler.getSendername(), aUrlDTO.getUrl())); - - Log.errorLog(96459855, - crawler.getSendername() + ": crawlerDocumentLoadError: " + aUrlDTO.getUrl() + ", " + httpStatusError.getStatusCode()); - } catch (final SocketException | SocketTimeoutException socketException) { - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLVERSUCHE); - ORF_LOGGER.trace(socketException); - retry = true; - timeout *= 2; - try { - Thread.sleep(5000); - } catch (InterruptedException ignored) { - // just try again - } - } catch (final Exception exception) { - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLER); - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLVERSUCHE); - Log.errorLog(96459856, exception); - ORF_LOGGER.trace(exception); - } - } while (retry && timeout <= MAX_TIMEOUT); - } - - private Document loadDocument(final D aUrlDTO, int timeout) throws IOException { - long start = System.currentTimeMillis(); - // maxBodySize(0)=unlimited - // necessary for ORF documents which are larger than the default size - Response response = Jsoup.connect(aUrlDTO.getUrl()) - .timeout(timeout) - .maxBodySize(0).execute(); - - long end = System.currentTimeMillis(); - - ORF_LOGGER.trace(String.format("%s: %d - loaded in %d ms", aUrlDTO.getUrl(), response.statusCode(), end - start)); - traceRequest(); - - final Document document = response.parse(); - - end = System.currentTimeMillis(); - ORF_LOGGER.trace(String.format("%s: %d - parsed in %d ms", aUrlDTO.getUrl(), response.statusCode(), end - start)); - - return document; - } -}