diff --git a/MServer-Config.yaml b/MServer-Config.yaml
index d6bc5192d..85cb9940f 100644
--- a/MServer-Config.yaml
+++ b/MServer-Config.yaml
@@ -17,12 +17,22 @@ maximumRequestsPerSecond: 999.0
# If set only these Sender will be crawled all other will be ignored.
senderIncluded:
- #- MDR
- #- NDR
+ #- ARD
+ #- ARTE_DE
+ #- ARGE_FR
+ #- ARTE_EN
+ #- ARTE_PL
+ #- ARTE_IT
+ #- ARTE_ES
+ #- 3SAT
+ #- FUNK
#- KIKA
- - DW
- #- BR
+ #- DW
+ #- ORF
#- PHOENIX
+ #- SRF
+ - SR
+ #- ZDF
# If set the server will be awake after the crawler run and restarts the run after the given amount.
#schedules:
@@ -111,14 +121,14 @@ checkImportListUrlTimeoutInSec: 1800
#### Default crawler configurations ####
# The maximum amount of URLs to be processed per task.
-maximumUrlsPerTask: 50
+maximumUrlsPerTask: 10
# The maximum duration in minutes a crawler may run.
maximumCrawlDurationInMinutes: 120
# Enables the topics search
# maximumSubpages limits the depth of the topics search
-topicsSearchEnabled: false
+topicsSearchEnabled: true
# The maximum amount of sub pages to be crawled.
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and
@@ -164,14 +174,14 @@ senderConfigurations:
KIKA:
maximumSubpages: 2
maximumRequestsPerSecond: 8.0
- SR:
- maximumRequestsPerSecond: 2.0
ZDF:
maximumRequestsPerSecond: 10.0
FUNK:
maximumUrlsPerTask: 99
DW:
maximumSubpages: 0
+ SR:
+ maximumSubpages: 5
# configure string variables
crawlerApiParams:
diff --git a/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java b/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java
index 8c03195f4..6ff257725 100644
--- a/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java
+++ b/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java
@@ -1,5 +1,6 @@
package de.mediathekview.mserver.base.webaccess;
+import okhttp3.ConnectionPool;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
@@ -22,12 +23,13 @@ public class JsoupConnection {
private static final String FILE_TYPE_M3U8 = "m3u8";
protected OkHttpClient client;
- public JsoupConnection(final int timeout) {
+ public JsoupConnection(final int timeout, final int threadPoolSize) {
client =
new OkHttpClient.Builder()
.connectTimeout(timeout, TimeUnit.SECONDS)
.readTimeout(timeout, TimeUnit.SECONDS)
.callTimeout(timeout, TimeUnit.SECONDS)
+ .connectionPool(new ConnectionPool(threadPoolSize, 5L, TimeUnit.MINUTES))
.build();
}
diff --git a/src/main/java/de/mediathekview/mserver/crawler/arte/tasks/ArteTaskBase.java b/src/main/java/de/mediathekview/mserver/crawler/arte/tasks/ArteTaskBase.java
index fc9705df5..ffdd4d853 100644
--- a/src/main/java/de/mediathekview/mserver/crawler/arte/tasks/ArteTaskBase.java
+++ b/src/main/java/de/mediathekview/mserver/crawler/arte/tasks/ArteTaskBase.java
@@ -1,6 +1,5 @@
package de.mediathekview.mserver.crawler.arte.tasks;
-import com.google.common.util.concurrent.RateLimiter;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
@@ -19,7 +18,6 @@
public abstract class ArteTaskBase extends AbstractRestTask {
private static final Logger LOG = LogManager.getLogger(ArteTaskBase.class);
- private static RateLimiter limiter = null;
private final transient GsonBuilder gsonBuilder;
protected ArteTaskBase(
@@ -106,11 +104,6 @@ private Response executeRequest(final WebTarget aTarget) {
if (authKey.isPresent()) {
request = request.header(HEADER_AUTHORIZATION, authKey.get());
}
-
- if (limiter == null) {
- limiter = RateLimiter.create(crawler.getCrawlerConfig().getMaximumRequestsPerSecond());
- }
- limiter.acquire();
return request
.header(HEADER_ACCEPT_ENCODING, ENCODING_GZIP)
.header(HEADER_ACCEPT, APPLICATION_JSON)
diff --git a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java
index 226a122ad..b9faa3455 100644
--- a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java
+++ b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java
@@ -15,6 +15,8 @@
import org.apache.logging.log4j.Logger;
import org.jsoup.nodes.Document;
+import com.google.common.util.concurrent.RateLimiter;
+
import java.io.IOException;
import java.time.Duration;
import java.time.LocalDateTime;
@@ -41,6 +43,7 @@ public abstract class AbstractCrawler implements Callable> {
protected Set films;
private LocalDateTime startTime;
protected JsoupConnection jsoupConnection;
+ protected RateLimiter rateLimiter;
protected AbstractCrawler(
final ForkJoinPool aForkJoinPool,
@@ -58,8 +61,11 @@ protected AbstractCrawler(
runtimeConfig = rootConfig.getConfig();
crawlerConfig = rootConfig.getSenderConfig(getSender());
- jsoupConnection = new JsoupConnection(crawlerConfig.getSocketTimeoutInSeconds());
-
+ jsoupConnection = new JsoupConnection(
+ rootConfig.getSenderConfig(getSender()).getSocketTimeoutInSeconds(),
+ runtimeConfig.getMaximumCpuThreads());
+ rateLimiter = RateLimiter.create(rootConfig.getSenderConfig(getSender()).getMaximumRequestsPerSecond());
+
films = ConcurrentHashMap.newKeySet();
}
@@ -137,6 +143,14 @@ public JsoupConnection getConnection() {
public void setConnection(JsoupConnection connection) {
jsoupConnection = connection;
}
+
+ public RateLimiter getRateLimiter() {
+ return rateLimiter;
+ }
+
+ public void setRateLimiter(RateLimiter rateLimiter) {
+ this.rateLimiter = rateLimiter;
+ }
/**
* Request an url and receive the body as String
@@ -145,6 +159,7 @@ public void setConnection(JsoupConnection connection) {
* @throws IOException
*/
public String requestBodyAsString(String url) throws IOException {
+ getRateLimiter().acquire();
return getConnection().requestBodyAsString(url);
}
@@ -155,6 +170,7 @@ public String requestBodyAsString(String url) throws IOException {
* @throws IOException
*/
public Document requestBodyAsHtmlDocument(String url) throws IOException {
+ getRateLimiter().acquire();
return getConnection().requestBodyAsHtmlDocument(url);
}
@@ -165,6 +181,7 @@ public Document requestBodyAsHtmlDocument(String url) throws IOException {
* @throws IOException
*/
public Document requestBodyAsXmlDocument(String url) throws IOException {
+ getRateLimiter().acquire();
return getConnection().requestBodyAsXmlDocument(url);
}
@@ -176,6 +193,7 @@ public Document requestBodyAsXmlDocument(String url) throws IOException {
* @return size of the response in KB or -1 in case we could not determine the size.
*/
public long determineFileSizeInKB(String url) {
+ getRateLimiter().acquire();
return getConnection().determineFileSize(url) / 1024;
}
@@ -185,6 +203,7 @@ public long determineFileSizeInKB(String url) {
* @return return true if the request was successfully processed by the server
*/
public boolean requestUrlExists(String url) {
+ getRateLimiter().acquire();
return getConnection().requestUrlExists(url);
}
/**
diff --git a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractJsonRestTask.java b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractJsonRestTask.java
index 80cd14fa0..327e25fab 100644
--- a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractJsonRestTask.java
+++ b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractJsonRestTask.java
@@ -1,6 +1,5 @@
package de.mediathekview.mserver.crawler.basic;
-import com.google.common.util.concurrent.RateLimiter;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
@@ -23,7 +22,6 @@ public abstract class AbstractJsonRestTask
protected static final String ENCODING_GZIP = "gzip";
private static final long serialVersionUID = -1090560363478964885L;
protected final transient GsonBuilder gsonBuilder;
- private static RateLimiter limiter = null;
protected AbstractJsonRestTask(
final AbstractCrawler crawler,
@@ -63,10 +61,6 @@ protected void processRestTarget(final D aDTO, final WebTarget aTarget) {
}
protected Response createResponse(final Builder request, final D aDTO) {
- if (limiter == null) {
- limiter = RateLimiter.create(crawler.getCrawlerConfig().getMaximumRequestsPerSecond());
- }
- limiter.acquire();
request.header(ACCEPT_CHARSET, StandardCharsets.UTF_8);
return request.header(ACCEPT_ENCODING, ENCODING_GZIP).header("User-Agent", "Mozilla").get();
}
diff --git a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractRestTask.java b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractRestTask.java
index 6c3ac4248..c48ca5664 100644
--- a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractRestTask.java
+++ b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractRestTask.java
@@ -72,6 +72,7 @@ protected void processElement(final D aDTO) {
* @return the {@link WebTarget} to access the url.
*/
protected WebTarget createWebTarget(final String aUrl) {
+ crawler.getRateLimiter().acquire();
return client.target(aUrl);
}
diff --git a/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java b/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java
index 43b40f0fb..7cc07c7c7 100644
--- a/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java
+++ b/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java
@@ -1,10 +1,8 @@
package de.mediathekview.mserver.crawler.dw;
-import com.google.common.util.concurrent.RateLimiter;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
-import de.mediathekview.mlib.daten.Sender;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.AbstractRestTask;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
@@ -23,9 +21,6 @@
@SuppressWarnings("serial")
public abstract class DWTaskBase extends AbstractRestTask {
private static final Logger LOG = LogManager.getLogger(DWTaskBase.class);
-
- private static RateLimiter limiter = null;
-
private final transient GsonBuilder gsonBuilder;
protected DWTaskBase(
@@ -78,10 +73,6 @@ private Response executeRequest(final WebTarget aTarget) {
request.header(
ZdfConstants.HEADER_AUTHENTIFICATION, AUTHORIZATION_BEARER + authKey.get());
}
- if (limiter == null) {
- limiter = RateLimiter.create(crawler.getRuntimeConfig().getSenderConfig(Sender.DW).getMaximumRequestsPerSecond());
- }
- limiter.acquire();
return request.header(HEADER_ACCEPT_ENCODING, ENCODING_GZIP).get();
}
}
diff --git a/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java b/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java
index 085ce3841..6aef6089b 100644
--- a/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java
+++ b/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java
@@ -117,8 +117,14 @@ protected void postProcessing(KikaApiVideoInfoDto aResponseObj, KikaApiFilmDto a
aFilm.setUrls(getVideoUrls(aResponseObj, aDTO));
aFilm.addAllSubtitleUrls(getSubtitle(aResponseObj, aDTO));
//
- taskResults.add(aFilm);
- crawler.incrementAndGetActualCount();
+
+
+ if (!taskResults.add(aFilm)) {
+ LOG.debug("Rejected duplicate {}",aFilm);
+ crawler.incrementAndGetErrorCount();
+ } else {
+ crawler.incrementAndGetActualCount();
+ }
crawler.updateProgress();
}
diff --git a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java
index baca014cf..3181c0d74 100644
--- a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java
+++ b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java
@@ -10,6 +10,7 @@
import de.mediathekview.mserver.crawler.ard.json.ArdVideoInfoDto;
import de.mediathekview.mserver.crawler.ard.json.ArdVideoInfoJsonDeserializer;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
+import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask;
import de.mediathekview.mserver.crawler.basic.AbstractUrlTask;
import de.mediathekview.mserver.crawler.sr.SrTopicUrlDTO;
import org.apache.commons.lang3.StringUtils;
@@ -27,7 +28,7 @@
import java.time.format.DateTimeParseException;
import java.util.*;
-public class SrFilmDetailTask extends SrRateLimitedDocumentTask {
+public class SrFilmDetailTask extends AbstractDocumentTask {
private static final org.apache.logging.log4j.Logger LOG =
LogManager.getLogger(SrFilmDetailTask.class);
@@ -156,8 +157,12 @@ protected void processDocument(final SrTopicUrlDTO aUrlDTO, final Document aDocu
addUrls(film, videoInfo.getVideoUrls());
- taskResults.add(film);
- crawler.incrementAndGetActualCount();
+ if (taskResults.add(film)) {
+ crawler.incrementAndGetActualCount();
+ } else {
+ crawler.incrementAndGetErrorCount();
+ LOG.error("Rejected duplicate {}", film);
+ }
crawler.updateProgress();
} else {
LOG.error("SrFilmDetailTask: no title or video found for url {}", aUrlDTO.getUrl());
diff --git a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java
deleted file mode 100644
index 7ef0414e3..000000000
--- a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java
+++ /dev/null
@@ -1,33 +0,0 @@
-package de.mediathekview.mserver.crawler.sr.tasks;
-
-import com.google.common.util.concurrent.RateLimiter;
-import de.mediathekview.mlib.daten.Sender;
-import de.mediathekview.mserver.base.config.MServerConfigManager;
-import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
-import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask;
-import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
-
-import java.util.Queue;
-
-public abstract class SrRateLimitedDocumentTask
- extends AbstractDocumentTask {
-
- private static final long serialVersionUID = -4077182368484515410L;
-
- private static RateLimiter LIMITER = null;
-
- SrRateLimitedDocumentTask(
- final AbstractCrawler crawler,
- final Queue urlToCrawlDTOs) {
- super(crawler, urlToCrawlDTOs);
- }
-
- @Override
- protected void processElement(final D urlDTO) {
- if (LIMITER== null) {
- LIMITER = RateLimiter.create(crawler.getRuntimeConfig().getSenderConfig(Sender.SR).getMaximumRequestsPerSecond());
- }
- LIMITER.acquire();
- super.processElement(urlDTO);
- }
-}
diff --git a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrTopicArchivePageTask.java b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrTopicArchivePageTask.java
index afaf0fbac..4ab22a597 100644
--- a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrTopicArchivePageTask.java
+++ b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrTopicArchivePageTask.java
@@ -2,6 +2,7 @@
import de.mediathekview.mserver.base.HtmlConsts;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
+import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask;
import de.mediathekview.mserver.crawler.basic.AbstractUrlTask;
import de.mediathekview.mserver.crawler.sr.SrConstants;
import de.mediathekview.mserver.crawler.sr.SrTopicUrlDTO;
@@ -15,7 +16,7 @@
import java.util.concurrent.ConcurrentLinkedQueue;
public class SrTopicArchivePageTask
- extends SrRateLimitedDocumentTask {
+ extends AbstractDocumentTask {
private static final String NEXT_PAGE_SELECTOR = "div.pagination__item > a[title*=weiter]";
private static final String SHOW_SELECTOR = "h3.teaser__text__header";
diff --git a/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java b/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java
index e2318e46b..9b8b021a8 100644
--- a/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java
+++ b/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java
@@ -1,9 +1,7 @@
package de.mediathekview.mserver.crawler.zdf.tasks;
-import com.google.common.util.concurrent.RateLimiter;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
-import de.mediathekview.mlib.daten.Sender;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.AbstractRestTask;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
@@ -20,9 +18,6 @@
public abstract class ZdfTaskBase extends AbstractRestTask {
private static final Logger LOG = LogManager.getLogger(ZdfTaskBase.class);
-
- private static RateLimiter limiter = null;
-
private final GsonBuilder gsonBuilder;
protected ZdfTaskBase(
@@ -73,11 +68,6 @@ private Response executeRequest(final WebTarget aTarget) {
request.header(
ZdfConstants.HEADER_AUTHENTIFICATION, AUTHORIZATION_BEARER + authKey.get());
}
- if (limiter == null) {
- limiter = RateLimiter.create(crawler.getRuntimeConfig().getSenderConfig(Sender.ZDF).getMaximumRequestsPerSecond());
- }
-
- limiter.acquire();
return request.header(HEADER_ACCEPT_ENCODING, ENCODING_GZIP).get();
}
}
diff --git a/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java
index 4ef1bd46d..837306d25 100644
--- a/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java
+++ b/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java
@@ -144,7 +144,7 @@ public static Collection