From cb06d86b93de9ec213344911a6826a21c2d2258b Mon Sep 17 00:00:00 2001 From: Julia Damerow Date: Fri, 16 Feb 2018 14:27:22 -0700 Subject: [PATCH] Story/geco 90 (#15) * [GECO-90] send error status for failed OCR runs and make tesseract timeout configurable * [GECO-90] fixed issue with image filename in response * [GECO-90] change tesseract timeout to 600 sec --- cassiopeia/pom.xml | 3 +- .../core/service/impl/KafkaRequestSender.java | 52 ++++++++++++------- .../core/service/impl/OCRManager.java | 23 +++++--- .../core/service/impl/RequestInfo.java | 23 +++++++- .../core/service/impl/RequestResender.java | 3 +- .../src/main/resources/config.properties | 2 +- 6 files changed, 74 insertions(+), 32 deletions(-) diff --git a/cassiopeia/pom.xml b/cassiopeia/pom.xml index 48f38b8..39f0e72 100644 --- a/cassiopeia/pom.xml +++ b/cassiopeia/pom.xml @@ -12,7 +12,7 @@ 1.7.5 4.3.1.RELEASE 4.1.3.RELEASE - 0.5 + 0.6 0.4.2 0.2 @@ -35,6 +35,7 @@ false + 600 diff --git a/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/KafkaRequestSender.java b/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/KafkaRequestSender.java index 965d0ce..87f2b80 100644 --- a/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/KafkaRequestSender.java +++ b/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/KafkaRequestSender.java @@ -30,59 +30,73 @@ public class KafkaRequestSender implements IKafkaRequestSender { @Autowired private IPropertiesManager propertyManager; - + @Autowired private IRequestFactory requestFactory; - + @Autowired private IRequestProducer requestProducer; @Autowired private ISystemMessageHandler messageHandler; - + @PostConstruct public void init() { requestFactory.config(CompletedOCRRequest.class); } - /* (non-Javadoc) - * @see edu.asu.diging.gilesecosystem.cassiopeia.core.service.impl.IKafkaRequestSender#sendRequest(java.lang.String, java.lang.String, java.lang.String, java.lang.String, edu.asu.diging.gilesecosystem.cassiopeia.core.service.impl.RequestInfo) + /* + * (non-Javadoc) + * + * @see edu.asu.diging.gilesecosystem.cassiopeia.core.service.impl. + * IKafkaRequestSender#sendRequest(java.lang.String, java.lang.String, + * java.lang.String, java.lang.String, + * edu.asu.diging.gilesecosystem.cassiopeia.core.service.impl.RequestInfo) */ @Override public void sendRequest(String requestId, String documentId, RequestInfo info) { String restEndpoint = propertyManager.getProperty(Properties.BASE_URL); if (restEndpoint.endsWith("/")) { - restEndpoint = restEndpoint.substring(0, restEndpoint.length()-1); + restEndpoint = restEndpoint.substring(0, restEndpoint.length() - 1); + } + + String fileEndpoint = null; + + if (info.getStatus() == RequestStatus.COMPLETE) { + fileEndpoint = restEndpoint + DownloadFileController.GET_FILE_URL + .replace(DownloadFileController.REQUEST_ID_PLACEHOLDER, requestId) + .replace(DownloadFileController.DOCUMENT_ID_PLACEHOLDER, documentId) + .replace(DownloadFileController.FILENAME_PLACEHOLDER, + info.getFilename()); } - - String fileEndpoint = restEndpoint + DownloadFileController.GET_FILE_URL - .replace(DownloadFileController.REQUEST_ID_PLACEHOLDER, requestId) - .replace(DownloadFileController.DOCUMENT_ID_PLACEHOLDER, documentId) - .replace(DownloadFileController.FILENAME_PLACEHOLDER, info.getFilename()); - + ICompletedOCRRequest completedRequest = null; try { - completedRequest = requestFactory.createRequest(requestId, info.getUploadId()); + completedRequest = requestFactory.createRequest(requestId, + info.getUploadId()); } catch (InstantiationException | IllegalAccessException e) { - messageHandler.handleMessage("Could not create request.", e, MessageType.ERROR); + messageHandler.handleMessage("Could not create request.", e, + MessageType.ERROR); // this should never happen if used correctly } - + completedRequest.setDocumentId(documentId); completedRequest.setDownloadPath(info.getPath()); completedRequest.setSize(info.getSize()); completedRequest.setDownloadUrl(fileEndpoint); completedRequest.setFilename(info.getImageFilename()); completedRequest.setFileId(info.getFileId()); - completedRequest.setStatus(RequestStatus.COMPLETE); + completedRequest.setStatus(info.getStatus()); + completedRequest.setErrorMsg(info.getErrorMsg()); completedRequest.setOcrDate(OffsetDateTime.now(ZoneId.of("UTC")).toString()); completedRequest.setTextFilename(info.getFilename()); - + try { - requestProducer.sendRequest(completedRequest, propertyManager.getProperty(Properties.KAFKA_TOPIC_OCR_COMPLETE)); + requestProducer.sendRequest(completedRequest, + propertyManager.getProperty(Properties.KAFKA_TOPIC_OCR_COMPLETE)); } catch (MessageCreationException e) { messageHandler.handleMessage("Could not send message.", e, MessageType.ERROR); } } - + } diff --git a/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/OCRManager.java b/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/OCRManager.java index d86ea67..c130f32 100644 --- a/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/OCRManager.java +++ b/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/OCRManager.java @@ -33,6 +33,7 @@ import edu.asu.diging.gilesecosystem.cassiopeia.core.service.IKafkaRequestSender; import edu.asu.diging.gilesecosystem.cassiopeia.core.service.IOCRManager; import edu.asu.diging.gilesecosystem.requests.IOCRRequest; +import edu.asu.diging.gilesecosystem.requests.RequestStatus; import edu.asu.diging.gilesecosystem.septemberutil.properties.MessageType; import edu.asu.diging.gilesecosystem.septemberutil.service.ISystemMessageHandler; import edu.asu.diging.gilesecosystem.util.files.IFileStorageManager; @@ -82,18 +83,25 @@ public void processOCRRequest(IOCRRequest request) { Metadata metadata = new Metadata(); BodyContentHandler handler = new BodyContentHandler(); - String ocrResult = null; + RequestInfo info = null; try (InputStream stream = new ByteArrayInputStream(image)) { ocrParser.parse(stream, handler, metadata, parseContext); - ocrResult = handler.toString(); + String ocrResult = handler.toString(); + info = saveTextToFile(request.getRequestId(), request.getDocumentId(), ocrResult, request.getFilename(), ".txt"); + info.setUploadId(request.getUploadId()); + info.setFileId(request.getFileId()); + info.setStatus(RequestStatus.COMPLETE); + info.setImageFilename(request.getFilename()); } catch (SAXException | TikaException | IOException e) { messageHandler.handleMessage("Error during ocr.", e, MessageType.ERROR); - // FIXME: send to monitoring app + info = new RequestInfo(null, 0, null); + info.setUploadId(request.getUploadId()); + info.setFileId(request.getFileId()); + info.setStatus(RequestStatus.FAILED); + info.setErrorMsg(e.getMessage()); + info.setImageFilename(request.getFilename()); } - RequestInfo info = saveTextToFile(request.getRequestId(), request.getDocumentId(), ocrResult, request.getFilename(), ".txt"); - info.setUploadId(request.getUploadId()); - info.setFileId(request.getFileId()); kafkaRequestSender.sendRequest(request.getRequestId(), request.getDocumentId(), info); } @@ -127,7 +135,6 @@ protected RequestInfo saveTextToFile(String requestId, if (!fileExtentions.startsWith(".")) { fileExtentions = "." + fileExtentions; } - String imageFilename = filename; filename = filename + fileExtentions; String filePath = docFolder + File.separator + filename; @@ -151,6 +158,6 @@ protected RequestInfo saveTextToFile(String requestId, } String relativePath = storageManager.getFileFolderPathInBaseFolder(requestId, documentId, null); - return new RequestInfo(relativePath + File.separator + filename, fileObject.length(), imageFilename, filename); + return new RequestInfo(relativePath + File.separator + filename, fileObject.length(), filename); } } diff --git a/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/RequestInfo.java b/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/RequestInfo.java index a32f92a..2e1733a 100644 --- a/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/RequestInfo.java +++ b/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/RequestInfo.java @@ -1,5 +1,7 @@ package edu.asu.diging.gilesecosystem.cassiopeia.core.service.impl; +import edu.asu.diging.gilesecosystem.requests.RequestStatus; + public class RequestInfo { private String path; @@ -8,12 +10,13 @@ public class RequestInfo { private String imageFilename; private String uploadId; private String fileId; + private RequestStatus status; + private String errorMsg; - public RequestInfo(String path, long size, String imageFilename, String filename) { + public RequestInfo(String path, long size, String filename) { this.path = path; this.size = size; this.filename = filename; - this.imageFilename = imageFilename; } public String getPath() { @@ -63,6 +66,22 @@ public String getFilename() { public void setFilename(String filename) { this.filename = filename; } + + public RequestStatus getStatus() { + return status; + } + + public void setStatus(RequestStatus status) { + this.status = status; + } + + public String getErrorMsg() { + return errorMsg; + } + + public void setErrorMsg(String errorMsg) { + this.errorMsg = errorMsg; + } } diff --git a/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/RequestResender.java b/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/RequestResender.java index 2ad9342..7e65c14 100644 --- a/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/RequestResender.java +++ b/cassiopeia/src/main/java/edu/asu/diging/gilesecosystem/cassiopeia/core/service/impl/RequestResender.java @@ -56,7 +56,8 @@ public boolean accept(File dir, String name) { String imageFilename = textFile.getName(); imageFilename = imageFilename.substring(0, imageFilename.lastIndexOf(".")); - RequestInfo info = new RequestInfo(relativePath + File.separator + textFile.getName(), textFile.length(), imageFilename, textFile.getName()); + RequestInfo info = new RequestInfo(relativePath + File.separator + textFile.getName(), textFile.length(), textFile.getName()); + info.setImageFilename(imageFilename); kafkaRequestSender.sendRequest(requestId, docId, info); requestCounter++; } diff --git a/cassiopeia/src/main/resources/config.properties b/cassiopeia/src/main/resources/config.properties index 3a9b67e..84ee2b8 100644 --- a/cassiopeia/src/main/resources/config.properties +++ b/cassiopeia/src/main/resources/config.properties @@ -16,7 +16,7 @@ tesseract_bin_folder=${tesseract.bin} tesseract_data_folder=${tesseract.data} tesseract_create_hocr=${tesseract.create.hocr} # how much time to give to tesseract before timing out (in sec) -tesseract_timeout=240 +tesseract_timeout=${tesseract.timeout} #OCR Type ocr_plainText=Plain Text