Extract more standard metadata from binary files (elastic#78754)

Until now, we have been extracted a few number of fields from the binary files sent to the ingest attachment plugin: * `content`, * `title`, * `author`, * `keywords`, * `date`, * `content_type`, * `content_length`, * `language`. Tika has a list of more standard properties which can be extracted: * `modified`, * `format`, * `identifier`, * `contributor`, * `coverage`, * `modifier`, * `creator_tool`, * `publisher`, * `relation`, * `rights`, * `source`, * `type`, * `description`, * `print_date`, * `metadata_date`, * `latitude`, * `longitude`, * `altitude`, * `rating`, * `comments` This commit exposes those new fields. Related to elastic#22339. Co-authored-by: Keith Massey <[email protected]>
weizijun · Nov 23, 2021 · 564ff9d · 564ff9d
1 parent 480fd6a
commit 564ff9d
Show file tree

Hide file tree

Showing 5 changed files with 197 additions and 43 deletions.
diff --git a/docs/plugins/ingest-attachment.asciidoc b/docs/plugins/ingest-attachment.asciidoc
@@ -98,6 +98,40 @@ The document's `attachment` object contains extracted properties for the file:
 NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended
       to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field.
 
+[[ingest-attachment-fields]]
+==== Exported fields
+
+The fields which might be extracted from a document are:
+
+* `content`,
+* `title`,
+* `author`,
+* `keywords`,
+* `date`,
+* `content_type`,
+* `content_length`,
+* `language`,
+* `modified`,
+* `format`,
+* `identifier`,
+* `contributor`,
+* `coverage`,
+* `modifier`,
+* `creator_tool`,
+* `publisher`,
+* `relation`,
+* `rights`,
+* `source`,
+* `type`,
+* `description`,
+* `print_date`,
+* `metadata_date`,
+* `latitude`,
+* `longitude`,
+* `altitude`,
+* `rating`,
+* `comments`
+
 To extract only certain `attachment` fields, specify the `properties` array:
 
 [source,console]

diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle
@@ -86,6 +86,12 @@ tasks.named("forbiddenPatterns").configure {
   exclude '**/text-cjk-*.txt'
 }
 
+tasks.named("yamlRestTestV7CompatTransform").configure { task ->
+  // 2 new tika metadata fields are returned in v8
+  task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .doc file")
+  task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .docx file")
+}
+
 tasks.named("thirdPartyAudit").configure {
   ignoreMissingClasses()
 }

diff --git a/...est-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/...est-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java
@@ -11,6 +11,7 @@
 import org.apache.tika.exception.ZeroByteFileException;
 import org.apache.tika.language.LanguageIdentifier;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.Strings;
@@ -132,40 +133,11 @@ public IngestDocument execute(IngestDocument ingestDocument) {
             additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
         }
 
-        if (properties.contains(Property.DATE)) {
-            String createdDate = metadata.get(TikaCoreProperties.CREATED);
-            if (createdDate != null) {
-                additionalFields.put(Property.DATE.toLowerCase(), createdDate);
-            }
-        }
-
-        if (properties.contains(Property.TITLE)) {
-            String title = metadata.get(TikaCoreProperties.TITLE);
-            if (Strings.hasLength(title)) {
-                additionalFields.put(Property.TITLE.toLowerCase(), title);
-            }
-        }
-
-        if (properties.contains(Property.AUTHOR)) {
-            String author = metadata.get("Author");
-            if (Strings.hasLength(author)) {
-                additionalFields.put(Property.AUTHOR.toLowerCase(), author);
-            }
-        }
-
-        if (properties.contains(Property.KEYWORDS)) {
-            String keywords = metadata.get("Keywords");
-            if (Strings.hasLength(keywords)) {
-                additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
-            }
-        }
-
-        if (properties.contains(Property.CONTENT_TYPE)) {
-            String contentType = metadata.get(Metadata.CONTENT_TYPE);
-            if (Strings.hasLength(contentType)) {
-                additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
-            }
-        }
+        addAdditionalField(additionalFields, Property.DATE, metadata.get(TikaCoreProperties.CREATED));
+        addAdditionalField(additionalFields, Property.TITLE, metadata.get(TikaCoreProperties.TITLE));
+        addAdditionalField(additionalFields, Property.AUTHOR, metadata.get("Author"));
+        addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get("Keywords"));
+        addAdditionalField(additionalFields, Property.CONTENT_TYPE, metadata.get(Metadata.CONTENT_TYPE));
 
         if (properties.contains(Property.CONTENT_LENGTH)) {
             String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
@@ -178,6 +150,30 @@ public IngestDocument execute(IngestDocument ingestDocument) {
             additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
         }
 
+        addAdditionalField(additionalFields, Property.AUTHOR, metadata.get(TikaCoreProperties.CREATOR));
+        addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get(Office.KEYWORDS));
+
+        addAdditionalField(additionalFields, Property.MODIFIED, metadata.get(TikaCoreProperties.MODIFIED));
+        addAdditionalField(additionalFields, Property.FORMAT, metadata.get(TikaCoreProperties.FORMAT));
+        addAdditionalField(additionalFields, Property.IDENTIFIER, metadata.get(TikaCoreProperties.IDENTIFIER));
+        addAdditionalField(additionalFields, Property.CONTRIBUTOR, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+        addAdditionalField(additionalFields, Property.COVERAGE, metadata.get(TikaCoreProperties.COVERAGE));
+        addAdditionalField(additionalFields, Property.MODIFIER, metadata.get(TikaCoreProperties.MODIFIER));
+        addAdditionalField(additionalFields, Property.CREATOR_TOOL, metadata.get(TikaCoreProperties.CREATOR_TOOL));
+        addAdditionalField(additionalFields, Property.PUBLISHER, metadata.get(TikaCoreProperties.PUBLISHER));
+        addAdditionalField(additionalFields, Property.RELATION, metadata.get(TikaCoreProperties.RELATION));
+        addAdditionalField(additionalFields, Property.RIGHTS, metadata.get(TikaCoreProperties.RIGHTS));
+        addAdditionalField(additionalFields, Property.SOURCE, metadata.get(TikaCoreProperties.SOURCE));
+        addAdditionalField(additionalFields, Property.TYPE, metadata.get(TikaCoreProperties.TYPE));
+        addAdditionalField(additionalFields, Property.DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION));
+        addAdditionalField(additionalFields, Property.PRINT_DATE, metadata.get(TikaCoreProperties.PRINT_DATE));
+        addAdditionalField(additionalFields, Property.METADATA_DATE, metadata.get(TikaCoreProperties.METADATA_DATE));
+        addAdditionalField(additionalFields, Property.LATITUDE, metadata.get(TikaCoreProperties.LATITUDE));
+        addAdditionalField(additionalFields, Property.LONGITUDE, metadata.get(TikaCoreProperties.LONGITUDE));
+        addAdditionalField(additionalFields, Property.ALTITUDE, metadata.get(TikaCoreProperties.ALTITUDE));
+        addAdditionalField(additionalFields, Property.RATING, metadata.get(TikaCoreProperties.RATING));
+        addAdditionalField(additionalFields, Property.COMMENTS, metadata.get(TikaCoreProperties.COMMENTS));
+
         ingestDocument.setFieldValue(targetField, additionalFields);
 
         if (removeBinary) {
@@ -186,6 +182,18 @@ public IngestDocument execute(IngestDocument ingestDocument) {
         return ingestDocument;
     }
 
+    /**
+     * Add an additional field if not null or empty
+     * @param additionalFields  additional fields
+     * @param property          property to add
+     * @param value             value to add
+     */
+    private <T> void addAdditionalField(Map<String, Object> additionalFields, Property property, String value) {
+        if (properties.contains(property) && Strings.hasLength(value)) {
+            additionalFields.put(property.toLowerCase(), value);
+        }
+    }
+
     @Override
     public String getType() {
         return TYPE;
@@ -270,7 +278,27 @@ enum Property {
         DATE,
         CONTENT_TYPE,
         CONTENT_LENGTH,
-        LANGUAGE;
+        LANGUAGE,
+        MODIFIED,
+        FORMAT,
+        IDENTIFIER,
+        CONTRIBUTOR,
+        COVERAGE,
+        MODIFIER,
+        CREATOR_TOOL,
+        PUBLISHER,
+        RELATION,
+        RIGHTS,
+        SOURCE,
+        TYPE,
+        DESCRIPTION,
+        PRINT_DATE,
+        METADATA_DATE,
+        LATITUDE,
+        LONGITUDE,
+        ALTITUDE,
+        RATING,
+        COMMENTS;
 
         public static Property parse(String value) {
             return valueOf(value.toUpperCase(Locale.ROOT));

diff --git a/...ttachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/...ttachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
@@ -68,9 +68,20 @@ public void testEnglishTextDocument() throws Exception {
     }
 
     public void testHtmlDocumentWithRandomFields() throws Exception {
-        // date is not present in the html doc
+        // some metadata are not present in the html doc
+        // "date", "metadata_date", "comments", "modified", "modifier", "print_date", "relation", "creator_tool", "altitude"
+        // "identifier", "longitude", "publisher", "description", "latitude", "format", "source", "coverage"
+        // "rating", "type", "contributor", "rights"
+        // we are only trying with content, title, author, keywords, content_type and content_length.
         ArrayList<AttachmentProcessor.Property> fieldsList = new ArrayList<>(
-            EnumSet.complementOf(EnumSet.of(AttachmentProcessor.Property.DATE))
+            EnumSet.of(
+                AttachmentProcessor.Property.CONTENT,
+                AttachmentProcessor.Property.TITLE,
+                AttachmentProcessor.Property.AUTHOR,
+                AttachmentProcessor.Property.KEYWORDS,
+                AttachmentProcessor.Property.CONTENT_TYPE,
+                AttachmentProcessor.Property.CONTENT_LENGTH
+            )
         );
         Set<AttachmentProcessor.Property> selectedProperties = new HashSet<>();
 
@@ -128,7 +139,20 @@ public void testEmptyTextDocument() throws Exception {
     public void testWordDocument() throws Exception {
         Map<String, Object> attachmentData = parseDocument("issue-104.docx", processor);
 
-        assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
+        assertThat(
+            attachmentData.keySet(),
+            containsInAnyOrder(
+                "content",
+                "language",
+                "date",
+                "author",
+                "content_type",
+                "content_length",
+                "modifier",
+                "modified",
+                "publisher"
+            )
+        );
         assertThat(attachmentData.get("content"), is(notNullValue()));
         assertThat(attachmentData.get("language"), is("en"));
         assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z"));
@@ -138,12 +162,28 @@ public void testWordDocument() throws Exception {
             attachmentData.get("content_type").toString(),
             is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
         );
+        assertThat(attachmentData.get("modifier").toString(), is("Luka Lampret"));
+        assertThat(attachmentData.get("modified").toString(), is("2015-02-20T11:36:00Z"));
+        assertThat(attachmentData.get("publisher").toString(), is("JDI"));
     }
 
     public void testWordDocumentWithVisioSchema() throws Exception {
         Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);
 
-        assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
+        assertThat(
+            attachmentData.keySet(),
+            containsInAnyOrder(
+                "content",
+                "language",
+                "date",
+                "author",
+                "content_type",
+                "content_length",
+                "modifier",
+                "modified",
+                "print_date"
+            )
+        );
         assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
         assertThat(attachmentData.get("language"), is("en"));
         assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
@@ -153,18 +193,37 @@ public void testWordDocumentWithVisioSchema() throws Exception {
             attachmentData.get("content_type").toString(),
             is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
         );
+        assertThat(attachmentData.get("modifier").toString(), is("Chris Dufour"));
+        assertThat(attachmentData.get("modified").toString(), is("2016-12-04T16:58:00Z"));
+        assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z"));
     }
 
     public void testLegacyWordDocumentWithVisioSchema() throws Exception {
         Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);
 
-        assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
+        assertThat(
+            attachmentData.keySet(),
+            containsInAnyOrder(
+                "content",
+                "language",
+                "date",
+                "author",
+                "content_type",
+                "content_length",
+                "modifier",
+                "modified",
+                "print_date"
+            )
+        );
         assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
         assertThat(attachmentData.get("language"), is("en"));
         assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
         assertThat(attachmentData.get("author"), is(notNullValue()));
         assertThat(attachmentData.get("content_length"), is(notNullValue()));
         assertThat(attachmentData.get("content_type").toString(), is("application/msword"));
+        assertThat(attachmentData.get("modifier").toString(), is("David Pilato"));
+        assertThat(attachmentData.get("modified").toString(), is("2016-12-16T15:04:00Z"));
+        assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z"));
     }
 
     public void testPdf() throws Exception {
@@ -217,9 +276,26 @@ public void testEpubDocument() throws Exception {
 
         assertThat(
             attachmentData.keySet(),
-            containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length", "date", "keywords")
+            containsInAnyOrder(
+                "language",
+                "content",
+                "author",
+                "title",
+                "content_type",
+                "content_length",
+                "date",
+                "keywords",
+                "identifier",
+                "contributor",
+                "publisher",
+                "description"
+            )
         );
         assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip"));
+        assertThat(attachmentData.get("identifier").toString(), is("1234567890"));
+        assertThat(attachmentData.get("contributor").toString(), is("no-one"));
+        assertThat(attachmentData.get("publisher").toString(), is("Apache"));
+        assertThat(attachmentData.get("description").toString(), is("This is an ePub test publication for Tika."));
     }
 
     // no real detection, just rudimentary

diff --git a/...nt/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml b/...nt/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml
@@ -1,5 +1,8 @@
 ---
 "Test ingest attachment processor with .doc file":
+  - skip:
+      version: " - 7.99.99"
+      reason: "new fields added in 8.0.0"
   - do:
       ingest.put_pipeline:
         id: "my_pipeline"
@@ -27,17 +30,22 @@
       get:
         index: test
         id: 1
-  - length: { _source.attachment: 6 }
+  - length: { _source.attachment: 8 }
   - match: { _source.attachment.content: "Test elasticsearch" }
   - match: { _source.attachment.language: "et" }
   - match: { _source.attachment.author: "David Pilato" }
   - match: { _source.attachment.date: "2016-03-10T08:25:00Z" }
   - match: { _source.attachment.content_length: 19 }
   - match: { _source.attachment.content_type: "application/msword" }
+  - match: { _source.attachment.modifier: "David Pilato" }
+  - match: { _source.attachment.modified: "2016-03-10T08:25:00Z" }
 
 
 ---
 "Test ingest attachment processor with .docx file":
+  - skip:
+      version: " - 7.99.99"
+      reason: "new fields added in 8.0.0"
   - do:
       ingest.put_pipeline:
         id: "my_pipeline"
@@ -65,10 +73,12 @@
       get:
         index: test
         id: 1
-  - length: { _source.attachment: 6 }
+  - length: { _source.attachment: 8 }
   - match: { _source.attachment.content: "Test elasticsearch" }
   - match: { _source.attachment.language: "et" }
   - match: { _source.attachment.author: "David Pilato" }
   - match: { _source.attachment.date: "2016-03-10T08:24:00Z" }
   - match: { _source.attachment.content_length: 19 }
   - match: { _source.attachment.content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document" }
+  - match: { _source.attachment.modifier: "David Pilato" }
+  - match: { _source.attachment.modified: "2016-03-10T08:24:00Z" }