Skip to content

Commit

Permalink
Extract more standard metadata from binary files (elastic#78754)
Browse files Browse the repository at this point in the history
Until now, we have been extracted a few number of fields from the binary files sent to the ingest attachment plugin:

* `content`,
* `title`,
* `author`,
* `keywords`,
* `date`,
* `content_type`,
* `content_length`,
* `language`.

Tika has a list of more standard properties which can be extracted:

* `modified`,
* `format`,
* `identifier`,
* `contributor`,
* `coverage`,
* `modifier`,
* `creator_tool`,
* `publisher`,
* `relation`,
* `rights`,
* `source`,
* `type`,
* `description`,
* `print_date`,
* `metadata_date`,
* `latitude`,
* `longitude`,
* `altitude`,
* `rating`,
* `comments`

This commit exposes those new fields.

Related to elastic#22339.

Co-authored-by: Keith Massey <[email protected]>
  • Loading branch information
dadoonet and masseyke authored Nov 23, 2021
1 parent 480fd6a commit 564ff9d
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 43 deletions.
34 changes: 34 additions & 0 deletions docs/plugins/ingest-attachment.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,40 @@ The document's `attachment` object contains extracted properties for the file:
NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended
to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field.

[[ingest-attachment-fields]]
==== Exported fields

The fields which might be extracted from a document are:

* `content`,
* `title`,
* `author`,
* `keywords`,
* `date`,
* `content_type`,
* `content_length`,
* `language`,
* `modified`,
* `format`,
* `identifier`,
* `contributor`,
* `coverage`,
* `modifier`,
* `creator_tool`,
* `publisher`,
* `relation`,
* `rights`,
* `source`,
* `type`,
* `description`,
* `print_date`,
* `metadata_date`,
* `latitude`,
* `longitude`,
* `altitude`,
* `rating`,
* `comments`

To extract only certain `attachment` fields, specify the `properties` array:

[source,console]
Expand Down
6 changes: 6 additions & 0 deletions plugins/ingest-attachment/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ tasks.named("forbiddenPatterns").configure {
exclude '**/text-cjk-*.txt'
}

tasks.named("yamlRestTestV7CompatTransform").configure { task ->
// 2 new tika metadata fields are returned in v8
task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .doc file")
task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .docx file")
}

tasks.named("thirdPartyAudit").configure {
ignoreMissingClasses()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.Strings;
Expand Down Expand Up @@ -132,40 +133,11 @@ public IngestDocument execute(IngestDocument ingestDocument) {
additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
}

if (properties.contains(Property.DATE)) {
String createdDate = metadata.get(TikaCoreProperties.CREATED);
if (createdDate != null) {
additionalFields.put(Property.DATE.toLowerCase(), createdDate);
}
}

if (properties.contains(Property.TITLE)) {
String title = metadata.get(TikaCoreProperties.TITLE);
if (Strings.hasLength(title)) {
additionalFields.put(Property.TITLE.toLowerCase(), title);
}
}

if (properties.contains(Property.AUTHOR)) {
String author = metadata.get("Author");
if (Strings.hasLength(author)) {
additionalFields.put(Property.AUTHOR.toLowerCase(), author);
}
}

if (properties.contains(Property.KEYWORDS)) {
String keywords = metadata.get("Keywords");
if (Strings.hasLength(keywords)) {
additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
}
}

if (properties.contains(Property.CONTENT_TYPE)) {
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (Strings.hasLength(contentType)) {
additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
}
}
addAdditionalField(additionalFields, Property.DATE, metadata.get(TikaCoreProperties.CREATED));
addAdditionalField(additionalFields, Property.TITLE, metadata.get(TikaCoreProperties.TITLE));
addAdditionalField(additionalFields, Property.AUTHOR, metadata.get("Author"));
addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get("Keywords"));
addAdditionalField(additionalFields, Property.CONTENT_TYPE, metadata.get(Metadata.CONTENT_TYPE));

if (properties.contains(Property.CONTENT_LENGTH)) {
String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
Expand All @@ -178,6 +150,30 @@ public IngestDocument execute(IngestDocument ingestDocument) {
additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
}

addAdditionalField(additionalFields, Property.AUTHOR, metadata.get(TikaCoreProperties.CREATOR));
addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get(Office.KEYWORDS));

addAdditionalField(additionalFields, Property.MODIFIED, metadata.get(TikaCoreProperties.MODIFIED));
addAdditionalField(additionalFields, Property.FORMAT, metadata.get(TikaCoreProperties.FORMAT));
addAdditionalField(additionalFields, Property.IDENTIFIER, metadata.get(TikaCoreProperties.IDENTIFIER));
addAdditionalField(additionalFields, Property.CONTRIBUTOR, metadata.get(TikaCoreProperties.CONTRIBUTOR));
addAdditionalField(additionalFields, Property.COVERAGE, metadata.get(TikaCoreProperties.COVERAGE));
addAdditionalField(additionalFields, Property.MODIFIER, metadata.get(TikaCoreProperties.MODIFIER));
addAdditionalField(additionalFields, Property.CREATOR_TOOL, metadata.get(TikaCoreProperties.CREATOR_TOOL));
addAdditionalField(additionalFields, Property.PUBLISHER, metadata.get(TikaCoreProperties.PUBLISHER));
addAdditionalField(additionalFields, Property.RELATION, metadata.get(TikaCoreProperties.RELATION));
addAdditionalField(additionalFields, Property.RIGHTS, metadata.get(TikaCoreProperties.RIGHTS));
addAdditionalField(additionalFields, Property.SOURCE, metadata.get(TikaCoreProperties.SOURCE));
addAdditionalField(additionalFields, Property.TYPE, metadata.get(TikaCoreProperties.TYPE));
addAdditionalField(additionalFields, Property.DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION));
addAdditionalField(additionalFields, Property.PRINT_DATE, metadata.get(TikaCoreProperties.PRINT_DATE));
addAdditionalField(additionalFields, Property.METADATA_DATE, metadata.get(TikaCoreProperties.METADATA_DATE));
addAdditionalField(additionalFields, Property.LATITUDE, metadata.get(TikaCoreProperties.LATITUDE));
addAdditionalField(additionalFields, Property.LONGITUDE, metadata.get(TikaCoreProperties.LONGITUDE));
addAdditionalField(additionalFields, Property.ALTITUDE, metadata.get(TikaCoreProperties.ALTITUDE));
addAdditionalField(additionalFields, Property.RATING, metadata.get(TikaCoreProperties.RATING));
addAdditionalField(additionalFields, Property.COMMENTS, metadata.get(TikaCoreProperties.COMMENTS));

ingestDocument.setFieldValue(targetField, additionalFields);

if (removeBinary) {
Expand All @@ -186,6 +182,18 @@ public IngestDocument execute(IngestDocument ingestDocument) {
return ingestDocument;
}

/**
* Add an additional field if not null or empty
* @param additionalFields additional fields
* @param property property to add
* @param value value to add
*/
private <T> void addAdditionalField(Map<String, Object> additionalFields, Property property, String value) {
if (properties.contains(property) && Strings.hasLength(value)) {
additionalFields.put(property.toLowerCase(), value);
}
}

@Override
public String getType() {
return TYPE;
Expand Down Expand Up @@ -270,7 +278,27 @@ enum Property {
DATE,
CONTENT_TYPE,
CONTENT_LENGTH,
LANGUAGE;
LANGUAGE,
MODIFIED,
FORMAT,
IDENTIFIER,
CONTRIBUTOR,
COVERAGE,
MODIFIER,
CREATOR_TOOL,
PUBLISHER,
RELATION,
RIGHTS,
SOURCE,
TYPE,
DESCRIPTION,
PRINT_DATE,
METADATA_DATE,
LATITUDE,
LONGITUDE,
ALTITUDE,
RATING,
COMMENTS;

public static Property parse(String value) {
return valueOf(value.toUpperCase(Locale.ROOT));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,20 @@ public void testEnglishTextDocument() throws Exception {
}

public void testHtmlDocumentWithRandomFields() throws Exception {
// date is not present in the html doc
// some metadata are not present in the html doc
// "date", "metadata_date", "comments", "modified", "modifier", "print_date", "relation", "creator_tool", "altitude"
// "identifier", "longitude", "publisher", "description", "latitude", "format", "source", "coverage"
// "rating", "type", "contributor", "rights"
// we are only trying with content, title, author, keywords, content_type and content_length.
ArrayList<AttachmentProcessor.Property> fieldsList = new ArrayList<>(
EnumSet.complementOf(EnumSet.of(AttachmentProcessor.Property.DATE))
EnumSet.of(
AttachmentProcessor.Property.CONTENT,
AttachmentProcessor.Property.TITLE,
AttachmentProcessor.Property.AUTHOR,
AttachmentProcessor.Property.KEYWORDS,
AttachmentProcessor.Property.CONTENT_TYPE,
AttachmentProcessor.Property.CONTENT_LENGTH
)
);
Set<AttachmentProcessor.Property> selectedProperties = new HashSet<>();

Expand Down Expand Up @@ -128,7 +139,20 @@ public void testEmptyTextDocument() throws Exception {
public void testWordDocument() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-104.docx", processor);

assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
assertThat(
attachmentData.keySet(),
containsInAnyOrder(
"content",
"language",
"date",
"author",
"content_type",
"content_length",
"modifier",
"modified",
"publisher"
)
);
assertThat(attachmentData.get("content"), is(notNullValue()));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z"));
Expand All @@ -138,12 +162,28 @@ public void testWordDocument() throws Exception {
attachmentData.get("content_type").toString(),
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
);
assertThat(attachmentData.get("modifier").toString(), is("Luka Lampret"));
assertThat(attachmentData.get("modified").toString(), is("2015-02-20T11:36:00Z"));
assertThat(attachmentData.get("publisher").toString(), is("JDI"));
}

public void testWordDocumentWithVisioSchema() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);

assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
assertThat(
attachmentData.keySet(),
containsInAnyOrder(
"content",
"language",
"date",
"author",
"content_type",
"content_length",
"modifier",
"modified",
"print_date"
)
);
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
Expand All @@ -153,18 +193,37 @@ public void testWordDocumentWithVisioSchema() throws Exception {
attachmentData.get("content_type").toString(),
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
);
assertThat(attachmentData.get("modifier").toString(), is("Chris Dufour"));
assertThat(attachmentData.get("modified").toString(), is("2016-12-04T16:58:00Z"));
assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z"));
}

public void testLegacyWordDocumentWithVisioSchema() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);

assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
assertThat(
attachmentData.keySet(),
containsInAnyOrder(
"content",
"language",
"date",
"author",
"content_type",
"content_length",
"modifier",
"modified",
"print_date"
)
);
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
assertThat(attachmentData.get("author"), is(notNullValue()));
assertThat(attachmentData.get("content_length"), is(notNullValue()));
assertThat(attachmentData.get("content_type").toString(), is("application/msword"));
assertThat(attachmentData.get("modifier").toString(), is("David Pilato"));
assertThat(attachmentData.get("modified").toString(), is("2016-12-16T15:04:00Z"));
assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z"));
}

public void testPdf() throws Exception {
Expand Down Expand Up @@ -217,9 +276,26 @@ public void testEpubDocument() throws Exception {

assertThat(
attachmentData.keySet(),
containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length", "date", "keywords")
containsInAnyOrder(
"language",
"content",
"author",
"title",
"content_type",
"content_length",
"date",
"keywords",
"identifier",
"contributor",
"publisher",
"description"
)
);
assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip"));
assertThat(attachmentData.get("identifier").toString(), is("1234567890"));
assertThat(attachmentData.get("contributor").toString(), is("no-one"));
assertThat(attachmentData.get("publisher").toString(), is("Apache"));
assertThat(attachmentData.get("description").toString(), is("This is an ePub test publication for Tika."));
}

// no real detection, just rudimentary
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
---
"Test ingest attachment processor with .doc file":
- skip:
version: " - 7.99.99"
reason: "new fields added in 8.0.0"
- do:
ingest.put_pipeline:
id: "my_pipeline"
Expand Down Expand Up @@ -27,17 +30,22 @@
get:
index: test
id: 1
- length: { _source.attachment: 6 }
- length: { _source.attachment: 8 }
- match: { _source.attachment.content: "Test elasticsearch" }
- match: { _source.attachment.language: "et" }
- match: { _source.attachment.author: "David Pilato" }
- match: { _source.attachment.date: "2016-03-10T08:25:00Z" }
- match: { _source.attachment.content_length: 19 }
- match: { _source.attachment.content_type: "application/msword" }
- match: { _source.attachment.modifier: "David Pilato" }
- match: { _source.attachment.modified: "2016-03-10T08:25:00Z" }


---
"Test ingest attachment processor with .docx file":
- skip:
version: " - 7.99.99"
reason: "new fields added in 8.0.0"
- do:
ingest.put_pipeline:
id: "my_pipeline"
Expand Down Expand Up @@ -65,10 +73,12 @@
get:
index: test
id: 1
- length: { _source.attachment: 6 }
- length: { _source.attachment: 8 }
- match: { _source.attachment.content: "Test elasticsearch" }
- match: { _source.attachment.language: "et" }
- match: { _source.attachment.author: "David Pilato" }
- match: { _source.attachment.date: "2016-03-10T08:24:00Z" }
- match: { _source.attachment.content_length: 19 }
- match: { _source.attachment.content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document" }
- match: { _source.attachment.modifier: "David Pilato" }
- match: { _source.attachment.modified: "2016-03-10T08:24:00Z" }

0 comments on commit 564ff9d

Please sign in to comment.