Skip to content

Commit

Permalink
feat: hash duplicates with cli option digestAlgorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
bamthomas committed Oct 2, 2023
1 parent 355f527 commit 8a47f82
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.icij.datashare.PropertiesProvider;
import org.icij.datashare.com.Message;
import org.icij.datashare.com.Publisher;
import org.icij.datashare.text.Hasher;
import org.icij.datashare.text.Language;
import org.icij.datashare.text.indexing.LanguageGuesser;
import org.icij.extract.document.TikaDocument;
Expand Down Expand Up @@ -48,6 +49,7 @@ public class ElasticsearchSpewer extends Spewer implements Serializable {
private final Publisher publisher;
private final LanguageGuesser languageGuesser;
private final int maxContentLength;
private final Hasher digestAlgorithm;
private String indexName;

@Inject
Expand All @@ -59,6 +61,7 @@ public ElasticsearchSpewer(final RestHighLevelClient client, LanguageGuesser lan
this.publisher = publisher;
this.esCfg = new ElasticsearchConfiguration(propertiesProvider);
this.maxContentLength = getMaxContentLength(propertiesProvider);
this.digestAlgorithm = getDigestAlgorithm(propertiesProvider);
logger.info("spewer defined with {}", esCfg);
}

Expand Down Expand Up @@ -91,7 +94,7 @@ private IndexRequest prepareRequest(final TikaDocument document, final TikaDocum
Map<String, Object> jsonDocument = getDocumentMap(document);

if (parent == null && isDuplicate(document.getId())) {
IndexRequest indexRequest = new IndexRequest(indexName).id(Entity.DEFAULT_DIGESTER.hash(document.getPath()));
IndexRequest indexRequest = new IndexRequest(indexName).id(digestAlgorithm.hash(document.getPath()));
indexRequest.source(getDuplicateMap(document));
indexRequest.setRefreshPolicy(esCfg.refreshPolicy);
return indexRequest;
Expand Down Expand Up @@ -199,4 +202,9 @@ public static String normalize(String input) {
int getMaxContentLength(PropertiesProvider propertiesProvider) {
return (int) Math.min(HumanReadableSize.parse(propertiesProvider.get("maxContentLength").orElse("-1")), Integer.MAX_VALUE);
}

private Hasher getDigestAlgorithm(PropertiesProvider propertiesProvider) {
return Hasher.parse(propertiesProvider.get("digestAlgorithm")
.orElse(Entity.DEFAULT_DIGESTER.name())).orElse(Entity.DEFAULT_DIGESTER);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import org.icij.datashare.com.Publisher;
import org.icij.datashare.test.ElasticsearchRule;
import org.icij.datashare.text.Document;
import org.icij.datashare.text.Duplicate;
import org.icij.datashare.text.Hasher;
import org.icij.datashare.text.Language;
import org.icij.datashare.text.Project;
import org.icij.extract.document.DocumentFactory;
Expand Down Expand Up @@ -342,25 +342,29 @@ public void test_extract_id_should_be_equal_to_datashare_id() throws IOException

@Test
public void test_duplicate_file() throws Exception {
Options<String> from = Options.from(new HashMap<>() {{
put("digestAlgorithm", Document.DEFAULT_DIGESTER.toString());
HashMap<String, String> digestProperties = new HashMap<>() {{
put("digestAlgorithm", "SHA-256");
put("digestProjectName", "project");
}});
}};
ElasticsearchSpewer spewer256 = new ElasticsearchSpewer(es.client,
text -> Language.ENGLISH, new FieldNames(), publisher, new PropertiesProvider(digestProperties)).withRefresh(IMMEDIATE).withIndex("test-datashare");
Options<String> from = Options.from(digestProperties);
DocumentFactory tikaFactory = new DocumentFactory().configure(from);
Extractor extractor = new Extractor(tikaFactory).configure(from);

final TikaDocument document = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc.txt")).getPath()));
final TikaDocument document2 = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/doc-duplicate.txt")).getPath()));

spewer.write(document);
spewer.write(document2);
spewer256.write(document);
spewer256.write(document2);

GetResponse actualDocument = es.client.get(new GetRequest(TEST_INDEX, document.getId()),RequestOptions.DEFAULT);
GetResponse actualDocument2 = es.client.get(new GetRequest(TEST_INDEX, new Duplicate(document2.getPath(), document.getId()).getId()), RequestOptions.DEFAULT);
GetResponse actualDocument2 = es.client.get(new GetRequest(TEST_INDEX, Hasher.SHA_256.hash(document2.getPath())), RequestOptions.DEFAULT);
assertThat(actualDocument.isExists()).isTrue();
assertThat(actualDocument.getSourceAsMap()).includes(entry("type", "Document"));
assertThat(actualDocument2.isExists()).isTrue();
assertThat(actualDocument2.getSourceAsMap()).includes(entry("type", "Duplicate"));
assertThat(actualDocument2.getId().length()).isEqualTo(Hasher.SHA_256.digestLength);
}

@Test
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>

<datashare-api.version>12.0.0</datashare-api.version>
<datashare-api.version>12.1.0</datashare-api.version>

<datashare-cli.version>${project.version}</datashare-cli.version>
<datashare-mitie.version>${project.version}</datashare-mitie.version>
Expand Down

0 comments on commit 8a47f82

Please sign in to comment.