Skip to content

Commit

Permalink
Merge pull request #1055 from Norconex/feature/CU-86888fmx3/crawler-s…
Browse files Browse the repository at this point in the history
…tability

Javadoc updates.
  • Loading branch information
essiembre authored Sep 2, 2024
2 parents 29aa3f8 + f124363 commit 26a8360
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 107 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,42 +33,29 @@
* Implementors should offer this XML configuration usage:
* </p>
*
* {@nx.xml #usage
* <metadataChecksummer class="(subclass)">
* keep="[false|true]"
* toField="(optional metadata field to store the checksum)"
* onSet="[append|prepend|replace|optional]" />
* }
* <p>
* <code>toField</code> is ignored unless the <code>keep</code>
* attribute is set to <code>true</code>.
* </p>
*/
@Data
@Accessors(chain = true)
@SuppressWarnings("javadoc")
public class BaseChecksummerConfig {

/**
* Whether to keep the metadata checksum value as a new metadata field.
* @param keep <code>true</code> to keep the checksum
* @return <code>true</code> to keep the checksum
*/
private boolean keep;

/**
* The metadata field to use to store the checksum value.
* Default value is set by checksummer implementations.
* A default field name may be set by checksummer implementations.
* Only applicable if {@link #isKeep()} returns {@code true}
* @param toField the metadata field name
* @return metadata field name
*/
private String toField;

/**
* The property setter to use when a value is set.
* @param onSet property setter
* @return property setter
*/
private PropertySetter onSet;
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,11 @@
* </p>
* <p>
* You have the option to keep the checksum as a document metadata field.
* When {@link #setKeep(boolean)} is <code>true</code>, the checksum will be
* When {@link Md5DocumentChecksummerConfig#setKeep(boolean)} is
* <code>true</code>, the checksum will be
* stored in the target field name specified. If you do not specify any,
* it stores it under the metadata field name
* {@link CrawlDocMetadata#CHECKSUM_METADATA}.
* {@value CrawlDocMetadata#CHECKSUM_METADATA}.
* </p>
*
* <p>
Expand All @@ -54,38 +55,11 @@
* will ignore the content while specifying none will only use the content.
* </p>
*
* {@nx.xml.usage
* <documentChecksummer
* class="com.norconex.crawler.core.checksum.impl.MD5DocumentChecksummer"
* combineFieldsAndContent="[false|true]"
* keep="[false|true]"
* toField="(optional metadata field to store the checksum)">
*
* <fieldMatcher {@nx.include com.norconex.commons.lang.text.TextMatcher#matchAttributes}>
* (expression matching fields used to create the checksum)
* </fieldMatcher>
* </documentChecksummer>
* }
* <p>
* <code>toField</code> is ignored unless the <code>keep</code>
* attribute is set to <code>true</code>.
* </p>
*
* {@nx.xml.example
* <documentChecksummer class="Md5DocumentChecksummer" />
* }
*
* <p>
* The above example uses the document body (default) to make the checksum.
* </p>
*
* <p>
* <b>Since 2.0.0</b>, a self-closing
* <code>&lt;documentChecksummer/&gt;</code> tag without any attributes
* is used to disable checksum generation.
* </p>
*/
@SuppressWarnings("javadoc")
@Data
public class Md5DocumentChecksummer
extends AbstractDocumentChecksummer<Md5DocumentChecksummerConfig> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,78 +15,20 @@
package com.norconex.crawler.core.doc.operations.checksum.impl;

import com.norconex.commons.lang.text.TextMatcher;
import com.norconex.crawler.core.doc.CrawlDocMetadata;
import com.norconex.crawler.core.doc.operations.checksum.BaseChecksummerConfig;
import com.norconex.crawler.core.doc.operations.checksum.DocumentChecksummer;

import lombok.Data;
import lombok.experimental.Accessors;

/**
* <p>Implementation of {@link DocumentChecksummer} which
* returns a MD5 checksum value of the extracted document content unless
* one or more given source fields are specified, in which case the MD5
* checksum value is constructed from those fields. This checksum is normally
* performed right after the document has been imported.
* </p>
* <p>
* You have the option to keep the checksum as a document metadata field.
* When {@link #setKeep(boolean)} is <code>true</code>, the checksum will be
* stored in the target field name specified. If you do not specify any,
* it stores it under the metadata field name
* {@link CrawlDocMetadata#CHECKSUM_METADATA}.
* </p>
*
* <p>
* <b>Since 1.9.0</b>, it is possible to use a combination of document content
* and fields to create the checksum by setting
* <code>combineFieldsAndContent</code> to <code>true</code>.
* If you combine fields and content but you don't define a field matcher,
* it will be the equivalent of adding all fields.
* If you do not combine the two, specifying a field matcher
* will ignore the content while specifying none will only use the content.
* </p>
*
* {@nx.xml.usage
* <documentChecksummer
* class="com.norconex.crawler.core.checksum.impl.MD5DocumentChecksummer"
* combineFieldsAndContent="[false|true]"
* keep="[false|true]"
* toField="(optional metadata field to store the checksum)">
*
* <fieldMatcher {@nx.include com.norconex.commons.lang.text.TextMatcher#matchAttributes}>
* (expression matching fields used to create the checksum)
* </fieldMatcher>
* </documentChecksummer>
* }
* <p>
* <code>toField</code> is ignored unless the <code>keep</code>
* attribute is set to <code>true</code>.
* </p>
*
* {@nx.xml.example
* <documentChecksummer class="Md5DocumentChecksummer" />
* }
*
* <p>
* The above example uses the document body (default) to make the checksum.
* </p>
*
* <p>
* <b>Since 2.0.0</b>, a self-closing
* <code>&lt;documentChecksummer/&gt;</code> tag without any attributes
* is used to disable checksum generation.
* </p>
* Configuration for {@link Md5DocumentChecksummer}.
*/
@SuppressWarnings("javadoc")
@Data
@Accessors(chain = true)
public class Md5DocumentChecksummerConfig extends BaseChecksummerConfig {

/**
* The field matcher.
* @param fieldMatcher field matcher
* @return field matcher
* Matcher of one or more fields to use to make up the checksum.
*/
private final TextMatcher fieldMatcher = new TextMatcher();

Expand All @@ -98,10 +40,6 @@ public Md5DocumentChecksummerConfig setFieldMatcher(

/**
* Whether we are combining the fields and content checksums.
* @param combineFieldsAndContent <code>true</code> if combining fields
* and content checksums
* @return <code>true</code> if combining fields and content checksums
*/
private boolean combineFieldsAndContent;

}

0 comments on commit 26a8360

Please sign in to comment.