Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use soft-update to maintain document history #29458

Merged
merged 7 commits into from
Apr 12, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions server/src/main/java/org/elasticsearch/common/lucene/Lucene.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.document.LatLonDocValuesField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
Expand Down Expand Up @@ -96,6 +97,8 @@ public class Lucene {
assert annotation == null : "DocValuesFormat " + LATEST_DOC_VALUES_FORMAT + " is deprecated" ;
}

public static final String SOFT_DELETE_FIELD = "__soft_delete";

public static final NamedAnalyzer STANDARD_ANALYZER = new NamedAnalyzer("_standard", AnalyzerScope.GLOBAL, new StandardAnalyzer());
public static final NamedAnalyzer KEYWORD_ANALYZER = new NamedAnalyzer("_keyword", AnalyzerScope.GLOBAL, new KeywordAnalyzer());

Expand Down Expand Up @@ -829,4 +832,11 @@ public int length() {
}
};
}

/**
* Returns a numeric docvalues which can be used to soft-delete documents.
*/
public static NumericDocValuesField getSoftDeleteDVMarker() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

call this newSoftDeletesField?

return new NumericDocValuesField(SOFT_DELETE_FIELD, 1);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
IndexSettings.MAX_REGEX_LENGTH_SETTING,
ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE_SETTING,
IndexSettings.INDEX_GC_DELETES_SETTING,
IndexSettings.INDEX_SOFT_DELETES_SETTING,
IndicesRequestCache.INDEX_CACHE_REQUEST_ENABLED_SETTING,
UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING,
EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING,
Expand Down
15 changes: 15 additions & 0 deletions server/src/main/java/org/elasticsearch/index/IndexSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,12 @@ public final class IndexSettings {
public static final Setting<TimeValue> INDEX_GC_DELETES_SETTING =
Setting.timeSetting("index.gc_deletes", DEFAULT_GC_DELETES, new TimeValue(-1, TimeUnit.MILLISECONDS), Property.Dynamic,
Property.IndexScope);

/**
* Specifies if the index should use soft-delete instead of hard-delete for update/delete operations.
*/
public static final Setting<Boolean> INDEX_SOFT_DELETES_SETTING = Setting.boolSetting("index.soft_deletes", true, Property.IndexScope);

/**
* The maximum number of refresh listeners allows on this shard.
*/
Expand Down Expand Up @@ -296,6 +302,7 @@ public final class IndexSettings {
private final IndexSortConfig indexSortConfig;
private final IndexScopedSettings scopedSettings;
private long gcDeletesInMillis = DEFAULT_GC_DELETES.millis();
private final boolean softDeleteEnabled;
private volatile boolean warmerEnabled;
private volatile int maxResultWindow;
private volatile int maxInnerResultWindow;
Expand Down Expand Up @@ -412,6 +419,7 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
generationThresholdSize = scopedSettings.get(INDEX_TRANSLOG_GENERATION_THRESHOLD_SIZE_SETTING);
mergeSchedulerConfig = new MergeSchedulerConfig(this);
gcDeletesInMillis = scopedSettings.get(INDEX_GC_DELETES_SETTING).getMillis();
softDeleteEnabled = scopedSettings.get(INDEX_SOFT_DELETES_SETTING);
warmerEnabled = scopedSettings.get(INDEX_WARMER_ENABLED_SETTING);
maxResultWindow = scopedSettings.get(MAX_RESULT_WINDOW_SETTING);
maxInnerResultWindow = scopedSettings.get(MAX_INNER_RESULT_WINDOW_SETTING);
Expand Down Expand Up @@ -868,4 +876,11 @@ public boolean isExplicitRefresh() {
* Returns the time that an index shard becomes search idle unless it's accessed in between
*/
public TimeValue getSearchIdleAfter() { return searchIdleAfter; }

/**
* Returns <code>true</code> if soft-delete is enabled.
*/
public boolean isSoftDeleteEnabled() {
return getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1) && softDeleteEnabled;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe resolve this in the ctor?

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ public final class CommitStats implements Streamable, ToXContentFragment {
private String id; // lucene commit id in base 64;
private int numDocs;

public CommitStats(SegmentInfos segmentInfos) {
public CommitStats(SegmentInfos segmentInfos, int numDocs) {
// clone the map to protect against concurrent changes
userData = MapBuilder.<String, String>newMapBuilder().putAll(segmentInfos.getUserData()).immutableMap();
// lucene calls the current generation, last generation.
generation = segmentInfos.getLastGeneration();
id = Base64.getEncoder().encodeToString(segmentInfos.getId());
numDocs = Lucene.getNumDocs(segmentInfos);
this.numDocs = numDocs;
}

private CommitStats() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,9 @@ protected final void ensureOpen() {

/** get commits stats for the last commit */
public CommitStats commitStats() {
return new CommitStats(getLastCommittedSegmentInfos());
try (Engine.Searcher searcher = acquireSearcher("commit_stats", Engine.SearcherScope.INTERNAL)) {
return new CommitStats(getLastCommittedSegmentInfos(), searcher.reader().numDocs());
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.SoftDeletesRetentionMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ReferenceManager;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
Expand Down Expand Up @@ -68,7 +70,6 @@
import org.elasticsearch.index.seqno.LocalCheckpointTracker;
import org.elasticsearch.index.seqno.SequenceNumbers;
import org.elasticsearch.index.shard.ElasticsearchMergePolicy;
import org.elasticsearch.index.shard.IndexingStats;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.translog.Translog;
import org.elasticsearch.index.translog.TranslogConfig;
Expand Down Expand Up @@ -141,6 +142,7 @@ public class InternalEngine extends Engine {
private final CounterMetric numDocDeletes = new CounterMetric();
private final CounterMetric numDocAppends = new CounterMetric();
private final CounterMetric numDocUpdates = new CounterMetric();
private final boolean softDeleteEnabled;

/**
* How many bytes we are currently moving to disk, via either IndexWriter.flush or refresh. IndexingMemoryController polls this
Expand All @@ -165,6 +167,7 @@ public InternalEngine(EngineConfig engineConfig) {
maxUnsafeAutoIdTimestamp.set(Long.MAX_VALUE);
}
this.uidField = engineConfig.getIndexSettings().isSingleType() ? IdFieldMapper.NAME : UidFieldMapper.NAME;
this.softDeleteEnabled = engineConfig.getIndexSettings().isSoftDeleteEnabled();
final TranslogDeletionPolicy translogDeletionPolicy = new TranslogDeletionPolicy(
engineConfig.getIndexSettings().getTranslogRetentionSize().getBytes(),
engineConfig.getIndexSettings().getTranslogRetentionAge().getMillis()
Expand Down Expand Up @@ -772,6 +775,7 @@ public IndexResult index(Index index) throws IOException {
} else if (plan.indexIntoLucene) {
indexResult = indexIntoLucene(index, plan);
} else {
// TODO: We need to index stale documents to have a full history in Lucene.
indexResult = new IndexResult(
plan.versionForIndexing, plan.seqNoForIndexing, plan.currentNotFoundOrDeleted);
}
Expand Down Expand Up @@ -1070,10 +1074,18 @@ private boolean assertDocDoesNotExist(final Index index, final boolean allowDele
}

private void updateDocs(final Term uid, final List<ParseContext.Document> docs, final IndexWriter indexWriter) throws IOException {
if (docs.size() > 1) {
indexWriter.updateDocuments(uid, docs);
if (softDeleteEnabled) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in the delete case we should also use indexWriter.updateDocValue(uid, Lucene.getSoftDeleteDVMarker()); instead of IndexWriter#deleteDocuments and then override this in the test to maybe throw an exception.

if (docs.size() > 1) {
indexWriter.softUpdateDocuments(uid, docs, Lucene.getSoftDeleteDVMarker());
} else {
indexWriter.softUpdateDocument(uid, docs.get(0), Lucene.getSoftDeleteDVMarker());
}
} else {
indexWriter.updateDocument(uid, docs.get(0));
if (docs.size() > 1) {
indexWriter.updateDocuments(uid, docs);
} else {
indexWriter.updateDocument(uid, docs.get(0));
}
}
numDocUpdates.inc(docs.size());
}
Expand Down Expand Up @@ -1918,6 +1930,7 @@ IndexWriter createWriter(Directory directory, IndexWriterConfig iwc) throws IOEx
return new IndexWriter(directory, iwc);
}


private IndexWriterConfig getIndexWriterConfig() {
final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
iwc.setCommitOnClose(false); // we by default don't commit on close
Expand All @@ -1931,11 +1944,15 @@ private IndexWriterConfig getIndexWriterConfig() {
}
iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger));
iwc.setMergeScheduler(mergeScheduler);
MergePolicy mergePolicy = config().getMergePolicy();
// Give us the opportunity to upgrade old segments while performing
// background merges
mergePolicy = new ElasticsearchMergePolicy(mergePolicy);
iwc.setMergePolicy(mergePolicy);
MergePolicy mergePolicy = config().getMergePolicy();
if (softDeleteEnabled) {
iwc.setSoftDeletesField(Lucene.SOFT_DELETE_FIELD);
// TODO: soft-delete retention policy
mergePolicy = new SoftDeletesRetentionMergePolicy(Lucene.SOFT_DELETE_FIELD, () -> new MatchAllDocsQuery(), mergePolicy);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't need this here since it's match all. Lets just omit it until we use it

}
iwc.setMergePolicy(new ElasticsearchMergePolicy(mergePolicy));
iwc.setSimilarity(engineConfig.getSimilarity());
iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().getMbFrac());
iwc.setCodec(engineConfig.getCodec());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package org.elasticsearch.index.shard;

import org.apache.logging.log4j.Logger;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MergeTrigger;
Expand Down Expand Up @@ -144,6 +145,11 @@ public boolean useCompoundFile(SegmentInfos segments, SegmentCommitInfo newSegme
return delegate.useCompoundFile(segments, newSegment, writer);
}

@Override
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of doing this I think we should extend MergePolicyWrapper in master and 6.x and then this is handled upstream

public boolean keepFullyDeletedSegment(CodecReader reader) throws IOException {
return delegate.keepFullyDeletedSegment(reader);
}

/**
* When <code>upgrade</code> is true, running a force merge will upgrade any segments written
* with older versions. This will apply to the next call to
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,6 @@ public RecoveryDiff recoveryDiff(MetadataSnapshot recoveryTargetSnapshot) {
}
final String segmentId = IndexFileNames.parseSegmentName(meta.name());
final String extension = IndexFileNames.getExtension(meta.name());
assert FIELD_INFOS_FILE_EXTENSION.equals(extension) == false || IndexFileNames.stripExtension(IndexFileNames.stripSegmentName(meta.name())).isEmpty() : "FieldInfos are generational but updateable DV are not supported in elasticsearch";
if (IndexFileNames.SEGMENTS.equals(segmentId) || DEL_FILE_EXTENSION.equals(extension) || LIV_FILE_EXTENSION.equals(extension)) {
// only treat del files as per-commit files fnm files are generational but only for upgradable DV
perCommitStoreFiles.add(meta);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilterDirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
Expand Down Expand Up @@ -119,6 +121,7 @@
import org.elasticsearch.index.translog.Translog;
import org.elasticsearch.index.translog.TranslogConfig;
import org.elasticsearch.indices.breaker.NoneCircuitBreakerService;
import org.elasticsearch.test.IndexSettingsModule;
import org.hamcrest.MatcherAssert;
import org.hamcrest.Matchers;

Expand Down Expand Up @@ -504,7 +507,7 @@ public void testSegmentsWithMergeFlag() throws Exception {

if (flush) {
// we should have had just 1 merge, so last generation should be exact
assertEquals(gen2, store.readLastCommittedSegmentsInfo().getLastGeneration());
assertEquals(gen2 + 1, store.readLastCommittedSegmentsInfo().getLastGeneration());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm why does this need to change?

}
}
}
Expand Down Expand Up @@ -2711,6 +2714,12 @@ private void maybeThrowFailure() throws IOException {
}
}

@Override
public long softUpdateDocument(Term term, Iterable<? extends IndexableField> doc, Field... softDeletes) throws IOException {
maybeThrowFailure();
return super.softUpdateDocument(term, doc, softDeletes);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

++

}

@Override
public long deleteDocuments(Term... terms) throws IOException {
maybeThrowFailure();
Expand Down Expand Up @@ -4537,6 +4546,41 @@ public void testTrimUnsafeCommits() throws Exception {
}
}

/**
* A simple test checks that the document history is maintained when a document is updated.
*/
public void testUpdateMaintainDocumentHistory() throws Exception {
final IndexMetaData indexMetaData = IndexMetaData.builder("test")
.settings(settings(Version.CURRENT).put(IndexSettings.INDEX_SOFT_DELETES_SETTING.getKey(), true))
.numberOfShards(1).numberOfReplicas(1).build();
final IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(indexMetaData);
final String docId = "doc-id";
final int versions = between(2, 20);
try (Store store = createStore();
InternalEngine engine = createEngine(indexSettings, store, createTempDir(), new LogDocMergePolicy())) {
final ParsedDocument doc = testParsedDocument(docId, null, testDocument(), new BytesArray("{}"), null);
for (int version = 1; version <= versions; version++) {
Engine.IndexResult indexResult = engine.index(indexForDoc(doc));
assertThat(indexResult.getFailure(), nullValue());
if (randomBoolean()) {
engine.flush();
}
}
engine.refresh("test");
try (Searcher searcher = engine.acquireSearcher("test", Engine.SearcherScope.INTERNAL)) {
assertThat(searcher.reader().numDocs(), equalTo(1));
assertThat(searcher.reader().numDeletedDocs(), equalTo(versions - 1));
try (Engine.GetResult get = engine.get(new Engine.Get(true, false, doc.type(), doc.id(), newUid(doc)), engine::acquireSearcher)) {
assertThat((int) get.version(), equalTo(versions));
}
// Use a reader which includes soft-deleted documents to verify history.
final UnfilteredReader unfilteredReader = new UnfilteredReader(searcher.getDirectoryReader());
assertThat(unfilteredReader.numDocs(), equalTo(versions));
assertThat(unfilteredReader.numDeletedDocs(), equalTo(0));
}
}
}

private static void trimUnsafeCommits(EngineConfig config) throws IOException {
final Store store = config.getStore();
final TranslogConfig translogConfig = config.getTranslogConfig();
Expand All @@ -4555,4 +4599,50 @@ void assertLuceneOperations(InternalEngine engine, long expectedAppends, long ex
assertThat(message, engine.getNumDocUpdates(), equalTo(expectedUpdates));
assertThat(message, engine.getNumDocDeletes(), equalTo(expectedDeletes));
}

/**
* A reader that does not exclude soft-deleted documents.
*/
static final class UnfilteredReader extends FilterDirectoryReader {
static final class UnfilteredSubReaderWrapper extends SubReaderWrapper {
@Override
public LeafReader wrap(LeafReader in) {
return new FilterLeafReader(in) {
@Override
public CacheHelper getCoreCacheHelper() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can return in.getCoreCacheHelper();

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can just move this entire thing to a util method in Lucene.java then we can reuse it in other places WDYT? @martijnvg you need this too somewhere right?

Copy link
Member Author

@dnhatn dnhatn Apr 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the test and this wrapper in this PR, and will add it back to Lucene.java later. I prefer to keep this change as a cut-over.

return null;
}

@Override
public CacheHelper getReaderCacheHelper() {
return null;
}

@Override
public int numDocs() {
return maxDoc();
}

@Override
public Bits getLiveDocs() {
return null;
}
};
}
}

UnfilteredReader(DirectoryReader in) throws IOException {
super(in, new UnfilteredSubReaderWrapper());
}

@Override
protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException {
return new UnfilteredReader(in);
}

@Override
public CacheHelper getReaderCacheHelper() {
return null; // we are modifying live docs.
}
}
}