diff --git a/x-pack/plugin/old-lucene-versions/build.gradle b/x-pack/plugin/old-lucene-versions/build.gradle index 6e9e38c1f190f..22ab9d7bf24ce 100644 --- a/x-pack/plugin/old-lucene-versions/build.gradle +++ b/x-pack/plugin/old-lucene-versions/build.gradle @@ -13,6 +13,4 @@ dependencies { compileOnly project(path: xpackModule('core')) } -test.enabled = false - addQaCheckDependencies() diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/OldLuceneVersions.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/OldLuceneVersions.java index 74b8620cc7af9..40b021f9ea9d8 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/OldLuceneVersions.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/OldLuceneVersions.java @@ -71,6 +71,8 @@ private static void convertToNewFormat(IndexShard indexShard) { private static SegmentInfos convertToNewerLuceneVersion(OldSegmentInfos oldSegmentInfos) { final SegmentInfos segmentInfos = new SegmentInfos(org.apache.lucene.util.Version.LATEST.major); + segmentInfos.version = oldSegmentInfos.version; + segmentInfos.counter = oldSegmentInfos.counter; segmentInfos.setNextWriteGeneration(oldSegmentInfos.getGeneration() + 1); final Map map = new HashMap<>(oldSegmentInfos.getUserData()); if (map.containsKey(Engine.HISTORY_UUID_KEY) == false) { @@ -85,21 +87,21 @@ private static SegmentInfos convertToNewerLuceneVersion(OldSegmentInfos oldSegme if (map.containsKey(Engine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID) == false) { map.put(Engine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID, "-1"); } - segmentInfos.setUserData(map, true); + segmentInfos.setUserData(map, false); for (SegmentCommitInfo infoPerCommit : oldSegmentInfos.asList()) { final SegmentInfo newInfo = BWCCodec.wrap(infoPerCommit.info); - - segmentInfos.add( - new SegmentCommitInfo( - newInfo, - infoPerCommit.getDelCount(), - infoPerCommit.getSoftDelCount(), - infoPerCommit.getDelGen(), - infoPerCommit.getFieldInfosGen(), - infoPerCommit.getDocValuesGen(), - infoPerCommit.getId() - ) + final SegmentCommitInfo commitInfo = new SegmentCommitInfo( + newInfo, + infoPerCommit.getDelCount(), + infoPerCommit.getSoftDelCount(), + infoPerCommit.getDelGen(), + infoPerCommit.getFieldInfosGen(), + infoPerCommit.getDocValuesGen(), + infoPerCommit.getId() ); + commitInfo.setDocValuesUpdatesFiles(infoPerCommit.getDocValuesUpdatesFiles()); + commitInfo.setFieldInfosFiles(infoPerCommit.getFieldInfosFiles()); + segmentInfos.add(commitInfo); } return segmentInfos; } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java index ee31589514736..b350f6a62404f 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/BWCCodec.java @@ -9,7 +9,6 @@ import org.apache.lucene.backward_codecs.lucene70.Lucene70Codec; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; @@ -20,7 +19,6 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.TermVectorsFormat; -import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; @@ -31,7 +29,6 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; -import org.elasticsearch.index.mapper.SeqNoFieldMapper; import org.elasticsearch.xpack.lucene.bwc.codecs.lucene70.BWCLucene70Codec; import java.io.IOException; @@ -60,11 +57,6 @@ public NormsFormat normsFormat() { throw new UnsupportedOperationException(); } - @Override - public DocValuesFormat docValuesFormat() { - throw new UnsupportedOperationException(); - } - @Override public TermVectorsFormat termVectorsFormat() { throw new UnsupportedOperationException(); @@ -166,14 +158,10 @@ public void write(Directory directory, SegmentInfo segmentInfo, String segmentSu }; } - // mark all fields as having no postings, no doc values, and no points. + // mark all fields as having no postings, no term vectors, no norms, no payloads, no points, and no vectors. private static FieldInfos filterFields(FieldInfos fieldInfos) { List fieldInfoCopy = new ArrayList<>(fieldInfos.size()); for (FieldInfo fieldInfo : fieldInfos) { - // omit sequence number field so that it doesn't interfere with peer recovery - if (fieldInfo.name.equals(SeqNoFieldMapper.NAME)) { - continue; - } fieldInfoCopy.add( new FieldInfo( fieldInfo.name, @@ -182,8 +170,8 @@ private static FieldInfos filterFields(FieldInfos fieldInfos) { false, false, IndexOptions.NONE, - DocValuesType.NONE, - -1, + fieldInfo.getDocValuesType(), + fieldInfo.getDocValuesGen(), fieldInfo.attributes(), 0, 0, diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyBinaryDocValues.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyBinaryDocValues.java new file mode 100644 index 0000000000000..a7b1724fa24f0 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyBinaryDocValues.java @@ -0,0 +1,42 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.util.BytesRef; + +/** + * A per-document byte[] + * + * @deprecated Use {@link BinaryDocValues} instead. + */ +@Deprecated +public abstract class LegacyBinaryDocValues { + + /** Sole constructor. (For invocation by subclass + * constructors, typically implicit.) */ + protected LegacyBinaryDocValues() {} + + /** Lookup the value for document. The returned {@link BytesRef} may be + * re-used across calls to {@link #get(int)} so make sure to + * {@link BytesRef#deepCopyOf(BytesRef) copy it} if you want to keep it + * around. */ + public abstract BytesRef get(int docID); +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyBinaryDocValuesWrapper.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyBinaryDocValuesWrapper.java new file mode 100644 index 0000000000000..8dbacf889172e --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyBinaryDocValuesWrapper.java @@ -0,0 +1,93 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +/** + * Wraps a {@link LegacyBinaryDocValues} into a {@link BinaryDocValues}. + * + * @deprecated Implement {@link BinaryDocValues} directly. + */ +@Deprecated +public final class LegacyBinaryDocValuesWrapper extends BinaryDocValues { + private final Bits docsWithField; + private final LegacyBinaryDocValues values; + private final int maxDoc; + private int docID = -1; + + public LegacyBinaryDocValuesWrapper(Bits docsWithField, LegacyBinaryDocValues values) { + this.docsWithField = docsWithField; + this.values = values; + this.maxDoc = docsWithField.length(); + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + docID++; + while (docID < maxDoc) { + if (docsWithField.get(docID)) { + return docID; + } + docID++; + } + docID = NO_MORE_DOCS; + return NO_MORE_DOCS; + } + + @Override + public int advance(int target) { + if (target < docID) { + throw new IllegalArgumentException("cannot advance backwards: docID=" + docID + " target=" + target); + } + if (target == NO_MORE_DOCS) { + this.docID = NO_MORE_DOCS; + } else { + this.docID = target - 1; + nextDoc(); + } + return docID; + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + return docsWithField.get(target); + } + + @Override + public long cost() { + return 0; + } + + @Override + public BytesRef binaryValue() { + return values.get(docID); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyDocValuesIterables.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyDocValuesIterables.java new file mode 100644 index 0000000000000..5a9b1bb252308 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyDocValuesIterables.java @@ -0,0 +1,539 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; +import java.util.Iterator; + +/** Bridge helper methods for legacy codecs to map sorted doc values to iterables. */ + +public class LegacyDocValuesIterables { + + private LegacyDocValuesIterables() { + // no + } + + /** Converts {@link SortedDocValues} into an {@code Iterable<BytesRef>} for all the values. + * + * @deprecated Consume {@link SortedDocValues} instead. */ + @Deprecated + public static Iterable valuesIterable(final SortedDocValues values) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + private int nextOrd; + + @Override + public boolean hasNext() { + return nextOrd < values.getValueCount(); + } + + @Override + public BytesRef next() { + try { + return values.lookupOrd(nextOrd++); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + }; + } + + /** Converts {@link SortedSetDocValues} into an {@code Iterable<BytesRef>} for all the values. + * + * @deprecated Consume {@link SortedSetDocValues} instead. */ + @Deprecated + public static Iterable valuesIterable(final SortedSetDocValues values) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + private long nextOrd; + + @Override + public boolean hasNext() { + return nextOrd < values.getValueCount(); + } + + @Override + public BytesRef next() { + try { + return values.lookupOrd(nextOrd++); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + }; + } + + /** Converts {@link SortedDocValues} into the ord for each document as an {@code Iterable<Number>}. + * + * @deprecated Consume {@link SortedDocValues} instead. */ + @Deprecated + public static Iterable sortedOrdIterable(final DocValuesProducer valuesProducer, FieldInfo fieldInfo, int maxDoc) { + return new Iterable() { + @Override + public Iterator iterator() { + + final SortedDocValues values; + try { + values = valuesProducer.getSorted(fieldInfo); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + return new Iterator() { + private int nextDocID; + + @Override + public boolean hasNext() { + return nextDocID < maxDoc; + } + + @Override + public Number next() { + try { + if (nextDocID > values.docID()) { + values.nextDoc(); + } + int result; + if (nextDocID == values.docID()) { + result = values.ordValue(); + } else { + result = -1; + } + nextDocID++; + return result; + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + }; + } + }; + } + + /** Converts number-of-ords per document from {@link SortedSetDocValues} into {@code Iterable<Number>}. + * + * @deprecated Consume {@link SortedSetDocValues} instead. */ + @Deprecated + public static Iterable sortedSetOrdCountIterable( + final DocValuesProducer valuesProducer, + final FieldInfo fieldInfo, + final int maxDoc + ) { + + return new Iterable() { + + @Override + public Iterator iterator() { + + final SortedSetDocValues values; + try { + values = valuesProducer.getSortedSet(fieldInfo); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + return new Iterator() { + private int nextDocID; + private int ordCount; + + @Override + public boolean hasNext() { + return nextDocID < maxDoc; + } + + @Override + public Number next() { + try { + if (nextDocID > values.docID()) { + if (values.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + ordCount = 0; + while (values.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) { + ordCount++; + } + } + } + int result; + if (nextDocID == values.docID()) { + result = ordCount; + } else { + result = 0; + } + nextDocID++; + return result; + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + }; + } + }; + } + + /** Converts all concatenated ords (in docID order) from {@link SortedSetDocValues} into {@code Iterable<Number>}. + * + * @deprecated Consume {@link SortedSetDocValues} instead. */ + @Deprecated + public static Iterable sortedSetOrdsIterable(final DocValuesProducer valuesProducer, final FieldInfo fieldInfo) { + + return new Iterable() { + + @Override + public Iterator iterator() { + + final SortedSetDocValues values; + try { + values = valuesProducer.getSortedSet(fieldInfo); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + return new Iterator() { + private boolean nextIsSet; + private long nextOrd; + + private void setNext() { + try { + if (nextIsSet == false) { + if (values.docID() == -1) { + values.nextDoc(); + } + while (true) { + if (values.docID() == DocIdSetIterator.NO_MORE_DOCS) { + nextOrd = -1; + break; + } + nextOrd = values.nextOrd(); + if (nextOrd != -1) { + break; + } + values.nextDoc(); + } + nextIsSet = true; + } + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + + @Override + public boolean hasNext() { + setNext(); + return nextOrd != -1; + } + + @Override + public Number next() { + setNext(); + assert nextOrd != -1; + nextIsSet = false; + return nextOrd; + } + }; + } + }; + } + + /** Converts number-of-values per document from {@link SortedNumericDocValues} into {@code Iterable<Number>}. + * + * @deprecated Consume {@link SortedDocValues} instead. */ + @Deprecated + public static Iterable sortedNumericToDocCount(final DocValuesProducer valuesProducer, final FieldInfo fieldInfo, int maxDoc) { + return new Iterable() { + + @Override + public Iterator iterator() { + + final SortedNumericDocValues values; + try { + values = valuesProducer.getSortedNumeric(fieldInfo); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + return new Iterator() { + private int nextDocID; + + @Override + public boolean hasNext() { + return nextDocID < maxDoc; + } + + @Override + public Number next() { + try { + if (nextDocID > values.docID()) { + values.nextDoc(); + } + int result; + if (nextDocID == values.docID()) { + result = values.docValueCount(); + } else { + result = 0; + } + nextDocID++; + return result; + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + }; + } + }; + } + + /** Converts all concatenated values (in docID order) from {@link SortedNumericDocValues} into {@code Iterable<Number>}. + * + * @deprecated Consume {@link SortedDocValues} instead. */ + @Deprecated + public static Iterable sortedNumericToValues(final DocValuesProducer valuesProducer, final FieldInfo fieldInfo) { + return new Iterable() { + + @Override + public Iterator iterator() { + + final SortedNumericDocValues values; + try { + values = valuesProducer.getSortedNumeric(fieldInfo); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + return new Iterator() { + private boolean nextIsSet; + private int nextCount; + private int upto; + private long nextValue; + + private void setNext() { + try { + if (nextIsSet == false) { + if (upto == nextCount) { + values.nextDoc(); + if (values.docID() == DocIdSetIterator.NO_MORE_DOCS) { + nextCount = 0; + nextIsSet = false; + return; + } else { + nextCount = values.docValueCount(); + } + upto = 0; + } + nextValue = values.nextValue(); + upto++; + nextIsSet = true; + } + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + + @Override + public boolean hasNext() { + setNext(); + return nextCount != 0; + } + + @Override + public Number next() { + setNext(); + assert nextCount != 0; + nextIsSet = false; + return nextValue; + } + }; + } + }; + } + + /** Converts norms into {@code Iterable<Number>}. + * + * @deprecated Consume {@link NumericDocValues} instead. */ + @Deprecated + public static Iterable normsIterable(final FieldInfo field, final NormsProducer normsProducer, final int maxDoc) { + + return new Iterable() { + + @Override + public Iterator iterator() { + + final NumericDocValues values; + try { + values = normsProducer.getNorms(field); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + return new Iterator() { + private int docIDUpto = -1; + + @Override + public boolean hasNext() { + return docIDUpto + 1 < maxDoc; + } + + @Override + public Number next() { + docIDUpto++; + if (docIDUpto > values.docID()) { + try { + values.nextDoc(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + Number result; + if (docIDUpto == values.docID()) { + try { + result = values.longValue(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } else { + // Unlike NumericDocValues, norms used to return 0 for missing values: + result = 0; + } + return result; + } + }; + } + }; + } + + /** Converts values from {@link BinaryDocValues} into {@code Iterable<BytesRef>}. + * + * @deprecated Consume {@link BinaryDocValues} instead. */ + @Deprecated + public static Iterable binaryIterable(final FieldInfo field, final DocValuesProducer valuesProducer, final int maxDoc) { + return new Iterable() { + @Override + public Iterator iterator() { + + final BinaryDocValues values; + try { + values = valuesProducer.getBinary(field); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + return new Iterator() { + private int docIDUpto = -1; + + @Override + public boolean hasNext() { + return docIDUpto + 1 < maxDoc; + } + + @Override + public BytesRef next() { + docIDUpto++; + if (docIDUpto > values.docID()) { + try { + values.nextDoc(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + BytesRef result; + if (docIDUpto == values.docID()) { + try { + result = values.binaryValue(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } else { + result = null; + } + return result; + } + }; + } + }; + } + + /** Converts values from {@link NumericDocValues} into {@code Iterable<Number>}. + * + * @deprecated Consume {@link NumericDocValues} instead. */ + @Deprecated + public static Iterable numericIterable(final FieldInfo field, final DocValuesProducer valuesProducer, final int maxDoc) { + return new Iterable() { + @Override + public Iterator iterator() { + + final NumericDocValues values; + try { + values = valuesProducer.getNumeric(field); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + + return new Iterator() { + private int docIDUpto = -1; + + @Override + public boolean hasNext() { + return docIDUpto + 1 < maxDoc; + } + + @Override + public Number next() { + docIDUpto++; + if (docIDUpto > values.docID()) { + try { + values.nextDoc(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + Number result; + if (docIDUpto == values.docID()) { + try { + result = values.longValue(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } else { + result = null; + } + return result; + } + }; + } + }; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyNumericDocValues.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyNumericDocValues.java new file mode 100644 index 0000000000000..252bc152ffdd2 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyNumericDocValues.java @@ -0,0 +1,42 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.NumericDocValues; + +/** + * A per-document numeric value. + * + * @deprecated Use {@link NumericDocValues} instead. + */ +@Deprecated +public abstract class LegacyNumericDocValues { + + /** Sole constructor. (For invocation by subclass + * constructors, typically implicit.) */ + protected LegacyNumericDocValues() {} + + /** + * Returns the numeric value for the specified document ID. + * @param docID document ID to lookup + * @return numeric value + */ + public abstract long get(int docID); +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyNumericDocValuesWrapper.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyNumericDocValuesWrapper.java new file mode 100644 index 0000000000000..f677c7b39bac7 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacyNumericDocValuesWrapper.java @@ -0,0 +1,99 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.util.Bits; + +import java.io.IOException; + +/** + * Wraps a {@link LegacyNumericDocValues} into a {@link NumericDocValues}. + * + * @deprecated Implement {@link NumericDocValues} directly. + */ +@Deprecated +public final class LegacyNumericDocValuesWrapper extends NumericDocValues { + private final Bits docsWithField; + private final LegacyNumericDocValues values; + private final int maxDoc; + private int docID = -1; + private long value; + + public LegacyNumericDocValuesWrapper(Bits docsWithField, LegacyNumericDocValues values) { + this.docsWithField = docsWithField; + this.values = values; + this.maxDoc = docsWithField.length(); + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + docID++; + while (docID < maxDoc) { + value = values.get(docID); + if (value != 0 || docsWithField.get(docID)) { + return docID; + } + docID++; + } + docID = NO_MORE_DOCS; + return NO_MORE_DOCS; + } + + @Override + public int advance(int target) { + assert target >= docID : "target=" + target + " docID=" + docID; + if (target == NO_MORE_DOCS) { + this.docID = NO_MORE_DOCS; + } else { + this.docID = target - 1; + nextDoc(); + } + return docID; + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + value = values.get(docID); + return value != 0 || docsWithField.get(docID); + } + + @Override + public long cost() { + // TODO + return 0; + } + + @Override + public long longValue() { + return value; + } + + @Override + public String toString() { + return "LegacyNumericDocValuesWrapper(" + values + ")"; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedDocValues.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedDocValues.java new file mode 100644 index 0000000000000..5effaa9284540 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedDocValues.java @@ -0,0 +1,114 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; + +/** + * A per-document byte[] with presorted values. + *

+ * Per-Document values in a SortedDocValues are deduplicated, dereferenced, + * and sorted into a dictionary of unique values. A pointer to the + * dictionary value (ordinal) can be retrieved for each document. Ordinals + * are dense and in increasing sorted order. + * + * @deprecated Use {@link SortedDocValues} instead. + */ +@Deprecated +public abstract class LegacySortedDocValues extends LegacyBinaryDocValues { + + /** Sole constructor. (For invocation by subclass + * constructors, typically implicit.) */ + protected LegacySortedDocValues() {} + + /** + * Returns the ordinal for the specified docID. + * @param docID document ID to lookup + * @return ordinal for the document: this is dense, starts at 0, then + * increments by 1 for the next value in sorted order. Note that + * missing values are indicated by -1. + */ + public abstract int getOrd(int docID); + + /** Retrieves the value for the specified ordinal. The returned + * {@link BytesRef} may be re-used across calls to {@link #lookupOrd(int)} + * so make sure to {@link BytesRef#deepCopyOf(BytesRef) copy it} if you want + * to keep it around. + * @param ord ordinal to lookup (must be >= 0 and < {@link #getValueCount()}) + * @see #getOrd(int) + */ + public abstract BytesRef lookupOrd(int ord); + + /** + * Returns the number of unique values. + * @return number of unique values in this SortedDocValues. This is + * also equivalent to one plus the maximum ordinal. + */ + public abstract int getValueCount(); + + private final BytesRef empty = new BytesRef(); + + @Override + public BytesRef get(int docID) { + int ord = getOrd(docID); + if (ord == -1) { + return empty; + } else { + return lookupOrd(ord); + } + } + + /** If {@code key} exists, returns its ordinal, else + * returns {@code -insertionPoint-1}, like {@code + * Arrays.binarySearch}. + * + * @param key Key to look up + **/ + public int lookupTerm(BytesRef key) { + int low = 0; + int high = getValueCount() - 1; + + while (low <= high) { + int mid = (low + high) >>> 1; + final BytesRef term = lookupOrd(mid); + int cmp = term.compareTo(key); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; // key found + } + } + + return -(low + 1); // key not found. + } + + /** + * Returns a {@link TermsEnum} over the values. + * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}. + */ + public TermsEnum termsEnum() { + throw new UnsupportedOperationException(); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedDocValuesWrapper.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedDocValuesWrapper.java new file mode 100644 index 0000000000000..592ae3846885a --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedDocValuesWrapper.java @@ -0,0 +1,104 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +/** + * Wraps a {@link LegacySortedDocValues} into a {@link SortedDocValues}. + * + * @deprecated Implement {@link SortedDocValues} directly. + */ +@Deprecated +public final class LegacySortedDocValuesWrapper extends SortedDocValues { + private final LegacySortedDocValues values; + private final int maxDoc; + private int docID = -1; + private int ord; + + public LegacySortedDocValuesWrapper(LegacySortedDocValues values, int maxDoc) { + this.values = values; + this.maxDoc = maxDoc; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + assert docID != NO_MORE_DOCS; + docID++; + while (docID < maxDoc) { + ord = values.getOrd(docID); + if (ord != -1) { + return docID; + } + docID++; + } + docID = NO_MORE_DOCS; + return NO_MORE_DOCS; + } + + @Override + public int advance(int target) { + if (target < docID) { + throw new IllegalArgumentException("cannot advance backwards: docID=" + docID + " target=" + target); + } + if (target >= maxDoc) { + this.docID = NO_MORE_DOCS; + } else { + this.docID = target - 1; + nextDoc(); + } + return docID; + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + ord = values.getOrd(docID); + return ord != -1; + } + + @Override + public long cost() { + return 0; + } + + @Override + public int ordValue() { + return ord; + } + + @Override + public BytesRef lookupOrd(int ord) { + return values.lookupOrd(ord); + } + + @Override + public int getValueCount() { + return values.getValueCount(); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedNumericDocValues.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedNumericDocValues.java new file mode 100644 index 0000000000000..fd528e8a6cddf --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedNumericDocValues.java @@ -0,0 +1,53 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.SortedNumericDocValues; + +/** + * A list of per-document numeric values, sorted + * according to {@link Long#compare(long, long)}. + * + * @deprecated Use {@link SortedNumericDocValues} instead. + */ +@Deprecated +public abstract class LegacySortedNumericDocValues { + + /** Sole constructor. (For invocation by subclass + * constructors, typically implicit.) */ + protected LegacySortedNumericDocValues() {} + + /** + * Positions to the specified document + */ + public abstract void setDocument(int doc); + + /** + * Retrieve the value for the current document at the specified index. + * An index ranges from {@code 0} to {@code count()-1}. + */ + public abstract long valueAt(int index); + + /** + * Retrieves the count of values for the current document. + * This may be zero if a document has no values. + */ + public abstract int count(); +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedNumericDocValuesWrapper.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedNumericDocValuesWrapper.java new file mode 100644 index 0000000000000..af6b38daf3dbc --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedNumericDocValuesWrapper.java @@ -0,0 +1,102 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.SortedNumericDocValues; + +import java.io.IOException; + +/** + * Wraps a {@link LegacySortedNumericDocValues} into a {@link SortedNumericDocValues}. + * + * @deprecated Implement {@link SortedNumericDocValues} directly. + */ +@Deprecated +public final class LegacySortedNumericDocValuesWrapper extends SortedNumericDocValues { + private final LegacySortedNumericDocValues values; + private final int maxDoc; + private int docID = -1; + private int upto; + + public LegacySortedNumericDocValuesWrapper(LegacySortedNumericDocValues values, int maxDoc) { + this.values = values; + this.maxDoc = maxDoc; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + assert docID != NO_MORE_DOCS; + while (true) { + docID++; + if (docID == maxDoc) { + docID = NO_MORE_DOCS; + break; + } + values.setDocument(docID); + if (values.count() != 0) { + break; + } + } + upto = 0; + return docID; + } + + @Override + public int advance(int target) { + if (target < docID) { + throw new IllegalArgumentException("cannot advance backwards: docID=" + docID + " target=" + target); + } + if (target >= maxDoc) { + docID = NO_MORE_DOCS; + } else { + docID = target - 1; + nextDoc(); + } + return docID; + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + values.setDocument(docID); + upto = 0; + return values.count() != 0; + } + + @Override + public long cost() { + return 0; + } + + @Override + public long nextValue() { + return values.valueAt(upto++); + } + + @Override + public int docValueCount() { + return values.count(); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedSetDocValues.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedSetDocValues.java new file mode 100644 index 0000000000000..3af500bcf166e --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedSetDocValues.java @@ -0,0 +1,115 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +/** + * A per-document set of presorted byte[] values. + *

+ * Per-Document values in a SortedDocValues are deduplicated, dereferenced, + * and sorted into a dictionary of unique values. A pointer to the + * dictionary value (ordinal) can be retrieved for each document. Ordinals + * are dense and in increasing sorted order. + * + * @deprecated Use {@link SortedSetDocValues} instead. + */ +@Deprecated +public abstract class LegacySortedSetDocValues { + + /** Sole constructor. (For invocation by subclass + * constructors, typically implicit.) */ + protected LegacySortedSetDocValues() {} + + /** When returned by {@link #nextOrd()} it means there are no more + * ordinals for the document. + */ + public static final long NO_MORE_ORDS = -1; + + /** + * Returns the next ordinal for the current document (previously + * set by {@link #setDocument(int)}. + * @return next ordinal for the document, or {@link #NO_MORE_ORDS}. + * ordinals are dense, start at 0, then increment by 1 for + * the next value in sorted order. + */ + public abstract long nextOrd(); + + /** + * Sets iteration to the specified docID + * @param docID document ID + */ + public abstract void setDocument(int docID); + + /** Retrieves the value for the specified ordinal. The returned + * {@link BytesRef} may be re-used across calls to lookupOrd so make sure to + * {@link BytesRef#deepCopyOf(BytesRef) copy it} if you want to keep it + * around. + * @param ord ordinal to lookup + * @see #nextOrd + */ + public abstract BytesRef lookupOrd(long ord); + + /** + * Returns the number of unique values. + * @return number of unique values in this SortedDocValues. This is + * also equivalent to one plus the maximum ordinal. + */ + public abstract long getValueCount(); + + /** If {@code key} exists, returns its ordinal, else + * returns {@code -insertionPoint-1}, like {@code + * Arrays.binarySearch}. + * + * @param key Key to look up + **/ + public long lookupTerm(BytesRef key) { + long low = 0; + long high = getValueCount() - 1; + + while (low <= high) { + long mid = (low + high) >>> 1; + final BytesRef term = lookupOrd(mid); + int cmp = term.compareTo(key); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; // key found + } + } + + return -(low + 1); // key not found. + } + + /** + * Returns a {@link TermsEnum} over the values. + * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}. + */ + public TermsEnum termsEnum() throws IOException { + throw new UnsupportedOperationException(); + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedSetDocValuesWrapper.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedSetDocValuesWrapper.java new file mode 100644 index 0000000000000..272929346448d --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/index/LegacySortedSetDocValuesWrapper.java @@ -0,0 +1,115 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.index; + +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +/** + * Wraps a {@link LegacySortedSetDocValues} into a {@link SortedSetDocValues}. + * + * @deprecated Implement {@link SortedSetDocValues} directly. + */ +@Deprecated +public final class LegacySortedSetDocValuesWrapper extends SortedSetDocValues { + private final LegacySortedSetDocValues values; + private final int maxDoc; + private int docID = -1; + private long ord; + + public LegacySortedSetDocValuesWrapper(LegacySortedSetDocValues values, int maxDoc) { + this.values = values; + this.maxDoc = maxDoc; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + assert docID != NO_MORE_DOCS; + docID++; + while (docID < maxDoc) { + values.setDocument(docID); + ord = values.nextOrd(); + if (ord != NO_MORE_ORDS) { + return docID; + } + docID++; + } + docID = NO_MORE_DOCS; + return NO_MORE_DOCS; + } + + @Override + public int advance(int target) { + if (target < docID) { + throw new IllegalArgumentException("cannot advance backwards: docID=" + docID + " target=" + target); + } + if (target >= maxDoc) { + this.docID = NO_MORE_DOCS; + } else { + this.docID = target - 1; + nextDoc(); + } + return docID; + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + values.setDocument(docID); + ord = values.nextOrd(); + return ord != NO_MORE_ORDS; + } + + @Override + public long cost() { + return 0; + } + + @Override + public long nextOrd() { + long result = ord; + if (result != NO_MORE_ORDS) { + ord = values.nextOrd(); + } + return result; + } + + @Override + public BytesRef lookupOrd(long ord) { + return values.lookupOrd((int) ord); + } + + @Override + public long getValueCount() { + return values.getValueCount(); + } + + @Override + public String toString() { + return "LegacySortedSetDocValuesWrapper(" + values + ")"; + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/LegacyStringHelper.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/LegacyStringHelper.java new file mode 100644 index 0000000000000..50e5cde04ead3 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/LegacyStringHelper.java @@ -0,0 +1,72 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene54; + +import org.apache.lucene.util.BytesRef; + +/** + * Legacy methods for manipulating strings. + * + * @lucene.internal + * @deprecated This is only used for backwards compatibility codecs (they + * don't work with the Java9-based replacement methods). + */ +@Deprecated +abstract class LegacyStringHelper { + + /** + * Compares two {@link BytesRef}, element by element, and returns the + * number of elements common to both arrays (from the start of each). + * + * @param left The first {@link BytesRef} to compare + * @param right The second {@link BytesRef} to compare + * @return The number of common elements (from the start of each). + */ + public static int bytesDifference(BytesRef left, BytesRef right) { + int len = left.length < right.length ? left.length : right.length; + final byte[] bytesLeft = left.bytes; + final int offLeft = left.offset; + byte[] bytesRight = right.bytes; + final int offRight = right.offset; + for (int i = 0; i < len; i++) + if (bytesLeft[i + offLeft] != bytesRight[i + offRight]) return i; + return len; + } + + /** + * Returns the length of {@code currentTerm} needed for use as a sort key. + * so that {@link BytesRef#compareTo(BytesRef)} still returns the same result. + * This method assumes currentTerm comes after priorTerm. + */ + public static int sortKeyLength(final BytesRef priorTerm, final BytesRef currentTerm) { + final int currentTermOffset = currentTerm.offset; + final int priorTermOffset = priorTerm.offset; + final int limit = Math.min(priorTerm.length, currentTerm.length); + for (int i = 0; i < limit; i++) { + if (priorTerm.bytes[priorTermOffset + i] != currentTerm.bytes[currentTermOffset + i]) { + return i + 1; + } + } + return Math.min(1 + priorTerm.length, currentTerm.length); + } + + private LegacyStringHelper() {} + +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesConsumer.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesConsumer.java new file mode 100644 index 0000000000000..fe8d24262f056 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesConsumer.java @@ -0,0 +1,842 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene54; + +import org.apache.lucene.backward_codecs.packed.LegacyDirectMonotonicWriter; +import org.apache.lucene.backward_codecs.packed.LegacyDirectWriter; +import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.LongsRef; +import org.apache.lucene.util.MathUtil; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts; +import org.elasticsearch.core.internal.io.IOUtils; +import org.elasticsearch.xpack.lucene.bwc.codecs.index.LegacyDocValuesIterables; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.stream.StreamSupport; + +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.ALL_LIVE; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.ALL_MISSING; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.BINARY_FIXED_UNCOMPRESSED; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.BINARY_PREFIX_COMPRESSED; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.BINARY_VARIABLE_UNCOMPRESSED; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.CONST_COMPRESSED; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.DELTA_COMPRESSED; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.GCD_COMPRESSED; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.INTERVAL_COUNT; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.INTERVAL_MASK; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.INTERVAL_SHIFT; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.MONOTONIC_BLOCK_SIZE; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.MONOTONIC_COMPRESSED; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.REVERSE_INTERVAL_COUNT; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.REVERSE_INTERVAL_MASK; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.SORTED_SET_TABLE; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.SORTED_SINGLE_VALUED; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.SORTED_WITH_ADDRESSES; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.SPARSE_COMPRESSED; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat.TABLE_COMPRESSED; + +/** writer for {@link Lucene54DocValuesFormat} */ +final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Closeable { + + enum NumberType { + /** Dense ordinals */ + ORDINAL, + /** Random long values */ + VALUE; + } + + IndexOutput data, meta; + final int maxDoc; + + /** expert: Creates a new writer */ + Lucene54DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) + throws IOException { + boolean success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = EndiannessReverserUtil.createOutput(state.directory, dataName, state.context); + CodecUtil.writeIndexHeader( + data, + dataCodec, + Lucene54DocValuesFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + meta = EndiannessReverserUtil.createOutput(state.directory, metaName, state.context); + CodecUtil.writeIndexHeader( + meta, + metaCodec, + Lucene54DocValuesFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + maxDoc = state.segmentInfo.maxDoc(); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + addNumericField(field, LegacyDocValuesIterables.numericIterable(field, valuesProducer, maxDoc), NumberType.VALUE); + } + + void addNumericField(FieldInfo field, Iterable values, NumberType numberType) throws IOException { + long count = 0; + long minValue = Long.MAX_VALUE; + long maxValue = Long.MIN_VALUE; + long gcd = 0; + long missingCount = 0; + long zeroCount = 0; + // TODO: more efficient? + HashSet uniqueValues = null; + long missingOrdCount = 0; + if (numberType == NumberType.VALUE) { + uniqueValues = new HashSet<>(); + + for (Number nv : values) { + final long v; + if (nv == null) { + v = 0; + missingCount++; + zeroCount++; + } else { + v = nv.longValue(); + if (v == 0) { + zeroCount++; + } + } + + if (gcd != 1) { + if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { + // in that case v - minValue might overflow and make the GCD computation return + // wrong results. Since these extreme values are unlikely, we just discard + // GCD computation for them + gcd = 1; + } else if (count != 0) { // minValue needs to be set first + gcd = MathUtil.gcd(gcd, v - minValue); + } + } + + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + + if (uniqueValues != null) { + if (uniqueValues.add(v)) { + if (uniqueValues.size() > 256) { + uniqueValues = null; + } + } + } + + ++count; + } + } else { + for (Number nv : values) { + long v = nv.longValue(); + if (v == -1L) { + missingOrdCount++; + } + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + ++count; + } + } + + final long delta = maxValue - minValue; + final int deltaBitsRequired = LegacyDirectWriter.unsignedBitsRequired(delta); + final int tableBitsRequired = uniqueValues == null ? Integer.MAX_VALUE : LegacyDirectWriter.bitsRequired(uniqueValues.size() - 1); + + final boolean sparse; // 1% of docs or less have a value + switch (numberType) { + case VALUE: + sparse = (double) missingCount / count >= 0.99; + break; + case ORDINAL: + sparse = (double) missingOrdCount / count >= 0.99; + break; + default: + throw new AssertionError(); + } + + final int format; + if (uniqueValues != null + && count <= Integer.MAX_VALUE + && (uniqueValues.size() == 1 || (uniqueValues.size() == 2 && missingCount > 0 && zeroCount == missingCount))) { + // either one unique value C or two unique values: "missing" and C + format = CONST_COMPRESSED; + } else if (sparse && count >= 1024) { + // require at least 1024 docs to avoid flipping back and forth when doing NRT search + format = SPARSE_COMPRESSED; + } else if (uniqueValues != null && tableBitsRequired < deltaBitsRequired) { + format = TABLE_COMPRESSED; + } else if (gcd != 0 && gcd != 1) { + final long gcdDelta = (maxValue - minValue) / gcd; + final long gcdBitsRequired = LegacyDirectWriter.unsignedBitsRequired(gcdDelta); + format = gcdBitsRequired < deltaBitsRequired ? GCD_COMPRESSED : DELTA_COMPRESSED; + } else { + format = DELTA_COMPRESSED; + } + meta.writeVInt(field.number); + meta.writeByte(Lucene54DocValuesFormat.NUMERIC); + meta.writeVInt(format); + if (format == SPARSE_COMPRESSED) { + meta.writeLong(data.getFilePointer()); + final long numDocsWithValue; + switch (numberType) { + case VALUE: + numDocsWithValue = count - missingCount; + break; + case ORDINAL: + numDocsWithValue = count - missingOrdCount; + break; + default: + throw new AssertionError(); + } + final long maxDoc = writeSparseMissingBitset(values, numberType, numDocsWithValue); + assert maxDoc == count; + } else if (missingCount == 0) { + meta.writeLong(ALL_LIVE); + } else if (missingCount == count) { + meta.writeLong(ALL_MISSING); + } else { + meta.writeLong(data.getFilePointer()); + writeMissingBitset(values); + } + meta.writeLong(data.getFilePointer()); + meta.writeVLong(count); + + switch (format) { + case CONST_COMPRESSED: + // write the constant (nonzero value in the n=2 case, singleton value otherwise) + meta.writeLong(minValue < 0 ? Collections.min(uniqueValues) : Collections.max(uniqueValues)); + break; + case GCD_COMPRESSED: + meta.writeLong(minValue); + meta.writeLong(gcd); + final long maxDelta = (maxValue - minValue) / gcd; + final int bits = LegacyDirectWriter.unsignedBitsRequired(maxDelta); + meta.writeVInt(bits); + final LegacyDirectWriter quotientWriter = LegacyDirectWriter.getInstance(data, count, bits); + for (Number nv : values) { + long value = nv == null ? 0 : nv.longValue(); + quotientWriter.add((value - minValue) / gcd); + } + quotientWriter.finish(); + break; + case DELTA_COMPRESSED: + final long minDelta = delta < 0 ? 0 : minValue; + meta.writeLong(minDelta); + meta.writeVInt(deltaBitsRequired); + final LegacyDirectWriter writer = LegacyDirectWriter.getInstance(data, count, deltaBitsRequired); + for (Number nv : values) { + long v = nv == null ? 0 : nv.longValue(); + writer.add(v - minDelta); + } + writer.finish(); + break; + case TABLE_COMPRESSED: + final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]); + Arrays.sort(decode); + final HashMap encode = new HashMap<>(); + meta.writeVInt(decode.length); + for (int i = 0; i < decode.length; i++) { + meta.writeLong(decode[i]); + encode.put(decode[i], i); + } + meta.writeVInt(tableBitsRequired); + final LegacyDirectWriter ordsWriter = LegacyDirectWriter.getInstance(data, count, tableBitsRequired); + for (Number nv : values) { + ordsWriter.add(encode.get(nv == null ? 0 : nv.longValue())); + } + ordsWriter.finish(); + break; + case SPARSE_COMPRESSED: + final Iterable filteredMissingValues; + switch (numberType) { + case VALUE: + meta.writeByte((byte) 0); + filteredMissingValues = new Iterable() { + @Override + public Iterator iterator() { + return StreamSupport.stream(values.spliterator(), false).filter(value -> value != null).iterator(); + } + }; + break; + case ORDINAL: + meta.writeByte((byte) 1); + filteredMissingValues = new Iterable() { + @Override + public Iterator iterator() { + return StreamSupport.stream(values.spliterator(), false) + .filter(value -> value.longValue() != -1L) + .iterator(); + } + }; + break; + default: + throw new AssertionError(); + } + // Write non-missing values as a numeric field + addNumericField(field, filteredMissingValues, numberType); + break; + default: + throw new AssertionError(); + } + meta.writeLong(data.getFilePointer()); + } + + // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on, + // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode) + void writeMissingBitset(Iterable values) throws IOException { + byte bits = 0; + int count = 0; + for (Object v : values) { + if (count == 8) { + data.writeByte(bits); + count = 0; + bits = 0; + } + if (v != null) { + bits |= 1 << (count & 7); + } + count++; + } + if (count > 0) { + data.writeByte(bits); + } + } + + long writeSparseMissingBitset(Iterable values, NumberType numberType, long numDocsWithValue) throws IOException { + meta.writeVLong(numDocsWithValue); + + // Write doc IDs that have a value + meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); + final LegacyDirectMonotonicWriter docIdsWriter = LegacyDirectMonotonicWriter.getInstance( + meta, + data, + numDocsWithValue, + DIRECT_MONOTONIC_BLOCK_SHIFT + ); + long docID = 0; + for (Number nv : values) { + switch (numberType) { + case VALUE: + if (nv != null) { + docIdsWriter.add(docID); + } + break; + case ORDINAL: + if (nv.longValue() != -1L) { + docIdsWriter.add(docID); + } + break; + default: + throw new AssertionError(); + } + docID++; + } + docIdsWriter.finish(); + return docID; + } + + @Override + public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + addBinaryField(field, LegacyDocValuesIterables.binaryIterable(field, valuesProducer, maxDoc)); + } + + private void addBinaryField(FieldInfo field, Iterable values) throws IOException { + // write the byte[] data + meta.writeVInt(field.number); + meta.writeByte(Lucene54DocValuesFormat.BINARY); + int minLength = Integer.MAX_VALUE; + int maxLength = Integer.MIN_VALUE; + final long startFP = data.getFilePointer(); + long count = 0; + long missingCount = 0; + for (BytesRef v : values) { + final int length; + if (v == null) { + length = 0; + missingCount++; + } else { + length = v.length; + } + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } + count++; + } + meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED); + if (missingCount == 0) { + meta.writeLong(ALL_LIVE); + } else if (missingCount == count) { + meta.writeLong(ALL_MISSING); + } else { + meta.writeLong(data.getFilePointer()); + writeMissingBitset(values); + } + meta.writeVInt(minLength); + meta.writeVInt(maxLength); + meta.writeVLong(count); + meta.writeLong(startFP); + + // if minLength == maxLength, it's a fixed-length byte[], we are done (the addresses are implicit) + // otherwise, we need to record the length fields... + if (minLength != maxLength) { + meta.writeLong(data.getFilePointer()); + meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); + + final LegacyDirectMonotonicWriter writer = LegacyDirectMonotonicWriter.getInstance( + meta, + data, + count + 1, + DIRECT_MONOTONIC_BLOCK_SHIFT + ); + long addr = 0; + writer.add(addr); + for (BytesRef v : values) { + if (v != null) { + addr += v.length; + } + writer.add(addr); + } + writer.finish(); + meta.writeLong(data.getFilePointer()); + } + } + + /** expert: writes a value dictionary for a sorted/sortedset field */ + private void addTermsDict(FieldInfo field, final Iterable values) throws IOException { + // first check if it's a "fixed-length" terms dict, and compressibility if so + int minLength = Integer.MAX_VALUE; + int maxLength = Integer.MIN_VALUE; + long numValues = 0; + BytesRefBuilder previousValue = new BytesRefBuilder(); + long prefixSum = 0; // only valid for fixed-width data, as we have a choice there + for (BytesRef v : values) { + minLength = Math.min(minLength, v.length); + maxLength = Math.max(maxLength, v.length); + if (minLength == maxLength) { + int termPosition = (int) (numValues & INTERVAL_MASK); + if (termPosition == 0) { + // first term in block, save it away to compare against the last term later + previousValue.copyBytes(v); + } else if (termPosition == INTERVAL_COUNT - 1) { + // last term in block, accumulate shared prefix against first term + prefixSum += LegacyStringHelper.bytesDifference(previousValue.get(), v); + } + } + numValues++; + } + // for fixed width data, look at the avg(shared prefix) before deciding how to encode: + // prefix compression "costs" worst case 2 bytes per term because we must store suffix lengths. + // so if we share at least 3 bytes on average, always compress. + if (minLength == maxLength && prefixSum <= 3 * (numValues >> INTERVAL_SHIFT)) { + // no index needed: not very compressible, direct addressing by mult + addBinaryField(field, values); + } else if (numValues < REVERSE_INTERVAL_COUNT) { + // low cardinality: waste a few KB of ram, but can't really use fancy index etc + addBinaryField(field, values); + } else { + assert numValues > 0; // we don't have to handle the empty case + // header + meta.writeVInt(field.number); + meta.writeByte(Lucene54DocValuesFormat.BINARY); + meta.writeVInt(BINARY_PREFIX_COMPRESSED); + meta.writeLong(-1L); + // now write the bytes: sharing prefixes within a block + final long startFP = data.getFilePointer(); + // currently, we have to store the delta from expected for every 1/nth term + // we could avoid this, but it's not much and less overall RAM than the previous approach! + ByteBuffersDataOutput addressBuffer = new ByteBuffersDataOutput(); + MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, MONOTONIC_BLOCK_SIZE); + // buffers up 16 terms + ByteBuffersDataOutput bytesBuffer = new ByteBuffersDataOutput(); + // buffers up block header + ByteBuffersDataOutput headerBuffer = new ByteBuffersDataOutput(); + BytesRefBuilder lastTerm = new BytesRefBuilder(); + lastTerm.grow(maxLength); + long count = 0; + int suffixDeltas[] = new int[INTERVAL_COUNT]; + for (BytesRef v : values) { + int termPosition = (int) (count & INTERVAL_MASK); + if (termPosition == 0) { + termAddresses.add(data.getFilePointer() - startFP); + // abs-encode first term + headerBuffer.writeVInt(v.length); + headerBuffer.writeBytes(v.bytes, v.offset, v.length); + lastTerm.copyBytes(v); + } else { + // prefix-code: we only share at most 255 characters, to encode the length as a single + // byte and have random access. Larger terms just get less compression. + int sharedPrefix = Math.min(255, LegacyStringHelper.bytesDifference(lastTerm.get(), v)); + bytesBuffer.writeByte((byte) sharedPrefix); + bytesBuffer.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix); + // we can encode one smaller, because terms are unique. + suffixDeltas[termPosition] = v.length - sharedPrefix - 1; + } + + count++; + // flush block + if ((count & INTERVAL_MASK) == 0) { + flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas); + } + } + // flush trailing crap + int leftover = (int) (count & INTERVAL_MASK); + if (leftover > 0) { + Arrays.fill(suffixDeltas, leftover, suffixDeltas.length, 0); + flushTermsDictBlock(headerBuffer, bytesBuffer, suffixDeltas); + } + final long indexStartFP = data.getFilePointer(); + // write addresses of indexed terms + termAddresses.finish(); + addressBuffer.copyTo(data); + addressBuffer = null; + termAddresses = null; + meta.writeVInt(minLength); + meta.writeVInt(maxLength); + meta.writeVLong(count); + meta.writeLong(startFP); + meta.writeLong(indexStartFP); + meta.writeVInt(PackedInts.VERSION_MONOTONIC_WITHOUT_ZIGZAG); + meta.writeVInt(MONOTONIC_BLOCK_SIZE); + addReverseTermIndex(field, values, maxLength); + } + } + + // writes term dictionary "block" + // first term is absolute encoded as vint length + bytes. + // lengths of subsequent N terms are encoded as either N bytes or N shorts. + // in the double-byte case, the first byte is indicated with -1. + // subsequent terms are encoded as byte suffixLength + bytes. + private void flushTermsDictBlock(ByteBuffersDataOutput headerBuffer, ByteBuffersDataOutput bytesBuffer, int suffixDeltas[]) + throws IOException { + boolean twoByte = false; + for (int i = 1; i < suffixDeltas.length; i++) { + if (suffixDeltas[i] > 254) { + twoByte = true; + } + } + if (twoByte) { + headerBuffer.writeByte((byte) 255); + for (int i = 1; i < suffixDeltas.length; i++) { + headerBuffer.writeShort((short) suffixDeltas[i]); + } + } else { + for (int i = 1; i < suffixDeltas.length; i++) { + headerBuffer.writeByte((byte) suffixDeltas[i]); + } + } + headerBuffer.copyTo(data); + headerBuffer.reset(); + bytesBuffer.copyTo(data); + bytesBuffer.reset(); + } + + // writes reverse term index: used for binary searching a term into a range of 64 blocks + // for every 64 blocks (1024 terms) we store a term, trimming any suffix unnecessary for comparison + // terms are written as a contiguous byte[], but never spanning 2^15 byte boundaries. + private void addReverseTermIndex(FieldInfo field, final Iterable values, int maxLength) throws IOException { + long count = 0; + BytesRefBuilder priorTerm = new BytesRefBuilder(); + priorTerm.grow(maxLength); + BytesRef indexTerm = new BytesRef(); + long startFP = data.getFilePointer(); + PagedBytes pagedBytes = new PagedBytes(15); + MonotonicBlockPackedWriter addresses = new MonotonicBlockPackedWriter(data, MONOTONIC_BLOCK_SIZE); + + for (BytesRef b : values) { + int termPosition = (int) (count & REVERSE_INTERVAL_MASK); + if (termPosition == 0) { + int len = LegacyStringHelper.sortKeyLength(priorTerm.get(), b); + indexTerm.bytes = b.bytes; + indexTerm.offset = b.offset; + indexTerm.length = len; + addresses.add(pagedBytes.copyUsingLengthPrefix(indexTerm)); + } else if (termPosition == REVERSE_INTERVAL_MASK) { + priorTerm.copyBytes(b); + } + count++; + } + addresses.finish(); + long numBytes = pagedBytes.getPointer(); + pagedBytes.freeze(true); + PagedBytes.PagedBytesDataInput in = pagedBytes.getDataInput(); + meta.writeLong(startFP); + data.writeVLong(numBytes); + data.copyBytes(in, numBytes); + } + + @Override + public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(Lucene54DocValuesFormat.SORTED); + addTermsDict(field, LegacyDocValuesIterables.valuesIterable(valuesProducer.getSorted(field))); + addNumericField(field, LegacyDocValuesIterables.sortedOrdIterable(valuesProducer, field, maxDoc), NumberType.ORDINAL); + } + + private void addSortedField(FieldInfo field, Iterable values, Iterable ords) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(Lucene54DocValuesFormat.SORTED); + addTermsDict(field, values); + addNumericField(field, ords, NumberType.ORDINAL); + } + + @Override + public void addSortedNumericField(FieldInfo field, final DocValuesProducer valuesProducer) throws IOException { + + final Iterable docToValueCount = LegacyDocValuesIterables.sortedNumericToDocCount(valuesProducer, field, maxDoc); + final Iterable values = LegacyDocValuesIterables.sortedNumericToValues(valuesProducer, field); + + meta.writeVInt(field.number); + meta.writeByte(Lucene54DocValuesFormat.SORTED_NUMERIC); + if (isSingleValued(docToValueCount)) { + meta.writeVInt(SORTED_SINGLE_VALUED); + // The field is single-valued, we can encode it as NUMERIC + addNumericField(field, singletonView(docToValueCount, values, null), NumberType.VALUE); + } else { + final SortedSet uniqueValueSets = uniqueValueSets(docToValueCount, values); + if (uniqueValueSets != null) { + meta.writeVInt(SORTED_SET_TABLE); + + // write the set_id -> values mapping + writeDictionary(uniqueValueSets); + + // write the doc -> set_id as a numeric field + addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), NumberType.ORDINAL); + } else { + meta.writeVInt(SORTED_WITH_ADDRESSES); + // write the stream of values as a numeric field + addNumericField(field, values, NumberType.VALUE); + // write the doc -> ord count as a absolute index to the stream + addOrdIndex(field, docToValueCount); + } + } + } + + @Override + public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { + + Iterable values = LegacyDocValuesIterables.valuesIterable(valuesProducer.getSortedSet(field)); + Iterable docToOrdCount = LegacyDocValuesIterables.sortedSetOrdCountIterable(valuesProducer, field, maxDoc); + Iterable ords = LegacyDocValuesIterables.sortedSetOrdsIterable(valuesProducer, field); + + meta.writeVInt(field.number); + meta.writeByte(Lucene54DocValuesFormat.SORTED_SET); + + if (isSingleValued(docToOrdCount)) { + meta.writeVInt(SORTED_SINGLE_VALUED); + // The field is single-valued, we can encode it as SORTED + addSortedField(field, values, singletonView(docToOrdCount, ords, -1L)); + } else { + final SortedSet uniqueValueSets = uniqueValueSets(docToOrdCount, ords); + if (uniqueValueSets != null) { + meta.writeVInt(SORTED_SET_TABLE); + + // write the set_id -> ords mapping + writeDictionary(uniqueValueSets); + + // write the ord -> byte[] as a binary field + addTermsDict(field, values); + + // write the doc -> set_id as a numeric field + addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), NumberType.ORDINAL); + } else { + meta.writeVInt(SORTED_WITH_ADDRESSES); + + // write the ord -> byte[] as a binary field + addTermsDict(field, values); + + // write the stream of ords as a numeric field + // NOTE: we could return an iterator that delta-encodes these within a doc + addNumericField(field, ords, NumberType.ORDINAL); + + // write the doc -> ord count as a absolute index to the stream + addOrdIndex(field, docToOrdCount); + } + } + } + + private SortedSet uniqueValueSets(Iterable docToValueCount, Iterable values) { + Set uniqueValueSet = new HashSet<>(); + LongsRef docValues = new LongsRef(256); + + Iterator valueCountIterator = docToValueCount.iterator(); + Iterator valueIterator = values.iterator(); + int totalDictSize = 0; + while (valueCountIterator.hasNext()) { + docValues.length = valueCountIterator.next().intValue(); + if (docValues.length > 256) { + return null; + } + for (int i = 0; i < docValues.length; ++i) { + docValues.longs[i] = valueIterator.next().longValue(); + } + if (uniqueValueSet.contains(docValues)) { + continue; + } + totalDictSize += docValues.length; + if (totalDictSize > 256) { + return null; + } + uniqueValueSet.add(new LongsRef(ArrayUtil.copyOfSubArray(docValues.longs, 0, docValues.length), 0, docValues.length)); + } + assert valueIterator.hasNext() == false; + return new TreeSet<>(uniqueValueSet); + } + + private void writeDictionary(SortedSet uniqueValueSets) throws IOException { + int lengthSum = 0; + for (LongsRef longs : uniqueValueSets) { + lengthSum += longs.length; + } + + meta.writeInt(lengthSum); + for (LongsRef valueSet : uniqueValueSets) { + for (int i = 0; i < valueSet.length; ++i) { + meta.writeLong(valueSet.longs[valueSet.offset + i]); + } + } + + meta.writeInt(uniqueValueSets.size()); + for (LongsRef valueSet : uniqueValueSets) { + meta.writeInt(valueSet.length); + } + } + + private Iterable docToSetId(SortedSet uniqueValueSets, Iterable docToValueCount, Iterable values) { + final Map setIds = new HashMap<>(); + int i = 0; + for (LongsRef set : uniqueValueSets) { + setIds.put(set, i++); + } + assert i == uniqueValueSets.size(); + + return new Iterable() { + + @Override + public Iterator iterator() { + final Iterator valueCountIterator = docToValueCount.iterator(); + final Iterator valueIterator = values.iterator(); + final LongsRef docValues = new LongsRef(256); + return new Iterator() { + + @Override + public boolean hasNext() { + return valueCountIterator.hasNext(); + } + + @Override + public Number next() { + docValues.length = valueCountIterator.next().intValue(); + for (int i = 0; i < docValues.length; ++i) { + docValues.longs[i] = valueIterator.next().longValue(); + } + final Integer id = setIds.get(docValues); + assert id != null; + return id; + } + + }; + + } + }; + } + + // writes addressing information as MONOTONIC_COMPRESSED integer + private void addOrdIndex(FieldInfo field, Iterable values) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(Lucene54DocValuesFormat.NUMERIC); + meta.writeVInt(MONOTONIC_COMPRESSED); + meta.writeLong(-1L); + meta.writeLong(data.getFilePointer()); + meta.writeVLong(maxDoc); + meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT); + + final LegacyDirectMonotonicWriter writer = LegacyDirectMonotonicWriter.getInstance( + meta, + data, + maxDoc + 1, + DIRECT_MONOTONIC_BLOCK_SHIFT + ); + long addr = 0; + writer.add(addr); + for (Number v : values) { + addr += v.longValue(); + writer.add(addr); + } + writer.finish(); + meta.writeLong(data.getFilePointer()); + } + + @Override + public void close() throws IOException { + boolean success = false; + try { + if (meta != null) { + meta.writeVInt(-1); // write EOF marker + CodecUtil.writeFooter(meta); // write checksum + } + if (data != null) { + CodecUtil.writeFooter(data); // write checksum + } + success = true; + } finally { + if (success) { + IOUtils.close(data, meta); + } else { + IOUtils.closeWhileHandlingException(data, meta); + } + meta = data = null; + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesFormat.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesFormat.java new file mode 100644 index 0000000000000..d428abf67de72 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesFormat.java @@ -0,0 +1,119 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene54; + +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +import java.io.IOException; + +/** + * Lucene 5.4 DocValues format. + * @deprecated Only for reading old 6.0+ segments + */ +@Deprecated +public final class Lucene54DocValuesFormat extends DocValuesFormat { + + /** Sole Constructor */ + public Lucene54DocValuesFormat() { + super("Lucene54"); + } + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new Lucene54DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + } + + @Override + public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { + return new Lucene54DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + } + + static final String DATA_CODEC = "Lucene54DocValuesData"; + static final String DATA_EXTENSION = "dvd"; + static final String META_CODEC = "Lucene54DocValuesMetadata"; + static final String META_EXTENSION = "dvm"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + // indicates docvalues type + static final byte NUMERIC = 0; + static final byte BINARY = 1; + static final byte SORTED = 2; + static final byte SORTED_SET = 3; + static final byte SORTED_NUMERIC = 4; + + // address terms in blocks of 16 terms + static final int INTERVAL_SHIFT = 4; + static final int INTERVAL_COUNT = 1 << INTERVAL_SHIFT; + static final int INTERVAL_MASK = INTERVAL_COUNT - 1; + + // build reverse index from every 1024th term + static final int REVERSE_INTERVAL_SHIFT = 10; + static final int REVERSE_INTERVAL_COUNT = 1 << REVERSE_INTERVAL_SHIFT; + static final int REVERSE_INTERVAL_MASK = REVERSE_INTERVAL_COUNT - 1; + + // for conversion from reverse index to block + static final int BLOCK_INTERVAL_SHIFT = REVERSE_INTERVAL_SHIFT - INTERVAL_SHIFT; + static final int BLOCK_INTERVAL_COUNT = 1 << BLOCK_INTERVAL_SHIFT; + static final int BLOCK_INTERVAL_MASK = BLOCK_INTERVAL_COUNT - 1; + + /** Compressed using packed blocks of ints. */ + static final int DELTA_COMPRESSED = 0; + /** Compressed by computing the GCD. */ + static final int GCD_COMPRESSED = 1; + /** Compressed by giving IDs to unique values. */ + static final int TABLE_COMPRESSED = 2; + /** Compressed with monotonically increasing values */ + static final int MONOTONIC_COMPRESSED = 3; + /** Compressed with constant value (uses only missing bitset) */ + static final int CONST_COMPRESSED = 4; + /** Compressed with sparse arrays. */ + static final int SPARSE_COMPRESSED = 5; + + /** Uncompressed binary, written directly (fixed length). */ + static final int BINARY_FIXED_UNCOMPRESSED = 0; + /** Uncompressed binary, written directly (variable length). */ + static final int BINARY_VARIABLE_UNCOMPRESSED = 1; + /** Compressed binary with shared prefixes */ + static final int BINARY_PREFIX_COMPRESSED = 2; + + /** Standard storage for sorted set values with 1 level of indirection: + * {@code docId -> address -> ord}. */ + static final int SORTED_WITH_ADDRESSES = 0; + /** Single-valued sorted set values, encoded as sorted values, so no level + * of indirection: {@code docId -> ord}. */ + static final int SORTED_SINGLE_VALUED = 1; + /** Compressed giving IDs to unique sets of values: + * {@code docId -> setId -> ords} */ + static final int SORTED_SET_TABLE = 2; + + /** placeholder for missing offset that means there are no missing values */ + static final int ALL_LIVE = -1; + /** placeholder for missing offset that means all values are missing */ + static final int ALL_MISSING = -2; + + // addressing uses 16k blocks + static final int MONOTONIC_BLOCK_SIZE = 16384; + static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16; +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesProducer.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesProducer.java new file mode 100644 index 0000000000000..b266cf2fb750b --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesProducer.java @@ -0,0 +1,1847 @@ +/* + * @notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Modifications copyright (C) 2021 Elasticsearch B.V. + */ +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene54; + +import org.apache.lucene.backward_codecs.packed.LegacyDirectMonotonicReader; +import org.apache.lucene.backward_codecs.packed.LegacyDirectReader; +import org.apache.lucene.backward_codecs.store.EndiannessReverserUtil; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.MonotonicBlockPackedReader; +import org.elasticsearch.core.internal.io.IOUtils; +import org.elasticsearch.xpack.lucene.bwc.codecs.index.LegacyBinaryDocValues; +import org.elasticsearch.xpack.lucene.bwc.codecs.index.LegacyBinaryDocValuesWrapper; +import org.elasticsearch.xpack.lucene.bwc.codecs.index.LegacySortedSetDocValues; +import org.elasticsearch.xpack.lucene.bwc.codecs.index.LegacySortedSetDocValuesWrapper; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesConsumer.NumberType.ORDINAL; +import static org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesConsumer.NumberType.VALUE; + +/** reader for {@link Lucene54DocValuesFormat} */ +final class Lucene54DocValuesProducer extends DocValuesProducer implements Closeable { + private final Map numerics = new HashMap<>(); + private final Map binaries = new HashMap<>(); + private final Map sortedSets = new HashMap<>(); + private final Map sortedNumerics = new HashMap<>(); + private final Map ords = new HashMap<>(); + private final Map ordIndexes = new HashMap<>(); + private final int numFields; + private final AtomicLong ramBytesUsed; + private final IndexInput data; + private final int maxDoc; + + // memory-resident structures + private final Map addressInstances = new HashMap<>(); + private final Map reverseIndexInstances = new HashMap<>(); + private final Map directAddressesMeta = new HashMap<>(); + + private final boolean merging; + + // clone for merge: when merging we don't do any instances.put()s + Lucene54DocValuesProducer(Lucene54DocValuesProducer original) { + assert Thread.holdsLock(original); + numerics.putAll(original.numerics); + binaries.putAll(original.binaries); + sortedSets.putAll(original.sortedSets); + sortedNumerics.putAll(original.sortedNumerics); + ords.putAll(original.ords); + ordIndexes.putAll(original.ordIndexes); + numFields = original.numFields; + ramBytesUsed = new AtomicLong(original.ramBytesUsed.get()); + data = original.data.clone(); + maxDoc = original.maxDoc; + + addressInstances.putAll(original.addressInstances); + reverseIndexInstances.putAll(original.reverseIndexInstances); + merging = true; + } + + /** expert: instantiates a new reader */ + Lucene54DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) + throws IOException { + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + this.maxDoc = state.segmentInfo.maxDoc(); + merging = false; + ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass())); + + int version = -1; + int numFields = -1; + + // read in the entries from the metadata file. + try (ChecksumIndexInput in = EndiannessReverserUtil.openChecksumInput(state.directory, metaName, state.context)) { + Throwable priorE = null; + try { + version = CodecUtil.checkIndexHeader( + in, + metaCodec, + Lucene54DocValuesFormat.VERSION_START, + Lucene54DocValuesFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + numFields = readFields(in, state.fieldInfos); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(in, priorE); + } + } + + this.numFields = numFields; + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + this.data = EndiannessReverserUtil.openInput(state.directory, dataName, state.context); + boolean success = false; + try { + final int version2 = CodecUtil.checkIndexHeader( + data, + dataCodec, + Lucene54DocValuesFormat.VERSION_START, + Lucene54DocValuesFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix + ); + if (version != version2) { + throw new CorruptIndexException("Format versions mismatch: meta=" + version + ", data=" + version2, data); + } + + // NOTE: data file is too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + CodecUtil.retrieveChecksum(data); + + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this.data); + } + } + } + + private void readSortedField(FieldInfo info, IndexInput meta) throws IOException { + // sorted = binary + numeric + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.BINARY) { + throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); + } + BinaryEntry b = readBinaryEntry(info, meta); + binaries.put(info.name, b); + + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n = readNumericEntry(info, meta); + ords.put(info.name, n); + } + + private void readSortedSetFieldWithAddresses(FieldInfo info, IndexInput meta) throws IOException { + // sortedset = binary + numeric (addresses) + ordIndex + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.BINARY) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + BinaryEntry b = readBinaryEntry(info, meta); + binaries.put(info.name, b); + + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n1 = readNumericEntry(info, meta); + ords.put(info.name, n1); + + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n2 = readNumericEntry(info, meta); + ordIndexes.put(info.name, n2); + } + + private void readSortedSetFieldWithTable(FieldInfo info, IndexInput meta) throws IOException { + // sortedset table = binary + ordset table + ordset index + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.BINARY) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + + BinaryEntry b = readBinaryEntry(info, meta); + binaries.put(info.name, b); + + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n = readNumericEntry(info, meta); + ords.put(info.name, n); + } + + private int readFields(IndexInput meta, FieldInfos infos) throws IOException { + int numFields = 0; + int fieldNumber = meta.readVInt(); + while (fieldNumber != -1) { + numFields++; + FieldInfo info = infos.fieldInfo(fieldNumber); + if (info == null) { + // trickier to validate more: because we use multiple entries for "composite" types like sortedset, etc. + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); + } + byte type = meta.readByte(); + if (type == Lucene54DocValuesFormat.NUMERIC) { + numerics.put(info.name, readNumericEntry(info, meta)); + } else if (type == Lucene54DocValuesFormat.BINARY) { + BinaryEntry b = readBinaryEntry(info, meta); + binaries.put(info.name, b); + } else if (type == Lucene54DocValuesFormat.SORTED) { + readSortedField(info, meta); + } else if (type == Lucene54DocValuesFormat.SORTED_SET) { + SortedSetEntry ss = readSortedSetEntry(meta); + sortedSets.put(info.name, ss); + if (ss.format == Lucene54DocValuesFormat.SORTED_WITH_ADDRESSES) { + readSortedSetFieldWithAddresses(info, meta); + } else if (ss.format == Lucene54DocValuesFormat.SORTED_SET_TABLE) { + readSortedSetFieldWithTable(info, meta); + } else if (ss.format == Lucene54DocValuesFormat.SORTED_SINGLE_VALUED) { + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.SORTED) { + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); + } + readSortedField(info, meta); + } else { + throw new AssertionError(); + } + } else if (type == Lucene54DocValuesFormat.SORTED_NUMERIC) { + SortedSetEntry ss = readSortedSetEntry(meta); + sortedNumerics.put(info.name, ss); + if (ss.format == Lucene54DocValuesFormat.SORTED_WITH_ADDRESSES) { + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + numerics.put(info.name, readNumericEntry(info, meta)); + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry ordIndex = readNumericEntry(info, meta); + ordIndexes.put(info.name, ordIndex); + } else if (ss.format == Lucene54DocValuesFormat.SORTED_SET_TABLE) { + if (meta.readVInt() != info.number) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + NumericEntry n = readNumericEntry(info, meta); + ords.put(info.name, n); + } else if (ss.format == Lucene54DocValuesFormat.SORTED_SINGLE_VALUED) { + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); + } + numerics.put(info.name, readNumericEntry(info, meta)); + } else { + throw new AssertionError(); + } + } else { + throw new CorruptIndexException("invalid type: " + type, meta); + } + fieldNumber = meta.readVInt(); + } + return numFields; + } + + private NumericEntry readNumericEntry(FieldInfo info, IndexInput meta) throws IOException { + NumericEntry entry = new NumericEntry(); + entry.format = meta.readVInt(); + entry.missingOffset = meta.readLong(); + if (entry.format == Lucene54DocValuesFormat.SPARSE_COMPRESSED) { + // sparse bits need a bit more metadata + entry.numDocsWithValue = meta.readVLong(); + final int blockShift = meta.readVInt(); + entry.monotonicMeta = LegacyDirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue, blockShift); + ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed()); + directAddressesMeta.put(info.name, entry.monotonicMeta); + } + entry.offset = meta.readLong(); + entry.count = meta.readVLong(); + switch (entry.format) { + case Lucene54DocValuesFormat.CONST_COMPRESSED: + entry.minValue = meta.readLong(); + if (entry.count > Integer.MAX_VALUE) { + // currently just a limitation e.g. of bits interface and so on. + throw new CorruptIndexException("illegal CONST_COMPRESSED count: " + entry.count, meta); + } + break; + case Lucene54DocValuesFormat.GCD_COMPRESSED: + entry.minValue = meta.readLong(); + entry.gcd = meta.readLong(); + entry.bitsPerValue = meta.readVInt(); + break; + case Lucene54DocValuesFormat.TABLE_COMPRESSED: + final int uniqueValues = meta.readVInt(); + if (uniqueValues > 256) { + throw new CorruptIndexException( + "TABLE_COMPRESSED cannot have more than 256 distinct values, got=" + uniqueValues, + meta + ); + } + entry.table = new long[uniqueValues]; + for (int i = 0; i < uniqueValues; ++i) { + entry.table[i] = meta.readLong(); + } + ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(entry.table)); + entry.bitsPerValue = meta.readVInt(); + break; + case Lucene54DocValuesFormat.DELTA_COMPRESSED: + entry.minValue = meta.readLong(); + entry.bitsPerValue = meta.readVInt(); + break; + case Lucene54DocValuesFormat.MONOTONIC_COMPRESSED: + final int blockShift = meta.readVInt(); + entry.monotonicMeta = LegacyDirectMonotonicReader.loadMeta(meta, maxDoc + 1, blockShift); + ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed()); + directAddressesMeta.put(info.name, entry.monotonicMeta); + break; + case Lucene54DocValuesFormat.SPARSE_COMPRESSED: + final byte numberType = meta.readByte(); + switch (numberType) { + case 0: + entry.numberType = VALUE; + break; + case 1: + entry.numberType = ORDINAL; + break; + default: + throw new CorruptIndexException("Number type can only be 0 or 1, got=" + numberType, meta); + } + + // now read the numeric entry for non-missing values + final int fieldNumber = meta.readVInt(); + if (fieldNumber != info.number) { + throw new CorruptIndexException("Field numbers mistmatch: " + fieldNumber + " != " + info.number, meta); + } + final int dvFormat = meta.readByte(); + if (dvFormat != Lucene54DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("Formats mistmatch: " + dvFormat + " != " + Lucene54DocValuesFormat.NUMERIC, meta); + } + entry.nonMissingValues = readNumericEntry(info, meta); + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format + ", input=", meta); + } + entry.endOffset = meta.readLong(); + return entry; + } + + private BinaryEntry readBinaryEntry(FieldInfo info, IndexInput meta) throws IOException { + BinaryEntry entry = new BinaryEntry(); + entry.format = meta.readVInt(); + entry.missingOffset = meta.readLong(); + entry.minLength = meta.readVInt(); + entry.maxLength = meta.readVInt(); + entry.count = meta.readVLong(); + entry.offset = meta.readLong(); + switch (entry.format) { + case Lucene54DocValuesFormat.BINARY_FIXED_UNCOMPRESSED: + break; + case Lucene54DocValuesFormat.BINARY_PREFIX_COMPRESSED: + entry.addressesOffset = meta.readLong(); + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + entry.reverseIndexOffset = meta.readLong(); + break; + case Lucene54DocValuesFormat.BINARY_VARIABLE_UNCOMPRESSED: + entry.addressesOffset = meta.readLong(); + final int blockShift = meta.readVInt(); + entry.addressesMeta = LegacyDirectMonotonicReader.loadMeta(meta, entry.count + 1, blockShift); + ramBytesUsed.addAndGet(entry.addressesMeta.ramBytesUsed()); + directAddressesMeta.put(info.name, entry.addressesMeta); + entry.addressesEndOffset = meta.readLong(); + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format, meta); + } + return entry; + } + + SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException { + SortedSetEntry entry = new SortedSetEntry(); + entry.format = meta.readVInt(); + if (entry.format == Lucene54DocValuesFormat.SORTED_SET_TABLE) { + final int totalTableLength = meta.readInt(); + if (totalTableLength > 256) { + throw new CorruptIndexException( + "SORTED_SET_TABLE cannot have more than 256 values in its dictionary, got=" + totalTableLength, + meta + ); + } + entry.table = new long[totalTableLength]; + for (int i = 0; i < totalTableLength; ++i) { + entry.table[i] = meta.readLong(); + } + ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(entry.table)); + final int tableSize = meta.readInt(); + if (tableSize > totalTableLength + 1) { // +1 because of the empty set + throw new CorruptIndexException( + "SORTED_SET_TABLE cannot have more set ids than ords in its dictionary, got " + + totalTableLength + + " ords and " + + tableSize + + " sets", + meta + ); + } + entry.tableOffsets = new int[tableSize + 1]; + for (int i = 1; i < entry.tableOffsets.length; ++i) { + entry.tableOffsets[i] = entry.tableOffsets[i - 1] + meta.readInt(); + } + ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(entry.tableOffsets)); + } else if (entry.format != Lucene54DocValuesFormat.SORTED_SINGLE_VALUED + && entry.format != Lucene54DocValuesFormat.SORTED_WITH_ADDRESSES) { + throw new CorruptIndexException("Unknown format: " + entry.format, meta); + } + return entry; + } + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + NumericEntry entry = numerics.get(field.name); + Bits docsWithField; + + if (entry.format == Lucene54DocValuesFormat.SPARSE_COMPRESSED) { + return getSparseNumericDocValues(entry); + } else { + if (entry.missingOffset == Lucene54DocValuesFormat.ALL_MISSING) { + return DocValues.emptyNumeric(); + } else if (entry.missingOffset == Lucene54DocValuesFormat.ALL_LIVE) { + LongValues values = getNumeric(entry); + return new NumericDocValues() { + private int docID = -1; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + docID++; + if (docID == maxDoc) { + docID = NO_MORE_DOCS; + } + return docID; + } + + @Override + public int advance(int target) { + if (target >= maxDoc) { + docID = NO_MORE_DOCS; + } else { + docID = target; + } + return docID; + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + return true; + } + + @Override + public long cost() { + // TODO + return 0; + } + + @Override + public long longValue() { + return values.get(docID); + } + }; + } else { + docsWithField = getLiveBits(entry.missingOffset, maxDoc); + } + } + final LongValues values = getNumeric(entry); + return new NumericDocValues() { + + int doc = -1; + long value; + + @Override + public long longValue() throws IOException { + return value; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + for (int doc = target; doc < maxDoc; ++doc) { + value = values.get(doc); + if (value != 0 || docsWithField.get(doc)) { + return this.doc = doc; + } + } + return doc = NO_MORE_DOCS; + } + + @Override + public boolean advanceExact(int target) throws IOException { + doc = target; + value = values.get(doc); + return value != 0 || docsWithField.get(doc); + } + + @Override + public long cost() { + return maxDoc; + } + + }; + } + + @Override + public void checkIntegrity() throws IOException { + CodecUtil.checksumEntireFile(data); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(fields=" + numFields + ")"; + } + + LongValues getNumeric(NumericEntry entry) throws IOException { + switch (entry.format) { + case Lucene54DocValuesFormat.CONST_COMPRESSED: { + final long constant = entry.minValue; + final Bits live = getLiveBits(entry.missingOffset, (int) entry.count); + return new LongValues() { + @Override + public long get(long index) { + return live.get((int) index) ? constant : 0; + } + }; + } + case Lucene54DocValuesFormat.DELTA_COMPRESSED: { + RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); + final long delta = entry.minValue; + final LongValues values = LegacyDirectReader.getInstance(slice, entry.bitsPerValue, 0); + return new LongValues() { + @Override + public long get(long id) { + return delta + values.get(id); + } + }; + } + case Lucene54DocValuesFormat.GCD_COMPRESSED: { + RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); + final long min = entry.minValue; + final long mult = entry.gcd; + final LongValues quotientReader = LegacyDirectReader.getInstance(slice, entry.bitsPerValue, 0); + return new LongValues() { + @Override + public long get(long id) { + return min + mult * quotientReader.get(id); + } + }; + } + case Lucene54DocValuesFormat.TABLE_COMPRESSED: { + RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); + final long table[] = entry.table; + final LongValues ords = LegacyDirectReader.getInstance(slice, entry.bitsPerValue, 0); + return new LongValues() { + @Override + public long get(long id) { + return table[(int) ords.get(id)]; + } + }; + } + case Lucene54DocValuesFormat.SPARSE_COMPRESSED: + final SparseNumericDocValues values = getSparseNumericDocValues(entry); + final long missingValue; + switch (entry.numberType) { + case ORDINAL: + missingValue = -1L; + break; + case VALUE: + missingValue = 0L; + break; + default: + throw new AssertionError(); + } + return new SparseNumericDocValuesRandomAccessWrapper(values, missingValue); + default: + throw new AssertionError(); + } + } + + static final class SparseNumericDocValues extends NumericDocValues { + + final int docIDsLength; + final LongValues docIds, values; + + int index, doc; + + SparseNumericDocValues(int docIDsLength, LongValues docIDs, LongValues values) { + this.docIDsLength = docIDsLength; + this.docIds = docIDs; + this.values = values; + reset(); + } + + void reset() { + index = -1; + doc = -1; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + if (index >= docIDsLength - 1) { + index = docIDsLength; + return doc = NO_MORE_DOCS; + } + return doc = (int) docIds.get(++index); + } + + @Override + public int advance(int target) throws IOException { + long loIndex = index; + long step = 1; + long hiIndex; + int hiDoc; + + // gallop forward by exponentially growing the interval + // in order to find an interval so that the target doc + // is in ]lo, hi]. Compared to a regular binary search, + // this optimizes the case that the caller performs many + // advance calls by small deltas + do { + hiIndex = index + step; + if (hiIndex >= docIDsLength) { + hiIndex = docIDsLength; + hiDoc = NO_MORE_DOCS; + break; + } + hiDoc = (int) docIds.get(hiIndex); + if (hiDoc >= target) { + break; + } + step <<= 1; + } while (true); + + // now binary search + while (loIndex + 1 < hiIndex) { + final long midIndex = (loIndex + 1 + hiIndex) >>> 1; + final int midDoc = (int) docIds.get(midIndex); + if (midDoc >= target) { + hiIndex = midIndex; + hiDoc = midDoc; + } else { + loIndex = midIndex; + } + } + + index = (int) hiIndex; + return doc = hiDoc; + } + + @Override + public boolean advanceExact(int target) throws IOException { + if (advance(target) == target) { + return true; + } + --index; + doc = target; + return index >= 0 && docIds.get(index) == target; + } + + @Override + public long longValue() { + assert index >= 0; + assert index < docIDsLength; + return values.get(index); + } + + @Override + public long cost() { + return docIDsLength; + } + } + + static class SparseNumericDocValuesRandomAccessWrapper extends LongValues { + + final SparseNumericDocValues values; + final long missingValue; + + SparseNumericDocValuesRandomAccessWrapper(SparseNumericDocValues values, long missingValue) { + this.values = values; + this.missingValue = missingValue; + } + + @Override + public long get(long longIndex) { + final int index = Math.toIntExact(longIndex); + int doc = values.docID(); + if (doc >= index) { + values.reset(); + } + assert values.docID() < index; + try { + doc = values.advance(index); + } catch (IOException e) { + throw new RuntimeException(e); + } + if (doc == index) { + return values.longValue(); + } else { + return missingValue; + } + } + + } + + LegacyBinaryDocValues getLegacyBinary(FieldInfo field) throws IOException { + BinaryEntry bytes = binaries.get(field.name); + switch (bytes.format) { + case Lucene54DocValuesFormat.BINARY_FIXED_UNCOMPRESSED: + return getFixedBinary(field, bytes); + case Lucene54DocValuesFormat.BINARY_VARIABLE_UNCOMPRESSED: + return getVariableBinary(field, bytes); + case Lucene54DocValuesFormat.BINARY_PREFIX_COMPRESSED: + return getCompressedBinary(field, bytes); + default: + throw new AssertionError(); + } + } + + @Override + public BinaryDocValues getBinary(FieldInfo field) throws IOException { + BinaryEntry be = binaries.get(field.name); + return new LegacyBinaryDocValuesWrapper(getLiveBits(be.missingOffset, maxDoc), getLegacyBinary(field)); + } + + private LegacyBinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final IndexInput data = this.data.slice("fixed-binary", bytes.offset, bytes.count * bytes.maxLength); + + final BytesRef term = new BytesRef(bytes.maxLength); + final byte[] buffer = term.bytes; + final int length = term.length = bytes.maxLength; + + return new LongBinaryDocValues() { + @Override + public BytesRef get(long id) { + try { + data.seek(id * length); + data.readBytes(buffer, 0, buffer.length); + return term; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + private LegacyBinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final RandomAccessInput addressesData = this.data.randomAccessSlice( + bytes.addressesOffset, + bytes.addressesEndOffset - bytes.addressesOffset + ); + final LongValues addresses = LegacyDirectMonotonicReader.getInstance(bytes.addressesMeta, addressesData); + + final IndexInput data = this.data.slice("var-binary", bytes.offset, bytes.addressesOffset - bytes.offset); + final BytesRef term = new BytesRef(Math.max(0, bytes.maxLength)); + final byte buffer[] = term.bytes; + + return new LongBinaryDocValues() { + @Override + public BytesRef get(long id) { + long startAddress = addresses.get(id); + long endAddress = addresses.get(id + 1); + int length = (int) (endAddress - startAddress); + try { + data.seek(startAddress); + data.readBytes(buffer, 0, length); + term.length = length; + return term; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + /** returns an address instance for prefix-compressed binary values. */ + private synchronized MonotonicBlockPackedReader getIntervalInstance(FieldInfo field, BinaryEntry bytes) throws IOException { + MonotonicBlockPackedReader addresses = addressInstances.get(field.name); + if (addresses == null) { + data.seek(bytes.addressesOffset); + final long size = (bytes.count + Lucene54DocValuesFormat.INTERVAL_MASK) >>> Lucene54DocValuesFormat.INTERVAL_SHIFT; + addresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size); + if (merging == false) { + addressInstances.put(field.name, addresses); + ramBytesUsed.addAndGet(addresses.ramBytesUsed() + Integer.BYTES); + } + } + return addresses; + } + + /** returns a reverse lookup instance for prefix-compressed binary values. */ + private synchronized ReverseTermsIndex getReverseIndexInstance(FieldInfo field, BinaryEntry bytes) throws IOException { + ReverseTermsIndex index = reverseIndexInstances.get(field.name); + if (index == null) { + index = new ReverseTermsIndex(); + data.seek(bytes.reverseIndexOffset); + long size = (bytes.count + Lucene54DocValuesFormat.REVERSE_INTERVAL_MASK) >>> Lucene54DocValuesFormat.REVERSE_INTERVAL_SHIFT; + index.termAddresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size); + long dataSize = data.readVLong(); + PagedBytes pagedBytes = new PagedBytes(15); + pagedBytes.copy(data, dataSize); + index.terms = pagedBytes.freeze(true); + if (merging == false) { + reverseIndexInstances.put(field.name, index); + ramBytesUsed.addAndGet(index.ramBytesUsed()); + } + } + return index; + } + + private LegacyBinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final MonotonicBlockPackedReader addresses = getIntervalInstance(field, bytes); + final ReverseTermsIndex index = getReverseIndexInstance(field, bytes); + assert addresses.size() > 0; // we don't have to handle empty case + IndexInput slice = data.slice("terms", bytes.offset, bytes.addressesOffset - bytes.offset); + return new CompressedBinaryDocValues(bytes, addresses, index, slice); + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + final int valueCount = (int) binaries.get(field.name).count; + final LegacyBinaryDocValues binary = getLegacyBinary(field); + NumericEntry entry = ords.get(field.name); + final LongValues ordinals = getNumeric(entry); + if (entry.format == Lucene54DocValuesFormat.SPARSE_COMPRESSED) { + final SparseNumericDocValues sparseValues = ((SparseNumericDocValuesRandomAccessWrapper) ordinals).values; + return new SortedDocValues() { + + @Override + public int ordValue() { + return (int) sparseValues.longValue(); + } + + @Override + public BytesRef lookupOrd(int ord) { + return binary.get(ord); + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public int docID() { + return sparseValues.docID(); + } + + @Override + public int nextDoc() throws IOException { + return sparseValues.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + return sparseValues.advance(target); + } + + @Override + public boolean advanceExact(int target) throws IOException { + return sparseValues.advanceExact(target); + } + + @Override + public long cost() { + return sparseValues.cost(); + } + + }; + } + return new SortedDocValues() { + private int docID = -1; + private int ord; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + assert docID != NO_MORE_DOCS; + while (true) { + docID++; + if (docID == maxDoc) { + docID = NO_MORE_DOCS; + break; + } + ord = (int) ordinals.get(docID); + if (ord != -1) { + break; + } + } + return docID; + } + + @Override + public int advance(int target) throws IOException { + if (target >= maxDoc) { + docID = NO_MORE_DOCS; + return docID; + } else { + docID = target - 1; + return nextDoc(); + } + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + ord = (int) ordinals.get(target); + return ord != -1; + } + + @Override + public int ordValue() { + return ord; + } + + @Override + public long cost() { + // TODO + return 0; + } + + @Override + public BytesRef lookupOrd(int ord) { + return binary.get(ord); + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public int lookupTerm(BytesRef key) throws IOException { + if (binary instanceof CompressedBinaryDocValues) { + return (int) ((CompressedBinaryDocValues) binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() throws IOException { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues) binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + }; + } + + /** returns an address instance for sortedset ordinal lists */ + private LongValues getOrdIndexInstance(FieldInfo field, NumericEntry entry) throws IOException { + RandomAccessInput data = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset); + return LegacyDirectMonotonicReader.getInstance(entry.monotonicMeta, data); + } + + @Override + public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { + SortedSetEntry ss = sortedNumerics.get(field.name); + if (ss.format == Lucene54DocValuesFormat.SORTED_SINGLE_VALUED) { + NumericEntry numericEntry = numerics.get(field.name); + final LongValues values = getNumeric(numericEntry); + if (numericEntry.format == Lucene54DocValuesFormat.SPARSE_COMPRESSED) { + SparseNumericDocValues sparseValues = ((SparseNumericDocValuesRandomAccessWrapper) values).values; + return new SortedNumericDocValues() { + + @Override + public long nextValue() throws IOException { + return sparseValues.longValue(); + } + + @Override + public int docValueCount() { + return 1; + } + + @Override + public int docID() { + return sparseValues.docID(); + } + + @Override + public int nextDoc() throws IOException { + return sparseValues.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + return sparseValues.advance(target); + } + + @Override + public boolean advanceExact(int target) throws IOException { + return sparseValues.advanceExact(target); + } + + @Override + public long cost() { + return sparseValues.cost(); + } + + }; + } + final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc); + return new SortedNumericDocValues() { + int docID = -1; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + while (true) { + docID++; + if (docID == maxDoc) { + docID = NO_MORE_DOCS; + break; + } + + if (docsWithField.get(docID)) { + // TODO: use .nextSetBit here, at least!! + break; + } + } + return docID; + } + + @Override + public int advance(int target) { + if (target >= maxDoc) { + docID = NO_MORE_DOCS; + return docID; + } else { + docID = target - 1; + return nextDoc(); + } + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + return docsWithField.get(docID); + } + + @Override + public long cost() { + // TODO + return 0; + } + + @Override + public int docValueCount() { + return 1; + } + + @Override + public long nextValue() { + return values.get(docID); + } + }; + } else if (ss.format == Lucene54DocValuesFormat.SORTED_WITH_ADDRESSES) { + NumericEntry numericEntry = numerics.get(field.name); + final LongValues values = getNumeric(numericEntry); + final LongValues ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); + + return new SortedNumericDocValues() { + long startOffset; + long endOffset; + int docID = -1; + long upto; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + while (true) { + docID++; + if (docID == maxDoc) { + docID = NO_MORE_DOCS; + return docID; + } + startOffset = ordIndex.get(docID); + endOffset = ordIndex.get(docID + 1L); + if (endOffset > startOffset) { + break; + } + } + upto = startOffset; + return docID; + } + + @Override + public int advance(int target) { + if (target >= maxDoc) { + docID = NO_MORE_DOCS; + return docID; + } else { + docID = target - 1; + return nextDoc(); + } + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + startOffset = ordIndex.get(docID); + endOffset = ordIndex.get(docID + 1L); + upto = startOffset; + return endOffset > startOffset; + } + + @Override + public long cost() { + // TODO + return 0; + } + + @Override + public int docValueCount() { + return (int) (endOffset - startOffset); + } + + @Override + public long nextValue() { + return values.get(upto++); + } + }; + } else if (ss.format == Lucene54DocValuesFormat.SORTED_SET_TABLE) { + NumericEntry entry = ords.get(field.name); + final LongValues ordinals = getNumeric(entry); + + final long[] table = ss.table; + final int[] offsets = ss.tableOffsets; + return new SortedNumericDocValues() { + int startOffset; + int endOffset; + int docID = -1; + int upto; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() { + while (true) { + docID++; + if (docID == maxDoc) { + docID = NO_MORE_DOCS; + return docID; + } + int ord = (int) ordinals.get(docID); + startOffset = offsets[ord]; + endOffset = offsets[ord + 1]; + if (endOffset > startOffset) { + break; + } + } + upto = startOffset; + return docID; + } + + @Override + public int advance(int target) { + if (target >= maxDoc) { + docID = NO_MORE_DOCS; + return docID; + } else { + docID = target - 1; + return nextDoc(); + } + } + + @Override + public boolean advanceExact(int target) throws IOException { + docID = target; + int ord = (int) ordinals.get(docID); + startOffset = offsets[ord]; + endOffset = offsets[ord + 1]; + upto = startOffset; + return endOffset > startOffset; + } + + @Override + public long cost() { + // TODO + return 0; + } + + @Override + public int docValueCount() { + return endOffset - startOffset; + } + + @Override + public long nextValue() { + return table[upto++]; + } + }; + } else { + throw new AssertionError(); + } + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + SortedSetEntry ss = sortedSets.get(field.name); + switch (ss.format) { + case Lucene54DocValuesFormat.SORTED_SINGLE_VALUED: + return DocValues.singleton(getSorted(field)); + case Lucene54DocValuesFormat.SORTED_WITH_ADDRESSES: + return getSortedSetWithAddresses(field); + case Lucene54DocValuesFormat.SORTED_SET_TABLE: + return getSortedSetTable(field, ss); + default: + throw new AssertionError(); + } + } + + private SortedSetDocValues getSortedSetWithAddresses(FieldInfo field) throws IOException { + final long valueCount = binaries.get(field.name).count; + // we keep the byte[]s and list of ords on disk, these could be large + final LongBinaryDocValues binary = (LongBinaryDocValues) getLegacyBinary(field); + final LongValues ordinals = getNumeric(ords.get(field.name)); + // but the addresses to the ord stream are in RAM + final LongValues ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); + + return new LegacySortedSetDocValuesWrapper(new LegacySortedSetDocValues() { + long startOffset; + long offset; + long endOffset; + + @Override + public long nextOrd() { + if (offset == endOffset) { + return NO_MORE_ORDS; + } else { + long ord = ordinals.get(offset); + offset++; + return ord; + } + } + + @Override + public void setDocument(int docID) { + startOffset = offset = ordIndex.get(docID); + endOffset = ordIndex.get(docID + 1L); + } + + @Override + public BytesRef lookupOrd(long ord) { + return binary.get(ord); + } + + @Override + public long getValueCount() { + return valueCount; + } + + @Override + public long lookupTerm(BytesRef key) { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues) binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() throws IOException { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues) binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + }, maxDoc); + } + + private SortedSetDocValues getSortedSetTable(FieldInfo field, SortedSetEntry ss) throws IOException { + final long valueCount = binaries.get(field.name).count; + final LongBinaryDocValues binary = (LongBinaryDocValues) getLegacyBinary(field); + final NumericEntry ordinalsEntry = ords.get(field.name); + final LongValues ordinals = getNumeric(ordinalsEntry); + + final long[] table = ss.table; + final int[] offsets = ss.tableOffsets; + + return new LegacySortedSetDocValuesWrapper(new LegacySortedSetDocValues() { + + int offset, startOffset, endOffset; + + @Override + public void setDocument(int docID) { + final int ord = (int) ordinals.get(docID); + offset = startOffset = offsets[ord]; + endOffset = offsets[ord + 1]; + } + + @Override + public long nextOrd() { + if (offset == endOffset) { + return NO_MORE_ORDS; + } else { + return table[offset++]; + } + } + + @Override + public BytesRef lookupOrd(long ord) { + return binary.get(ord); + } + + @Override + public long getValueCount() { + return valueCount; + } + + @Override + public long lookupTerm(BytesRef key) { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues) binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() throws IOException { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues) binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + }, maxDoc); + } + + private Bits getLiveBits(final long offset, final int count) throws IOException { + if (offset == Lucene54DocValuesFormat.ALL_MISSING) { + return new Bits.MatchNoBits(count); + } else if (offset == Lucene54DocValuesFormat.ALL_LIVE) { + return new Bits.MatchAllBits(count); + } else { + int length = (int) ((count + 7L) >>> 3); + final RandomAccessInput in = data.randomAccessSlice(offset, length); + return new Bits() { + @Override + public boolean get(int index) { + try { + return (in.readByte(index >> 3) & (1 << (index & 7))) != 0; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public int length() { + return count; + } + }; + } + } + + private SparseNumericDocValues getSparseNumericDocValues(NumericEntry entry) throws IOException { + final RandomAccessInput docIdsData = this.data.randomAccessSlice(entry.missingOffset, entry.offset - entry.missingOffset); + final LongValues docIDs = LegacyDirectMonotonicReader.getInstance(entry.monotonicMeta, docIdsData); + final LongValues values = getNumeric(entry.nonMissingValues); // cannot be sparse + return new SparseNumericDocValues(Math.toIntExact(entry.numDocsWithValue), docIDs, values); + } + + @Override + public synchronized DocValuesProducer getMergeInstance() { + return new Lucene54DocValuesProducer(this); + } + + @Override + public void close() throws IOException { + data.close(); + } + + /** metadata entry for a numeric docvalues field */ + static class NumericEntry { + private NumericEntry() {} + + /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ + long missingOffset; + /** offset to the actual numeric values */ + public long offset; + /** end offset to the actual numeric values */ + public long endOffset; + /** bits per value used to pack the numeric values */ + public int bitsPerValue; + + int format; + /** count of values written */ + public long count; + + /** monotonic meta */ + public LegacyDirectMonotonicReader.Meta monotonicMeta; + + long minValue; + long gcd; + long table[]; + + /** for sparse compression */ + long numDocsWithValue; + NumericEntry nonMissingValues; + Lucene54DocValuesConsumer.NumberType numberType; + } + + /** metadata entry for a binary docvalues field */ + static class BinaryEntry { + private BinaryEntry() {} + + /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ + long missingOffset; + /** offset to the actual binary values */ + long offset; + + int format; + /** count of values written */ + public long count; + int minLength; + int maxLength; + /** offset to the addressing data that maps a value to its slice of the byte[] */ + public long addressesOffset, addressesEndOffset; + /** meta data for addresses */ + public LegacyDirectMonotonicReader.Meta addressesMeta; + /** offset to the reverse index */ + public long reverseIndexOffset; + /** packed ints version used to encode addressing information */ + public int packedIntsVersion; + /** packed ints blocksize */ + public int blockSize; + } + + /** metadata entry for a sorted-set docvalues field */ + static class SortedSetEntry { + private SortedSetEntry() {} + + int format; + + long[] table; + int[] tableOffsets; + } + + // internally we compose complex dv (sorted/sortedset) from other ones + abstract static class LongBinaryDocValues extends LegacyBinaryDocValues { + @Override + public final BytesRef get(int docID) { + return get((long) docID); + } + + abstract BytesRef get(long id); + } + + // used for reverse lookup to a small range of blocks + static class ReverseTermsIndex implements Accountable { + public MonotonicBlockPackedReader termAddresses; + public PagedBytes.Reader terms; + + @Override + public long ramBytesUsed() { + return termAddresses.ramBytesUsed() + terms.ramBytesUsed(); + } + + @Override + public Collection getChildResources() { + List resources = new ArrayList<>(); + resources.add(Accountables.namedAccountable("term bytes", terms)); + resources.add(Accountables.namedAccountable("term addresses", termAddresses)); + return Collections.unmodifiableList(resources); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(size=" + termAddresses.size() + ")"; + } + } + + // in the compressed case, we add a few additional operations for + // more efficient reverse lookup and enumeration + static final class CompressedBinaryDocValues extends LongBinaryDocValues { + final long numValues; + final long numIndexValues; + final int maxTermLength; + final MonotonicBlockPackedReader addresses; + final IndexInput data; + final CompressedBinaryTermsEnum termsEnum; + final PagedBytes.Reader reverseTerms; + final MonotonicBlockPackedReader reverseAddresses; + final long numReverseIndexValues; + + CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, ReverseTermsIndex index, IndexInput data) + throws IOException { + this.maxTermLength = bytes.maxLength; + this.numValues = bytes.count; + this.addresses = addresses; + this.numIndexValues = addresses.size(); + this.data = data; + this.reverseTerms = index.terms; + this.reverseAddresses = index.termAddresses; + this.numReverseIndexValues = reverseAddresses.size(); + this.termsEnum = getTermsEnum(data); + } + + @Override + public BytesRef get(long id) { + try { + termsEnum.seekExact(id); + return termsEnum.term(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + long lookupTerm(BytesRef key) { + try { + switch (termsEnum.seekCeil(key)) { + case FOUND: + return termsEnum.ord(); + case NOT_FOUND: + return -termsEnum.ord() - 1; + default: + return -numValues - 1; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + TermsEnum getTermsEnum() throws IOException { + return getTermsEnum(data.clone()); + } + + private CompressedBinaryTermsEnum getTermsEnum(IndexInput input) throws IOException { + return new CompressedBinaryTermsEnum(input); + } + + class CompressedBinaryTermsEnum extends BaseTermsEnum { + private long currentOrd = -1; + // offset to the start of the current block + private long currentBlockStart; + private final IndexInput input; + // delta from currentBlockStart to start of each term + private final int offsets[] = new int[Lucene54DocValuesFormat.INTERVAL_COUNT]; + private final byte buffer[] = new byte[2 * Lucene54DocValuesFormat.INTERVAL_COUNT - 1]; + + private final BytesRef term = new BytesRef(maxTermLength); + private final BytesRef firstTerm = new BytesRef(maxTermLength); + private final BytesRef scratch = new BytesRef(); + + CompressedBinaryTermsEnum(IndexInput input) throws IOException { + this.input = input; + input.seek(0); + } + + private void readHeader() throws IOException { + firstTerm.length = input.readVInt(); + input.readBytes(firstTerm.bytes, 0, firstTerm.length); + input.readBytes(buffer, 0, Lucene54DocValuesFormat.INTERVAL_COUNT - 1); + if (buffer[0] == -1) { + readShortAddresses(); + } else { + readByteAddresses(); + } + currentBlockStart = input.getFilePointer(); + } + + // read single byte addresses: each is delta - 2 + // (shared prefix byte and length > 0 are both implicit) + private void readByteAddresses() throws IOException { + int addr = 0; + for (int i = 1; i < offsets.length; i++) { + addr += 2 + (buffer[i - 1] & 0xFF); + offsets[i] = addr; + } + } + + // read double byte addresses: each is delta - 2 + // (shared prefix byte and length > 0 are both implicit) + private void readShortAddresses() throws IOException { + input.readBytes(buffer, Lucene54DocValuesFormat.INTERVAL_COUNT - 1, Lucene54DocValuesFormat.INTERVAL_COUNT); + int addr = 0; + for (int i = 1; i < offsets.length; i++) { + int x = i << 1; + addr += 2 + ((buffer[x - 1] << 8) | (buffer[x] & 0xFF)); + offsets[i] = addr; + } + } + + // set term to the first term + private void readFirstTerm() throws IOException { + term.length = firstTerm.length; + System.arraycopy(firstTerm.bytes, firstTerm.offset, term.bytes, 0, term.length); + } + + // read term at offset, delta encoded from first term + private void readTerm(int offset) throws IOException { + int start = input.readByte() & 0xFF; + System.arraycopy(firstTerm.bytes, firstTerm.offset, term.bytes, 0, start); + int suffix = offsets[offset] - offsets[offset - 1] - 1; + input.readBytes(term.bytes, start, suffix); + term.length = start + suffix; + } + + @Override + public BytesRef next() throws IOException { + currentOrd++; + if (currentOrd >= numValues) { + return null; + } else { + int offset = (int) (currentOrd & Lucene54DocValuesFormat.INTERVAL_MASK); + if (offset == 0) { + // switch to next block + readHeader(); + readFirstTerm(); + } else { + readTerm(offset); + } + return term; + } + } + + // binary search reverse index to find smaller + // range of blocks to search + long binarySearchIndex(BytesRef text) throws IOException { + long low = 0; + long high = numReverseIndexValues - 1; + while (low <= high) { + long mid = (low + high) >>> 1; + reverseTerms.fill(scratch, reverseAddresses.get(mid)); + int cmp = scratch.compareTo(text); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; + } + } + return high; + } + + // binary search against first term in block range + // to find term's block + long binarySearchBlock(BytesRef text, long low, long high) throws IOException { + while (low <= high) { + long mid = (low + high) >>> 1; + input.seek(addresses.get(mid)); + term.length = input.readVInt(); + input.readBytes(term.bytes, 0, term.length); + int cmp = term.compareTo(text); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; + } + } + return high; + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + // locate block: narrow to block range with index, then search blocks + final long block; + long indexPos = binarySearchIndex(text); + if (indexPos < 0) { + block = 0; + } else { + long low = indexPos << Lucene54DocValuesFormat.BLOCK_INTERVAL_SHIFT; + long high = Math.min(numIndexValues - 1, low + Lucene54DocValuesFormat.BLOCK_INTERVAL_MASK); + block = Math.max(low, binarySearchBlock(text, low, high)); + } + + // position before block, then scan to term. + input.seek(addresses.get(block)); + currentOrd = (block << Lucene54DocValuesFormat.INTERVAL_SHIFT) - 1; + + while (next() != null) { + int cmp = term.compareTo(text); + if (cmp == 0) { + return SeekStatus.FOUND; + } else if (cmp > 0) { + return SeekStatus.NOT_FOUND; + } + } + return SeekStatus.END; + } + + @Override + public void seekExact(long ord) throws IOException { + long block = ord >>> Lucene54DocValuesFormat.INTERVAL_SHIFT; + if (block != currentOrd >>> Lucene54DocValuesFormat.INTERVAL_SHIFT) { + // switch to different block + input.seek(addresses.get(block)); + readHeader(); + } + + currentOrd = ord; + + int offset = (int) (ord & Lucene54DocValuesFormat.INTERVAL_MASK); + if (offset == 0) { + readFirstTerm(); + } else { + input.seek(currentBlockStart + offsets[offset - 1]); + readTerm(offset); + } + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public long ord() throws IOException { + return currentOrd; + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + return -1; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + } + } +} diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java index 3599b97ec16f2..43a24574297c3 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene60/Lucene60Codec.java @@ -24,12 +24,15 @@ import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat; import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat; import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.LiveDocsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec; import org.elasticsearch.xpack.lucene.bwc.codecs.lucene50.Lucene50SegmentInfoFormat; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat; import java.util.Objects; @@ -44,8 +47,14 @@ public class Lucene60Codec extends BWCCodec { private final SegmentInfoFormat segmentInfosFormat = wrap(new Lucene50SegmentInfoFormat()); private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat(); private final CompoundFormat compoundFormat = new Lucene50CompoundFormat(); - private final StoredFieldsFormat storedFieldsFormat; + private final DocValuesFormat defaultDocValuesFormat = new Lucene54DocValuesFormat(); + private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDocValuesFormat; + } + }; /** * Instantiates a new codec. @@ -89,4 +98,10 @@ public final LiveDocsFormat liveDocsFormat() { public final CompoundFormat compoundFormat() { return compoundFormat; } + + @Override + public DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java index 804128dfe97ae..2f805a4881744 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene62/Lucene62Codec.java @@ -24,11 +24,14 @@ import org.apache.lucene.backward_codecs.lucene50.Lucene50StoredFieldsFormat; import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat; import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.LiveDocsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec; +import org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat; import java.util.Objects; @@ -44,6 +47,13 @@ public class Lucene62Codec extends BWCCodec { private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat(); private final CompoundFormat compoundFormat = new Lucene50CompoundFormat(); private final StoredFieldsFormat storedFieldsFormat; + private final DocValuesFormat defaultDocValuesFormat = new Lucene54DocValuesFormat(); + private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDocValuesFormat; + } + }; public Lucene62Codec() { this(Lucene50StoredFieldsFormat.Mode.BEST_SPEED); @@ -78,4 +88,9 @@ public final LiveDocsFormat liveDocsFormat() { public final CompoundFormat compoundFormat() { return compoundFormat; } + + @Override + public DocValuesFormat docValuesFormat() { + return docValuesFormat; + } } diff --git a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java index a69622928518c..bc9fa098476c1 100644 --- a/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java +++ b/x-pack/plugin/old-lucene-versions/src/main/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene70/BWCLucene70Codec.java @@ -15,10 +15,12 @@ import org.apache.lucene.backward_codecs.lucene60.Lucene60FieldInfosFormat; import org.apache.lucene.backward_codecs.lucene70.Lucene70SegmentInfoFormat; import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.LiveDocsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec; public class BWCLucene70Codec extends BWCCodec { @@ -28,6 +30,13 @@ public class BWCLucene70Codec extends BWCCodec { private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat(); private final CompoundFormat compoundFormat = new Lucene50CompoundFormat(); private final StoredFieldsFormat storedFieldsFormat; + private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene70"); + private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + }; public BWCLucene70Codec() { super("BWCLucene70Codec"); @@ -58,4 +67,9 @@ public LiveDocsFormat liveDocsFormat() { public CompoundFormat compoundFormat() { return compoundFormat; } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } } diff --git a/x-pack/plugin/old-lucene-versions/src/main/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/x-pack/plugin/old-lucene-versions/src/main/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat new file mode 100644 index 0000000000000..2d46b4bca3d0c --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/main/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.elasticsearch.xpack.lucene.bwc.codecs.lucene54.Lucene54DocValuesFormat diff --git a/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesFormatTests.java b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesFormatTests.java new file mode 100644 index 0000000000000..db3393898fa75 --- /dev/null +++ b/x-pack/plugin/old-lucene-versions/src/test/java/org/elasticsearch/xpack/lucene/bwc/codecs/lucene54/Lucene54DocValuesFormatTests.java @@ -0,0 +1,22 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.lucene.bwc.codecs.lucene54; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.BaseDocValuesFormatTestCase; +import org.apache.lucene.util.TestUtil; + +public class Lucene54DocValuesFormatTests extends BaseDocValuesFormatTestCase { + + private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene54DocValuesFormat()); + + @Override + protected Codec getCodec() { + return codec; + } +} diff --git a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java index 9385e92af6d38..3bddc60b36449 100644 --- a/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java +++ b/x-pack/qa/repository-old-versions/src/test/java/org/elasticsearch/oldrepos/OldRepositoryAccessIT.java @@ -25,10 +25,12 @@ import org.elasticsearch.client.RestClient; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.client.indices.CloseIndexRequest; +import org.elasticsearch.client.indices.PutMappingRequest; import org.elasticsearch.client.searchable_snapshots.MountSnapshotRequest; import org.elasticsearch.cluster.SnapshotsInProgress; import org.elasticsearch.cluster.health.ClusterHealthStatus; import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.SecureString; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.ThreadContext; @@ -37,14 +39,20 @@ import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.search.sort.SortBuilders; +import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.snapshots.SnapshotInfo; import org.elasticsearch.snapshots.SnapshotState; import org.elasticsearch.test.hamcrest.ElasticsearchAssertions; import org.elasticsearch.test.rest.ESRestTestCase; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentFactory; +import org.elasticsearch.xcontent.json.JsonXContent; import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -90,7 +98,8 @@ public void runTest(boolean sourceOnlyRepository) throws IOException { ); int oldEsPort = Integer.parseInt(System.getProperty("tests.es.port")); - int numDocs = 5; + int numDocs = 10; + int extraDocs = 1; final Set expectedIds = new HashSet<>(); try ( RestHighLevelClient client = highLevelClient(adminClient()); @@ -99,12 +108,23 @@ public void runTest(boolean sourceOnlyRepository) throws IOException { try { Request createIndex = new Request("PUT", "/test"); int numberOfShards = randomIntBetween(1, 3); - createIndex.setJsonEntity(""" - {"settings":{"number_of_shards": %s}} - """.formatted(numberOfShards)); + + XContentBuilder settingsBuilder = XContentFactory.jsonBuilder().startObject().startObject("settings"); + settingsBuilder.field("index.number_of_shards", numberOfShards); + + // 6.5.0 started using soft-deletes, but it was only enabled by default on 7.0 + if (oldVersion.onOrAfter(Version.fromString("6.5.0")) + && oldVersion.before(Version.fromString("7.0.0")) + && randomBoolean()) { + settingsBuilder.field("index.soft_deletes.enabled", true); + } + + settingsBuilder.endObject().endObject(); + + createIndex.setJsonEntity(Strings.toString(settingsBuilder)); assertOK(oldEs.performRequest(createIndex)); - for (int i = 0; i < numDocs; i++) { + for (int i = 0; i < numDocs + extraDocs; i++) { String id = "testdoc" + i; expectedIds.add(id); Request doc = new Request("PUT", "/test/doc/" + id); @@ -113,6 +133,14 @@ public void runTest(boolean sourceOnlyRepository) throws IOException { assertOK(oldEs.performRequest(doc)); } + for (int i = 0; i < extraDocs; i++) { + String id = randomFrom(expectedIds); + expectedIds.remove(id); + Request doc = new Request("DELETE", "/test/doc/" + id); + doc.addParameter("refresh", "true"); + oldEs.performRequest(doc); + } + // register repo on old ES and take snapshot Request createRepoRequest = new Request("PUT", "/_snapshot/testrepo"); createRepoRequest.setJsonEntity(sourceOnlyRepository ? """ @@ -190,7 +218,7 @@ public void runTest(boolean sourceOnlyRepository) throws IOException { if (Build.CURRENT.isSnapshot()) { // restore / mount and check whether searches work - restoreMountAndVerify(numDocs, expectedIds, client, numberOfShards); + restoreMountAndVerify(numDocs, expectedIds, client, numberOfShards, sourceOnlyRepository); // close indices assertTrue( @@ -208,7 +236,7 @@ public void runTest(boolean sourceOnlyRepository) throws IOException { ); // restore / mount again - restoreMountAndVerify(numDocs, expectedIds, client, numberOfShards); + restoreMountAndVerify(numDocs, expectedIds, client, numberOfShards, sourceOnlyRepository); } } finally { IOUtils.closeWhileHandlingException( @@ -233,8 +261,13 @@ private static String sourceForDoc(int i) { } @SuppressWarnings("removal") - private void restoreMountAndVerify(int numDocs, Set expectedIds, RestHighLevelClient client, int numberOfShards) - throws IOException { + private void restoreMountAndVerify( + int numDocs, + Set expectedIds, + RestHighLevelClient client, + int numberOfShards, + boolean sourceOnlyRepository + ) throws IOException { // restore index RestoreSnapshotResponse restoreSnapshotResponse = client.snapshot() .restore( @@ -259,7 +292,7 @@ private void restoreMountAndVerify(int numDocs, Set expectedIds, RestHig ); // run a search against the index - assertDocs("restored_test", numDocs, expectedIds, client); + assertDocs("restored_test", numDocs, expectedIds, client, sourceOnlyRepository); // mount as full copy searchable snapshot RestoreSnapshotResponse mountSnapshotResponse = client.searchableSnapshots() @@ -285,7 +318,7 @@ private void restoreMountAndVerify(int numDocs, Set expectedIds, RestHig ); // run a search against the index - assertDocs("mounted_full_copy_test", numDocs, expectedIds, client); + assertDocs("mounted_full_copy_test", numDocs, expectedIds, client, sourceOnlyRepository); // mount as shared cache searchable snapshot mountSnapshotResponse = client.searchableSnapshots() @@ -300,11 +333,12 @@ private void restoreMountAndVerify(int numDocs, Set expectedIds, RestHig assertEquals(numberOfShards, mountSnapshotResponse.getRestoreInfo().successfulShards()); // run a search against the index - assertDocs("mounted_shared_cache_test", numDocs, expectedIds, client); + assertDocs("mounted_shared_cache_test", numDocs, expectedIds, client, sourceOnlyRepository); } @SuppressWarnings("removal") - private void assertDocs(String index, int numDocs, Set expectedIds, RestHighLevelClient client) throws IOException { + private void assertDocs(String index, int numDocs, Set expectedIds, RestHighLevelClient client, boolean sourceOnlyRepository) + throws IOException { // run a search against the index SearchResponse searchResponse = client.search(new SearchRequest(index), RequestOptions.DEFAULT); logger.info(searchResponse); @@ -318,21 +352,55 @@ private void assertDocs(String index, int numDocs, Set expectedIds, Rest assertTrue(Arrays.stream(searchResponse.getHits().getHits()).allMatch(SearchHit::hasSource)); // check that correct _source present for each document for (SearchHit h : searchResponse.getHits().getHits()) { - assertEquals(sourceForDoc(Integer.parseInt(h.getId().substring("testdoc".length()))), h.getSourceAsString()); + assertEquals(sourceForDoc(getIdAsNumeric(h.getId())), h.getSourceAsString()); } + String id = randomFrom(expectedIds); + int num = getIdAsNumeric(id); // run a search using runtime fields against the index searchResponse = client.search( new SearchRequest(index).source( SearchSourceBuilder.searchSource() - .query(QueryBuilders.matchQuery("val", 2)) + .query(QueryBuilders.matchQuery("val", num)) .runtimeMappings(Map.of("val", Map.of("type", "long"))) ), RequestOptions.DEFAULT ); logger.info(searchResponse); assertEquals(1, searchResponse.getHits().getTotalHits().value); - assertEquals("testdoc2", searchResponse.getHits().getHits()[0].getId()); - assertEquals(sourceForDoc(2), searchResponse.getHits().getHits()[0].getSourceAsString()); + assertEquals(id, searchResponse.getHits().getHits()[0].getId()); + assertEquals(sourceForDoc(num), searchResponse.getHits().getHits()[0].getSourceAsString()); + + if (sourceOnlyRepository == false) { + // check that doc values can be accessed by (reverse) sorting on numeric val field + // first add mapping for field (this will be done automatically in the future) + XContentBuilder mappingBuilder = JsonXContent.contentBuilder(); + mappingBuilder.startObject().startObject("properties").startObject("val"); + mappingBuilder.field("type", "long"); + mappingBuilder.endObject().endObject().endObject(); + assertTrue( + client.indices().putMapping(new PutMappingRequest(index).source(mappingBuilder), RequestOptions.DEFAULT).isAcknowledged() + ); + + // search using reverse sort on val + searchResponse = client.search( + new SearchRequest(index).source( + SearchSourceBuilder.searchSource() + .query(QueryBuilders.matchAllQuery()) + .sort(SortBuilders.fieldSort("val").order(SortOrder.DESC)) + ), + RequestOptions.DEFAULT + ); + logger.info(searchResponse); + // check sort order + assertEquals( + expectedIds.stream().sorted(Comparator.comparingInt(this::getIdAsNumeric).reversed()).collect(Collectors.toList()), + Arrays.stream(searchResponse.getHits().getHits()).map(SearchHit::getId).collect(Collectors.toList()) + ); + } + } + + private int getIdAsNumeric(String id) { + return Integer.parseInt(id.substring("testdoc".length())); } }