From 07f1d1556851a0119ba86a22f7c311c57170cc82 Mon Sep 17 00:00:00 2001 From: Matthew Davis Date: Sun, 28 May 2023 11:10:08 -0400 Subject: [PATCH] add snappy compression for stored BSON documents and metadata by default. enable it to be turned off via an index flag in create/update index (#109) --- .../server/config/ServerIndexConfig.java | 4 ++++ .../io/zulia/client/command/UpdateIndex.java | 16 +++++++++++++++ .../client/config/ClientIndexConfig.java | 20 +++++++++++++++++-- zulia-common/src/main/proto/zulia_base.proto | 1 + zulia-common/src/main/proto/zulia_index.proto | 6 ++++++ zulia-server/build.gradle.kts | 2 +- .../index/DocumentScoredDocLeafHandler.java | 7 +++++++ .../server/index/ShardDocumentIndexer.java | 10 +++++++--- .../zulia/server/index/ZuliaIndexManager.java | 4 ++++ .../io/zulia/server/index/ZuliaShard.java | 13 ++++++++++-- .../io/zulia/server/test/node/IndexTest.java | 9 +++++++++ .../zulia/server/test/node/StartStopTest.java | 2 ++ 12 files changed, 86 insertions(+), 8 deletions(-) diff --git a/zulia-analyzer/src/main/java/io/zulia/server/config/ServerIndexConfig.java b/zulia-analyzer/src/main/java/io/zulia/server/config/ServerIndexConfig.java index 5b74099d..5133a5a8 100644 --- a/zulia-analyzer/src/main/java/io/zulia/server/config/ServerIndexConfig.java +++ b/zulia-analyzer/src/main/java/io/zulia/server/config/ServerIndexConfig.java @@ -211,6 +211,10 @@ public int getRAMBufferMB() { return indexSettings.getRamBufferMB(); } + public boolean isCompressionEnabled() { + return !indexSettings.getDisableCompression(); + } + public Set getMatchingFields(String field) { return getMatchingIndexFields(field, true); } diff --git a/zulia-client/src/main/java/io/zulia/client/command/UpdateIndex.java b/zulia-client/src/main/java/io/zulia/client/command/UpdateIndex.java index a9d51ad7..fc44f554 100644 --- a/zulia-client/src/main/java/io/zulia/client/command/UpdateIndex.java +++ b/zulia-client/src/main/java/io/zulia/client/command/UpdateIndex.java @@ -46,6 +46,8 @@ public class UpdateIndex extends SimpleCommand analyzerSettingsList = Collections.emptyList(); @@ -311,6 +313,15 @@ public UpdateIndex setRamBufferMB(Integer ramBufferMB) { return this; } + public Boolean getDisableCompression() { + return disableCompression; + } + + public UpdateIndex setDisableCompression(Boolean disableCompression) { + this.disableCompression = disableCompression; + return this; + } + public Integer getNumberOfReplicas() { return numberOfReplicas; } @@ -482,6 +493,11 @@ public UpdateIndexRequest getRequest() { updateIndexSettings.setRamBufferMB(ramBufferMB); } + if (disableCompression != null) { + updateIndexSettings.setSetDisableCompression(true); + updateIndexSettings.setDisableCompression(disableCompression); + } + updateIndexSettings.setMetaUpdateOperation(metaDataOperation); if (!metadata.isEmpty()) { updateIndexSettings.setMetadata(ZuliaUtil.mongoDocumentToByteString(metadata)); diff --git a/zulia-client/src/main/java/io/zulia/client/config/ClientIndexConfig.java b/zulia-client/src/main/java/io/zulia/client/config/ClientIndexConfig.java index 386245fc..28581158 100644 --- a/zulia-client/src/main/java/io/zulia/client/config/ClientIndexConfig.java +++ b/zulia-client/src/main/java/io/zulia/client/config/ClientIndexConfig.java @@ -38,6 +38,8 @@ public class ClientIndexConfig { private Integer ramBufferMB; private Integer numberOfReplicas; + private Boolean disableCompression; + private TreeMap fieldMap; private TreeMap analyzerSettingsMap; @@ -102,6 +104,11 @@ public Integer getNumberOfShards() { return numberOfShards; } + public ClientIndexConfig setNumberOfShards(Integer numberOfShards) { + this.numberOfShards = numberOfShards; + return this; + } + public Integer getRamBufferMB() { return ramBufferMB; } @@ -111,8 +118,12 @@ public ClientIndexConfig setRamBufferMB(Integer ramBufferMB) { return this; } - public ClientIndexConfig setNumberOfShards(Integer numberOfShards) { - this.numberOfShards = numberOfShards; + public Boolean getDisableCompression() { + return disableCompression; + } + + public ClientIndexConfig setDisableCompression(Boolean disableCompression) { + this.disableCompression = disableCompression; return this; } @@ -337,6 +348,10 @@ public IndexSettings getIndexSettings() { isb.setRamBufferMB(ramBufferMB); } + if (disableCompression != null) { + isb.setDisableCompression(disableCompression); + } + if (meta != null) { isb.setMeta(ZuliaUtil.mongoDocumentToByteString(meta)); } @@ -394,6 +409,7 @@ public void configure(IndexSettings indexSettings) { this.indexWeight = indexSettings.getIndexWeight(); this.ramBufferMB = indexSettings.getRamBufferMB(); + this.disableCompression = indexSettings.getDisableCompression(); this.meta = ZuliaUtil.byteStringToMongoDocument(indexSettings.getMeta()); diff --git a/zulia-common/src/main/proto/zulia_base.proto b/zulia-common/src/main/proto/zulia_base.proto index 2ef77fc5..eaee471b 100644 --- a/zulia-common/src/main/proto/zulia_base.proto +++ b/zulia-common/src/main/proto/zulia_base.proto @@ -7,6 +7,7 @@ message IdInfo { uint64 timestamp = 2; uint32 majorVersion = 3; uint32 minorVersion = 4; + bool compressedDoc = 5; } enum MasterSlaveSettings { diff --git a/zulia-common/src/main/proto/zulia_index.proto b/zulia-common/src/main/proto/zulia_index.proto index 0f740862..19311ff9 100644 --- a/zulia-common/src/main/proto/zulia_index.proto +++ b/zulia-common/src/main/proto/zulia_index.proto @@ -59,6 +59,8 @@ message IndexSettings { repeated FieldMapping fieldMapping = 21; + bool disableCompression = 22; + } @@ -117,6 +119,10 @@ message UpdateIndexSettings { repeated FieldMapping fieldMapping = 29; Operation fieldMappingOperation = 30; // keyed by alias + + bool setDisableCompression = 31; + bool disableCompression = 32; + } diff --git a/zulia-server/build.gradle.kts b/zulia-server/build.gradle.kts index 96dc1ad6..09d90769 100644 --- a/zulia-server/build.gradle.kts +++ b/zulia-server/build.gradle.kts @@ -47,7 +47,7 @@ dependencies { implementation("org.mongodb:mongodb-driver-sync:$mongoDriverVersion") implementation("org.apache.commons:commons-compress:1.22") - implementation("org.xerial.snappy:snappy-java:1.1.9.1") + implementation("org.xerial.snappy:snappy-java:1.1.10.0") implementation(platform("software.amazon.awssdk:bom:$amazonVersion")) implementation("software.amazon.awssdk:s3") diff --git a/zulia-server/src/main/java/io/zulia/server/index/DocumentScoredDocLeafHandler.java b/zulia-server/src/main/java/io/zulia/server/index/DocumentScoredDocLeafHandler.java index 7400e80a..3e8d6117 100644 --- a/zulia-server/src/main/java/io/zulia/server/index/DocumentScoredDocLeafHandler.java +++ b/zulia-server/src/main/java/io/zulia/server/index/DocumentScoredDocLeafHandler.java @@ -19,6 +19,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.util.BytesRef; +import org.xerial.snappy.Snappy; import java.io.IOException; import java.util.ArrayList; @@ -119,6 +120,9 @@ protected ZuliaQuery.ScoredResult handleDocument(LeafReaderContext currentLeaf, if (meta) { if (metaDocValues != null && metaDocValues.advanceExact(localDocId)) { byte[] metaBytes = BytesRefUtil.getByteArray(metaDocValues.binaryValue()); + if (idInfo.getCompressedDoc()) { + metaBytes = Snappy.uncompress(metaBytes); + } rdBuilder.setMetadata(ByteString.copyFrom(metaBytes)); } } @@ -126,6 +130,9 @@ protected ZuliaQuery.ScoredResult handleDocument(LeafReaderContext currentLeaf, if (full) { if (fullDocValues != null && fullDocValues.advanceExact(localDocId)) { byte[] docBytes = BytesRefUtil.getByteArray(fullDocValues.binaryValue()); + if (idInfo.getCompressedDoc()) { + docBytes = Snappy.uncompress(docBytes); + } rdBuilder.setDocument(ByteString.copyFrom(docBytes)); if (needsHighlight || needsAnalysis || needsDocFiltering) { diff --git a/zulia-server/src/main/java/io/zulia/server/index/ShardDocumentIndexer.java b/zulia-server/src/main/java/io/zulia/server/index/ShardDocumentIndexer.java index 6318f2f2..e3378e90 100644 --- a/zulia-server/src/main/java/io/zulia/server/index/ShardDocumentIndexer.java +++ b/zulia-server/src/main/java/io/zulia/server/index/ShardDocumentIndexer.java @@ -39,6 +39,7 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; +import org.xerial.snappy.Snappy; import java.io.IOException; import java.nio.ByteBuffer; @@ -80,18 +81,21 @@ public Document getIndexDocument(String uniqueId, long timestamp, DocumentContai luceneDocument.add(new SortedSetDocValuesField(idSortField, new BytesRef(uniqueId))); luceneDocument.add(new LongPoint(ZuliaFieldConstants.TIMESTAMP_FIELD, timestamp)); + boolean compressionEnabled = indexConfig.isCompressionEnabled(); ZuliaBase.IdInfo idInfo = ZuliaBase.IdInfo.newBuilder().setId(uniqueId).setTimestamp(timestamp).setMajorVersion(majorVersion) - .setMinorVersion(minorVersion).build(); + .setMinorVersion(minorVersion).setCompressedDoc(compressionEnabled).build(); byte[] idInfoBytes = idInfo.toByteArray(); luceneDocument.add(new BinaryDocValuesField(ZuliaFieldConstants.STORED_ID_FIELD, new BytesRef(idInfoBytes))); if (metadata.hasDocument()) { - luceneDocument.add(new BinaryDocValuesField(ZuliaFieldConstants.STORED_META_FIELD, new BytesRef(metadata.getByteArray()))); + byte[] bytes = compressionEnabled ? Snappy.compress(metadata.getByteArray()) : metadata.getByteArray(); + luceneDocument.add(new BinaryDocValuesField(ZuliaFieldConstants.STORED_META_FIELD, new BytesRef(bytes))); } if (mongoDocument.hasDocument()) { - luceneDocument.add(new BinaryDocValuesField(ZuliaFieldConstants.STORED_DOC_FIELD, new BytesRef(mongoDocument.getByteArray()))); + byte[] bytes = compressionEnabled ? Snappy.compress(mongoDocument.getByteArray()) : mongoDocument.getByteArray(); + luceneDocument.add(new BinaryDocValuesField(ZuliaFieldConstants.STORED_DOC_FIELD, new BytesRef(bytes))); addUserFields(mongoDocument.getDocument(), luceneDocument, taxoWriter); } diff --git a/zulia-server/src/main/java/io/zulia/server/index/ZuliaIndexManager.java b/zulia-server/src/main/java/io/zulia/server/index/ZuliaIndexManager.java index 78626ab3..5a459d5f 100644 --- a/zulia-server/src/main/java/io/zulia/server/index/ZuliaIndexManager.java +++ b/zulia-server/src/main/java/io/zulia/server/index/ZuliaIndexManager.java @@ -553,6 +553,10 @@ public UpdateIndexResponse updateIndex(UpdateIndexRequest request) throws Except existingSettings.setIndexWeight(updateIndexSettings.getIndexWeight()); } + if (updateIndexSettings.getSetDisableCompression()) { + existingSettings.setDisableCompression(updateIndexSettings.getDisableCompression()); + } + Operation metaUpdateOperation = updateIndexSettings.getMetaUpdateOperation(); if (metaUpdateOperation.getEnable()) { Document existingMeta = ZuliaUtil.byteStringToMongoDocument(existingSettings.getMeta()); diff --git a/zulia-server/src/main/java/io/zulia/server/index/ZuliaShard.java b/zulia-server/src/main/java/io/zulia/server/index/ZuliaShard.java index 3bd3b380..42d2fe66 100644 --- a/zulia-server/src/main/java/io/zulia/server/index/ZuliaShard.java +++ b/zulia-server/src/main/java/io/zulia/server/index/ZuliaShard.java @@ -12,6 +12,7 @@ import io.zulia.server.search.ShardQuery; import io.zulia.server.util.BytesRefUtil; import org.apache.lucene.search.Query; +import org.xerial.snappy.Snappy; import java.io.IOException; import java.util.EnumSet; @@ -157,8 +158,16 @@ public void reindex() throws IOException { String uniqueId = idInfo.getId(); - DocumentContainer metadata = new DocumentContainer(d.meta()); - DocumentContainer mongoDocument = new DocumentContainer(d.fullDoc()); + DocumentContainer metadata; + DocumentContainer mongoDocument; + if (idInfo.getCompressedDoc()) { + metadata = new DocumentContainer(d.meta() != null ? Snappy.uncompress(BytesRefUtil.getByteArray(d.meta())) : null); + mongoDocument = new DocumentContainer(d.fullDoc() != null ? Snappy.uncompress(BytesRefUtil.getByteArray(d.fullDoc())) : null); + } + else { + metadata = new DocumentContainer(d.meta()); + mongoDocument = new DocumentContainer(d.fullDoc()); + } if (!trackedIds.contains(uniqueId)) { shardWriteManager.indexDocument(uniqueId, timestamp, mongoDocument, metadata); diff --git a/zulia-server/src/test/java/io/zulia/server/test/node/IndexTest.java b/zulia-server/src/test/java/io/zulia/server/test/node/IndexTest.java index a87c9674..2c8d9481 100644 --- a/zulia-server/src/test/java/io/zulia/server/test/node/IndexTest.java +++ b/zulia-server/src/test/java/io/zulia/server/test/node/IndexTest.java @@ -55,6 +55,7 @@ public void createIndex() throws Exception { .setPinToCache(true)); indexConfig.setIndexName(INDEX_TEST); + indexConfig.setDisableCompression(true); indexConfig.setNumberOfShards(1); zuliaWorkPool.createIndex(indexConfig); @@ -70,6 +71,8 @@ public void createIndex() throws Exception { ClientIndexConfig indexConfigFromServer = zuliaWorkPool.getIndexConfig(INDEX_TEST).getIndexConfig(); + Assertions.assertTrue(indexConfigFromServer.getDisableCompression()); + Assertions.assertEquals(4, indexConfigFromServer.getFieldConfigMap().size()); ZuliaIndex.FieldConfig idFieldConfig = indexConfigFromServer.getFieldConfig("id"); @@ -139,6 +142,7 @@ public void createIndex() throws Exception { new Search(INDEX_TEST).setSearchLabel("searching for cash").addQuery(new ScoredQuery("title:cash")).setPinToCache(true)); indexConfig.addFieldMapping(new FieldMapping("title").addMappedFields("category").includeSelf()); indexConfig.addFieldMapping(new FieldMapping("test").addMappedFields("title", "category")); + indexConfig.setDisableCompression(false); zuliaWorkPool.createIndex(indexConfig); @@ -146,6 +150,9 @@ public void createIndex() throws Exception { { ClientIndexConfig indexConfigFromServer = zuliaWorkPool.getIndexConfig(INDEX_TEST).getIndexConfig(); + + Assertions.assertFalse(indexConfigFromServer.getDisableCompression()); + Assertions.assertEquals(indexConfigFromServer.getFieldConfigMap().size(), 3); List defaultSearchFields = indexConfigFromServer.getDefaultSearchFields(); @@ -163,6 +170,7 @@ public void updateIndex() throws Exception { { UpdateIndex updateIndex = new UpdateIndex(INDEX_TEST); updateIndex.setIndexWeight(4); + updateIndex.setDisableCompression(true); FieldConfigBuilder newField = FieldConfigBuilder.createString("newField").indexAs(DefaultAnalyzers.LC_KEYWORD).sort(); updateIndex.mergeFieldConfig(newField); @@ -176,6 +184,7 @@ public void updateIndex() throws Exception { ClientIndexConfig indexConfigFromServer = zuliaWorkPool.getIndexConfig(INDEX_TEST).getIndexConfig(); Assertions.assertEquals(4, indexConfigFromServer.getIndexWeight()); + Assertions.assertTrue(indexConfigFromServer.getDisableCompression()); Assertions.assertEquals(4, indexConfigFromServer.getFieldConfigMap().size()); ZuliaIndex.FieldConfig newField = indexConfigFromServer.getFieldConfig("newField"); Assertions.assertEquals(1, newField.getSortAsCount()); diff --git a/zulia-server/src/test/java/io/zulia/server/test/node/StartStopTest.java b/zulia-server/src/test/java/io/zulia/server/test/node/StartStopTest.java index 16f61471..e409bc4c 100644 --- a/zulia-server/src/test/java/io/zulia/server/test/node/StartStopTest.java +++ b/zulia-server/src/test/java/io/zulia/server/test/node/StartStopTest.java @@ -74,6 +74,7 @@ public void createIndex() throws Exception { indexConfig.setIndexName(FACET_TEST_INDEX); indexConfig.setNumberOfShards(1); indexConfig.setShardCommitInterval(20); //force some commits + indexConfig.setDisableCompression(true); //optional meta indexConfig.setMeta(new Document().append("createTime", new Date()).append("myLabel", "greatLabel")); @@ -386,6 +387,7 @@ public void reindex() throws Exception { indexConfig.addFieldConfig(FieldConfigBuilder.createBool("testBool").index().facet().sort()); indexConfig.setIndexName(FACET_TEST_INDEX); indexConfig.setNumberOfShards(1); + indexConfig.setDisableCompression(false); // default values, just for clarity zuliaWorkPool.createIndex(indexConfig);