diff --git a/treccovid_semantic_search/README.md b/treccovid_semantic_search/README.md new file mode 100644 index 00000000..1aecb084 --- /dev/null +++ b/treccovid_semantic_search/README.md @@ -0,0 +1,261 @@ +## Trec-Covid Semantic Search workload + +This workload uses OpenSearch pretrained model and ml-common-plugin to embed vectors. It is based on the neural search tutorial https://opensearch.org/docs/latest/search-plugins/neural-search-tutorial/ + +### Dataset + +Trec-Covid is a dataset collection of documents about COVID-19 information. +- Trec-Covid website: https://ir.nist.gov/covidSubmit/index.html +- Dataset: https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip + +### Example document and query +```json +{ + "_id": "2b73a28n", + "title": "Role of endothelin-1 in lung disease", + "text": "Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases.....", + "metadata": { + "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/", + "pubmed_id": "11686871" + } +} +``` +```json +{ + "query": { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "LSmIG44BlTi78mODPYgy", + "k": 10 + } + } + } +} +``` + +### Procedures + +#### Index, force-merge and search + +This procedure runs all tasks of this workload. First it deletes the current index and model. Then it indexes the corpus with vector embedding. Then it does the force-merging. At the end it does the semantic search. + +### Workload tasks: + +- cluster-settings +- delete-index +- delete-ingest-pipeline +- delete-ml-model +- register-ml-model +- deploy-ml-model +- create-ingest-pipeline +- create-index +- check-cluster-health +- index-append +- refresh-after-index +- force-merge +- refresh-after-force-merge +- wait-until-merges-finish +- default +- semantic-search + +### Parameters + +This workload allows [specifying the following parameters](#specifying-workload-parameters) using the `--workload-params` option to OpenSearch Benchmark: + +* `bulk_size` (default: 100) +* `bulk_indexing_clients` (default: 1): Number of clients that issue bulk indexing requests. +* `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. +* `number_of_replicas` (default: 0) +* `number_of_shards` (default: 1) +* `query_cache_enabled` (default: false) +* `requests_cache_enabled` (default: false) +* `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. +* `force_merge_max_num_segments` (default: unset): An integer specifying the max amount of segments the force-merge operation should use. +* `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. +* `cluster_health` (default: "green"): The minimum required cluster health. +* `error_level` (default: "non-fatal"): Available for bulk operations only to specify ignore-response-error-level. +* `target_throughput` (default: default values for each operation): Number of requests per second, `""` for no limit. +* `search_clients`: Number of clients that issue search requests. +* `model_name` (default: huggingface/sentence-transformers/all-mpnet-base-v2) OpenSearch-provided pretrained model name. +* `model_version` (default: 1.0.1) Model version. +* `model_format` (default: TORCH_SCRIPT) Model format. +* `dimensions` (default: 768): Vector dimensions, needed to match the model. +* `engine` (default:` lucene): The approximate k-NN library to use for indexing and search. +* `method` (default:` hnsw): K-NN search algorithm. +* `space_type` (default:` l2): The vector space used to calculate the distance between vectors. +* `k` (default: 10) Number of nearest neighbors are returned. +* `warmup_iterations` Number of Warmup iteration of each search client executes. +* `iterations` Number of test iterations of each search client executes. +* `num_variable_queries` (default: 0) Number of variable queries will be used for the semantic search task, 0 means fixed query and max value is 50. + +### Specifying Workload Parameters + +Example: +```json +{ + "index_settings": { + "index.number_of_shards": 1, + "index.number_of_replicas": 0 + }, + "bulk_indexing_clients": 2, + "ingest_percentage": 20, + "search_clients": 10, + "target_throughput": "", + "iterations": 100, + "warmup_iterations": 100, + "k": 100, + "variable_queries": 100 +} + ``` + +Save it as `params.json` and provide it to OpenSearch Benchmark with `--workload-params="/path/to/params.json"`. The overrides for simple parameters could be specified in-place, for example `--workload-params=search_clients:2`. + +### Sample command and output + +``` +./opensearch-benchmark execute-test --workload=treccovid_semantic_search \ + --target-hosts=:9200 --pipeline=benchmark-only --workload-params=params.json + + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + +[INFO] [Test Execution ID]: b6117408-73b8-4fc0-ba5d-f324cb3e1844 +[INFO] Executing test with workload [treccovid_semantic_search], test_procedure [index-merge-search] and provision_config_instance ['external'] with version [2.13.0]. + +Running cluster-settings [100% done] +Running delete-index [100% done] +Running delete-ingest-pipeline [100% done] +Running delete-ml-model [100% done] +Running register-ml-model [100% done] +Running deploy-ml-model [100% done] +Running create-ingest-pipeline [100% done] +Running create-index [100% done] +Running check-cluster-health [100% done] +Running index-append [100% done] +Running refresh-after-index [100% done] +Running force-merge [100% done] +Running refresh-after-force-merge [100% done] +Running wait-until-merges-finish [100% done] +Running default [100% done] +Running semantic-search [100% done] + +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 0.433717 | min | +| Min cumulative indexing time across primary shards | | 0 | min | +| Median cumulative indexing time across primary shards | | 0.00015 | min | +| Max cumulative indexing time across primary shards | | 0.171 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 0.374233 | min | +| Cumulative merge count of primary shards | | 8 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0.00055 | min | +| Max cumulative merge time across primary shards | | 0.345033 | min | +| Cumulative merge throttle time of primary shards | | 0.33885 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0.33885 | min | +| Cumulative refresh time of primary shards | | 0.10995 | min | +| Cumulative refresh count of primary shards | | 162 | | +| Min cumulative refresh time across primary shards | | 0 | min | +| Median cumulative refresh time across primary shards | | 0.000783333 | min | +| Max cumulative refresh time across primary shards | | 0.0343667 | min | +| Cumulative flush time of primary shards | | 0.00885 | min | +| Cumulative flush count of primary shards | | 4 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0 | min | +| Max cumulative flush time across primary shards | | 0.00885 | min | +| Total Young Gen GC time | | 0.523 | s | +| Total Young Gen GC count | | 24 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 2.18146 | GB | +| Translog size | | 0.0721766 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 50 | | +| Min Throughput | index-append | 108.82 | docs/s | +| Mean Throughput | index-append | 110.47 | docs/s | +| Median Throughput | index-append | 110.6 | docs/s | +| Max Throughput | index-append | 111.68 | docs/s | +| 50th percentile latency | index-append | 3465.01 | ms | +| 90th percentile latency | index-append | 3588.01 | ms | +| 100th percentile latency | index-append | 3764.87 | ms | +| 50th percentile service time | index-append | 3465.01 | ms | +| 90th percentile service time | index-append | 3588.01 | ms | +| 100th percentile service time | index-append | 3764.87 | ms | +| error rate | index-append | 0 | % | +| Min Throughput | wait-until-merges-finish | 90.88 | ops/s | +| Mean Throughput | wait-until-merges-finish | 90.88 | ops/s | +| Median Throughput | wait-until-merges-finish | 90.88 | ops/s | +| Max Throughput | wait-until-merges-finish | 90.88 | ops/s | +| 100th percentile latency | wait-until-merges-finish | 10.6818 | ms | +| 100th percentile service time | wait-until-merges-finish | 10.6818 | ms | +| error rate | wait-until-merges-finish | 0 | % | +| Min Throughput | default | 1030.78 | ops/s | +| Mean Throughput | default | 1030.78 | ops/s | +| Median Throughput | default | 1030.78 | ops/s | +| Max Throughput | default | 1030.78 | ops/s | +| 50th percentile latency | default | 8.11098 | ms | +| 90th percentile latency | default | 10.5718 | ms | +| 99th percentile latency | default | 12.5866 | ms | +| 99.9th percentile latency | default | 13.8164 | ms | +| 100th percentile latency | default | 14.1444 | ms | +| 50th percentile service time | default | 8.11098 | ms | +| 90th percentile service time | default | 10.5718 | ms | +| 99th percentile service time | default | 12.5866 | ms | +| 99.9th percentile service time | default | 13.8164 | ms | +| 100th percentile service time | default | 14.1444 | ms | +| error rate | default | 0 | % | +| Min Throughput | semantic-search | 110.75 | ops/s | +| Mean Throughput | semantic-search | 112.87 | ops/s | +| Median Throughput | semantic-search | 112.98 | ops/s | +| Max Throughput | semantic-search | 114.51 | ops/s | +| 50th percentile latency | semantic-search | 82.0484 | ms | +| 90th percentile latency | semantic-search | 99.8155 | ms | +| 99th percentile latency | semantic-search | 125.478 | ms | +| 99.9th percentile latency | semantic-search | 139.749 | ms | +| 100th percentile latency | semantic-search | 144.083 | ms | +| 50th percentile service time | semantic-search | 82.0484 | ms | +| 90th percentile service time | semantic-search | 99.8155 | ms | +| 99th percentile service time | semantic-search | 125.478 | ms | +| 99.9th percentile service time | semantic-search | 139.749 | ms | +| 100th percentile service time | semantic-search | 144.083 | ms | +| error rate | semantic-search | 0 | % | + + +--------------------------------- +[INFO] SUCCESS (took 266 seconds) +``` + +### License + +We use the same license for the data as the original data. +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ +``` +Covid-trec [1] is part of the COVID-19 Open Research dataset [2], which is licensed under Apache 2.0. +[1] https://arxiv.org/pdf/2005.04474v1.pdf +[2] https://github.com/allenai/cord19/ diff --git a/treccovid_semantic_search/files.txt b/treccovid_semantic_search/files.txt new file mode 100644 index 00000000..20fb5d6f --- /dev/null +++ b/treccovid_semantic_search/files.txt @@ -0,0 +1,2 @@ +documents.json.bz2 +queries.json.bz2 diff --git a/treccovid_semantic_search/index.json b/treccovid_semantic_search/index.json new file mode 100644 index 00000000..3ba5b385 --- /dev/null +++ b/treccovid_semantic_search/index.json @@ -0,0 +1,36 @@ +{ + "settings": { + {%-if number_of_shards is defined %} + "index.number_of_shards": {{number_of_shards}}, + {%- endif %} + {%-if number_of_replicas is defined %} + "index.number_of_replicas": {{number_of_replicas}}, + {%- endif %} + "index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}}, + "index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}}, + "index.knn": true, + "default_pipeline": "nlp-ingest-pipeline" + }, + "mappings": { + "properties": { + "id": { + "type": "text" + }, + "passage_embedding": { + "type": "knn_vector", + "dimension": {{dimensions | default(768)}}, + "method": { + {%-if engine is defined %} + "engine": "{{engine}}", + {%- endif %} + "space_type": "{{space_type | default('l2')}}", + "name": "{{method | default('hnsw')}}", + "parameters": {} + } + }, + "text": { + "type": "text" + } + } + } +} diff --git a/treccovid_semantic_search/operations/default.json b/treccovid_semantic_search/operations/default.json new file mode 100644 index 00000000..ac3a65e7 --- /dev/null +++ b/treccovid_semantic_search/operations/default.json @@ -0,0 +1,72 @@ + { + "name": "delete-ingest-pipeline", + "operation-type": "delete-pipeline", + "id": "nlp-ingest-pipeline" + }, + { + "name": "create-ingest-pipeline", + "operation-type": "put-pipeline", + "param-source": "create-ingest-pipeline", + "id": "nlp-ingest-pipeline", + "body": { + "description": "An NLP ingest pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "", + "field_map": { + "text": "passage_embedding" + } + } + } + ] + } + }, + { + "name": "index-append", + "operation-type": "bulk", + "bulk-size": {{bulk_size | default(100)}}, + "ingest-percentage": {{ingest_percentage | default(100)}} + }, + { + "name": "wait-until-merges-finish", + "operation-type": "index-stats", + "index": "_all", + "condition": { + "path": "_all.total.merges.current", + "expected-value": 0 + }, + "retry-until-success": true, + "include-in-reporting": false + }, + { + "name": "default", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + { + "name": "semantic-search", + "operation-type": "search", + "num-variable-queries": {{num_variable_queries | default(0)}}, + "param-source": "semantic-search-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "", + "k": {{k | default(10)}} + } + } + } + } + } diff --git a/treccovid_semantic_search/test_procedures/default.json b/treccovid_semantic_search/test_procedures/default.json new file mode 100644 index 00000000..12c1f675 --- /dev/null +++ b/treccovid_semantic_search/test_procedures/default.json @@ -0,0 +1,112 @@ + { + "name": "index-merge-search", + "description": "Indexes the corpus with vector embedding and then runs queries with vector embedding.", + "default": true, + "schedule": [ + { + "name": "cluster-settings", + "operation": { + "operation-type": "put-settings", + "body": { + "persistent": { + "plugins": { + "ml_commons": { + "only_run_on_ml_node": "false", + "native_memory_threshold": "99", + "allow_registering_model_via_local_file": "true", + "allow_registering_model_via_url": "true" + } + } + } + } + } + }, + { + "operation": "delete-index" + }, + { + "operation": "delete-ingest-pipeline" + }, + { + "operation": { + "operation-type": "delete-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" + } + }, + { + "operation": { + "operation-type": "register-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", + "model-version": "{{ model_version | default('1.0.1') }}", + "model-format": "{{ model_format | default('TORCH_SCRIPT') }}", + "model-config-file": "{{ model_config_file | default('') }}" + } + }, + { + "operation": "deploy-ml-model" + }, + { + "operation": "create-ingest-pipeline" + }, + { + "operation": { + "operation-type": "create-index", + "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { + "index.refresh_interval": "5s", + "index.translog.flush_threshold_size": "1g" + }{%- endif %} + } + }, + { + "name": "check-cluster-health", + "operation": { + "operation-type": "cluster-health", + "index": "treccovid", + "request-params": { + "wait_for_status": "{{cluster_health | default('green')}}", + "wait_for_no_relocating_shards": "true" + }, + "retry-until-success": true + } + }, + { + "operation": "index-append", + "warmup-time-period": 60, + "clients": {{bulk_indexing_clients | default(1)}}, + "ignore-response-error-level": "{{error_level | default('non-fatal')}}" + }, + { + "name": "refresh-after-index", + "operation": "refresh" + }, + { + "operation": { + "operation-type": "force-merge", + "request-timeout": 7200{%- if force_merge_max_num_segments is defined %}, + "max-num-segments": {{ force_merge_max_num_segments | tojson }} + {%- endif %} + } + }, + { + "name": "refresh-after-force-merge", + "operation": "refresh" + }, + { + "operation": "wait-until-merges-finish" + }, + { + "operation": "default", + "warmup-iterations": {{warmup_iterations | default(500) | tojson}}, + "iterations": {{iterations | default(500) | tojson }}, + "target-throughput": {{ target_throughput | default(100) | tojson}}, + "clients": {{ search_clients | default(1) }} + }, + { + "operation": "semantic-search", + "warmup-iterations": {{warmup_iterations | default(100) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "target-throughput": {{ target_throughput | default(10) | tojson}}, + "clients": {{ search_clients | default(1)}} + } + ] + } diff --git a/treccovid_semantic_search/workload.json b/treccovid_semantic_search/workload.json new file mode 100644 index 00000000..761d1d0e --- /dev/null +++ b/treccovid_semantic_search/workload.json @@ -0,0 +1,32 @@ +{% import "benchmark.helpers" as benchmark with context %} + +{ + "version": 2, + "description": "Trec-Covid is a dataset collection of documents about COVID-19 information.", + "indices": [ + { + "name": "treccovid", + "body": "index.json" + } + ], + "corpora": [ + { + "name": "treccovid", + "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid", + "documents": [ + { + "source-file": "documents.json.bz2", + "document-count": 129192, + "compressed-bytes": 51187469, + "uncompressed-bytes": 211980208 + } + ] + } + ], + "operations": [ + {{ benchmark.collect(parts="operations/*.json") }} + ], + "test_procedures": [ + {{ benchmark.collect(parts="test_procedures/*.json") }} + ] +} diff --git a/treccovid_semantic_search/workload.py b/treccovid_semantic_search/workload.py new file mode 100644 index 00000000..1eaa0436 --- /dev/null +++ b/treccovid_semantic_search/workload.py @@ -0,0 +1,75 @@ +import random +import os +import json +from pathlib import Path + +from osbenchmark.workload.loader import Downloader +from osbenchmark.workload.loader import Decompressor +from osbenchmark.workload.loader import Decompressor + +script_dir = os.path.dirname(os.path.realpath(__file__)) + +def ingest_pipeline_param_source(workload, params, **kwargs): + model_id = params['body']['processors'][0]['text_embedding']['model_id'] + if not model_id: + with open('model_id.json') as f: + d = json.loads(f.read()) + model_id = d['model_id'] + params['body']['processors'][0]['text_embedding']['model_id'] = model_id + return params + +class QueryParamSource: + def __init__(self, workload, params, **kwargs): + if len(workload.indices) == 1: + index = workload.indices[0].name + if len(workload.indices[0].types) == 1: + type = workload.indices[0].types[0].name + else: + type = None + else: + index = "_all" + type = None + + self._params = params + self._params['index'] = index + self._params['type'] = type + self._params['variable-queries'] = params.get("variable-queries", 0) + self.infinite = True + + if self._params['variable-queries'] > 0: + with open(script_dir + os.sep + 'workload_queries.json', 'r') as f: + d = json.loads(f.read()) + source_file = d['source-file'] + base_url = d['base-url'] + compressed_bytes = d['compressed-bytes'] + uncompressed_bytes = d['uncompressed-bytes'] + compressed_path = script_dir + os.sep + source_file + uncompressed_path = script_dir + os.sep + Path(source_file).stem + if not os.path.exists(compressed_path): + downloader = Downloader(False, False) + downloader.download(base_url, None, compressed_path, compressed_bytes) + if not os.path.exists(uncompressed_path): + decompressor = Decompressor() + decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + params = self._params + with open('model_id.json', 'r') as f: + d = json.loads(f.read()) + params['body']['query']['neural']['passage_embedding']['model_id'] = d['model_id'] + count = self._params.get("variable-queries", 0) + if count > 0: + script_dir = os.path.dirname(os.path.realpath(__file__)) + with open(script_dir + '/queries.json', 'r') as f: + lines = f.read().splitlines() + line =random.choice(lines) + query_text = json.loads(line)['text'] + params['body']['query']['neural']['passage_embedding']['query_text'] = query_text + return params + +def register(registry): + registry.register_param_source("semantic-search-source", QueryParamSource) + registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source) diff --git a/treccovid_semantic_search/workload_queries.json b/treccovid_semantic_search/workload_queries.json new file mode 100644 index 00000000..d445066d --- /dev/null +++ b/treccovid_semantic_search/workload_queries.json @@ -0,0 +1,6 @@ +{ + "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid", + "source-file": "queries.json.bz2", + "compressed-bytes": 4310, + "uncompressed-bytes": 16552 +}