Support sharegpt dataset in chatqna e2e test (#152)

* Support sharegpt dataset in chatqna e2e test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change the log level for selected questions --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
opea-project · Oct 10, 2024 · 028bf63 · 028bf63
1 parent 946c439
commit 028bf63
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 7 deletions.
diff --git a/evals/benchmark/README.md b/evals/benchmark/README.md
@@ -72,6 +72,7 @@ test_suite_config:
         arrival-rate: 1.0        # Request arrival rate
   warm_ups: 0  # Number of test requests for warm-ups
   run_time: 60m  # Total runtime for the test suite
+  seed:  # The seed for all RNGs
   user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]  # Number of test requests
   query_timeout: 120  # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
   random_prompt: false  # Use random prompts if true, fixed prompts if false
@@ -112,4 +113,10 @@ test_cases:
     e2e:
       run_test: true
       service_name: "chatqna-backend-server-svc"
+      service_list:  # Replace with your k8s service names if deploy with k8s
+                     # or container names if deploy with Docker for metrics collection,
+                     # activate if collect_service_metric is true
+        - "chatqna-backend-server-svc"
+      dataset: # Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt
 ```
+If you'd like to use sharegpt dataset, please download the dataset according to the [guide](https://github.com/lm-sys/FastChat/issues/90#issuecomment-1493250773). Merge all downloaded data files into one file named sharegpt.json and put the file at `evals/benchmark/stresscli/dataset`.
diff --git a/evals/benchmark/benchmark.py b/evals/benchmark/benchmark.py
@@ -54,6 +54,7 @@ def extract_test_case_data(content):
         "service_port": test_suite_config.get("service_port"),
         "load_shape": test_suite_config.get("load_shape"),
         "query_timeout": test_suite_config.get("query_timeout", 120),
+        "seed": test_suite_config.get("seed", None),
         "all_case_data": {
             example: content["test_cases"].get(example, {}) for example in test_suite_config.get("examples", [])
         },
@@ -93,6 +94,8 @@ def create_run_yaml_content(service, base_url, bench_target, test_phase, num_que
                 "bench-target": bench_target,
                 "service-metric-collect": test_params["collect_service_metric"],
                 "service-list": service.get("service_list", []),
+                "dataset": service.get("dataset", "default"),
+                "seed": test_params.get("seed", None),
                 "llm-model": test_params["llm_model"],
                 "deployment-type": test_params["deployment_type"],
                 "load-shape": test_params["load_shape"],
@@ -307,6 +310,7 @@ def run_benchmark(report=False):
         "load_shape": parsed_data["load_shape"],
         "query_timeout": parsed_data["query_timeout"],
         "warm_ups": parsed_data["warm_ups"],
+        "seed": parsed_data["seed"],
     }
     check_test_suite_config(test_suite_config)
 

diff --git a/evals/benchmark/benchmark.yaml b/evals/benchmark/benchmark.yaml
@@ -8,6 +8,7 @@ test_suite_config: # Overall configuration settings for the test suite
   service_port: None  # Leave as None for k8s, specify for Docker
   warm_ups: 0  # Number of test requests for warm-up
   run_time: 60m  # The max total run time for the test suite
+  seed:  # The seed for all RNGs
   user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]  # Number of test requests at each concurrency level
   query_timeout: 120  # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
   random_prompt: false  # Use random prompts if true, fixed prompts if false
@@ -67,8 +68,15 @@ test_cases:
       service_list:  # Replace with your k8s service names if deploy with k8s
                      # or container names if deploy with Docker for metrics collection,
                      # activate if collect_service_metric is true
-        - "chatqna-tei"
-        - "chatqna-teirerank"
+        - "chatqna-backend-server-svc"
+        - "chatqna-nginx-svc"
+        - "dataprep-svc"
+        - "embedding-dependency-svc"
+        - "llm-dependency-svc"
+        - "reranking-dependency-svc"
+        - "retriever-svc"
+        - "vector-db"
+      dataset: # Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt
 
   codegen:
     llm:

diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py
@@ -32,6 +32,8 @@
     "max-request": 100,
     "namespace": "default",
     "load-shape": {"name": DEFAULT_LOADSHAPE},
+    "dataset": "default",
+    "seed": "none",
 }
 
 console_logger = logging.getLogger("opea.eval")
@@ -130,7 +132,10 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
         "deployment-type", global_settings.get("deployment-type", locust_defaults["deployment-type"])
     )
     runspec["namespace"] = run_settings.get("namespace", global_settings.get("namespace", locust_defaults["namespace"]))
-
+    runspec["dataset"] = run_settings.get("dataset", global_settings.get("dataset", locust_defaults["dataset"]))
+    runspec["dataset"] = locust_defaults["dataset"] if runspec["dataset"] is None else runspec["dataset"]
+    runspec["seed"] = run_settings.get("seed", global_settings.get("seed", locust_defaults["seed"]))
+    runspec["seed"] = locust_defaults["seed"] if runspec["seed"] is None else runspec["seed"]
     runspec["run_name"] = run_settings["name"]
 
     # Specify load shape to adjust user distribution
@@ -193,7 +198,12 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
             processes = 10 if concurrent_level > 400 else 5 if concurrent_level > 200 else processes
     elif load_shape == "poisson":
         if load_shape_params and "arrival-rate" in load_shape_params:
-            processes = max(2, math.ceil(int(load_shape_params["arrival-rate"]) / 10))
+            processes = max(2, math.ceil(int(load_shape_params["arrival-rate"]) / 5))
+    else:
+        if load_shape_params and "arrival-rate" in load_shape_params:
+            processes = max(2, math.ceil(int(load_shape_params["arrival-rate"]) / 5))
+        elif runspec["max_requests"] > 0:
+            processes = 10 if runspec["max_requests"] > 2000 else 5 if runspec["max_requests"] > 1000 else processes
 
     cmd = [
         "locust",
@@ -205,6 +215,10 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
         runspec["runtime"],
         "--load-shape",
         runspec["load-shape"],
+        "--dataset",
+        runspec["dataset"],
+        "--seed",
+        str(runspec["seed"]),
         "--processes",
         str(processes),
         "--users",

diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py
@@ -50,6 +50,20 @@ def _(parser):
         default="constant",
         help="load shape to adjust conccurency at runtime",
     )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        env_var="OPEA_EVAL_DATASET",
+        default="default",
+        help="dataset",
+    )
+    parser.add_argument(
+        "--seed",
+        type=str,
+        env_var="OPEA_EVAL_SEED",
+        default="none",
+        help="The seed for all RNGs",
+    )
 
 
 reqlist = []
@@ -188,11 +202,14 @@ def on_test_start(environment, **kwargs):
         console_logger.info(f"Http timeout      : {environment.parsed_options.http_timeout}\n")
         console_logger.info(f"Benchmark target  : {environment.parsed_options.bench_target}\n")
         console_logger.info(f"Load shape        : {environment.parsed_options.load_shape}")
+        console_logger.info(f"Dataset           : {environment.parsed_options.dataset}")
 
 
 @events.init.add_listener
 def on_locust_init(environment, **_kwargs):
     global bench_package
+    os.environ["OPEA_EVAL_DATASET"] = environment.parsed_options.dataset
+    os.environ["OPEA_EVAL_SEED"] = environment.parsed_options.seed
     try:
         bench_package = __import__(environment.parsed_options.bench_target)
     except ImportError:

diff --git a/evals/benchmark/stresscli/locust/chatqnabench.py b/evals/benchmark/stresscli/locust/chatqnabench.py
@@ -9,7 +9,15 @@
 import tokenresponse as token
 
 cwd = os.path.dirname(__file__)
-filename = f"{cwd}/../dataset/chatqna.json"
+dataset = os.environ["OPEA_EVAL_DATASET"]
+if dataset == "sharegpt":
+    filename = f"{cwd}/../dataset/sharegpt.json"
+elif dataset == "default":
+    filename = f"{cwd}/../dataset/chatqna.json"
+else:
+    logging.error(f"Dataset not found: dataset/{dataset}.json.")
+    exit()
+
 qlist = []
 try:
     with open(filename) as qfile:
@@ -18,6 +26,10 @@
     logging.error(f"Question File open failed: {filename}")
     exit()
 
+seed = os.environ["OPEA_EVAL_SEED"]
+if seed and seed != "none":
+    random.seed(seed)
+
 
 def getUrl():
     return "/v1/chatqna"
@@ -26,9 +38,17 @@ def getUrl():
 def getReqData():
     qlen = len(qlist)
     qid = random.randint(0, qlen - 1)
-    logging.debug(f"Selected question: {qlist[qid]['qText']}")
 
-    return {"messages": qlist[qid]["qText"], "max_tokens": 128}
+    if dataset == "sharegpt":
+        msg = qlist[qid]["conversations"][0]["value"]
+    elif dataset == "default":
+        msg = qlist[qid]["qText"]
+    else:
+        msg = qlist[qid]["qText"]
+
+    logging.debug(f"Selected question: {msg}")
+
+    return {"messages": msg, "max_tokens": 128}
 
 
 def respStatics(environment, reqData, respData):