Skip to content

Commit

Permalink
Support sharegpt dataset in chatqna e2e test (#152)
Browse files Browse the repository at this point in the history
* Support sharegpt dataset in chatqna e2e test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Change the log level for selected questions

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
joshuayao and pre-commit-ci[bot] authored Oct 10, 2024
1 parent 946c439 commit 028bf63
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 7 deletions.
7 changes: 7 additions & 0 deletions evals/benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ test_suite_config:
arrival-rate: 1.0 # Request arrival rate
warm_ups: 0 # Number of test requests for warm-ups
run_time: 60m # Total runtime for the test suite
seed: # The seed for all RNGs
user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] # Number of test requests
query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
random_prompt: false # Use random prompts if true, fixed prompts if false
Expand Down Expand Up @@ -112,4 +113,10 @@ test_cases:
e2e:
run_test: true
service_name: "chatqna-backend-server-svc"
service_list: # Replace with your k8s service names if deploy with k8s
# or container names if deploy with Docker for metrics collection,
# activate if collect_service_metric is true
- "chatqna-backend-server-svc"
dataset: # Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt
```
If you'd like to use sharegpt dataset, please download the dataset according to the [guide](https://github.com/lm-sys/FastChat/issues/90#issuecomment-1493250773). Merge all downloaded data files into one file named sharegpt.json and put the file at `evals/benchmark/stresscli/dataset`.
4 changes: 4 additions & 0 deletions evals/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def extract_test_case_data(content):
"service_port": test_suite_config.get("service_port"),
"load_shape": test_suite_config.get("load_shape"),
"query_timeout": test_suite_config.get("query_timeout", 120),
"seed": test_suite_config.get("seed", None),
"all_case_data": {
example: content["test_cases"].get(example, {}) for example in test_suite_config.get("examples", [])
},
Expand Down Expand Up @@ -93,6 +94,8 @@ def create_run_yaml_content(service, base_url, bench_target, test_phase, num_que
"bench-target": bench_target,
"service-metric-collect": test_params["collect_service_metric"],
"service-list": service.get("service_list", []),
"dataset": service.get("dataset", "default"),
"seed": test_params.get("seed", None),
"llm-model": test_params["llm_model"],
"deployment-type": test_params["deployment_type"],
"load-shape": test_params["load_shape"],
Expand Down Expand Up @@ -307,6 +310,7 @@ def run_benchmark(report=False):
"load_shape": parsed_data["load_shape"],
"query_timeout": parsed_data["query_timeout"],
"warm_ups": parsed_data["warm_ups"],
"seed": parsed_data["seed"],
}
check_test_suite_config(test_suite_config)

Expand Down
12 changes: 10 additions & 2 deletions evals/benchmark/benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ test_suite_config: # Overall configuration settings for the test suite
service_port: None # Leave as None for k8s, specify for Docker
warm_ups: 0 # Number of test requests for warm-up
run_time: 60m # The max total run time for the test suite
seed: # The seed for all RNGs
user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] # Number of test requests at each concurrency level
query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult.
random_prompt: false # Use random prompts if true, fixed prompts if false
Expand Down Expand Up @@ -67,8 +68,15 @@ test_cases:
service_list: # Replace with your k8s service names if deploy with k8s
# or container names if deploy with Docker for metrics collection,
# activate if collect_service_metric is true
- "chatqna-tei"
- "chatqna-teirerank"
- "chatqna-backend-server-svc"
- "chatqna-nginx-svc"
- "dataprep-svc"
- "embedding-dependency-svc"
- "llm-dependency-svc"
- "reranking-dependency-svc"
- "retriever-svc"
- "vector-db"
dataset: # Activate if random_prompt=true: leave blank = default dataset(WebQuestions) or sharegpt

codegen:
llm:
Expand Down
18 changes: 16 additions & 2 deletions evals/benchmark/stresscli/commands/load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
"max-request": 100,
"namespace": "default",
"load-shape": {"name": DEFAULT_LOADSHAPE},
"dataset": "default",
"seed": "none",
}

console_logger = logging.getLogger("opea.eval")
Expand Down Expand Up @@ -130,7 +132,10 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
"deployment-type", global_settings.get("deployment-type", locust_defaults["deployment-type"])
)
runspec["namespace"] = run_settings.get("namespace", global_settings.get("namespace", locust_defaults["namespace"]))

runspec["dataset"] = run_settings.get("dataset", global_settings.get("dataset", locust_defaults["dataset"]))
runspec["dataset"] = locust_defaults["dataset"] if runspec["dataset"] is None else runspec["dataset"]
runspec["seed"] = run_settings.get("seed", global_settings.get("seed", locust_defaults["seed"]))
runspec["seed"] = locust_defaults["seed"] if runspec["seed"] is None else runspec["seed"]
runspec["run_name"] = run_settings["name"]

# Specify load shape to adjust user distribution
Expand Down Expand Up @@ -193,7 +198,12 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
processes = 10 if concurrent_level > 400 else 5 if concurrent_level > 200 else processes
elif load_shape == "poisson":
if load_shape_params and "arrival-rate" in load_shape_params:
processes = max(2, math.ceil(int(load_shape_params["arrival-rate"]) / 10))
processes = max(2, math.ceil(int(load_shape_params["arrival-rate"]) / 5))
else:
if load_shape_params and "arrival-rate" in load_shape_params:
processes = max(2, math.ceil(int(load_shape_params["arrival-rate"]) / 5))
elif runspec["max_requests"] > 0:
processes = 10 if runspec["max_requests"] > 2000 else 5 if runspec["max_requests"] > 1000 else processes

cmd = [
"locust",
Expand All @@ -205,6 +215,10 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
runspec["runtime"],
"--load-shape",
runspec["load-shape"],
"--dataset",
runspec["dataset"],
"--seed",
str(runspec["seed"]),
"--processes",
str(processes),
"--users",
Expand Down
17 changes: 17 additions & 0 deletions evals/benchmark/stresscli/locust/aistress.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,20 @@ def _(parser):
default="constant",
help="load shape to adjust conccurency at runtime",
)
parser.add_argument(
"--dataset",
type=str,
env_var="OPEA_EVAL_DATASET",
default="default",
help="dataset",
)
parser.add_argument(
"--seed",
type=str,
env_var="OPEA_EVAL_SEED",
default="none",
help="The seed for all RNGs",
)


reqlist = []
Expand Down Expand Up @@ -188,11 +202,14 @@ def on_test_start(environment, **kwargs):
console_logger.info(f"Http timeout : {environment.parsed_options.http_timeout}\n")
console_logger.info(f"Benchmark target : {environment.parsed_options.bench_target}\n")
console_logger.info(f"Load shape : {environment.parsed_options.load_shape}")
console_logger.info(f"Dataset : {environment.parsed_options.dataset}")


@events.init.add_listener
def on_locust_init(environment, **_kwargs):
global bench_package
os.environ["OPEA_EVAL_DATASET"] = environment.parsed_options.dataset
os.environ["OPEA_EVAL_SEED"] = environment.parsed_options.seed
try:
bench_package = __import__(environment.parsed_options.bench_target)
except ImportError:
Expand Down
26 changes: 23 additions & 3 deletions evals/benchmark/stresscli/locust/chatqnabench.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,15 @@
import tokenresponse as token

cwd = os.path.dirname(__file__)
filename = f"{cwd}/../dataset/chatqna.json"
dataset = os.environ["OPEA_EVAL_DATASET"]
if dataset == "sharegpt":
filename = f"{cwd}/../dataset/sharegpt.json"
elif dataset == "default":
filename = f"{cwd}/../dataset/chatqna.json"
else:
logging.error(f"Dataset not found: dataset/{dataset}.json.")
exit()

qlist = []
try:
with open(filename) as qfile:
Expand All @@ -18,6 +26,10 @@
logging.error(f"Question File open failed: {filename}")
exit()

seed = os.environ["OPEA_EVAL_SEED"]
if seed and seed != "none":
random.seed(seed)


def getUrl():
return "/v1/chatqna"
Expand All @@ -26,9 +38,17 @@ def getUrl():
def getReqData():
qlen = len(qlist)
qid = random.randint(0, qlen - 1)
logging.debug(f"Selected question: {qlist[qid]['qText']}")

return {"messages": qlist[qid]["qText"], "max_tokens": 128}
if dataset == "sharegpt":
msg = qlist[qid]["conversations"][0]["value"]
elif dataset == "default":
msg = qlist[qid]["qText"]
else:
msg = qlist[qid]["qText"]

logging.debug(f"Selected question: {msg}")

return {"messages": msg, "max_tokens": 128}


def respStatics(environment, reqData, respData):
Expand Down

0 comments on commit 028bf63

Please sign in to comment.