diff --git a/evals/benchmark/stresscli/locust/codegenbench.py b/evals/benchmark/stresscli/locust/codegenbench.py index bdf1fefa..3a34b57c 100644 --- a/evals/benchmark/stresscli/locust/codegenbench.py +++ b/evals/benchmark/stresscli/locust/codegenbench.py @@ -20,7 +20,7 @@ def getUrl(): - return "/v1/chatqna" + return "/v1/codegen" def getReqData(): diff --git a/evals/evaluation/bigcode_evaluation_harness/accuracy.py b/evals/evaluation/bigcode_evaluation_harness/accuracy.py index 31132726..49bc5d8e 100644 --- a/evals/evaluation/bigcode_evaluation_harness/accuracy.py +++ b/evals/evaluation/bigcode_evaluation_harness/accuracy.py @@ -22,6 +22,8 @@ from bigcode_eval.tasks import ALL_TASKS from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer +from evals.evaluation.bigcode_evaluation_harness.api_evaluator import APIEvaluator + def pattern_match(patterns, source_list): """Returns a list containing all values of the source_list that @@ -68,6 +70,13 @@ def evaluate(args): evaluator = Evaluator(accelerator, None, None, args) for task in task_names: results[task] = evaluator.evaluate(task) + elif args.codegen_url: + # here we generate code using an OPEA codegen API + if accelerator.is_main_process: + print("OPEA codegen API generation mode") + evaluator = APIEvaluator(accelerator, args.model, None, args) + for task in task_names: + results[task] = evaluator.evaluate(task) else: # here we generate code and save it (evaluation is optional but True by default) dict_precisions = { diff --git a/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py b/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py new file mode 100644 index 00000000..e6078764 --- /dev/null +++ b/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py @@ -0,0 +1,106 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import inspect +import json +import warnings + +import aiohttp +from bigcode_eval import tasks +from bigcode_eval.evaluator import Evaluator + + +class APIEvaluator(Evaluator): + def generate_text(self, task_name, intermediate_generations=None): + task = tasks.get_task(task_name, self.args) + dataset = task.get_dataset() + # if args.limit is None, use all samples + # if args.limit is used, make sure args.limit_start + args.limit <= len(dataset) + n_tasks = min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset) + print(n_tasks) + # when args.limit is None + # adjust n_tasks by args.limit_start to prevent out of bounds issues + if not self.args.limit: + n_tasks -= self.args.limit_start + references = [ + task.get_reference(dataset[i]) for i in range(self.args.limit_start, self.args.limit_start + n_tasks) + ] + + if self.args.check_references: + if "get_solution" in inspect.signature(task.get_reference).parameters: + solutions = [ + [task.get_reference(dataset[i], get_solution=True)] + for i in range(self.args.limit_start, self.args.limit_start + n_tasks) + ] + else: + solutions = [[ref] for ref in references] + return solutions, references + + if intermediate_generations: + curr_generations = [gen for gen in intermediate_generations if gen] + n_tasks -= len(curr_generations) + + generations = parallel_generations_by_api( + task, + dataset, + self.accelerator, + n_tasks=n_tasks, + args=self.args, + ) + + if len(generations[0]) > self.args.n_samples: + generations = [l[: self.args.n_samples] for l in generations] + warnings.warn( + f"Number of tasks wasn't proportional to number of devices, we removed extra predictions to only keep nsamples={self.args.n_samples}" + ) + return generations, references + + +def parallel_generations_by_api( + task, + dataset, + accelerator, + n_tasks, + args, +): + if args.load_generations_path: + # load generated code + with open(args.load_generations_path) as fp: + generations = json.load(fp) + if accelerator.is_main_process: + print( + f"generations loaded, {n_tasks} selected from {len(generations)} with {len(generations[0])} candidates" + ) + return generations[:n_tasks] + + if codegen_url := args.codegen_url: + assert "/codegen" in codegen_url, "Only OPEA codegen compatible APIs are supported" + import asyncio + import os + + import requests + from tqdm.asyncio import tqdm + + async def get_res(prompt): + headers = {"Content-Type": "application/json"} + data = { + "messages": prompt, + "max_tokens": 2048, + "stream": False, + "temperature": args.temperature, + "top_p": args.top_p, + "top_k": args.top_k, + } + async with aiohttp.ClientSession() as session: + async with session.post(codegen_url, json=data, headers=headers, timeout=600) as response: + text = await response.text() + return text + + prompts = [task.get_prompt(doc) for doc in dataset] + awaitables = [get_res(prompt=prompt) for prompt in prompts] + responses = asyncio.run(tqdm.gather(*awaitables)) + generations = [] + for i, (prompt, response) in enumerate(zip(prompts, responses)): + texts = [prompt + choice["message"]["content"] for choice in json.loads(response)["choices"]] + generations.append([task.postprocess_generation(text, i) for text in texts]) + return generations diff --git a/evals/evaluation/bigcode_evaluation_harness/arguments.py b/evals/evaluation/bigcode_evaluation_harness/arguments.py index cec695b8..8683873a 100644 --- a/evals/evaluation/bigcode_evaluation_harness/arguments.py +++ b/evals/evaluation/bigcode_evaluation_harness/arguments.py @@ -204,6 +204,11 @@ def setup_parser(): action="store_true", help="Don't run generation but benchmark groundtruth (useful for debugging)", ) + parser.add_argument( + "--codegen_url", + default=None, + help="Base URL to use OPEA Codegen API,", + ) return parser.parse_args() @@ -253,6 +258,7 @@ def __init__( check_references=False, user_model=None, # used for pass model object tokenizer=None, # used for pass tokenizer object + codegen_url=None, ): self.prefix = prefix self.do_sample = do_sample @@ -295,3 +301,4 @@ def __init__( self.check_references = check_references self.user_model = user_model self.tokenizer = tokenizer + self.codegen_url = codegen_url