Skip to content

Commit

Permalink
Support bigcode eval for codegen v0.1 (#94)
Browse files Browse the repository at this point in the history
* Support bigcode eval for codegen v0.1

Signed-off-by: Yao, Qing <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix for bigcode eval UT

Signed-off-by: Yao, Qing <[email protected]>

---------

Signed-off-by: Yao, Qing <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
yao531441 and pre-commit-ci[bot] authored Sep 5, 2024
1 parent 4df6438 commit 02b60b5
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 1 deletion.
2 changes: 1 addition & 1 deletion evals/benchmark/stresscli/locust/codegenbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@


def getUrl():
return "/v1/chatqna"
return "/v1/codegen"


def getReqData():
Expand Down
9 changes: 9 additions & 0 deletions evals/evaluation/bigcode_evaluation_harness/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from bigcode_eval.tasks import ALL_TASKS
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer

from evals.evaluation.bigcode_evaluation_harness.api_evaluator import APIEvaluator


def pattern_match(patterns, source_list):
"""Returns a list containing all values of the source_list that
Expand Down Expand Up @@ -68,6 +70,13 @@ def evaluate(args):
evaluator = Evaluator(accelerator, None, None, args)
for task in task_names:
results[task] = evaluator.evaluate(task)
elif args.codegen_url:
# here we generate code using an OPEA codegen API
if accelerator.is_main_process:
print("OPEA codegen API generation mode")
evaluator = APIEvaluator(accelerator, args.model, None, args)
for task in task_names:
results[task] = evaluator.evaluate(task)
else:
# here we generate code and save it (evaluation is optional but True by default)
dict_precisions = {
Expand Down
106 changes: 106 additions & 0 deletions evals/evaluation/bigcode_evaluation_harness/api_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import inspect
import json
import warnings

import aiohttp
from bigcode_eval import tasks
from bigcode_eval.evaluator import Evaluator


class APIEvaluator(Evaluator):
def generate_text(self, task_name, intermediate_generations=None):
task = tasks.get_task(task_name, self.args)
dataset = task.get_dataset()
# if args.limit is None, use all samples
# if args.limit is used, make sure args.limit_start + args.limit <= len(dataset)
n_tasks = min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset)
print(n_tasks)
# when args.limit is None
# adjust n_tasks by args.limit_start to prevent out of bounds issues
if not self.args.limit:
n_tasks -= self.args.limit_start
references = [
task.get_reference(dataset[i]) for i in range(self.args.limit_start, self.args.limit_start + n_tasks)
]

if self.args.check_references:
if "get_solution" in inspect.signature(task.get_reference).parameters:
solutions = [
[task.get_reference(dataset[i], get_solution=True)]
for i in range(self.args.limit_start, self.args.limit_start + n_tasks)
]
else:
solutions = [[ref] for ref in references]
return solutions, references

if intermediate_generations:
curr_generations = [gen for gen in intermediate_generations if gen]
n_tasks -= len(curr_generations)

generations = parallel_generations_by_api(
task,
dataset,
self.accelerator,
n_tasks=n_tasks,
args=self.args,
)

if len(generations[0]) > self.args.n_samples:
generations = [l[: self.args.n_samples] for l in generations]
warnings.warn(
f"Number of tasks wasn't proportional to number of devices, we removed extra predictions to only keep nsamples={self.args.n_samples}"
)
return generations, references


def parallel_generations_by_api(
task,
dataset,
accelerator,
n_tasks,
args,
):
if args.load_generations_path:
# load generated code
with open(args.load_generations_path) as fp:
generations = json.load(fp)
if accelerator.is_main_process:
print(
f"generations loaded, {n_tasks} selected from {len(generations)} with {len(generations[0])} candidates"
)
return generations[:n_tasks]

if codegen_url := args.codegen_url:
assert "/codegen" in codegen_url, "Only OPEA codegen compatible APIs are supported"
import asyncio
import os

import requests
from tqdm.asyncio import tqdm

async def get_res(prompt):
headers = {"Content-Type": "application/json"}
data = {
"messages": prompt,
"max_tokens": 2048,
"stream": False,
"temperature": args.temperature,
"top_p": args.top_p,
"top_k": args.top_k,
}
async with aiohttp.ClientSession() as session:
async with session.post(codegen_url, json=data, headers=headers, timeout=600) as response:
text = await response.text()
return text

prompts = [task.get_prompt(doc) for doc in dataset]
awaitables = [get_res(prompt=prompt) for prompt in prompts]
responses = asyncio.run(tqdm.gather(*awaitables))
generations = []
for i, (prompt, response) in enumerate(zip(prompts, responses)):
texts = [prompt + choice["message"]["content"] for choice in json.loads(response)["choices"]]
generations.append([task.postprocess_generation(text, i) for text in texts])
return generations
7 changes: 7 additions & 0 deletions evals/evaluation/bigcode_evaluation_harness/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ def setup_parser():
action="store_true",
help="Don't run generation but benchmark groundtruth (useful for debugging)",
)
parser.add_argument(
"--codegen_url",
default=None,
help="Base URL to use OPEA Codegen API,",
)
return parser.parse_args()


Expand Down Expand Up @@ -253,6 +258,7 @@ def __init__(
check_references=False,
user_model=None, # used for pass model object
tokenizer=None, # used for pass tokenizer object
codegen_url=None,
):
self.prefix = prefix
self.do_sample = do_sample
Expand Down Expand Up @@ -295,3 +301,4 @@ def __init__(
self.check_references = check_references
self.user_model = user_model
self.tokenizer = tokenizer
self.codegen_url = codegen_url

0 comments on commit 02b60b5

Please sign in to comment.