-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(general learning LM) | make the general learning lm (#26)
* Update vscode settings.json * Add details to JSONEvaluationHelper result output * Add GenericLMFunctionOptimizer for general learning LM * Add tests for GenericLMFunctionOptimizer * Bump version to 0.0.10
- Loading branch information
Showing
5 changed files
with
272 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[project] | ||
name = "zenbase" | ||
version = "0.0.8" | ||
version = "0.0.10" | ||
description = "LLMs made Zen" | ||
authors = [{ name = "Cyrus Nouroozi", email = "[email protected]" }] | ||
dependencies = [ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
110 changes: 110 additions & 0 deletions
110
py/src/zenbase/predefined/generic_lm_function/optimizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
from dataclasses import dataclass, field | ||
from typing import Any, Callable, List, NamedTuple, Type | ||
|
||
from instructor.client import Instructor | ||
from pydantic import BaseModel | ||
|
||
from zenbase.adaptors.json.adaptor import JSONAdaptor | ||
from zenbase.core.managers import ZenbaseTracer | ||
from zenbase.optim.metric.labeled_few_shot import LabeledFewShot | ||
from zenbase.optim.metric.types import CandidateEvalResult | ||
from zenbase.types import LMDemo, LMFunction | ||
|
||
|
||
@dataclass | ||
class GenericLMFunctionOptimizer: | ||
class Result(NamedTuple): | ||
best_function: LMFunction | ||
candidate_results: list[CandidateEvalResult] | ||
best_candidate_result: CandidateEvalResult | None | ||
|
||
instructor_client: Instructor | ||
prompt: str | ||
input_model: Type[BaseModel] | ||
output_model: Type[BaseModel] | ||
model: str | ||
zenbase_tracer: ZenbaseTracer | ||
training_set: List[dict] | ||
validation_set: List[dict] | ||
test_set: List[dict] | ||
custom_evaluator: Callable[[Any, dict], dict] = field(default=None) | ||
shots: int = 5 | ||
samples: int = 10 | ||
|
||
lm_function: LMFunction = field(init=False) | ||
training_set_demos: List[LMDemo] = field(init=False) | ||
validation_set_demos: List[LMDemo] = field(init=False) | ||
test_set_demos: List[LMDemo] = field(init=False) | ||
best_evaluation: CandidateEvalResult | None = field(default=None) | ||
base_evaluation: CandidateEvalResult | None = field(default=None) | ||
|
||
def __post_init__(self): | ||
self.lm_function = self._generate_lm_function() | ||
self.training_set_demos = self._convert_dataset_to_demos(self.training_set) | ||
self.validation_set_demos = self._convert_dataset_to_demos(self.validation_set) | ||
self.test_set_demos = self._convert_dataset_to_demos(self.test_set) | ||
|
||
def _generate_lm_function(self) -> LMFunction: | ||
@self.zenbase_tracer.trace_function | ||
def generic_function(request): | ||
messages = [ | ||
{"role": "system", "content": self.prompt}, | ||
{"role": "user", "content": str(request.inputs)}, | ||
] | ||
return self.instructor_client.chat.completions.create( | ||
model=self.model, response_model=self.output_model, messages=messages | ||
) | ||
|
||
return generic_function | ||
|
||
def _convert_dataset_to_demos(self, dataset: List[dict]) -> List[LMDemo]: | ||
return [LMDemo(inputs=item["inputs"], outputs=item["outputs"]) for item in dataset] | ||
|
||
def optimize(self) -> Result: | ||
evaluator = self.custom_evaluator or self._create_default_evaluator() | ||
test_evaluator = self._create_test_evaluator(evaluator) | ||
|
||
# Perform base evaluation | ||
self.base_evaluation = self._perform_base_evaluation(test_evaluator) | ||
|
||
optimizer = LabeledFewShot(demoset=self.training_set_demos, shots=self.shots) | ||
optimizer_result = optimizer.perform( | ||
self.lm_function, | ||
evaluator=JSONAdaptor.metric_evaluator( | ||
data=self.validation_set_demos, | ||
eval_function=evaluator, | ||
), | ||
samples=self.samples, | ||
rounds=1, | ||
) | ||
|
||
# Evaluate best function | ||
self.best_evaluation = self._evaluate_best_function(test_evaluator, optimizer_result) | ||
|
||
return self.Result( | ||
best_function=optimizer_result.best_function, | ||
candidate_results=optimizer_result.candidate_results, | ||
best_candidate_result=optimizer_result.best_candidate_result, | ||
) | ||
|
||
def _create_default_evaluator(self): | ||
def evaluator(output: BaseModel, ideal_output: dict) -> dict: | ||
return { | ||
"passed": int(output.dict() == ideal_output), | ||
} | ||
|
||
return evaluator | ||
|
||
def _create_test_evaluator(self, evaluator): | ||
return JSONAdaptor.metric_evaluator( | ||
data=self.test_set_demos, | ||
eval_function=evaluator, | ||
) | ||
|
||
def _perform_base_evaluation(self, test_evaluator): | ||
"""Perform the base evaluation of the LM function.""" | ||
return test_evaluator(self.lm_function) | ||
|
||
def _evaluate_best_function(self, test_evaluator, optimizer_result): | ||
"""Evaluate the best function from the optimization result.""" | ||
return test_evaluator(optimizer_result.best_function) |
152 changes: 152 additions & 0 deletions
152
py/tests/predefined/test_generic_lm_function_optimizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
import instructor | ||
import pytest | ||
from instructor import Instructor | ||
from openai import OpenAI | ||
from pydantic import BaseModel | ||
|
||
from zenbase.core.managers import ZenbaseTracer | ||
from zenbase.predefined.generic_lm_function.optimizer import GenericLMFunctionOptimizer | ||
|
||
|
||
class InputModel(BaseModel): | ||
question: str | ||
|
||
|
||
class OutputModel(BaseModel): | ||
answer: str | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def openai_client() -> OpenAI: | ||
return OpenAI() | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def instructor_client(openai_client: OpenAI) -> Instructor: | ||
return instructor.from_openai(openai_client) | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def zenbase_tracer() -> ZenbaseTracer: | ||
return ZenbaseTracer() | ||
|
||
|
||
@pytest.fixture | ||
def generic_optimizer(instructor_client, zenbase_tracer): | ||
training_set = [ | ||
{"inputs": {"question": "What is the capital of France?"}, "outputs": {"answer": "Paris"}}, | ||
{"inputs": {"question": "Who wrote Romeo and Juliet?"}, "outputs": {"answer": "William Shakespeare"}}, | ||
{"inputs": {"question": "What is the largest planet in our solar system?"}, "outputs": {"answer": "Jupiter"}}, | ||
{"inputs": {"question": "Who painted the Mona Lisa?"}, "outputs": {"answer": "Leonardo da Vinci"}}, | ||
{"inputs": {"question": "What is the chemical symbol for gold?"}, "outputs": {"answer": "Au"}}, | ||
] | ||
|
||
return GenericLMFunctionOptimizer( | ||
instructor_client=instructor_client, | ||
prompt="You are a helpful assistant. Answer the user's question concisely.", | ||
input_model=InputModel, | ||
output_model=OutputModel, | ||
model="gpt-3.5-turbo", | ||
zenbase_tracer=zenbase_tracer, | ||
training_set=training_set, | ||
validation_set=[ | ||
{"inputs": {"question": "What is the capital of Italy?"}, "outputs": {"answer": "Rome"}}, | ||
{"inputs": {"question": "What is the capital of France?"}, "outputs": {"answer": "Paris"}}, | ||
{"inputs": {"question": "What is the capital of Germany?"}, "outputs": {"answer": "Berlin"}}, | ||
], | ||
test_set=[ | ||
{"inputs": {"question": "Who invented the telephone?"}, "outputs": {"answer": "Alexander Graham Bell"}}, | ||
{"inputs": {"question": "Who is CEO of microsoft?"}, "outputs": {"answer": "Bill Gates"}}, | ||
{"inputs": {"question": "Who is founder of Facebook?"}, "outputs": {"answer": "Mark Zuckerberg"}}, | ||
], | ||
shots=len(training_set), # Set shots to the number of training examples | ||
) | ||
|
||
|
||
@pytest.mark.helpers | ||
def test_generic_optimizer_optimize(generic_optimizer): | ||
result = generic_optimizer.optimize() | ||
assert result is not None | ||
assert isinstance(result, GenericLMFunctionOptimizer.Result) | ||
assert result.best_function is not None | ||
assert callable(result.best_function) | ||
assert isinstance(result.candidate_results, list) | ||
assert result.best_candidate_result is not None | ||
|
||
# Check base evaluation | ||
assert generic_optimizer.base_evaluation is not None | ||
|
||
# Check best evaluation | ||
assert generic_optimizer.best_evaluation is not None | ||
|
||
# Test the best function | ||
test_input = InputModel(question="What is the capital of Italy?") | ||
output = result.best_function(test_input) | ||
assert isinstance(output, OutputModel) | ||
assert isinstance(output.answer, str) | ||
assert output.answer.strip().lower() == "rome" | ||
|
||
|
||
@pytest.mark.helpers | ||
def test_generic_optimizer_evaluations(generic_optimizer): | ||
result = generic_optimizer.optimize() | ||
|
||
# Check that base and best evaluations exist | ||
assert generic_optimizer.base_evaluation is not None | ||
assert generic_optimizer.best_evaluation is not None | ||
|
||
# Check that best evaluation is at least as good as base evaluation | ||
assert generic_optimizer.base_evalution.evals["score"] >= generic_optimizer.best_evaluation.evals["score"] | ||
|
||
# Additional checks to ensure the structure of the result | ||
assert isinstance(result, GenericLMFunctionOptimizer.Result) | ||
assert result.best_function is not None | ||
assert isinstance(result.candidate_results, list) | ||
assert result.best_candidate_result is not None | ||
|
||
# Check that the best candidate result matches the best evaluation | ||
assert result.best_candidate_result == generic_optimizer.best_evaluation | ||
|
||
|
||
@pytest.mark.helpers | ||
def test_generic_optimizer_custom_evaluator(instructor_client, zenbase_tracer): | ||
def custom_evaluator(output: OutputModel, ideal_output: dict) -> dict: | ||
return {"passed": int(output.answer.lower() == ideal_output["answer"].lower()), "length": len(output.answer)} | ||
|
||
training_set = [ | ||
{"inputs": {"question": "What is 2+2?"}, "outputs": {"answer": "4"}}, | ||
{"inputs": {"question": "What is the capital of France?"}, "outputs": {"answer": "Paris"}}, | ||
{"inputs": {"question": "Who wrote Romeo and Juliet?"}, "outputs": {"answer": "William Shakespeare"}}, | ||
{"inputs": {"question": "What is the largest planet in our solar system?"}, "outputs": {"answer": "Jupiter"}}, | ||
{"inputs": {"question": "Who painted the Mona Lisa?"}, "outputs": {"answer": "Leonardo da Vinci"}}, | ||
] | ||
|
||
optimizer = GenericLMFunctionOptimizer( | ||
instructor_client=instructor_client, | ||
prompt="You are a helpful assistant. Answer the user's question concisely.", | ||
input_model=InputModel, | ||
output_model=OutputModel, | ||
model="gpt-3.5-turbo", | ||
zenbase_tracer=zenbase_tracer, | ||
training_set=training_set, | ||
validation_set=[{"inputs": {"question": "What is 3+3?"}, "outputs": {"answer": "6"}}], | ||
test_set=[{"inputs": {"question": "What is 4+4?"}, "outputs": {"answer": "8"}}], | ||
custom_evaluator=custom_evaluator, | ||
shots=len(training_set), # Set shots to the number of training examples | ||
) | ||
|
||
result = optimizer.optimize() | ||
assert result is not None | ||
assert isinstance(result, GenericLMFunctionOptimizer.Result) | ||
assert "length" in optimizer.best_evaluation.individual_evals[0].details | ||
|
||
# Test the custom evaluator | ||
test_input = InputModel(question="What is 5+5?") | ||
output = result.best_function(test_input) | ||
assert isinstance(output, OutputModel) | ||
assert isinstance(output.answer, str) | ||
|
||
# Manually apply the custom evaluator | ||
eval_result = custom_evaluator(output, {"answer": "10"}) | ||
assert "passed" in eval_result | ||
assert "length" in eval_result |