Skip to content

Commit

Permalink
feat(general learning LM) | make the general learning lm (#26)
Browse files Browse the repository at this point in the history
* Update vscode settings.json

* Add details to JSONEvaluationHelper result output

* Add GenericLMFunctionOptimizer for general learning LM

* Add tests for GenericLMFunctionOptimizer

* Bump version to 0.0.10
  • Loading branch information
ammirsm authored Jul 28, 2024
1 parent f58c626 commit 13ee053
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 2 deletions.
9 changes: 8 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,12 @@
"py/.pytest_cache/": true
},
"python.defaultInterpreterPath": "py/.venv/bin/python",
"python.analysis.extraPaths": ["./py/src"]
"python.analysis.extraPaths": [
"./py/src"
],
"python.testing.pytestArgs": [
"py"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
2 changes: 1 addition & 1 deletion py/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "zenbase"
version = "0.0.8"
version = "0.0.10"
description = "LLMs made Zen"
authors = [{ name = "Cyrus Nouroozi", email = "[email protected]" }]
dependencies = [
Expand Down
1 change: 1 addition & 0 deletions py/src/zenbase/adaptors/json/evaluation_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def run_and_evaluate(demo: LMDemo):
passed=result["passed"],
response=response,
demo=demo,
details=result,
)
)

Expand Down
110 changes: 110 additions & 0 deletions py/src/zenbase/predefined/generic_lm_function/optimizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from dataclasses import dataclass, field
from typing import Any, Callable, List, NamedTuple, Type

from instructor.client import Instructor
from pydantic import BaseModel

from zenbase.adaptors.json.adaptor import JSONAdaptor
from zenbase.core.managers import ZenbaseTracer
from zenbase.optim.metric.labeled_few_shot import LabeledFewShot
from zenbase.optim.metric.types import CandidateEvalResult
from zenbase.types import LMDemo, LMFunction


@dataclass
class GenericLMFunctionOptimizer:
class Result(NamedTuple):
best_function: LMFunction
candidate_results: list[CandidateEvalResult]
best_candidate_result: CandidateEvalResult | None

instructor_client: Instructor
prompt: str
input_model: Type[BaseModel]
output_model: Type[BaseModel]
model: str
zenbase_tracer: ZenbaseTracer
training_set: List[dict]
validation_set: List[dict]
test_set: List[dict]
custom_evaluator: Callable[[Any, dict], dict] = field(default=None)
shots: int = 5
samples: int = 10

lm_function: LMFunction = field(init=False)
training_set_demos: List[LMDemo] = field(init=False)
validation_set_demos: List[LMDemo] = field(init=False)
test_set_demos: List[LMDemo] = field(init=False)
best_evaluation: CandidateEvalResult | None = field(default=None)
base_evaluation: CandidateEvalResult | None = field(default=None)

def __post_init__(self):
self.lm_function = self._generate_lm_function()
self.training_set_demos = self._convert_dataset_to_demos(self.training_set)
self.validation_set_demos = self._convert_dataset_to_demos(self.validation_set)
self.test_set_demos = self._convert_dataset_to_demos(self.test_set)

def _generate_lm_function(self) -> LMFunction:
@self.zenbase_tracer.trace_function
def generic_function(request):
messages = [
{"role": "system", "content": self.prompt},
{"role": "user", "content": str(request.inputs)},
]
return self.instructor_client.chat.completions.create(
model=self.model, response_model=self.output_model, messages=messages
)

return generic_function

def _convert_dataset_to_demos(self, dataset: List[dict]) -> List[LMDemo]:
return [LMDemo(inputs=item["inputs"], outputs=item["outputs"]) for item in dataset]

def optimize(self) -> Result:
evaluator = self.custom_evaluator or self._create_default_evaluator()
test_evaluator = self._create_test_evaluator(evaluator)

# Perform base evaluation
self.base_evaluation = self._perform_base_evaluation(test_evaluator)

optimizer = LabeledFewShot(demoset=self.training_set_demos, shots=self.shots)
optimizer_result = optimizer.perform(
self.lm_function,
evaluator=JSONAdaptor.metric_evaluator(
data=self.validation_set_demos,
eval_function=evaluator,
),
samples=self.samples,
rounds=1,
)

# Evaluate best function
self.best_evaluation = self._evaluate_best_function(test_evaluator, optimizer_result)

return self.Result(
best_function=optimizer_result.best_function,
candidate_results=optimizer_result.candidate_results,
best_candidate_result=optimizer_result.best_candidate_result,
)

def _create_default_evaluator(self):
def evaluator(output: BaseModel, ideal_output: dict) -> dict:
return {
"passed": int(output.dict() == ideal_output),
}

return evaluator

def _create_test_evaluator(self, evaluator):
return JSONAdaptor.metric_evaluator(
data=self.test_set_demos,
eval_function=evaluator,
)

def _perform_base_evaluation(self, test_evaluator):
"""Perform the base evaluation of the LM function."""
return test_evaluator(self.lm_function)

def _evaluate_best_function(self, test_evaluator, optimizer_result):
"""Evaluate the best function from the optimization result."""
return test_evaluator(optimizer_result.best_function)
152 changes: 152 additions & 0 deletions py/tests/predefined/test_generic_lm_function_optimizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import instructor
import pytest
from instructor import Instructor
from openai import OpenAI
from pydantic import BaseModel

from zenbase.core.managers import ZenbaseTracer
from zenbase.predefined.generic_lm_function.optimizer import GenericLMFunctionOptimizer


class InputModel(BaseModel):
question: str


class OutputModel(BaseModel):
answer: str


@pytest.fixture(scope="module")
def openai_client() -> OpenAI:
return OpenAI()


@pytest.fixture(scope="module")
def instructor_client(openai_client: OpenAI) -> Instructor:
return instructor.from_openai(openai_client)


@pytest.fixture(scope="module")
def zenbase_tracer() -> ZenbaseTracer:
return ZenbaseTracer()


@pytest.fixture
def generic_optimizer(instructor_client, zenbase_tracer):
training_set = [
{"inputs": {"question": "What is the capital of France?"}, "outputs": {"answer": "Paris"}},
{"inputs": {"question": "Who wrote Romeo and Juliet?"}, "outputs": {"answer": "William Shakespeare"}},
{"inputs": {"question": "What is the largest planet in our solar system?"}, "outputs": {"answer": "Jupiter"}},
{"inputs": {"question": "Who painted the Mona Lisa?"}, "outputs": {"answer": "Leonardo da Vinci"}},
{"inputs": {"question": "What is the chemical symbol for gold?"}, "outputs": {"answer": "Au"}},
]

return GenericLMFunctionOptimizer(
instructor_client=instructor_client,
prompt="You are a helpful assistant. Answer the user's question concisely.",
input_model=InputModel,
output_model=OutputModel,
model="gpt-3.5-turbo",
zenbase_tracer=zenbase_tracer,
training_set=training_set,
validation_set=[
{"inputs": {"question": "What is the capital of Italy?"}, "outputs": {"answer": "Rome"}},
{"inputs": {"question": "What is the capital of France?"}, "outputs": {"answer": "Paris"}},
{"inputs": {"question": "What is the capital of Germany?"}, "outputs": {"answer": "Berlin"}},
],
test_set=[
{"inputs": {"question": "Who invented the telephone?"}, "outputs": {"answer": "Alexander Graham Bell"}},
{"inputs": {"question": "Who is CEO of microsoft?"}, "outputs": {"answer": "Bill Gates"}},
{"inputs": {"question": "Who is founder of Facebook?"}, "outputs": {"answer": "Mark Zuckerberg"}},
],
shots=len(training_set), # Set shots to the number of training examples
)


@pytest.mark.helpers
def test_generic_optimizer_optimize(generic_optimizer):
result = generic_optimizer.optimize()
assert result is not None
assert isinstance(result, GenericLMFunctionOptimizer.Result)
assert result.best_function is not None
assert callable(result.best_function)
assert isinstance(result.candidate_results, list)
assert result.best_candidate_result is not None

# Check base evaluation
assert generic_optimizer.base_evaluation is not None

# Check best evaluation
assert generic_optimizer.best_evaluation is not None

# Test the best function
test_input = InputModel(question="What is the capital of Italy?")
output = result.best_function(test_input)
assert isinstance(output, OutputModel)
assert isinstance(output.answer, str)
assert output.answer.strip().lower() == "rome"


@pytest.mark.helpers
def test_generic_optimizer_evaluations(generic_optimizer):
result = generic_optimizer.optimize()

# Check that base and best evaluations exist
assert generic_optimizer.base_evaluation is not None
assert generic_optimizer.best_evaluation is not None

# Check that best evaluation is at least as good as base evaluation
assert generic_optimizer.base_evalution.evals["score"] >= generic_optimizer.best_evaluation.evals["score"]

# Additional checks to ensure the structure of the result
assert isinstance(result, GenericLMFunctionOptimizer.Result)
assert result.best_function is not None
assert isinstance(result.candidate_results, list)
assert result.best_candidate_result is not None

# Check that the best candidate result matches the best evaluation
assert result.best_candidate_result == generic_optimizer.best_evaluation


@pytest.mark.helpers
def test_generic_optimizer_custom_evaluator(instructor_client, zenbase_tracer):
def custom_evaluator(output: OutputModel, ideal_output: dict) -> dict:
return {"passed": int(output.answer.lower() == ideal_output["answer"].lower()), "length": len(output.answer)}

training_set = [
{"inputs": {"question": "What is 2+2?"}, "outputs": {"answer": "4"}},
{"inputs": {"question": "What is the capital of France?"}, "outputs": {"answer": "Paris"}},
{"inputs": {"question": "Who wrote Romeo and Juliet?"}, "outputs": {"answer": "William Shakespeare"}},
{"inputs": {"question": "What is the largest planet in our solar system?"}, "outputs": {"answer": "Jupiter"}},
{"inputs": {"question": "Who painted the Mona Lisa?"}, "outputs": {"answer": "Leonardo da Vinci"}},
]

optimizer = GenericLMFunctionOptimizer(
instructor_client=instructor_client,
prompt="You are a helpful assistant. Answer the user's question concisely.",
input_model=InputModel,
output_model=OutputModel,
model="gpt-3.5-turbo",
zenbase_tracer=zenbase_tracer,
training_set=training_set,
validation_set=[{"inputs": {"question": "What is 3+3?"}, "outputs": {"answer": "6"}}],
test_set=[{"inputs": {"question": "What is 4+4?"}, "outputs": {"answer": "8"}}],
custom_evaluator=custom_evaluator,
shots=len(training_set), # Set shots to the number of training examples
)

result = optimizer.optimize()
assert result is not None
assert isinstance(result, GenericLMFunctionOptimizer.Result)
assert "length" in optimizer.best_evaluation.individual_evals[0].details

# Test the custom evaluator
test_input = InputModel(question="What is 5+5?")
output = result.best_function(test_input)
assert isinstance(output, OutputModel)
assert isinstance(output.answer, str)

# Manually apply the custom evaluator
eval_result = custom_evaluator(output, {"answer": "10"})
assert "passed" in eval_result
assert "length" in eval_result

0 comments on commit 13ee053

Please sign in to comment.