From 99eb34727a1daf7889c18d808de244686fd8fa9b Mon Sep 17 00:00:00 2001 From: ssbushi <66321939+ssbushi@users.noreply.github.com> Date: Thu, 2 Jan 2025 16:01:36 +0000 Subject: [PATCH] feat(evals): Support file based evals in "runNewEvaluation" #1579 --- genkit-tools/cli/src/commands/eval-flow.ts | 2 +- genkit-tools/common/src/eval/evaluate.ts | 52 +++++++++++++--------- genkit-tools/common/src/types/apis.ts | 5 ++- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/genkit-tools/cli/src/commands/eval-flow.ts b/genkit-tools/cli/src/commands/eval-flow.ts index f3f58765b..8225b58a8 100644 --- a/genkit-tools/cli/src/commands/eval-flow.ts +++ b/genkit-tools/cli/src/commands/eval-flow.ts @@ -130,7 +130,7 @@ export const evalFlow = new Command('eval:flow') const evalDataset = await runInference({ manager, actionRef, - evalFlowInput, + evalInferenceInput: evalFlowInput, auth: options.auth, }); diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts index 5cdaa0faa..1055a4636 100644 --- a/genkit-tools/common/src/eval/evaluate.ts +++ b/genkit-tools/common/src/eval/evaluate.ts @@ -66,25 +66,38 @@ export async function runNewEvaluation( manager: RuntimeManager, request: RunNewEvaluationRequest ): Promise { - const { datasetId, actionRef, evaluators } = request; - const datasetStore = await getDatasetStore(); - logger.info(`Fetching dataset ${datasetId}...`); - const dataset = await datasetStore.getDataset(datasetId); - const datasetMetadatas = await datasetStore.listDatasets(); - const targetDatasetMetadata = datasetMetadatas.find( - (d) => d.datasetId === datasetId - ); - const datasetVersion = targetDatasetMetadata?.version; + const { dataSource, actionRef, evaluators } = request; + const { datasetId, data } = dataSource; + if (!datasetId && !data) { + throw new Error(`Either 'data' or 'datasetId' must be provided`); + } + + let evalInferenceInput: EvalInferenceInput; + let metadata = {}; + if (datasetId) { + const datasetStore = await getDatasetStore(); + logger.info(`Fetching dataset ${datasetId}...`); + const dataset = await datasetStore.getDataset(datasetId); + if (dataset.length === 0) { + throw new Error(`Dataset ${datasetId} is empty`); + } + evalInferenceInput = EvalInferenceInputSchema.parse(dataset); - if (dataset.length === 0) { - throw new Error(`Dataset ${datasetId} is empty`); + const datasetMetadatas = await datasetStore.listDatasets(); + const targetDatasetMetadata = datasetMetadatas.find( + (d) => d.datasetId === datasetId + ); + const datasetVersion = targetDatasetMetadata?.version; + metadata = { datasetId, datasetVersion }; + } else { + evalInferenceInput = data!; } logger.info('Running inference...'); const evalDataset = await runInference({ manager, actionRef, - evalFlowInput: EvalInferenceInputSchema.parse(dataset), + evalInferenceInput, auth: request.options?.auth, actionConfig: request.options?.actionConfig, }); @@ -98,9 +111,8 @@ export async function runNewEvaluation( evaluatorActions, evalDataset, augments: { + ...metadata, actionRef, - datasetId, - datasetVersion, actionConfig: request.options?.actionConfig, }, }); @@ -111,11 +123,11 @@ export async function runNewEvaluation( export async function runInference(params: { manager: RuntimeManager; actionRef: string; - evalFlowInput: EvalInferenceInput; + evalInferenceInput: EvalInferenceInput; auth?: string; actionConfig?: any; }): Promise { - const { manager, actionRef, evalFlowInput, auth, actionConfig } = params; + const { manager, actionRef, evalInferenceInput, auth, actionConfig } = params; if (!isSupportedActionRef(actionRef)) { throw new Error('Inference is only supported on flows and models'); } @@ -123,7 +135,7 @@ export async function runInference(params: { const evalDataset: EvalInput[] = await bulkRunAction({ manager, actionRef, - evalFlowInput, + evalInferenceInput, auth, actionConfig, }); @@ -210,13 +222,13 @@ export async function getMatchingEvaluatorActions( async function bulkRunAction(params: { manager: RuntimeManager; actionRef: string; - evalFlowInput: EvalInferenceInput; + evalInferenceInput: EvalInferenceInput; auth?: string; actionConfig?: any; }): Promise { - const { manager, actionRef, evalFlowInput, auth, actionConfig } = params; + const { manager, actionRef, evalInferenceInput, auth, actionConfig } = params; const isModelAction = actionRef.startsWith('/model'); - let testCases: TestCase[] = evalFlowInput.map((c) => ({ + let testCases: TestCase[] = evalInferenceInput.map((c) => ({ input: c.input, reference: c.reference, testCaseId: c.testCaseId ?? generateTestCaseId(), diff --git a/genkit-tools/common/src/types/apis.ts b/genkit-tools/common/src/types/apis.ts index bd300cc73..430296929 100644 --- a/genkit-tools/common/src/types/apis.ts +++ b/genkit-tools/common/src/types/apis.ts @@ -131,7 +131,10 @@ export const UpdateDatasetRequestSchema = z.object({ export type UpdateDatasetRequest = z.infer; export const RunNewEvaluationRequestSchema = z.object({ - datasetId: z.string(), + dataSource: z.object({ + datasetId: z.string().optional(), + data: EvalInferenceInputSchema.optional(), + }), actionRef: z.string(), evaluators: z.array(z.string()).optional(), options: z