diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/README.md b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/README.md index f8d10c679e622..5b16b7ba76abc 100644 --- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/README.md +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/README.md @@ -2,7 +2,7 @@ ## Overview -This tool is developed for our team working on the Elastic Observability platform, specifically focusing on evaluating the Observability AI Assistant. It simplifies scripting and evaluating various scenarios with the Large Language Model (LLM) integration. +This tool is developed for our team working on the Elastic Observability platform, specifically focusing on evaluating the Observability AI Assistant. It simplifies scripting and evaluating various scenarios with Large Language Model (LLM) integrations. ## Setup requirements @@ -12,26 +12,40 @@ This tool is developed for our team working on the Elastic Observability platfor ## Running evaluations -Run the tool using: - -`$ node x-pack/solutions/observability/plugins/observability_solution/observability_ai_assistant_app/scripts/evaluation/index.js` - -This will evaluate all existing scenarios, and write the evaluation results to the terminal. - ### Configuration -#### Kibana and Elasticsearch - -By default, the tool will look for a Kibana instance running locally (at `http://localhost:5601`, which is the default address for running Kibana in development mode). It will also attempt to read the Kibana config file for the Elasticsearch address & credentials. If you want to override these settings, use `--kibana` and `--es`. Only basic auth is supported, e.g. `--kibana http://username:password@localhost:5601`. If you want to use a specific space, use `--spaceId` +#### To run the evaluation using a local Elasticsearch and Kibana instance: -#### Connector +- Run Elasticsearch locally: `yarn es snapshot --license trial` +- Start Kibana (Default address for Kibana in dev mode: `http://localhost:5601`) +- Run this command to start evaluating: +`$ node x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/index.js` -Use `--connectorId` to specify a `.gen-ai` or `.bedrock` connector to use. If none are given, it will prompt you to select a connector based on the ones that are available. If only a single supported connector is found, it will be used without prompting. - -#### Persisting conversations - -By default, completed conversations are not persisted. If you do want to persist them, for instance for reviewing purposes, set the `--persist` flag to store them. This will also generate a clickable link in the output of the evaluation that takes you to the conversation. - -If you want to clear conversations on startup, use the `--clear` flag. This only works when `--persist` is enabled. If `--spaceId` is set, only conversations for the current space will be cleared. +This will evaluate all existing scenarios, and write the evaluation results to the terminal. -When storing conversations, the name of the scenario is used as a title. Set the `--autoTitle` flag to have the LLM generate a title for you. +#### To run the evaluation using a hosted deployment: +- Add the credentials of Elasticsearch to `kibana.dev.yml` as follows: +``` +elasticsearch.hosts: https://: +elasticsearch.username: +elasticsearch.password: +elasticsearch.ssl.verificationMode: none +elasticsearch.ignoreVersionMismatch: true +``` +- Start Kibana +- Run this command to start evaluating: `node x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/index.js --kibana http://:@localhost:5601` + +By default the script will use the Elasticsearch credentials specified in `kibana.dev.yml`, if you want to override it use the `--es` flag when running the evaluation script: +E.g.: `node x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/index.js --kibana http://:@localhost:5601 --es https://:@:` + +The `--kibana` and `--es` flags override the default credentials. Only basic auth is supported. + +## Other (optional) configuration flags +- `--connectorId` - Specify a generative AI connector to use. If none are given, it will prompt you to select a connector based on the ones that are available. If only a single supported connector is found, it will be used without prompting. +- `--evaluateWith`: The connector ID to evaluate with. Leave empty to use the same connector, use "other" to get a selection menu. +- `--spaceId` - Specify the space ID if you want to use a specific space. +- `--persist` - By default, completed conversations are not persisted. If you want to persist them, for instance for reviewing purposes, include this flag when running the evaluation script. This will also generate a clickable link in the output of the evaluation that takes you to the conversation in Kibana. +- `--clear` - If you want to clear conversations on startup, include this command when running the evaluation script. This only works when `--persist` is enabled. If `--spaceId` is set, only conversations for the current space will be cleared +- `--autoTitle`: When storing conversations, the name of the scenario is used as a title. Set this flag to have the LLM generate a title for you. This only works when `--persist` is enabled. +- `--files`: A file or list of files containing the scenarios to evaluate. Defaults to all. +- `--grep`: A string or regex to filter scenarios by. diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/evaluation.ts b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/evaluation.ts index a01b276c37bdf..9b20402952583 100644 --- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/evaluation.ts +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/evaluation.ts @@ -37,6 +37,8 @@ function runEvaluations() { kibana: argv.kibana, }); + log.info(`Elasticsearch URL: ${serviceUrls.esUrl}`); + const kibanaClient = new KibanaClient(log, serviceUrls.kibanaUrl, argv.spaceId); const esClient = new Client({ node: serviceUrls.esUrl, @@ -100,7 +102,7 @@ function runEvaluations() { evaluationConnectorId: evaluationConnector.id!, persist: argv.persist, suite: mocha.suite, - scopes: ['all'], + scopes: ['observability'], }); const header: string[][] = [ diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/kibana_client.ts b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/kibana_client.ts index 69f6715da2dbe..ef4d3988679fa 100644 --- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/kibana_client.ts +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/kibana_client.ts @@ -26,7 +26,7 @@ import { Message, MessageRole } from '@kbn/observability-ai-assistant-plugin/com import { streamIntoObservable } from '@kbn/observability-ai-assistant-plugin/server'; import { ToolingLog } from '@kbn/tooling-log'; import axios, { AxiosInstance, AxiosResponse, isAxiosError } from 'axios'; -import { isArray, omit, pick, remove } from 'lodash'; +import { omit, pick, remove } from 'lodash'; import pRetry from 'p-retry'; import { concatMap, @@ -59,13 +59,14 @@ interface Options { screenContexts?: ObservabilityAIAssistantScreenContext[]; } -type CompleteFunction = ( - ...args: - | [StringOrMessageList] - | [StringOrMessageList, Options] - | [string | undefined, StringOrMessageList] - | [string | undefined, StringOrMessageList, Options] -) => Promise<{ +interface CompleteFunctionParams { + messages: StringOrMessageList; + conversationId?: string; + options?: Options; + scope?: AssistantScope; +} + +type CompleteFunction = (params: CompleteFunctionParams) => Promise<{ conversationId?: string; messages: InnerMessage[]; errors: ChatCompletionErrorEvent[]; @@ -74,7 +75,6 @@ type CompleteFunction = ( export interface ChatClient { chat: (message: StringOrMessageList) => Promise; complete: CompleteFunction; - evaluate: ( {}: { conversationId?: string; messages: InnerMessage[]; errors: ChatCompletionErrorEvent[] }, criteria: string[] @@ -124,10 +124,10 @@ export class KibanaClient { return this.axios({ method, url, - data: data || {}, + ...(method.toLowerCase() === 'delete' && !data ? {} : { data: data || {} }), headers: { 'kbn-xsrf': 'true', - 'x-elastic-internal-origin': 'foo', + 'x-elastic-internal-origin': 'Kibana', }, }).catch((error) => { if (isAxiosError(error)) { @@ -148,7 +148,7 @@ export class KibanaClient { } async installKnowledgeBase() { - this.log.debug('Checking to see whether knowledge base is installed'); + this.log.info('Checking whether the knowledge base is installed'); const { data: { ready }, @@ -157,7 +157,7 @@ export class KibanaClient { }); if (ready) { - this.log.info('Knowledge base is installed'); + this.log.success('Knowledge base is already installed'); return; } @@ -176,7 +176,7 @@ export class KibanaClient { { retries: 10 } ); - this.log.info('Knowledge base installed'); + this.log.success('Knowledge base installed'); } async createSpaceIfNeeded() { @@ -184,7 +184,7 @@ export class KibanaClient { return; } - this.log.debug(`Checking if space ${this.spaceId} exists`); + this.log.info(`Checking if space ${this.spaceId} exists`); const spaceExistsResponse = await this.callKibana<{ id?: string; @@ -204,7 +204,7 @@ export class KibanaClient { }); if (spaceExistsResponse.data.id) { - this.log.debug(`Space id ${this.spaceId} found`); + this.log.success(`Space id ${this.spaceId} found`); return; } @@ -223,7 +223,7 @@ export class KibanaClient { ); if (spaceCreatedResponse.status === 200) { - this.log.info(`Created space ${this.spaceId}`); + this.log.success(`Created space ${this.spaceId}`); } else { throw new Error( `Error creating space: ${spaceCreatedResponse.status} - ${spaceCreatedResponse.data}` @@ -231,6 +231,18 @@ export class KibanaClient { } } + getMessages(message: string | Array): Array { + if (typeof message === 'string') { + return [ + { + content: message, + role: MessageRole.User, + }, + ]; + } + return message; + } + createChatClient({ connectorId, evaluationConnectorId, @@ -244,22 +256,11 @@ export class KibanaClient { suite?: Mocha.Suite; scopes: AssistantScope[]; }): ChatClient { - function getMessages(message: string | Array): Array { - if (typeof message === 'string') { - return [ - { - content: message, - role: MessageRole.User, - }, - ]; - } - return message; - } - const that = this; let currentTitle: string = ''; let firstSuiteName: string = ''; + let currentScopes = scopes; if (suite) { suite.beforeEach(function () { @@ -362,7 +363,7 @@ export class KibanaClient { that.log.info('Chat', name); const chat$ = defer(() => { - that.log.debug(`Calling chat API`); + that.log.info('Calling the /chat API'); const params: ObservabilityAIAssistantAPIClientRequestParamsOf<'POST /internal/observability_ai_assistant/chat'>['params']['body'] = { name, @@ -370,7 +371,7 @@ export class KibanaClient { connectorId: connectorIdOverride || connectorId, functions: functions.map((fn) => pick(fn, 'name', 'description', 'parameters')), functionCall, - scopes, + scopes: currentScopes, }; return that.axios.post( @@ -378,7 +379,11 @@ export class KibanaClient { pathname: '/internal/observability_ai_assistant/chat', }), params, - { responseType: 'stream', timeout: NaN } + { + responseType: 'stream', + timeout: NaN, + headers: { 'x-elastic-internal-origin': 'Kibana' }, + } ); }).pipe( switchMap((response) => streamIntoObservable(response.data)), @@ -400,54 +405,33 @@ export class KibanaClient { return { chat: async (message) => { const messages = [ - ...getMessages(message).map((msg) => ({ + ...this.getMessages(message).map((msg) => ({ message: msg, '@timestamp': new Date().toISOString(), })), ]; return chat('chat', { messages, functions: [] }); }, - complete: async (...args) => { - that.log.info(`Complete`); - let messagesArg: StringOrMessageList | undefined; - let conversationId: string | undefined; - let options: Options = {}; - - function isMessageList(arg: any): arg is StringOrMessageList { - return isArray(arg) || typeof arg === 'string'; - } + complete: async ({ + messages: messagesArg, + conversationId, + options = {}, + scope: newScope, + }: CompleteFunctionParams) => { + that.log.info('Calling complete'); - // | [StringOrMessageList] - // | [StringOrMessageList, Options] - // | [string, StringOrMessageList] - // | [string, StringOrMessageList, Options] - if (args.length === 1) { - messagesArg = args[0]; - } else if (args.length === 2 && !isMessageList(args[1])) { - messagesArg = args[0]; - options = args[1]; - } else if ( - args.length === 2 && - (typeof args[0] === 'string' || typeof args[0] === 'undefined') && - isMessageList(args[1]) - ) { - conversationId = args[0]; - messagesArg = args[1]; - } else if (args.length === 3) { - conversationId = args[0]; - messagesArg = args[1]; - options = args[2]; - } + // set scope + currentScopes = [newScope || 'observability']; const messages = [ - ...getMessages(messagesArg!).map((msg) => ({ + ...this.getMessages(messagesArg!).map((msg) => ({ message: msg, '@timestamp': new Date().toISOString(), })), ]; const stream$ = defer(() => { - that.log.debug(`Calling /chat/complete API`); + that.log.info(`Calling /chat/complete API`); return from( that.axios.post( that.getUrl({ @@ -460,9 +444,13 @@ export class KibanaClient { connectorId, persist, title: currentTitle, - scopes, + scopes: currentScopes, }, - { responseType: 'stream', timeout: NaN } + { + responseType: 'stream', + timeout: NaN, + headers: { 'x-elastic-internal-origin': 'Kibana' }, + } ) ); }).pipe( @@ -615,7 +603,7 @@ export class KibanaClient { }) .concat({ score: errors.length === 0 ? 1 : 0, - criterion: 'The conversation encountered errors', + criterion: 'The conversation did not encounter any errors', reasoning: errors.length ? `The following errors occurred: ${errors.map((error) => error.error.message)}` : 'No errors occurred', diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/alerts/index.spec.ts b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/alerts/index.spec.ts index 8e466bbdbea4f..c56b65f2dff29 100644 --- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/alerts/index.spec.ts +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/alerts/index.spec.ts @@ -18,7 +18,7 @@ import { customThresholdAIAssistantLogCount, } from '../../alert_templates/templates'; -describe('alert function', () => { +describe('Alert function', () => { const ruleIds: any[] = []; before(async () => { @@ -31,12 +31,19 @@ describe('alert function', () => { ruleIds.push(responseApmRule.data.id); logger.info('Creating dataview'); - - await kibanaClient.callKibana( - 'post', - { pathname: '/api/content_management/rpc/create' }, - customThresholdAIAssistantLogCount.dataViewParams - ); + try { + await kibanaClient.callKibana( + 'post', + { pathname: '/api/content_management/rpc/create' }, + customThresholdAIAssistantLogCount.dataViewParams + ); + } catch (error) { + if (error?.status === 409) { + logger.info('Data view already exists, skipping creation'); + } else { + throw error; + } + } logger.info('Creating logs rule'); const responseLogsRule = await kibanaClient.callKibana( @@ -47,13 +54,11 @@ describe('alert function', () => { ruleIds.push(responseLogsRule.data.id); logger.debug('Cleaning APM indices'); - await synthtraceEsClients.apmSynthtraceEsClient.clean(); const myServiceInstance = apm.service('my-service', 'production', 'go').instance('my-instance'); logger.debug('Indexing synthtrace data'); - await synthtraceEsClients.apmSynthtraceEsClient.index( timerange(moment().subtract(15, 'minutes'), moment()) .interval('1m') @@ -78,7 +83,6 @@ describe('alert function', () => { ); logger.debug('Triggering a rule run'); - await Promise.all( ruleIds.map((ruleId) => kibanaClient.callKibana('post', { @@ -95,9 +99,9 @@ describe('alert function', () => { }); it('summary of active alerts', async () => { - const conversation = await chatClient.complete( - 'Are there any active alerts over the last 4 hours?' - ); + const conversation = await chatClient.complete({ + messages: 'Are there any active alerts over the last 4 hours?', + }); const result = await chatClient.evaluate(conversation, [ 'Correctly uses the `alerts` function to fetch data for the current time range', @@ -109,17 +113,17 @@ describe('alert function', () => { }); it('filtered alerts', async () => { - let conversation = await chatClient.complete( - 'Do I have any active threshold alerts related to the AI Assistant?' - ); + let conversation = await chatClient.complete({ + messages: 'Do I have any active threshold alerts related to the AI Assistant?', + }); - conversation = await chatClient.complete( - conversation.conversationId!, - conversation.messages.concat({ + conversation = await chatClient.complete({ + conversationId: conversation.conversationId!, + messages: conversation.messages.concat({ content: 'Do I have any alerts on the service my-service?', role: MessageRole.User, - }) - ); + }), + }); const result = await chatClient.evaluate(conversation, [ 'Uses the get_alerts_dataset_info function', @@ -143,7 +147,7 @@ describe('alert function', () => { 'post', { pathname: `/api/content_management/rpc/delete` }, { - contentTypeId: 'index-pattern', + contentTypeId: customThresholdAIAssistantLogCount.dataViewParams.contentTypeId, id: customThresholdAIAssistantLogCount.dataViewParams.options.id, options: { force: true }, version: 1, diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/apm/index.spec.ts b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/apm/index.spec.ts index 361d436f8ecec..e2c295a229c2a 100644 --- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/apm/index.spec.ts +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/apm/index.spec.ts @@ -15,8 +15,9 @@ import { MessageRole } from '@kbn/observability-ai-assistant-plugin/common'; import { chatClient, kibanaClient, synthtraceEsClients } from '../../services'; import { apmErrorCountAIAssistant } from '../../alert_templates/templates'; -describe('apm', () => { +describe('APM', () => { const ruleIds: any[] = []; + before(async () => { const responseApmRule = await kibanaClient.callKibana( 'post', @@ -92,9 +93,10 @@ describe('apm', () => { }); it('service throughput', async () => { - const conversation = await chatClient.complete( - 'What is the average throughput per minute for the ai-assistant-service service over the past 4 hours?' - ); + const conversation = await chatClient.complete({ + messages: + 'What is the average throughput per minute for the ai-assistant-service service over the past 4 hours?', + }); const result = await chatClient.evaluate(conversation, [ 'Uses the get_apm_dataset_info function to get information about the APM data streams', @@ -109,9 +111,9 @@ describe('apm', () => { }); it('service dependencies', async () => { - const conversation = await chatClient.complete( - 'What are the downstream dependencies of the ai-assistant-service-front service?' - ); + const conversation = await chatClient.complete({ + messages: 'What are the downstream dependencies of the ai-assistant-service-front service?', + }); const result = await chatClient.evaluate(conversation, [ 'Uses the get_apm_downstream_dependencies function with the `service.name` parameter being "ai-assistant-service-front"', @@ -122,34 +124,34 @@ describe('apm', () => { }); it('services in environment', async () => { - let conversation = await chatClient.complete( - 'What are the active services in the environment "test"?' - ); + let conversation = await chatClient.complete({ + messages: 'What are the active services in the environment "test"?', + }); - conversation = await chatClient.complete( - conversation.conversationId!, - conversation.messages.concat({ + conversation = await chatClient.complete({ + conversationId: conversation.conversationId!, + messages: conversation.messages.concat({ content: 'What is the average error rate per service over the past 4 hours?', role: MessageRole.User, - }) - ); + }), + }); - conversation = await chatClient.complete( - conversation.conversationId!, - conversation.messages.concat({ + conversation = await chatClient.complete({ + conversationId: conversation.conversationId!, + messages: conversation.messages.concat({ content: 'What are the top 2 most frequent errors in the services in the test environment in the last hour?', role: MessageRole.User, - }) - ); + }), + }); - conversation = await chatClient.complete( - conversation.conversationId!, - conversation.messages.concat({ + conversation = await chatClient.complete({ + conversationId: conversation.conversationId!, + messages: conversation.messages.concat({ content: 'Are there any alert for those services?', role: MessageRole.User, - }) - ); + }), + }); const result = await chatClient.evaluate(conversation, [ 'Responds with the active services in the environment "test"', diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/documentation/index.spec.ts b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/documentation/index.spec.ts new file mode 100644 index 0000000000000..b91c66c02a742 --- /dev/null +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/documentation/index.spec.ts @@ -0,0 +1,109 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/// + +import expect from '@kbn/expect'; +import { + InstallationStatusResponse, + PerformInstallResponse, + UninstallResponse, +} from '@kbn/product-doc-base-plugin/common/http_api/installation'; +import { RETRIEVE_DOCUMENTATION_NAME } from '../../../../server/functions/documentation'; +import { chatClient, kibanaClient, logger } from '../../services'; + +const ELASTIC_DOCS_INSTALLATION_STATUS_API_PATH = '/internal/product_doc_base/status'; +const ELASTIC_DOCS_INSTALL_ALL_API_PATH = '/internal/product_doc_base/install'; +const ELASTIC_DOCS_UNINSTALL_ALL_API_PATH = '/internal/product_doc_base/uninstall'; + +describe('Retrieve documentation function', () => { + before(async () => { + let statusResponse = await kibanaClient.callKibana('get', { + pathname: ELASTIC_DOCS_INSTALLATION_STATUS_API_PATH, + }); + + if (statusResponse.data.overall === 'installed') { + logger.success('Elastic documentation is already installed'); + } else { + logger.info('Installing Elastic documentation'); + const installResponse = await kibanaClient.callKibana('post', { + pathname: ELASTIC_DOCS_INSTALL_ALL_API_PATH, + }); + + if (!installResponse.data.installed) { + logger.error('Could not install Elastic documentation'); + throw new Error('Documentation did not install successfully before running tests.'); + } + + statusResponse = await kibanaClient.callKibana('get', { + pathname: ELASTIC_DOCS_INSTALLATION_STATUS_API_PATH, + }); + + if (statusResponse.data.overall !== 'installed') { + throw new Error('Documentation is not fully installed, cannot proceed with tests.'); + } else { + logger.success('Installed Elastic documentation'); + } + } + }); + + it('retrieves Elasticsearch documentation', async () => { + const prompt = 'How can I configure HTTPS in Elasticsearch?'; + const conversation = await chatClient.complete({ messages: prompt }); + + const result = await chatClient.evaluate(conversation, [ + `Uses the ${RETRIEVE_DOCUMENTATION_NAME} function before answering the question about the Elastic stack`, + 'The assistant provides guidance on configuring HTTPS for Elasticsearch based on the retrieved documentation', + 'Does not hallucinate steps without first calling the retrieve_elastic_doc function', + 'Mentions Elasticsearch and HTTPS configuration steps consistent with the documentation', + ]); + + expect(result.passed).to.be(true); + }); + + it('retrieves Kibana documentation', async () => { + const prompt = 'What is Kibana Lens and how do I create a bar chart visualization with it?'; + const conversation = await chatClient.complete({ messages: prompt }); + + const result = await chatClient.evaluate(conversation, [ + `Uses the ${RETRIEVE_DOCUMENTATION_NAME} function before answering the question about Kibana`, + 'Accurately explains what Kibana Lens is and provides doc-based steps for creating a bar chart visualization', + `Does not invent unsupported instructions, answers should reference what's found in the Kibana docs`, + ]); + + expect(result.passed).to.be(true); + }); + + it('retrieves Observability documentation', async () => { + const prompt = + 'How can I set up APM instrumentation for my Node.js service in Elastic Observability?'; + const conversation = await chatClient.complete({ messages: prompt }); + + const result = await chatClient.evaluate(conversation, [ + `Uses the ${RETRIEVE_DOCUMENTATION_NAME} function before answering the question about Observability`, + 'Provides instructions based on the Observability docs for setting up APM instrumentation in a Node.js service', + 'Mentions steps like installing the APM agent, configuring it with the service name and APM Server URL, etc., as per the docs', + 'Does not provide hallucinated steps, should align with actual Observability documentation', + ]); + + expect(result.passed).to.be(true); + }); + + after(async () => { + // Uninstall all installed documentation + logger.info('Uninstalling Elastic documentation'); + const uninstallResponse = await kibanaClient.callKibana('post', { + pathname: ELASTIC_DOCS_UNINSTALL_ALL_API_PATH, + }); + + if (uninstallResponse.data.success) { + logger.success('Uninstalled Elastic documentation'); + } else { + logger.error('Could not uninstall Elastic documentation'); + } + }); +}); diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/elasticsearch/index.spec.ts b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/elasticsearch/index.spec.ts index 898489290cc10..722043dc29e3e 100644 --- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/elasticsearch/index.spec.ts +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/elasticsearch/index.spec.ts @@ -11,12 +11,15 @@ import expect from '@kbn/expect'; import { MessageRole } from '@kbn/observability-ai-assistant-plugin/common'; import { chatClient, esClient } from '../../services'; -describe('elasticsearch functions', () => { +describe('Elasticsearch functions', () => { describe('health', () => { it('returns the cluster health state', async () => { - const conversation = await chatClient.complete( - 'Can you tell me what the state of my Elasticsearch cluster is?' - ); + const conversation = await chatClient.complete({ + messages: 'Can you tell me what the state of my Elasticsearch cluster is?', + // using 'all' for elasticsearch scenarios enables the LLM correctly pick + // elasticsearch functions when querying for data + scope: 'all', + }); const result = await chatClient.evaluate(conversation, [ 'Calls the Elasticsearch function with method: GET and path: _cluster/health', @@ -58,7 +61,10 @@ describe('elasticsearch functions', () => { }); it('returns the count of docs in the KB', async () => { - const conversation = await chatClient.complete('How many documents are in the index kb?'); + const conversation = await chatClient.complete({ + messages: 'How many documents are in the index kb?', + scope: 'all', + }); const result = await chatClient.evaluate(conversation, [ 'Calls the `elasticsearch` function OR the `query` function', @@ -69,15 +75,19 @@ describe('elasticsearch functions', () => { }); it('returns store and refresh stats of an index', async () => { - let conversation = await chatClient.complete('What are the store stats of the index kb?'); + let conversation = await chatClient.complete({ + messages: 'What are the store stats of the index kb?', + scope: 'all', + }); - conversation = await chatClient.complete( - conversation.conversationId!, - conversation.messages.concat({ + conversation = await chatClient.complete({ + conversationId: conversation.conversationId!, + messages: conversation.messages.concat({ content: 'What are the the refresh stats of the index?', role: MessageRole.User, - }) - ); + }), + scope: 'all', + }); const result = await chatClient.evaluate(conversation, [ 'Calls the Elasticsearch function with method: kb/_stats/store', @@ -98,25 +108,29 @@ describe('elasticsearch functions', () => { describe('assistant created index', () => { it('creates index, adds documents and deletes index', async () => { - let conversation = await chatClient.complete( - 'Create a new index called testing_ai_assistant what will have two documents, one for the test_suite alerts with message "This test is for alerts" and another one for the test_suite esql with the message "This test is for esql"' - ); + let conversation = await chatClient.complete({ + messages: + 'Create a new index called testing_ai_assistant that will have two documents, one for the test_suite alerts with message "This test is for alerts" and another one for the test_suite esql with the message "This test is for esql"', + scope: 'all', + }); - conversation = await chatClient.complete( - conversation.conversationId!, - conversation.messages.concat({ + conversation = await chatClient.complete({ + conversationId: conversation.conversationId!, + messages: conversation.messages.concat({ content: 'What are the fields types for the index testing_ai_assistant?', role: MessageRole.User, - }) - ); + }), + scope: 'all', + }); - conversation = await chatClient.complete( - conversation.conversationId!, - conversation.messages.concat({ + conversation = await chatClient.complete({ + conversationId: conversation.conversationId!, + messages: conversation.messages.concat({ content: 'Delete the testing_ai_assistant index', role: MessageRole.User, - }) - ); + }), + scope: 'all', + }); const result = await chatClient.evaluate(conversation, [ 'Calls the Elasticsearch function to create the index testing_ai_assistant and add the documents to it', @@ -129,9 +143,13 @@ describe('elasticsearch functions', () => { }); }); }); + describe('other', () => { it('returns clusters license', async () => { - const conversation = await chatClient.complete('What is my clusters license?'); + const conversation = await chatClient.complete({ + messages: 'What is my clusters license?', + scope: 'all', + }); const result = await chatClient.evaluate(conversation, [ 'Calls the Elasticsearch function', diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/esql/index.spec.ts b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/esql/index.spec.ts index 37699876f6165..23b0af988035c 100644 --- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/esql/index.spec.ts +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/esql/index.spec.ts @@ -23,7 +23,7 @@ async function evaluateEsqlQuery({ execute?: boolean; criteria?: string[]; }): Promise { - const conversation = await chatClient.complete(question); + const conversation = await chatClient.complete({ messages: question, scope: 'all' }); const evaluation = await chatClient.evaluate(conversation, [ ...(expected @@ -33,8 +33,8 @@ async function evaluateEsqlQuery({ ] : []), ...(execute - ? [`The query successfully executed without an error`] - : [`The query was not executed, it was only explained`]), + ? ['The query successfully executed without an error'] + : ['The query was not executed, it was only explained']), ...criteria, ]); @@ -73,6 +73,7 @@ describe('ES|QL query generation', () => { }, }, }); + await esClient.index({ index: 'packetbeat-8.11.3', document: { @@ -348,6 +349,7 @@ describe('ES|QL query generation', () => { execute: false, }); }); + it('prod_web length', async () => { await evaluateEsqlQuery({ question: `can you convert this SPL query to ESQL? index=prod_web | eval length=len(message) | eval k255=if((length>255),1,0) | eval k2=if((length>2048),1,0) | eval k4=if((length>4096),1,0) |eval k16=if((length>16384),1,0) | stats count, sum(k255), sum(k2),sum(k4),sum(k16), sum(length)`, @@ -360,6 +362,7 @@ describe('ES|QL query generation', () => { execute: false, }); }); + it('prod_web filter message and host', async () => { await evaluateEsqlQuery({ question: `can you convert this SPL query to ESQL? index=prod_web NOT "Connection reset" NOT "[acm-app] created a ThreadLocal" sourcetype!=prod_urlf_east_logs sourcetype!=prod_urlf_west_logs host!="dbs-tools-*" NOT "Public] in context with path [/global] " host!="*dev*" host!="*qa*" host!="*uat*"`, diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/kb/index.spec.ts b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/kb/index.spec.ts index 9e27ff94f3111..e047a02f5c582 100644 --- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/kb/index.spec.ts +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/kb/index.spec.ts @@ -9,43 +9,128 @@ import expect from '@kbn/expect'; import { MessageRole } from '@kbn/observability-ai-assistant-plugin/common'; -import { chatClient, esClient } from '../../services'; - -describe('kb functions', () => { - it('summarizes and recalls information', async () => { - let conversation = await chatClient.complete( - 'Remember that this cluster is used to test the AI Assistant using the Observability AI Evaluation Framework' - ); - - conversation = await chatClient.complete( - conversation.conversationId!, - conversation.messages.concat({ - content: 'What is this cluster used for?', - role: MessageRole.User, - }) - ); - - const result = await chatClient.evaluate(conversation, [ - 'Calls the summarize function', - 'Effectively summarizes and remembers that this cluster is used to test the AI Assistant using the Observability AI Evaluation Framework', - 'Calls the "context" function to respond to What is this cluster used for?', - 'Effectively recalls that this cluster is used to test the AI Assistant using the Observability AI Evaluation Framework', - ]); - - expect(result.passed).to.be(true); - }); +import { chatClient, esClient, kibanaClient } from '../../services'; + +const KB_INDEX = '.kibana-observability-ai-assistant-kb-*'; + +describe('Knowledge base', () => { + describe('kb functions', () => { + it('summarizes and recalls information', async () => { + let conversation = await chatClient.complete({ + messages: + 'Remember that this cluster is used to test the AI Assistant using the Observability AI Evaluation Framework', + }); + + conversation = await chatClient.complete({ + conversationId: conversation.conversationId!, + messages: conversation.messages.concat({ + content: 'What is this cluster used for?', + role: MessageRole.User, + }), + }); + + const result = await chatClient.evaluate(conversation, [ + 'Calls the summarize function', + 'Effectively summarizes and remembers that this cluster is used to test the AI Assistant using the Observability AI Evaluation Framework', + 'Calls the "context" function to respond to What is this cluster used for?', + 'Effectively recalls that this cluster is used to test the AI Assistant using the Observability AI Evaluation Framework', + ]); + + expect(result.passed).to.be(true); + }); - after(async () => { - await esClient.deleteByQuery({ - index: '.kibana-observability-ai-assistant-kb-*', - ignore_unavailable: true, - query: { - match: { - text: { - query: '*Observability AI Evaluation Framework*', + after(async () => { + await esClient.deleteByQuery({ + index: KB_INDEX, + ignore_unavailable: true, + query: { + match: { + text: { + query: '*Observability AI Evaluation Framework*', + }, }, }, + }); + }); + }); + + describe('kb retrieval', () => { + const testDocs = [ + { + id: 'doc_invention_1', + title: 'Quantum Revectorization Engine', + text: 'The Quantum Revectorization Engine (QRE), invented by Dr. Eliana Stone at Acme Labs in 2023, uses advanced quantum fields to reorder the subatomic structure of materials, effectively reconfiguring matter at a fundamental level. Its main achievement was to transform ordinary silicon wafers into superconductive materials without traditional cooling methods.', }, + { + id: 'doc_invention_2', + title: 'Constraints of QRE', + text: 'Current constraints on the Quantum Revectorization Engine technology limit its revectorization radius to approximately 2 nanometers. Additionally, the energy required to maintain the quantum fields is extraordinarily high, necessitating specialized fusion reactors to sustain the process.', + }, + ]; + + before(async () => { + await kibanaClient.installKnowledgeBase(); + try { + await esClient.deleteByQuery({ + index: KB_INDEX, + ignore_unavailable: true, + query: { match_all: {} }, + refresh: true, + }); + } catch (error) { + // ignore error + } + + // Insert the test documents into KB + await kibanaClient.callKibana( + 'post', + { pathname: '/internal/observability_ai_assistant/kb/entries/import' }, + { + entries: testDocs, + } + ); + }); + + it('retrieves inventor and purpose of the QRE', async () => { + const prompt = 'Who invented the Quantum Revectorization Engine and what does it do?'; + const conversation = await chatClient.complete({ messages: prompt }); + + const result = await chatClient.evaluate(conversation, [ + 'Uses KB retrieval function to find information about the Quantum Revectorization Engine', + 'Correctly identifies Dr. Eliana Stone at Acme Labs in 2023 as the inventor', + 'Accurately describes that it reorders the subatomic structure of materials and can transform silicon wafers into superconductive materials', + 'Does not invent unrelated or hallucinated details not present in the KB', + ]); + + expect(result.passed).to.be(true); + }); + + it('retrieves constraints and energy requirements of the QRE', async () => { + const prompt = + 'What is the approximate revectorization radius of the QRE and what kind of reactor is required to power it?'; + const conversation = await chatClient.complete({ messages: prompt }); + + const result = await chatClient.evaluate(conversation, [ + 'Uses KB retrieval function to find the correct document about QRE constraints', + 'Mentions the 2 nanometer limit on the revectorization radius', + 'Mentions that specialized fusion reactors are needed', + 'Does not mention information unrelated to constraints or energy (i.e., does not mention the inventor or silicon wafer transformation from doc-invention-1)', + ]); + + expect(result.passed).to.be(true); + }); + + after(async () => { + await esClient.deleteByQuery({ + index: KB_INDEX, + ignore_unavailable: true, + query: { + match: { + text: 'Quantum Revectorization Engine', + }, + }, + refresh: true, + }); }); }); }); diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/tsconfig.json b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/tsconfig.json index 212a36a502441..763bf6659170e 100644 --- a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/tsconfig.json +++ b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/tsconfig.json @@ -82,6 +82,7 @@ "@kbn/product-doc-common", "@kbn/charts-theme", "@kbn/ai-assistant-icon", + "@kbn/product-doc-base-plugin", ], "exclude": [ "target/**/*"