From 0b74f62a338fe1f6b281a019e71279f0d2ffb81f Mon Sep 17 00:00:00 2001
From: Pierre Gayvallet <pierre.gayvallet@elastic.co>
Date: Tue, 17 Dec 2024 16:13:17 +0100
Subject: [PATCH] [inference] Add cancelation support for chatComplete and
 output (#203108)

## Summary

Fix https://github.com/elastic/kibana/issues/200757

Add cancelation support for `chatComplete` and `output`, based on an
abort signal.


### Examples

#### response mode

```ts
import { isInferenceRequestAbortedError } from '@kbn/inference-common';

try {
  const abortController = new AbortController();
  const chatResponse = await inferenceClient.chatComplete({
    connectorId: 'some-gen-ai-connector',
    abortSignal: abortController.signal,
    messages: [{ role: MessageRole.User, content: 'Do something' }],
  });
} catch(e) {
  if(isInferenceRequestAbortedError(e)) {
    // request was aborted, do something
  } else {
    // was another error, do something else
  }
}

// elsewhere
abortController.abort()
```

#### stream mode

```ts
import { isInferenceRequestAbortedError } from '@kbn/inference-common';

const abortController = new AbortController();
const events$ = inferenceClient.chatComplete({
  stream: true,
  connectorId: 'some-gen-ai-connector',
  abortSignal: abortController.signal,
  messages: [{ role: MessageRole.User, content: 'Do something' }],
});

events$.subscribe({
  next: (event) => {
    // do something
  },
  error: (err) => {
    if(isInferenceRequestAbortedError(e)) {
      // request was aborted, do something
    } else {
      // was another error, do something else
    }
  }
});

abortController.abort();
```
---
 .github/CODEOWNERS                            |   2 +-
 .../shared/ai-infra/inference-common/index.ts |   3 +
 .../inference-common/src/chat_complete/api.ts |   4 +
 .../ai-infra/inference-common/src/errors.ts   |  44 ++++
 .../inference-common/src/output/api.ts        |   5 +-
 .../plugins/shared/inference/README.md        |  69 +++++
 .../common/output/create_output_api.test.ts   |  22 ++
 .../common/output/create_output_api.ts        |   3 +
 .../bedrock/bedrock_claude_adapter.test.ts    |  19 ++
 .../bedrock/bedrock_claude_adapter.ts         |   3 +-
 .../adapters/gemini/gemini_adapter.test.ts    |  19 ++
 .../adapters/gemini/gemini_adapter.ts         |   3 +-
 .../adapters/openai/openai_adapter.test.ts    |  20 ++
 .../adapters/openai/openai_adapter.ts         |  12 +-
 .../server/chat_complete/api.test.mocks.ts    |  26 ++
 .../server/chat_complete/api.test.ts          | 237 ++++++++++++++++++
 .../inference/server/chat_complete/api.ts     |  35 +--
 .../inference/server/chat_complete/types.ts   |   1 +
 .../utils/handle_cancellation.test.ts         |  53 ++++
 .../utils/handle_cancellation.ts              |  39 +++
 .../server/chat_complete/utils/index.ts       |   3 +-
 .../chat_complete/utils/inference_executor.ts |  23 +-
 .../inference/server/routes/chat_complete.ts  |   4 +
 .../inference/server/test_utils/index.ts      |  11 +
 .../server/test_utils/inference_connector.ts  |  19 ++
 .../test_utils/inference_connector_adapter.ts |  14 ++
 .../server/test_utils/inference_executor.ts   |  19 ++
 27 files changed, 688 insertions(+), 24 deletions(-)
 create mode 100644 x-pack/platform/plugins/shared/inference/server/chat_complete/api.test.mocks.ts
 create mode 100644 x-pack/platform/plugins/shared/inference/server/chat_complete/api.test.ts
 create mode 100644 x-pack/platform/plugins/shared/inference/server/chat_complete/utils/handle_cancellation.test.ts
 create mode 100644 x-pack/platform/plugins/shared/inference/server/chat_complete/utils/handle_cancellation.ts
 create mode 100644 x-pack/platform/plugins/shared/inference/server/test_utils/index.ts
 create mode 100644 x-pack/platform/plugins/shared/inference/server/test_utils/inference_connector.ts
 create mode 100644 x-pack/platform/plugins/shared/inference/server/test_utils/inference_connector_adapter.ts
 create mode 100644 x-pack/platform/plugins/shared/inference/server/test_utils/inference_executor.ts

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d9def6481fc90..627579d513e9f 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1832,7 +1832,7 @@ packages/kbn-monaco/src/esql @elastic/kibana-esql
 #CC# /x-pack/plugins/global_search_providers/ @elastic/kibana-core
 
 # AppEx AI Infra
-/x-pack/plugins/inference @elastic/appex-ai-infra @elastic/obs-ai-assistant @elastic/security-generative-ai
+/x-pack/platform/plugins/shared/inference @elastic/appex-ai-infra @elastic/obs-ai-assistant @elastic/security-generative-ai
 /x-pack/test/functional_gen_ai/inference @elastic/appex-ai-infra
 
 # AppEx Platform Services Security
diff --git a/x-pack/platform/packages/shared/ai-infra/inference-common/index.ts b/x-pack/platform/packages/shared/ai-infra/inference-common/index.ts
index 603192fb96db4..134b0f02811fe 100644
--- a/x-pack/platform/packages/shared/ai-infra/inference-common/index.ts
+++ b/x-pack/platform/packages/shared/ai-infra/inference-common/index.ts
@@ -84,11 +84,14 @@ export {
   type InferenceTaskErrorEvent,
   type InferenceTaskInternalError,
   type InferenceTaskRequestError,
+  type InferenceTaskAbortedError,
   createInferenceInternalError,
   createInferenceRequestError,
+  createInferenceRequestAbortedError,
   isInferenceError,
   isInferenceInternalError,
   isInferenceRequestError,
+  isInferenceRequestAbortedError,
 } from './src/errors';
 
 export { truncateList } from './src/truncate_list';
diff --git a/x-pack/platform/packages/shared/ai-infra/inference-common/src/chat_complete/api.ts b/x-pack/platform/packages/shared/ai-infra/inference-common/src/chat_complete/api.ts
index cb91f4e53e8ae..4e29d5f7dad06 100644
--- a/x-pack/platform/packages/shared/ai-infra/inference-common/src/chat_complete/api.ts
+++ b/x-pack/platform/packages/shared/ai-infra/inference-common/src/chat_complete/api.ts
@@ -93,6 +93,10 @@ export type ChatCompleteOptions<
    * Function calling mode, defaults to "native".
    */
   functionCalling?: FunctionCallingMode;
+  /**
+   * Optional signal that can be used to forcefully abort the request.
+   */
+  abortSignal?: AbortSignal;
 } & TToolOptions;
 
 /**
diff --git a/x-pack/platform/packages/shared/ai-infra/inference-common/src/errors.ts b/x-pack/platform/packages/shared/ai-infra/inference-common/src/errors.ts
index 5a99adc4321d9..472ed50e231f5 100644
--- a/x-pack/platform/packages/shared/ai-infra/inference-common/src/errors.ts
+++ b/x-pack/platform/packages/shared/ai-infra/inference-common/src/errors.ts
@@ -13,6 +13,7 @@ import { InferenceTaskEventBase, InferenceTaskEventType } from './inference_task
 export enum InferenceTaskErrorCode {
   internalError = 'internalError',
   requestError = 'requestError',
+  abortedError = 'requestAborted',
 }
 
 /**
@@ -46,16 +47,37 @@ export type InferenceTaskErrorEvent = InferenceTaskEventBase<InferenceTaskEventT
   };
 };
 
+/**
+ * Inference error thrown when an unexpected internal error occurs while handling the request.
+ */
 export type InferenceTaskInternalError = InferenceTaskError<
   InferenceTaskErrorCode.internalError,
   Record<string, any>
 >;
 
+/**
+ * Inference error thrown when the request was considered invalid.
+ *
+ * Some example of reasons for invalid requests would be:
+ * - no connector matching the provided connectorId
+ * - invalid connector type for the provided connectorId
+ */
 export type InferenceTaskRequestError = InferenceTaskError<
   InferenceTaskErrorCode.requestError,
   { status: number }
 >;
 
+/**
+ * Inference error thrown when the request was aborted.
+ *
+ * Request abortion occurs when providing an abort signal and firing it
+ * before the call to the LLM completes.
+ */
+export type InferenceTaskAbortedError = InferenceTaskError<
+  InferenceTaskErrorCode.abortedError,
+  { status: number }
+>;
+
 export function createInferenceInternalError(
   message = 'An internal error occurred',
   meta?: Record<string, any>
@@ -72,16 +94,38 @@ export function createInferenceRequestError(
   });
 }
 
+export function createInferenceRequestAbortedError(): InferenceTaskAbortedError {
+  return new InferenceTaskError(InferenceTaskErrorCode.abortedError, 'Request was aborted', {
+    status: 499,
+  });
+}
+
+/**
+ * Check if the given error is an {@link InferenceTaskError}
+ */
 export function isInferenceError(
   error: unknown
 ): error is InferenceTaskError<string, Record<string, any> | undefined> {
   return error instanceof InferenceTaskError;
 }
 
+/**
+ * Check if the given error is an {@link InferenceTaskInternalError}
+ */
 export function isInferenceInternalError(error: unknown): error is InferenceTaskInternalError {
   return isInferenceError(error) && error.code === InferenceTaskErrorCode.internalError;
 }
 
+/**
+ * Check if the given error is an {@link InferenceTaskRequestError}
+ */
 export function isInferenceRequestError(error: unknown): error is InferenceTaskRequestError {
   return isInferenceError(error) && error.code === InferenceTaskErrorCode.requestError;
 }
+
+/**
+ * Check if the given error is an {@link InferenceTaskAbortedError}
+ */
+export function isInferenceRequestAbortedError(error: unknown): error is InferenceTaskAbortedError {
+  return isInferenceError(error) && error.code === InferenceTaskErrorCode.abortedError;
+}
diff --git a/x-pack/platform/packages/shared/ai-infra/inference-common/src/output/api.ts b/x-pack/platform/packages/shared/ai-infra/inference-common/src/output/api.ts
index cd90394cd67d3..3ae4a6a07ee29 100644
--- a/x-pack/platform/packages/shared/ai-infra/inference-common/src/output/api.ts
+++ b/x-pack/platform/packages/shared/ai-infra/inference-common/src/output/api.ts
@@ -96,7 +96,10 @@ export interface OutputOptions<
    * Defaults to false.
    */
   stream?: TStream;
-
+  /**
+   * Optional signal that can be used to forcefully abort the request.
+   */
+  abortSignal?: AbortSignal;
   /**
    * Optional configuration for retrying the call if an error occurs.
    */
diff --git a/x-pack/platform/plugins/shared/inference/README.md b/x-pack/platform/plugins/shared/inference/README.md
index bba5b4cdcfc27..a52e589a9deab 100644
--- a/x-pack/platform/plugins/shared/inference/README.md
+++ b/x-pack/platform/plugins/shared/inference/README.md
@@ -221,6 +221,75 @@ const toolCall = toolCalls[0];
 // process the tool call and eventually continue the conversation with the LLM
 ```
 
+#### Request cancellation
+
+Request cancellation can be done by passing an abort signal when calling the API. Firing the signal
+before the request completes will cause the abortion, and the API call will throw an error.
+
+```ts
+const abortController = new AbortController();
+
+const chatResponse = await inferenceClient.chatComplete({
+  connectorId: 'some-gen-ai-connector',
+  abortSignal: abortController.signal,
+  messages: [{ role: MessageRole.User, content: 'Do something' }],
+});
+
+// from elsewhere / before the request completes and the promise resolves:
+
+abortController.abort();
+```
+
+The `isInferenceRequestAbortedError` helper function, exposed from `@kbn/inference-common`, can be used easily identify those errors:
+
+```ts
+import { isInferenceRequestAbortedError } from '@kbn/inference-common';
+
+try {
+  const abortController = new AbortController();
+  const chatResponse = await inferenceClient.chatComplete({
+    connectorId: 'some-gen-ai-connector',
+    abortSignal: abortController.signal,
+    messages: [{ role: MessageRole.User, content: 'Do something' }],
+  });
+} catch(e) {
+  if(isInferenceRequestAbortedError(e)) {
+    // request was aborted, do something
+  } else {
+    // was another error, do something else
+  }
+}
+```
+
+The approach is very similar for stream mode:
+
+```ts
+import { isInferenceRequestAbortedError } from '@kbn/inference-common';
+
+const abortController = new AbortController();
+const events$ = inferenceClient.chatComplete({
+  stream: true,
+  connectorId: 'some-gen-ai-connector',
+  abortSignal: abortController.signal,
+  messages: [{ role: MessageRole.User, content: 'Do something' }],
+});
+
+events$.subscribe({
+  next: (event) => {
+    // do something
+  },
+  error: (err) => {
+    if(isInferenceRequestAbortedError(e)) {
+      // request was aborted, do something
+    } else {
+      // was another error, do something else
+    }
+  }
+});
+
+abortController.abort();
+```
+
 ### `output` API
 
 `output` is a wrapper around the `chatComplete` API that is catered towards a specific use case: having the LLM output a structured response, based on a schema.
diff --git a/x-pack/platform/plugins/shared/inference/common/output/create_output_api.test.ts b/x-pack/platform/plugins/shared/inference/common/output/create_output_api.test.ts
index c65720aae2e48..d29f88009f8e5 100644
--- a/x-pack/platform/plugins/shared/inference/common/output/create_output_api.test.ts
+++ b/x-pack/platform/plugins/shared/inference/common/output/create_output_api.test.ts
@@ -196,4 +196,26 @@ describe('createOutputApi', () => {
       ).toThrowError('Retry options are not supported in streaming mode');
     });
   });
+
+  it('propagates the abort signal when provided', async () => {
+    chatComplete.mockResolvedValue(Promise.resolve({ content: 'content', toolCalls: [] }));
+
+    const output = createOutputApi(chatComplete);
+
+    const abortController = new AbortController();
+
+    await output({
+      id: 'id',
+      connectorId: '.my-connector',
+      input: 'input message',
+      abortSignal: abortController.signal,
+    });
+
+    expect(chatComplete).toHaveBeenCalledTimes(1);
+    expect(chatComplete).toHaveBeenCalledWith(
+      expect.objectContaining({
+        abortSignal: abortController.signal,
+      })
+    );
+  });
 });
diff --git a/x-pack/platform/plugins/shared/inference/common/output/create_output_api.ts b/x-pack/platform/plugins/shared/inference/common/output/create_output_api.ts
index 3e65cb283dd45..7cd7e9cad144e 100644
--- a/x-pack/platform/plugins/shared/inference/common/output/create_output_api.ts
+++ b/x-pack/platform/plugins/shared/inference/common/output/create_output_api.ts
@@ -34,6 +34,7 @@ export function createOutputApi(chatCompleteApi: ChatCompleteAPI) {
     previousMessages,
     functionCalling,
     stream,
+    abortSignal,
     retry,
   }: DefaultOutputOptions): OutputCompositeResponse<string, ToolSchema | undefined, boolean> {
     if (stream && retry !== undefined) {
@@ -52,6 +53,7 @@ export function createOutputApi(chatCompleteApi: ChatCompleteAPI) {
       connectorId,
       stream,
       functionCalling,
+      abortSignal,
       system,
       messages,
       ...(schema
@@ -113,6 +115,7 @@ export function createOutputApi(chatCompleteApi: ChatCompleteAPI) {
               input,
               schema,
               system,
+              abortSignal,
               previousMessages: messages.concat(
                 {
                   role: MessageRole.Assistant as const,
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/bedrock/bedrock_claude_adapter.test.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/bedrock/bedrock_claude_adapter.test.ts
index 565727b7f57fe..c6114c3b09e95 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/bedrock/bedrock_claude_adapter.test.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/bedrock/bedrock_claude_adapter.test.ts
@@ -325,5 +325,24 @@ describe('bedrockClaudeAdapter', () => {
       expect(tools).toEqual([]);
       expect(system).toEqual(addNoToolUsageDirective('some system instruction'));
     });
+
+    it('propagates the abort signal when provided', () => {
+      const abortController = new AbortController();
+
+      bedrockClaudeAdapter.chatComplete({
+        logger,
+        executor: executorMock,
+        messages: [{ role: MessageRole.User, content: 'question' }],
+        abortSignal: abortController.signal,
+      });
+
+      expect(executorMock.invoke).toHaveBeenCalledTimes(1);
+      expect(executorMock.invoke).toHaveBeenCalledWith({
+        subAction: 'invokeStream',
+        subActionParams: expect.objectContaining({
+          signal: abortController.signal,
+        }),
+      });
+    });
   });
 });
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/bedrock/bedrock_claude_adapter.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/bedrock/bedrock_claude_adapter.ts
index e73d9c9344c98..e34605a4c96ad 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/bedrock/bedrock_claude_adapter.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/bedrock/bedrock_claude_adapter.ts
@@ -26,7 +26,7 @@ import { processCompletionChunks } from './process_completion_chunks';
 import { addNoToolUsageDirective } from './prompts';
 
 export const bedrockClaudeAdapter: InferenceConnectorAdapter = {
-  chatComplete: ({ executor, system, messages, toolChoice, tools }) => {
+  chatComplete: ({ executor, system, messages, toolChoice, tools, abortSignal }) => {
     const noToolUsage = toolChoice === ToolChoiceType.none;
 
     const subActionParams = {
@@ -36,6 +36,7 @@ export const bedrockClaudeAdapter: InferenceConnectorAdapter = {
       toolChoice: toolChoiceToBedrock(toolChoice),
       temperature: 0,
       stopSequences: ['\n\nHuman:'],
+      signal: abortSignal,
     };
 
     return from(
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/gemini/gemini_adapter.test.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/gemini/gemini_adapter.test.ts
index 95a46f73d5d1f..5024bd1f4c87e 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/gemini/gemini_adapter.test.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/gemini/gemini_adapter.test.ts
@@ -402,5 +402,24 @@ describe('geminiAdapter', () => {
       expect(tapFn).toHaveBeenCalledWith({ chunk: 1 });
       expect(tapFn).toHaveBeenCalledWith({ chunk: 2 });
     });
+
+    it('propagates the abort signal when provided', () => {
+      const abortController = new AbortController();
+
+      geminiAdapter.chatComplete({
+        logger,
+        executor: executorMock,
+        messages: [{ role: MessageRole.User, content: 'question' }],
+        abortSignal: abortController.signal,
+      });
+
+      expect(executorMock.invoke).toHaveBeenCalledTimes(1);
+      expect(executorMock.invoke).toHaveBeenCalledWith({
+        subAction: 'invokeStream',
+        subActionParams: expect.objectContaining({
+          signal: abortController.signal,
+        }),
+      });
+    });
   });
 });
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/gemini/gemini_adapter.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/gemini/gemini_adapter.ts
index 80d0439449066..aa62f7006eac7 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/gemini/gemini_adapter.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/gemini/gemini_adapter.ts
@@ -22,7 +22,7 @@ import { processVertexStream } from './process_vertex_stream';
 import type { GenerateContentResponseChunk, GeminiMessage, GeminiToolConfig } from './types';
 
 export const geminiAdapter: InferenceConnectorAdapter = {
-  chatComplete: ({ executor, system, messages, toolChoice, tools }) => {
+  chatComplete: ({ executor, system, messages, toolChoice, tools, abortSignal }) => {
     return from(
       executor.invoke({
         subAction: 'invokeStream',
@@ -32,6 +32,7 @@ export const geminiAdapter: InferenceConnectorAdapter = {
           tools: toolsToGemini(tools),
           toolConfig: toolChoiceToConfig(toolChoice),
           temperature: 0,
+          signal: abortSignal,
           stopSequences: ['\n\nHuman:'],
         },
       })
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/openai/openai_adapter.test.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/openai/openai_adapter.test.ts
index 48544f1bb0fb1..9b7fbc388024f 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/openai/openai_adapter.test.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/openai/openai_adapter.test.ts
@@ -77,6 +77,7 @@ describe('openAIAdapter', () => {
         };
       });
     });
+
     it('correctly formats messages ', () => {
       openAIAdapter.chatComplete({
         ...defaultArgs,
@@ -254,6 +255,25 @@ describe('openAIAdapter', () => {
       expect(getRequest().stream).toBe(true);
       expect(getRequest().body.stream).toBe(true);
     });
+
+    it('propagates the abort signal when provided', () => {
+      const abortController = new AbortController();
+
+      openAIAdapter.chatComplete({
+        logger,
+        executor: executorMock,
+        messages: [{ role: MessageRole.User, content: 'question' }],
+        abortSignal: abortController.signal,
+      });
+
+      expect(executorMock.invoke).toHaveBeenCalledTimes(1);
+      expect(executorMock.invoke).toHaveBeenCalledWith({
+        subAction: 'stream',
+        subActionParams: expect.objectContaining({
+          signal: abortController.signal,
+        }),
+      });
+    });
   });
 
   describe('when handling the response', () => {
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/openai/openai_adapter.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/openai/openai_adapter.ts
index 49b6bb5142023..0529820b1bfbf 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/openai/openai_adapter.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/adapters/openai/openai_adapter.ts
@@ -43,7 +43,16 @@ import {
 } from '../../simulated_function_calling';
 
 export const openAIAdapter: InferenceConnectorAdapter = {
-  chatComplete: ({ executor, system, messages, toolChoice, tools, functionCalling, logger }) => {
+  chatComplete: ({
+    executor,
+    system,
+    messages,
+    toolChoice,
+    tools,
+    functionCalling,
+    logger,
+    abortSignal,
+  }) => {
     const stream = true;
     const simulatedFunctionCalling = functionCalling === 'simulated';
 
@@ -73,6 +82,7 @@ export const openAIAdapter: InferenceConnectorAdapter = {
         subAction: 'stream',
         subActionParams: {
           body: JSON.stringify(request),
+          signal: abortSignal,
           stream,
         },
       })
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/api.test.mocks.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/api.test.mocks.ts
new file mode 100644
index 0000000000000..e3248b79af406
--- /dev/null
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/api.test.mocks.ts
@@ -0,0 +1,26 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+export const getInferenceAdapterMock = jest.fn();
+
+jest.doMock('./adapters', () => {
+  const actual = jest.requireActual('./adapters');
+  return {
+    ...actual,
+    getInferenceAdapter: getInferenceAdapterMock,
+  };
+});
+
+export const getInferenceExecutorMock = jest.fn();
+
+jest.doMock('./utils', () => {
+  const actual = jest.requireActual('./utils');
+  return {
+    ...actual,
+    getInferenceExecutor: getInferenceExecutorMock,
+  };
+});
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/api.test.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/api.test.ts
new file mode 100644
index 0000000000000..7d557ec512fcc
--- /dev/null
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/api.test.ts
@@ -0,0 +1,237 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { getInferenceExecutorMock, getInferenceAdapterMock } from './api.test.mocks';
+
+import { of, Subject, isObservable, toArray, firstValueFrom } from 'rxjs';
+import { loggerMock, type MockedLogger } from '@kbn/logging-mocks';
+import { httpServerMock } from '@kbn/core/server/mocks';
+import { actionsMock } from '@kbn/actions-plugin/server/mocks';
+import {
+  type ChatCompleteAPI,
+  type ChatCompletionChunkEvent,
+  MessageRole,
+} from '@kbn/inference-common';
+import {
+  createInferenceConnectorAdapterMock,
+  createInferenceConnectorMock,
+  createInferenceExecutorMock,
+  chunkEvent,
+} from '../test_utils';
+import { createChatCompleteApi } from './api';
+
+describe('createChatCompleteApi', () => {
+  let request: ReturnType<typeof httpServerMock.createKibanaRequest>;
+  let logger: MockedLogger;
+  let actions: ReturnType<typeof actionsMock.createStart>;
+  let inferenceAdapter: ReturnType<typeof createInferenceConnectorAdapterMock>;
+  let inferenceConnector: ReturnType<typeof createInferenceConnectorMock>;
+  let inferenceExecutor: ReturnType<typeof createInferenceExecutorMock>;
+
+  let chatComplete: ChatCompleteAPI;
+
+  beforeEach(() => {
+    request = httpServerMock.createKibanaRequest();
+    logger = loggerMock.create();
+    actions = actionsMock.createStart();
+
+    chatComplete = createChatCompleteApi({ request, actions, logger });
+
+    inferenceAdapter = createInferenceConnectorAdapterMock();
+    inferenceAdapter.chatComplete.mockReturnValue(of(chunkEvent('chunk-1')));
+    getInferenceAdapterMock.mockReturnValue(inferenceAdapter);
+
+    inferenceConnector = createInferenceConnectorMock();
+
+    inferenceExecutor = createInferenceExecutorMock({ connector: inferenceConnector });
+    getInferenceExecutorMock.mockResolvedValue(inferenceExecutor);
+  });
+
+  afterEach(() => {
+    getInferenceExecutorMock.mockReset();
+    getInferenceAdapterMock.mockReset();
+  });
+
+  it('calls `getInferenceExecutor` with the right parameters', async () => {
+    await chatComplete({
+      connectorId: 'connectorId',
+      messages: [{ role: MessageRole.User, content: 'question' }],
+    });
+
+    expect(getInferenceExecutorMock).toHaveBeenCalledTimes(1);
+    expect(getInferenceExecutorMock).toHaveBeenCalledWith({
+      connectorId: 'connectorId',
+      request,
+      actions,
+    });
+  });
+
+  it('calls `getInferenceAdapter` with the right parameters', async () => {
+    await chatComplete({
+      connectorId: 'connectorId',
+      messages: [{ role: MessageRole.User, content: 'question' }],
+    });
+
+    expect(getInferenceAdapterMock).toHaveBeenCalledTimes(1);
+    expect(getInferenceAdapterMock).toHaveBeenCalledWith(inferenceConnector.type);
+  });
+
+  it('calls `inferenceAdapter.chatComplete` with the right parameters', async () => {
+    await chatComplete({
+      connectorId: 'connectorId',
+      messages: [{ role: MessageRole.User, content: 'question' }],
+    });
+
+    expect(inferenceAdapter.chatComplete).toHaveBeenCalledTimes(1);
+    expect(inferenceAdapter.chatComplete).toHaveBeenCalledWith({
+      messages: [{ role: MessageRole.User, content: 'question' }],
+      executor: inferenceExecutor,
+      logger,
+    });
+  });
+
+  it('throws if the connector is not compatible', async () => {
+    getInferenceAdapterMock.mockReturnValue(undefined);
+
+    await expect(
+      chatComplete({
+        connectorId: 'connectorId',
+        messages: [{ role: MessageRole.User, content: 'question' }],
+      })
+    ).rejects.toThrowErrorMatchingInlineSnapshot(`"Adapter for type .gen-ai not implemented"`);
+  });
+
+  describe('response mode', () => {
+    it('returns a promise resolving with the response', async () => {
+      inferenceAdapter.chatComplete.mockReturnValue(
+        of(chunkEvent('chunk-1'), chunkEvent('chunk-2'))
+      );
+
+      const response = await chatComplete({
+        connectorId: 'connectorId',
+        messages: [{ role: MessageRole.User, content: 'question' }],
+      });
+
+      expect(response).toEqual({
+        content: 'chunk-1chunk-2',
+        toolCalls: [],
+      });
+    });
+
+    describe('request cancellation', () => {
+      it('passes the abortSignal down to `inferenceAdapter.chatComplete`', async () => {
+        const abortController = new AbortController();
+
+        await chatComplete({
+          connectorId: 'connectorId',
+          messages: [{ role: MessageRole.User, content: 'question' }],
+          abortSignal: abortController.signal,
+        });
+
+        expect(inferenceAdapter.chatComplete).toHaveBeenCalledTimes(1);
+        expect(inferenceAdapter.chatComplete).toHaveBeenCalledWith({
+          messages: [{ role: MessageRole.User, content: 'question' }],
+          executor: inferenceExecutor,
+          abortSignal: abortController.signal,
+          logger,
+        });
+      });
+
+      it('throws an error when the signal is triggered', async () => {
+        const abortController = new AbortController();
+
+        const subject = new Subject<ChatCompletionChunkEvent>();
+        inferenceAdapter.chatComplete.mockReturnValue(subject.asObservable());
+
+        subject.next(chunkEvent('chunk-1'));
+
+        let caughtError: any;
+
+        const promise = chatComplete({
+          connectorId: 'connectorId',
+          messages: [{ role: MessageRole.User, content: 'question' }],
+          abortSignal: abortController.signal,
+        }).catch((err) => {
+          caughtError = err;
+        });
+
+        abortController.abort();
+
+        await promise;
+
+        expect(caughtError).toBeInstanceOf(Error);
+        expect(caughtError.message).toContain('Request was aborted');
+      });
+    });
+  });
+
+  describe('stream mode', () => {
+    it('returns an observable of events', async () => {
+      inferenceAdapter.chatComplete.mockReturnValue(
+        of(chunkEvent('chunk-1'), chunkEvent('chunk-2'))
+      );
+
+      const events$ = chatComplete({
+        stream: true,
+        connectorId: 'connectorId',
+        messages: [{ role: MessageRole.User, content: 'question' }],
+      });
+
+      expect(isObservable(events$)).toBe(true);
+
+      const events = await firstValueFrom(events$.pipe(toArray()));
+      expect(events).toEqual([
+        {
+          content: 'chunk-1',
+          tool_calls: [],
+          type: 'chatCompletionChunk',
+        },
+        {
+          content: 'chunk-2',
+          tool_calls: [],
+          type: 'chatCompletionChunk',
+        },
+        {
+          content: 'chunk-1chunk-2',
+          toolCalls: [],
+          type: 'chatCompletionMessage',
+        },
+      ]);
+    });
+
+    describe('request cancellation', () => {
+      it('throws an error when the signal is triggered', async () => {
+        const abortController = new AbortController();
+
+        const subject = new Subject<ChatCompletionChunkEvent>();
+        inferenceAdapter.chatComplete.mockReturnValue(subject.asObservable());
+
+        subject.next(chunkEvent('chunk-1'));
+
+        let caughtError: any;
+
+        const events$ = chatComplete({
+          stream: true,
+          connectorId: 'connectorId',
+          messages: [{ role: MessageRole.User, content: 'question' }],
+          abortSignal: abortController.signal,
+        });
+
+        events$.subscribe({
+          error: (err: any) => {
+            caughtError = err;
+          },
+        });
+
+        abortController.abort();
+
+        expect(caughtError).toBeInstanceOf(Error);
+        expect(caughtError.message).toContain('Request was aborted');
+      });
+    });
+  });
+});
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/api.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/api.ts
index e58c94759e165..0e58c255bd608 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/api.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/api.ts
@@ -6,7 +6,7 @@
  */
 
 import { last, omit } from 'lodash';
-import { defer, switchMap, throwError } from 'rxjs';
+import { defer, switchMap, throwError, identity } from 'rxjs';
 import type { Logger } from '@kbn/logging';
 import type { KibanaRequest } from '@kbn/core-http-server';
 import {
@@ -17,9 +17,13 @@ import {
   ChatCompleteOptions,
 } from '@kbn/inference-common';
 import type { PluginStartContract as ActionsPluginStart } from '@kbn/actions-plugin/server';
-import { getConnectorById } from '../util/get_connector_by_id';
 import { getInferenceAdapter } from './adapters';
-import { createInferenceExecutor, chunksIntoMessage, streamToResponse } from './utils';
+import {
+  getInferenceExecutor,
+  chunksIntoMessage,
+  streamToResponse,
+  handleCancellation,
+} from './utils';
 
 interface CreateChatCompleteApiOptions {
   request: KibanaRequest;
@@ -37,18 +41,16 @@ export function createChatCompleteApi({ request, actions, logger }: CreateChatCo
     system,
     functionCalling,
     stream,
+    abortSignal,
   }: ChatCompleteOptions<ToolOptions, boolean>): ChatCompleteCompositeResponse<
     ToolOptions,
     boolean
   > => {
-    const obs$ = defer(async () => {
-      const actionsClient = await actions.getActionsClientWithRequest(request);
-      const connector = await getConnectorById({ connectorId, actionsClient });
-      const executor = createInferenceExecutor({ actionsClient, connector });
-      return { executor, connector };
+    const inference$ = defer(async () => {
+      return await getInferenceExecutor({ connectorId, request, actions });
     }).pipe(
-      switchMap(({ executor, connector }) => {
-        const connectorType = connector.type;
+      switchMap((executor) => {
+        const connectorType = executor.getConnector().type;
         const inferenceAdapter = getInferenceAdapter(connectorType);
 
         const messagesWithoutData = messages.map((message) => omit(message, 'data'));
@@ -80,21 +82,20 @@ export function createChatCompleteApi({ request, actions, logger }: CreateChatCo
           tools,
           logger,
           functionCalling,
+          abortSignal,
         });
       }),
       chunksIntoMessage({
-        toolOptions: {
-          toolChoice,
-          tools,
-        },
+        toolOptions: { toolChoice, tools },
         logger,
-      })
+      }),
+      abortSignal ? handleCancellation(abortSignal) : identity
     );
 
     if (stream) {
-      return obs$;
+      return inference$;
     } else {
-      return streamToResponse(obs$);
+      return streamToResponse(inference$);
     }
   };
 }
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/types.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/types.ts
index 64cc542ff6119..498afb9a2a17f 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/types.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/types.ts
@@ -29,6 +29,7 @@ export interface InferenceConnectorAdapter {
       messages: Message[];
       system?: string;
       functionCalling?: FunctionCallingMode;
+      abortSignal?: AbortSignal;
       logger: Logger;
     } & ToolOptions
   ) => Observable<InferenceConnectorAdapterChatCompleteEvent>;
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/handle_cancellation.test.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/handle_cancellation.test.ts
new file mode 100644
index 0000000000000..7fd464a7051c6
--- /dev/null
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/handle_cancellation.test.ts
@@ -0,0 +1,53 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { of, Subject, toArray, firstValueFrom } from 'rxjs';
+import { InferenceTaskError, InferenceTaskErrorCode } from '@kbn/inference-common';
+import { handleCancellation } from './handle_cancellation';
+
+describe('handleCancellation', () => {
+  it('mirrors the source when the abort signal is not triggered', async () => {
+    const abortController = new AbortController();
+
+    const source$ = of(1, 2, 3);
+
+    const output$ = source$.pipe(handleCancellation(abortController.signal));
+
+    const events = await firstValueFrom(output$.pipe(toArray()));
+    expect(events).toEqual([1, 2, 3]);
+  });
+
+  it('causes the observable to error when the signal fires', () => {
+    const abortController = new AbortController();
+
+    const source$ = new Subject<number>();
+
+    const output$ = source$.pipe(handleCancellation(abortController.signal));
+
+    let thrownError: any;
+    const values: number[] = [];
+
+    output$.subscribe({
+      next: (value) => {
+        values.push(value);
+      },
+      error: (err) => {
+        thrownError = err;
+      },
+    });
+
+    source$.next(1);
+    source$.next(2);
+    abortController.abort();
+    source$.next(3);
+
+    expect(values).toEqual([1, 2]);
+    expect(thrownError).toBeInstanceOf(InferenceTaskError);
+    expect(thrownError.code).toBe(InferenceTaskErrorCode.abortedError);
+    expect(thrownError.message).toContain('Request was aborted');
+  });
+});
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/handle_cancellation.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/handle_cancellation.ts
new file mode 100644
index 0000000000000..640172b150e4e
--- /dev/null
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/handle_cancellation.ts
@@ -0,0 +1,39 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { OperatorFunction, Observable, Subject, takeUntil } from 'rxjs';
+import { createInferenceRequestAbortedError } from '@kbn/inference-common';
+
+export function handleCancellation<T>(abortSignal: AbortSignal): OperatorFunction<T, T> {
+  return (source$) => {
+    const stop$ = new Subject<void>();
+    if (abortSignal.aborted) {
+      stop$.next();
+    }
+    abortSignal.addEventListener('abort', () => {
+      stop$.next();
+    });
+
+    return new Observable<T>((subscriber) => {
+      return source$.pipe(takeUntil(stop$)).subscribe({
+        next: (value) => {
+          subscriber.next(value);
+        },
+        error: (err) => {
+          subscriber.error(err);
+        },
+        complete: () => {
+          if (abortSignal.aborted) {
+            subscriber.error(createInferenceRequestAbortedError());
+          } else {
+            subscriber.complete();
+          }
+        },
+      });
+    });
+  };
+}
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/index.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/index.ts
index d3dc2010cba3a..4314a554589dd 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/index.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/index.ts
@@ -6,10 +6,11 @@
  */
 
 export {
-  createInferenceExecutor,
+  getInferenceExecutor,
   type InferenceInvokeOptions,
   type InferenceInvokeResult,
   type InferenceExecutor,
 } from './inference_executor';
 export { chunksIntoMessage } from './chunks_into_message';
 export { streamToResponse } from './stream_to_response';
+export { handleCancellation } from './handle_cancellation';
diff --git a/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/inference_executor.ts b/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/inference_executor.ts
index 736beb82aa685..c461e6b6cdfb7 100644
--- a/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/inference_executor.ts
+++ b/x-pack/platform/plugins/shared/inference/server/chat_complete/utils/inference_executor.ts
@@ -5,9 +5,14 @@
  * 2.0.
  */
 
+import type { KibanaRequest } from '@kbn/core-http-server';
 import type { ActionTypeExecutorResult } from '@kbn/actions-plugin/common';
-import type { ActionsClient } from '@kbn/actions-plugin/server';
+import type {
+  ActionsClient,
+  PluginStartContract as ActionsPluginStart,
+} from '@kbn/actions-plugin/server';
 import type { InferenceConnector } from '../../../common/connectors';
+import { getConnectorById } from '../../util/get_connector_by_id';
 
 export interface InferenceInvokeOptions {
   subAction: string;
@@ -22,6 +27,7 @@ export type InferenceInvokeResult<Data = unknown> = ActionTypeExecutorResult<Dat
  * In practice, for now it's just a thin abstraction around the action client.
  */
 export interface InferenceExecutor {
+  getConnector: () => InferenceConnector;
   invoke(params: InferenceInvokeOptions): Promise<InferenceInvokeResult>;
 }
 
@@ -33,6 +39,7 @@ export const createInferenceExecutor = ({
   actionsClient: ActionsClient;
 }): InferenceExecutor => {
   return {
+    getConnector: () => connector,
     async invoke({ subAction, subActionParams }): Promise<InferenceInvokeResult> {
       return await actionsClient.execute({
         actionId: connector.connectorId,
@@ -44,3 +51,17 @@ export const createInferenceExecutor = ({
     },
   };
 };
+
+export const getInferenceExecutor = async ({
+  connectorId,
+  actions,
+  request,
+}: {
+  connectorId: string;
+  actions: ActionsPluginStart;
+  request: KibanaRequest;
+}) => {
+  const actionsClient = await actions.getActionsClientWithRequest(request);
+  const connector = await getConnectorById({ connectorId, actionsClient });
+  return createInferenceExecutor({ actionsClient, connector });
+};
diff --git a/x-pack/platform/plugins/shared/inference/server/routes/chat_complete.ts b/x-pack/platform/plugins/shared/inference/server/routes/chat_complete.ts
index 84e3dd57cded1..06ca5381cd83a 100644
--- a/x-pack/platform/plugins/shared/inference/server/routes/chat_complete.ts
+++ b/x-pack/platform/plugins/shared/inference/server/routes/chat_complete.ts
@@ -109,6 +109,9 @@ export function registerChatCompleteRoute({
       .getStartServices()
       .then(([coreStart, pluginsStart]) => pluginsStart.actions);
 
+    const abortController = new AbortController();
+    request.events.aborted$.subscribe(() => abortController.abort());
+
     const client = createInferenceClient({ request, actions, logger });
 
     const { connectorId, messages, system, toolChoice, tools, functionCalling } = request.body;
@@ -121,6 +124,7 @@ export function registerChatCompleteRoute({
       tools,
       functionCalling,
       stream,
+      abortSignal: abortController.signal,
     });
   }
 
diff --git a/x-pack/platform/plugins/shared/inference/server/test_utils/index.ts b/x-pack/platform/plugins/shared/inference/server/test_utils/index.ts
new file mode 100644
index 0000000000000..2eafe20bfdcaf
--- /dev/null
+++ b/x-pack/platform/plugins/shared/inference/server/test_utils/index.ts
@@ -0,0 +1,11 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+export { chunkEvent, tokensEvent, messageEvent } from './chat_complete_events';
+export { createInferenceConnectorMock } from './inference_connector';
+export { createInferenceConnectorAdapterMock } from './inference_connector_adapter';
+export { createInferenceExecutorMock } from './inference_executor';
diff --git a/x-pack/platform/plugins/shared/inference/server/test_utils/inference_connector.ts b/x-pack/platform/plugins/shared/inference/server/test_utils/inference_connector.ts
new file mode 100644
index 0000000000000..af7f35115325d
--- /dev/null
+++ b/x-pack/platform/plugins/shared/inference/server/test_utils/inference_connector.ts
@@ -0,0 +1,19 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { InferenceConnector, InferenceConnectorType } from '../../common/connectors';
+
+export const createInferenceConnectorMock = (
+  parts: Partial<InferenceConnector> = {}
+): InferenceConnector => {
+  return {
+    type: InferenceConnectorType.OpenAI,
+    name: 'Inference connector',
+    connectorId: 'connector-id',
+    ...parts,
+  };
+};
diff --git a/x-pack/platform/plugins/shared/inference/server/test_utils/inference_connector_adapter.ts b/x-pack/platform/plugins/shared/inference/server/test_utils/inference_connector_adapter.ts
new file mode 100644
index 0000000000000..9e2c4516f4f1c
--- /dev/null
+++ b/x-pack/platform/plugins/shared/inference/server/test_utils/inference_connector_adapter.ts
@@ -0,0 +1,14 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { InferenceConnectorAdapter } from '../chat_complete/types';
+
+export const createInferenceConnectorAdapterMock = (): jest.Mocked<InferenceConnectorAdapter> => {
+  return {
+    chatComplete: jest.fn(),
+  };
+};
diff --git a/x-pack/platform/plugins/shared/inference/server/test_utils/inference_executor.ts b/x-pack/platform/plugins/shared/inference/server/test_utils/inference_executor.ts
new file mode 100644
index 0000000000000..64b5100a9db3d
--- /dev/null
+++ b/x-pack/platform/plugins/shared/inference/server/test_utils/inference_executor.ts
@@ -0,0 +1,19 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { InferenceConnector } from '../../common/connectors';
+import { InferenceExecutor } from '../chat_complete/utils';
+import { createInferenceConnectorMock } from './inference_connector';
+
+export const createInferenceExecutorMock = ({
+  connector = createInferenceConnectorMock(),
+}: { connector?: InferenceConnector } = {}): jest.Mocked<InferenceExecutor> => {
+  return {
+    getConnector: jest.fn().mockReturnValue(connector),
+    invoke: jest.fn(),
+  };
+};