grafana/public/app/features/dashboard/components/GenAI/llms/openai.ts

/**
 * OpenAI API client.
 *
 * This module contains functions used to make requests to the OpenAI API via
 * the Grafana LLM app plugin. That plugin must be installed, enabled and configured
 * in order for these functions to work.
 *
 * The {@link enabled} function can be used to check if the plugin is enabled and configured.
 */

import { pipe, Observable, UnaryFunction } from 'rxjs';
import { filter, map, scan, takeWhile, tap } from 'rxjs/operators';

import {
  isLiveChannelMessageEvent,
  LiveChannelAddress,
  LiveChannelMessageEvent,
  LiveChannelScope,
} from '@grafana/data';
import { getBackendSrv, getGrafanaLiveSrv, logDebug } from '@grafana/runtime';

import { LLM_PLUGIN_ID, LLM_PLUGIN_ROUTE, setLLMPluginVersion } from './constants';
import { LLMAppSettings } from './types';

const OPENAI_CHAT_COMPLETIONS_PATH = 'openai/v1/chat/completions';

/** The role of a message's author. */
export type Role = 'system' | 'user' | 'assistant' | 'function';

/** A message in a conversation. */
export interface Message {
  /** The role of the message's author. */
  role: Role;

  /** The contents of the message. content is required for all messages, and may be null for assistant messages with function calls. */
  content: string;

  /**
   * The name of the author of this message.
   *
   * This is required if role is 'function', and it should be the name of the function whose response is in the content.
   *
   * May contain a-z, A-Z, 0-9, and underscores, with a maximum length of 64 characters.
   */
  name?: string;

  /**
   * The name and arguments of a function that should be called, as generated by the model.
   */
  function_call?: Object;
}

/** A function the model may generate JSON inputs for. */
export interface Function {
  /**
   * The name of the function to be called.
   *
   * Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.
   */
  name: string;
  /**
   * A description of what the function does, used by the model to choose when and how to call the function.
   */
  description?: string;
  /*
   * The parameters the functions accepts, described as a JSON Schema object. See the OpenAI guide for examples, and the JSON Schema reference for documentation about the format.
   *
   * To describe a function that accepts no parameters, provide the value {"type": "object", "properties": {}}.
   */
  parameters: Object;
}

export interface ChatCompletionsRequest {
  /**
   * ID of the model to use.
   *
   * See the model endpoint compatibility table for details on which models work with the Chat Completions API.
   */
  model: string;
  /** A list of messages comprising the conversation so far. */
  messages: Message[];
  /** A list of functions the model may generate JSON inputs for. */
  functions?: Function[];
  /**
   * Controls how the model responds to function calls.
   *
   * "none" means the model does not call a function, and responds to the end-user.
   * "auto" means the model can pick between an end-user or calling a function.
   * Specifying a particular function via {"name": "my_function"} forces the model to call that function.
   *
   * "none" is the default when no functions are present. "auto" is the default if functions are present.
   */
  function_call?: 'none' | 'auto' | { name: string };
  /**
   * What sampling temperature to use, between 0 and 2.
   * Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
   *
   * We generally recommend altering this or top_p but not both.
   */
  temperature?: number;
  /**
   * An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass.
   * So 0.1 means only the tokens comprising the top 10% probability mass are considered.
   *
   * We generally recommend altering this or temperature but not both.
   */
  top_p?: number;
  /**
   * How many chat completion choices to generate for each input message.
   */
  n?: number;
  /**
   * Up to 4 sequences where the API will stop generating further tokens.
   */
  stop?: string | string[];
  /**
   * The maximum number of tokens to generate in the chat completion.
   *
   * The total length of input tokens and generated tokens is limited by the model's context length. Example Python code for counting tokens.
   */
  max_tokens?: number;
  /**
   * Number between -2.0 and 2.0.
   *
   * Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
   */
  presence_penalty?: number;
  /**
   * Number between -2.0 and 2.0.
   *
   * Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
   */
  frequency_penalty?: number;
  /**
   * Modify the likelihood of specified tokens appearing in the completion.
   *
   * Accepts a json object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100.
   * Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,
   * but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban
   * or exclusive selection of the relevant token.
   */
  logit_bias?: { [key: string]: number };
  /**
   * A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.
   */
  user?: string;
}

/** A completion object from an OpenAI model. */
export interface Choice {
  /** The message object generated by the model. */
  message: Message;
  /**
   * The reason the model stopped generating text.
   *
   * This may be one of:
   *  - stop: API returned complete message, or a message terminated by one of the stop sequences provided via the stop parameter
   *  - length: incomplete model output due to max_tokens parameter or token limit
   *  - function_call: the model decided to call a function
   *  - content_filter: omitted content due to a flag from our content filters
   *  - null: API response still in progress or incomplete
   */
  finish_reason: string;
  /** The index of the completion in the list of choices. */
  index: number;
}

/** The usage statistics for a request to OpenAPI. */
export interface Usage {
  /** The number of tokens in the prompt. */
  prompt_tokens: number;
  /** The number of tokens in the completion. */
  completion_tokens: number;
  /** The total number of tokens. */
  total_tokens: number;
}

/** The error response from the Grafana LLM app when trying to call the chat completions API. */
interface ChatCompletionsErrorResponse {
  /** The error message. */
  error: string;
}

/** A response from the OpenAI Chat Completions API. */
export interface ChatCompletionsResponse<T = Choice> {
  /** The ID of the request. */
  id: string;
  /** The type of object returned (e.g. 'chat.completion'). */
  object: string;
  /** The timestamp of the request, as a UNIX timestamp. */
  created: number;
  /** The name of the model used to generate the response. */
  model: string;
  /** A list of completion objects (only one, unless `n > 1` in the request). */
  choices: T[];
  /** The number of tokens used to generate the replies, counting prompt, completion, and total. */
  usage: Usage;
}

/** A content message returned from the model. */
export interface ContentMessage {
  /** The content of the message. */
  content: string;
}

/** A message returned from the model indicating that it is done. */
export interface DoneMessage {
  done: boolean;
}

/** A function call message returned from the model. */
export interface FunctionCallMessage {
  /** The name of the function to call. */
  name: string;
  /** JSON string for the arguments to the function call. */
  arguments: string;
}

/**
 * A delta returned from a stream of chat completion responses.
 *
 * In practice this will be either a content message or a function call;
 * done messages are filtered out by the `streamChatCompletions` function.
 */
export type ChatCompletionsDelta = ContentMessage | FunctionCallMessage | DoneMessage;

/** A chunk included in a chat completion response. */
export interface ChatCompletionsChunk {
  /** The delta since the previous chunk. */
  delta: ChatCompletionsDelta;
}

/** Return true if the message is a 'content' message. */
export function isContentMessage(message: unknown): message is ContentMessage {
  return typeof message === 'object' && message !== null && 'content' in message;
}

/** Return true if the message is a 'done' message. */
export function isDoneMessage(message: unknown): message is DoneMessage {
  return typeof message === 'object' && message !== null && 'done' in message;
}

/** Return true if the response is an error response. */
export function isErrorResponse(response: unknown): response is ChatCompletionsErrorResponse {
  return typeof response === 'object' && response !== null && 'error' in response;
}

/**
 * An rxjs operator that extracts the content messages from a stream of chat completion responses.
 *
 * @returns An observable that emits the content messages. Each emission will be a string containing the
 *         token emitted by the model.
 * @example <caption>Example of reading all tokens in a stream.</caption>
 * const stream = streamChatCompletions({ model: 'gpt-3.5-turbo', messages: [
 *   { role: 'system', content: 'You are a great bot.' },
 *   { role: 'user', content: 'Hello, bot.' },
 * ]}).pipe(extractContent());
 * stream.subscribe({ next: console.log, error: console.error });
 * // Output:
 * // ['Hello', '? ', 'How ', 'are ', 'you', '?']
 */
export function extractContent(): UnaryFunction<
  Observable<ChatCompletionsResponse<ChatCompletionsChunk>>,
  Observable<string>
> {
  return pipe(
    filter((response: ChatCompletionsResponse<ChatCompletionsChunk>) => isContentMessage(response.choices[0].delta)),
    // The type assertion is needed here because the type predicate above doesn't seem to propagate.
    map(
      (response: ChatCompletionsResponse<ChatCompletionsChunk>) => (response.choices[0].delta as ContentMessage).content
    )
  );
}

/**
 * An rxjs operator that accumulates the content messages from a stream of chat completion responses.
 *
 * @returns An observable that emits the accumulated content messages. Each emission will be a string containing the
 *         content of all messages received so far.
 * @example
 * const stream = streamChatCompletions({ model: 'gpt-3.5-turbo', messages: [
 *   { role: 'system', content: 'You are a great bot.' },
 *   { role: 'user', content: 'Hello, bot.' },
 * ]}).pipe(accumulateContent());
 * stream.subscribe({ next: console.log, error: console.error });
 * // Output:
 * // ['Hello', 'Hello! ', 'Hello! How ', 'Hello! How are ', 'Hello! How are you', 'Hello! How are you?']
 */
export function accumulateContent(): UnaryFunction<
  Observable<ChatCompletionsResponse<ChatCompletionsChunk>>,
  Observable<string>
> {
  return pipe(
    extractContent(),
    scan((acc, curr) => acc + curr, '')
  );
}

/**
 * Make a request to OpenAI's chat-completions API via the Grafana LLM plugin proxy.
 */
export async function chatCompletions(request: ChatCompletionsRequest): Promise<ChatCompletionsResponse> {
  const response = await getBackendSrv().post<ChatCompletionsResponse>(
    '/api/plugins/grafana-llm-app/resources/openai/v1/chat/completions',
    request,
    {
      headers: { 'Content-Type': 'application/json' },
    }
  );
  return response;
}

/**
 * Make a streaming request to OpenAI's chat-completions API via the Grafana LLM plugin proxy.
 *
 * A stream of tokens will be returned as an `Observable<string>`. Use the `extractContent` operator to
 * filter the stream to only content messages, or the `accumulateContent` operator to obtain a stream of
 * accumulated content messages.
 *
 * The 'done' message will not be emitted; the stream will simply end when this message is encountered.
 *
 * @example <caption>Example of reading all tokens in a stream.</caption>
 * const stream = streamChatCompletions({ model: 'gpt-3.5-turbo', messages: [
 *   { role: 'system', content: 'You are a great bot.' },
 *   { role: 'user', content: 'Hello, bot.' },
 * ]}).pipe(extractContent());
 * stream.subscribe({ next: console.log, error: console.error });
 * // Output:
 * // ['Hello', '? ', 'How ', 'are ', 'you', '?']
 *
 * @example <caption>Example of accumulating tokens in a stream.</caption>
 * const stream = streamChatCompletions({ model: 'gpt-3.5-turbo', messages: [
 *   { role: 'system', content: 'You are a great bot.' },
 *   { role: 'user', content: 'Hello, bot.' },
 * ]}).pipe(accumulateContent());
 * stream.subscribe({ next: console.log, error: console.error });
 * // Output:
 * // ['Hello', 'Hello! ', 'Hello! How ', 'Hello! How are ', 'Hello! How are you', 'Hello! How are you?']
 */
export function streamChatCompletions(
  request: ChatCompletionsRequest
): Observable<ChatCompletionsResponse<ChatCompletionsChunk>> {
  const channel: LiveChannelAddress = {
    scope: LiveChannelScope.Plugin,
    namespace: LLM_PLUGIN_ID,
    path: OPENAI_CHAT_COMPLETIONS_PATH + '/' + self.crypto.randomUUID(),
    data: request,
  };
  const messages = getGrafanaLiveSrv()
    .getStream(channel)
    .pipe(filter((event) => isLiveChannelMessageEvent(event))) as Observable<
    LiveChannelMessageEvent<ChatCompletionsResponse<ChatCompletionsChunk>>
  >;
  return messages.pipe(
    tap((event) => {
      if (isErrorResponse(event.message)) {
        throw new Error(event.message.error);
      }
    }),
    takeWhile((event) => isErrorResponse(event.message) || !isDoneMessage(event.message.choices[0].delta)),
    map((event) => event.message)
  );
}

let loggedWarning = false;

/** Check if the OpenAI API is enabled via the LLM plugin. */
export const enabled = async () => {
  try {
    const settings: LLMAppSettings = await getBackendSrv().get(`${LLM_PLUGIN_ROUTE}/settings`, undefined, undefined, {
      showSuccessAlert: false,
      showErrorAlert: false,
    });
    setLLMPluginVersion(settings.info.version);
    return settings.enabled ?? false;
  } catch (e) {
    if (!loggedWarning) {
      logDebug(String(e));
      logDebug(
        'Failed to check if OpenAI is enabled. This is expected if the Grafana LLM plugin is not installed, and the above error can be ignored.'
      );
      loggedWarning = true;
    }
    return false;
  }
};