Add latest changes from gitlab-org/gitlab@master

This commit is contained in:
GitLab Bot 2024-07-11 18:26:32 +00:00
parent 0a251aa742
commit 5d7f5cb05d
122 changed files with 1330 additions and 6902 deletions

View File

@ -5,7 +5,6 @@ import UsersSelect from '~/users_select';
export default class IssuableContext {
constructor(currentUser) {
this.userSelect = new UsersSelect(currentUser);
this.reviewersSelect = new UsersSelect(currentUser, '.js-reviewer-search');
this.reviewersSelect.dropdowns.forEach((glDropdownInstance) => {
@ -34,9 +33,6 @@ export default class IssuableContext {
$('.issuable-sidebar .inline-update').on('change', 'select', function onClickSelect() {
return $(this).submit();
});
$('.issuable-sidebar .inline-update').on('change', '.js-assignee', function onClickAssignee() {
return $(this).submit();
});
$(document)
.off('click', '.issuable-sidebar .dropdown-content a')
.on('click', '.issuable-sidebar .dropdown-content a', (e) => e.preventDefault());

View File

@ -12,7 +12,6 @@ import resolvedStatusMixin from '~/batch_comments/mixins/resolved_status';
import { createAlert } from '~/alert';
import { TYPE_ISSUE } from '~/issues/constants';
import { __, sprintf } from '~/locale';
import eventHub from '~/sidebar/event_hub';
import UserAccessRoleBadge from '~/vue_shared/components/user_access_role_badge.vue';
import glFeatureFlagsMixin from '~/vue_shared/mixins/gl_feature_flags_mixin';
import { splitCamelCase } from '~/lib/utils/text_utility';
@ -151,9 +150,6 @@ export default {
showDeleteAction() {
return this.canDelete && !this.canReportAsAbuse && !this.noteUrl;
},
isAuthoredByCurrentUser() {
return this.authorId === this.currentUserId;
},
currentUserId() {
return this.getUserDataByProp('id');
},
@ -165,9 +161,6 @@ export default {
? __('Unassign from commenting user')
: __('Assign to commenting user');
},
sidebarAction() {
return this.isUserAssigned ? 'sidebar.addAssignee' : 'sidebar.removeAssignee';
},
targetType() {
return this.getNoteableData.targetType;
},
@ -228,8 +221,6 @@ export default {
},
handleAssigneeUpdate(assignees) {
this.$emit('updateAssignees', assignees);
eventHub.$emit(this.sidebarAction, this.author);
eventHub.$emit('sidebar.saveAssignees');
},
assignUser() {
let { assignees } = this;

View File

@ -158,21 +158,26 @@ export default {
};
},
successRatio() {
const { successfulPipelines, failedPipelines } = this.counts;
const { successfulPipelines, totalPipelines } = this.counts;
const successfulCount = successfulPipelines?.count;
const failedCount = failedPipelines?.count;
const ratio = (successfulCount / (successfulCount + failedCount)) * 100;
const totalCount = totalPipelines?.count || 0;
return failedCount === 0 ? 100 : ratio;
return totalCount === 0 ? 100 : (successfulCount / totalCount) * 100;
},
failureRatio() {
const { failedPipelines, totalPipelines } = this.counts;
const failedCount = failedPipelines?.count;
const totalCount = totalPipelines?.count || 0;
return totalCount === 0 ? 0 : (failedCount / totalCount) * 100;
},
formattedCounts() {
const { totalPipelines, successfulPipelines, failedPipelines } = this.counts;
const { totalPipelines } = this.counts;
return {
total: totalPipelines?.count,
success: successfulPipelines?.count,
failed: failedPipelines?.count,
successRatio: this.successRatio,
failureRatio: this.failureRatio,
};
},
areaCharts() {

View File

@ -4,7 +4,7 @@ import { GlSingleStat } from '@gitlab/ui/dist/charts';
import { SUPPORTED_FORMATS, getFormatter } from '~/lib/utils/unit_format';
import { s__, formatNumber } from '~/locale';
const defaultPrecision = 2;
const defaultPrecision = 0;
export default {
components: {
@ -38,21 +38,16 @@ export default {
value: formatNumber(this.counts.total),
},
{
label: s__('PipelineCharts|Success ratio'),
label: s__('PipelineCharts|Failure rate'),
identifier: 'failure-ratio',
value: formatPercent(this.counts.failureRatio, defaultPrecision),
link: this.failedPipelinesLink,
},
{
label: s__('PipelineCharts|Success rate'),
identifier: 'success-ratio',
value: formatPercent(this.counts.successRatio, defaultPrecision),
},
{
label: s__('PipelineCharts|Successful pipelines'),
identifier: 'successful-pipelines',
value: formatNumber(this.counts.success),
},
{
label: s__('PipelineCharts|Failed pipelines'),
identifier: 'failed-pipelines',
value: formatNumber(this.counts.failed),
link: this.failedPipelinesLink,
},
];
},
},
@ -74,9 +69,13 @@ export default {
:should-animate="true"
use-delimiters
/>
<gl-link v-if="shouldDisplayLink(statistic)" class="gl-p-2" :href="statistic.link">{{
s__('Pipeline|See details')
}}</gl-link>
<gl-link
v-if="shouldDisplayLink(statistic)"
class="gl-p-2"
:href="statistic.link"
data-event-tracking="click_view_all_link_in_pipeline_analytics"
>{{ s__('Pipeline|View all') }}</gl-link
>
</div>
</div>
</template>

View File

@ -1,149 +0,0 @@
<script>
import { createAlert } from '~/alert';
import { TYPE_ISSUE } from '~/issues/constants';
import { __ } from '~/locale';
import glFeatureFlagsMixin from '~/vue_shared/mixins/gl_feature_flags_mixin';
import eventHub from '../../event_hub';
import Store from '../../stores/sidebar_store';
import AssigneeTitle from './assignee_title.vue';
import Assignees from './assignees.vue';
import AssigneesRealtime from './assignees_realtime.vue';
export default {
name: 'SidebarAssignees',
components: {
AssigneeTitle,
Assignees,
AssigneesRealtime,
},
mixins: [glFeatureFlagsMixin()],
props: {
mediator: {
type: Object,
required: true,
},
field: {
type: String,
required: true,
},
issuableType: {
type: String,
required: false,
default: TYPE_ISSUE,
},
issuableIid: {
type: String,
required: true,
},
projectPath: {
type: String,
required: true,
},
issuableId: {
type: Number,
required: true,
},
assigneeAvailabilityStatus: {
type: Object,
required: false,
default: () => ({}),
},
},
data() {
return {
store: new Store(),
loading: false,
};
},
computed: {
shouldEnableRealtime() {
// Note: Realtime is only available on issues right now, future support for MR wil be built later.
return this.issuableType === TYPE_ISSUE;
},
queryVariables() {
return {
iid: this.issuableIid,
fullPath: this.projectPath,
};
},
relativeUrlRoot() {
return gon.relative_url_root ?? '';
},
},
created() {
this.removeAssignee = this.store.removeAssignee.bind(this.store);
this.addAssignee = this.store.addAssignee.bind(this.store);
this.removeAllAssignees = this.store.removeAllAssignees.bind(this.store);
// Get events from deprecatedJQueryDropdown
eventHub.$on('sidebar.removeAssignee', this.removeAssignee);
eventHub.$on('sidebar.addAssignee', this.addAssignee);
eventHub.$on('sidebar.removeAllAssignees', this.removeAllAssignees);
eventHub.$on('sidebar.saveAssignees', this.saveAssignees);
},
beforeDestroy() {
eventHub.$off('sidebar.removeAssignee', this.removeAssignee);
eventHub.$off('sidebar.addAssignee', this.addAssignee);
eventHub.$off('sidebar.removeAllAssignees', this.removeAllAssignees);
eventHub.$off('sidebar.saveAssignees', this.saveAssignees);
},
methods: {
assignSelf() {
// Notify gl dropdown that we are now assigning to current user
this.$el.parentElement.dispatchEvent(new Event('assignYourself'));
this.mediator.assignYourself();
this.saveAssignees();
},
saveAssignees() {
this.loading = true;
this.mediator
.saveAssignees(this.field)
.then(() => {
this.loading = false;
this.store.resetChanging();
})
.catch(() => {
this.loading = false;
return createAlert({
message: __('Error occurred when saving assignees'),
});
});
},
exposeAvailabilityStatus(users) {
return users.map(({ username, ...rest }) => ({
...rest,
username,
availability: this.assigneeAvailabilityStatus[username] || '',
}));
},
},
};
</script>
<template>
<div>
<assignees-realtime
v-if="shouldEnableRealtime"
:issuable-type="issuableType"
:issuable-id="issuableId"
:query-variables="queryVariables"
:mediator="mediator"
/>
<assignee-title
:number-of-assignees="store.assignees.length"
:loading="loading || store.isFetching.assignees"
:editable="store.editable"
:changing="store.changing"
/>
<assignees
v-if="!store.isFetching.assignees"
:root-path="relativeUrlRoot"
:users="exposeAvailabilityStatus(store.assignees)"
:editable="store.editable"
:issuable-type="issuableType"
@assign-self="assignSelf"
/>
</div>
</template>

View File

@ -8,7 +8,6 @@ import {
isInDesignPage,
isInIncidentPage,
isInIssuePage,
isInMRPage,
parseBoolean,
} from '~/lib/utils/common_utils';
import { __ } from '~/locale';
@ -16,7 +15,6 @@ import { apolloProvider } from '~/graphql_shared/issuable_client';
import Translate from '~/vue_shared/translate';
import UserSelect from '~/vue_shared/components/user_select/user_select.vue';
import CollapsedAssigneeList from './components/assignees/collapsed_assignee_list.vue';
import SidebarAssignees from './components/assignees/sidebar_assignees.vue';
import SidebarAssigneesWidget from './components/assignees/sidebar_assignees_widget.vue';
import SidebarConfidentialityWidget from './components/confidential/sidebar_confidentiality_widget.vue';
import CopyEmailToClipboard from './components/copy/copy_email_to_clipboard.vue';
@ -84,51 +82,6 @@ function mountSidebarTodoWidget() {
});
}
function getSidebarAssigneeAvailabilityData() {
const sidebarAssigneeEl = document.querySelectorAll('.js-sidebar-assignee-data input');
return Array.from(sidebarAssigneeEl)
.map((el) => el.dataset)
.reduce(
(acc, { username, availability = '' }) => ({
...acc,
[username]: availability,
}),
{},
);
}
function mountSidebarAssigneesDeprecated(mediator) {
const el = document.querySelector('.js-sidebar-assignees-root');
if (!el) {
return null;
}
const { id, iid, fullPath } = getSidebarOptions();
const assigneeAvailabilityStatus = getSidebarAssigneeAvailabilityData();
return new Vue({
el,
name: 'SidebarAssigneesRoot',
apolloProvider,
render: (createElement) =>
createElement(SidebarAssignees, {
props: {
mediator,
issuableIid: String(iid),
projectPath: fullPath,
field: el.dataset.field,
issuableType:
isInIssuePage() || isInIncidentPage() || isInDesignPage()
? TYPE_ISSUE
: TYPE_MERGE_REQUEST,
issuableId: id,
assigneeAvailabilityStatus,
},
}),
});
}
function mountSidebarAssigneesWidget() {
const el = document.querySelector('.js-sidebar-assignees-root');
@ -172,12 +125,6 @@ function mountSidebarAssigneesWidget() {
},
}),
});
const assigneeDropdown = document.querySelector('.js-sidebar-assignee-dropdown');
if (assigneeDropdown) {
trackShowInviteMemberLink(assigneeDropdown);
}
}
function mountSidebarReviewers(mediator) {
@ -801,15 +748,9 @@ export function mountAssigneesDropdown() {
});
}
const isAssigneesWidgetShown = isInIssuePage() || isInDesignPage() || isInMRPage();
export function mountSidebar(mediator, store) {
mountSidebarTodoWidget();
if (isAssigneesWidgetShown) {
mountSidebarAssigneesWidget();
} else {
mountSidebarAssigneesDeprecated(mediator);
}
mountSidebarAssigneesWidget();
mountSidebarReviewers(mediator);
mountSidebarCrmContacts();
mountSidebarLabelsWidget();

View File

@ -57,7 +57,7 @@ class Projects::MergeRequests::DraftsController < Projects::MergeRequests::Appli
end
def publish
result = DraftNotes::PublishService.new(merge_request, current_user).execute(draft_note(allow_nil: true))
result = DraftNotes::PublishService.new(merge_request, current_user).execute(draft: draft_note(allow_nil: true))
if create_note_params[:note]
::Notes::CreateService.new(@project, current_user, create_note_params).execute

View File

@ -206,13 +206,6 @@ module IssuablesHelper
finder.class.scalar_params.any? { |p| params[p].present? }
end
def assignee_sidebar_data(assignee, merge_request: nil)
{ avatar_url: assignee.avatar_url, name: assignee.name, username: assignee.username }.tap do |data|
data[:can_merge] = merge_request.can_be_merged_by?(assignee) if merge_request
data[:availability] = assignee.status.availability if assignee.association(:status).loaded? && assignee.status&.availability
end
end
def issuable_squash_option?(issuable, project)
if issuable.persisted?
issuable.squash

View File

@ -20,7 +20,8 @@ module HasUserType
service_account: 13,
llm_bot: 14,
placeholder: 15,
duo_code_review_bot: 16
duo_code_review_bot: 16,
import_user: 17
}.with_indifferent_access.freeze
BOT_USER_TYPES = %w[

View File

@ -0,0 +1,12 @@
# frozen_string_literal: true
module Import
class NamespaceImportUser < ApplicationRecord
self.table_name = 'namespace_import_users'
belongs_to :import_user, class_name: 'User', foreign_key: :user_id, inverse_of: :namespace_import_user
belongs_to :namespace
validates :namespace_id, :user_id, presence: true
end
end

View File

@ -111,6 +111,8 @@ class Namespace < ApplicationRecord
has_many :jira_connect_subscriptions, class_name: 'JiraConnectSubscription', foreign_key: :namespace_id, inverse_of: :namespace
has_many :import_source_users, class_name: 'Import::SourceUser', foreign_key: :namespace_id, inverse_of: :namespace
has_one :namespace_import_user, class_name: 'Import::NamespaceImportUser', foreign_key: :namespace_id, inverse_of: :namespace
has_one :import_user, class_name: 'User', through: :namespace_import_user, foreign_key: :user_id
validates :owner, presence: true, if: ->(n) { n.owner_required? }
validates :name,

View File

@ -532,6 +532,7 @@ class Project < ApplicationRecord
delegate :name, to: :owner, allow_nil: true, prefix: true
delegate :jira_dvcs_server_last_sync_at, to: :feature_usage
delegate :last_pipeline, to: :commit, allow_nil: true
delegate :import_user, to: :root_ancestor
with_options to: :team do
delegate :members, prefix: true

View File

@ -268,6 +268,7 @@ class User < MainClusterwide::ApplicationRecord
has_many :created_custom_emoji, class_name: 'CustomEmoji', inverse_of: :creator
has_many :bulk_imports
has_one :namespace_import_user, class_name: 'Import::NamespaceImportUser', inverse_of: :import_user
has_many :custom_attributes, class_name: 'UserCustomAttribute'
has_one :trusted_with_spam_attribute, -> { UserCustomAttribute.trusted_with_spam }, class_name: 'UserCustomAttribute'
@ -2263,7 +2264,7 @@ class User < MainClusterwide::ApplicationRecord
end
def terms_accepted?
return true if project_bot? || service_account? || security_policy_bot?
return true if project_bot? || service_account? || security_policy_bot? || import_user?
if Feature.enabled?(:enforce_acceptance_of_changed_terms)
!!ApplicationSetting::Term.latest&.accepted_by_user?(self)

View File

@ -51,6 +51,10 @@ class BasePolicy < DeclarativePolicy::Base
with_options scope: :user, score: 0
condition(:placeholder_user) { @user.try(:placeholder?) || false }
desc "Import user"
with_options scope: :user, score: 0
condition(:import_user) { @user.try(:import_user?) || false }
desc "User email is unconfirmed or user account is locked"
with_options scope: :user, score: 0
condition(:inactive) { @user&.confirmation_required_on_sign_in? || @user&.access_locked? }
@ -91,6 +95,7 @@ class BasePolicy < DeclarativePolicy::Base
condition(:is_gitlab_com, score: 0, scope: :global) { ::Gitlab.com? }
rule { placeholder_user }.prevent_all
rule { import_user }.prevent_all
private

View File

@ -2,13 +2,15 @@
module DraftNotes
class PublishService < DraftNotes::BaseService
def execute(draft = nil)
return error('Not allowed to create notes') unless can?(current_user, :create_note, merge_request)
def execute(draft: nil, executing_user: nil)
executing_user ||= current_user
return error('Not allowed to create notes') unless can?(executing_user, :create_note, merge_request)
if draft
publish_draft_note(draft)
publish_draft_note(draft, executing_user)
else
publish_draft_notes
publish_draft_notes(executing_user)
merge_request_activity_counter.track_publish_review_action(user: current_user)
end
@ -20,14 +22,14 @@ module DraftNotes
private
def publish_draft_note(draft)
create_note_from_draft(draft)
def publish_draft_note(draft, executing_user)
create_note_from_draft(draft, executing_user)
draft.delete
MergeRequests::ResolvedDiscussionNotificationService.new(project: project, current_user: current_user).execute(merge_request)
end
def publish_draft_notes
def publish_draft_notes(executing_user)
return if draft_notes.blank?
review = Review.create!(author: current_user, merge_request: merge_request, project: project)
@ -36,6 +38,7 @@ module DraftNotes
draft_note.review = review
create_note_from_draft(
draft_note,
executing_user,
skip_capture_diff_note_position: true,
skip_keep_around_commits: true,
skip_merge_status_trigger: true
@ -51,7 +54,7 @@ module DraftNotes
after_publish(review)
end
def create_note_from_draft(draft, skip_capture_diff_note_position: false, skip_keep_around_commits: false, skip_merge_status_trigger: false)
def create_note_from_draft(draft, executing_user, skip_capture_diff_note_position: false, skip_keep_around_commits: false, skip_merge_status_trigger: false)
# Make sure the diff file is unfolded in order to find the correct line
# codes.
draft.diff_file&.unfold_diff_lines(draft.original_position)
@ -59,7 +62,8 @@ module DraftNotes
note_params = draft.publish_params.merge(skip_keep_around_commits: skip_keep_around_commits)
note = Notes::CreateService.new(project, current_user, note_params).execute(
skip_capture_diff_note_position: skip_capture_diff_note_position,
skip_merge_status_trigger: skip_merge_status_trigger
skip_merge_status_trigger: skip_merge_status_trigger,
executing_user: executing_user
)
set_discussion_resolve_status(note, draft)

View File

@ -2,9 +2,10 @@
module Notes
class BuildService < ::BaseService
def execute
def execute(executing_user: nil)
in_reply_to_discussion_id = params.delete(:in_reply_to_discussion_id)
external_author = params.delete(:external_author)
executing_user ||= current_user
discussion = nil
@ -16,7 +17,7 @@ module Notes
if in_reply_to_discussion_id.present?
discussion = find_discussion(in_reply_to_discussion_id)
return discussion_not_found unless discussion && can?(current_user, :create_note, discussion.noteable)
return discussion_not_found unless discussion && can?(executing_user, :create_note, discussion.noteable)
discussion = discussion.convert_to_discussion! if discussion.can_convert_to_discussion?

View File

@ -4,8 +4,11 @@ module Notes
class CreateService < ::Notes::BaseService
include IncidentManagement::UsageData
def execute(skip_capture_diff_note_position: false, skip_merge_status_trigger: false)
note = Notes::BuildService.new(project, current_user, params.except(:merge_request_diff_head_sha)).execute
def execute(skip_capture_diff_note_position: false, skip_merge_status_trigger: false, executing_user: nil)
note =
Notes::BuildService
.new(project, current_user, params.except(:merge_request_diff_head_sha))
.execute(executing_user: executing_user)
# n+1: https://gitlab.com/gitlab-org/gitlab-foss/issues/37440
note_valid = Gitlab::GitalyClient.allow_n_plus_1_calls do

View File

@ -7,42 +7,3 @@
.title.hide-collapsed.gl-display-flex.gl-justify-content-space-between.gl-align-items-center{ class: 'gl-mb-0!' }
%span.gl-font-bold= s_('Label|Assignee')
= gl_loading_icon(inline: true)
.js-sidebar-assignee-data.selectbox.hide-collapsed
- if assignees.none?
= hidden_field_tag "#{issuable_type}[assignee_ids][]", 0, id: nil
- else
- assignees.each do |assignee|
= hidden_field_tag "#{issuable_type}[assignee_ids][]", assignee.id, id: nil, data: assignee_sidebar_data(assignee, merge_request: @merge_request)
- options = { toggle_class: 'js-user-search js-author-search',
title: _('Select assignees'),
filter: true,
dropdown_class: 'dropdown-menu-user dropdown-menu-selectable dropdown-menu-author',
placeholder: _('Search users'),
data: { first_user: issuable_sidebar.dig(:current_user, :username),
current_user: true,
iid: issuable_sidebar[:iid],
issuable_type: issuable_type,
project_id: issuable_sidebar[:project_id],
author_id: issuable_sidebar[:author_id],
field_name: "#{issuable_type}[assignee_ids][]",
issue_update: issuable_sidebar[:issuable_json_path],
ability_name: issuable_type,
null_user: true,
display: 'static' } }
- title = dropdown_options[:title]
- options[:toggle_class] += ' js-multiselect js-save-user-data'
- data = { field_name: "#{issuable_type}[assignee_ids][]" }
- data[:multi_select] = true
- data['dropdown-title'] = title
- data['dropdown-header'] = dropdown_options[:data][:'dropdown-header']
- data['max-select'] = dropdown_max_select(dropdown_options[:data])
- options[:data].merge!(data)
= render 'shared/issuable/sidebar_user_dropdown',
options: options,
wrapper_class: 'js-sidebar-assignee-dropdown',
track_label: 'edit_assignee',
trigger_source: "#{issuable_type}_assignee_dropdown"

View File

@ -0,0 +1,18 @@
---
description: View all link on Pipeline Analytics failure rate stat
internal_events: true
action: click_view_all_link_in_pipeline_analytics
identifiers:
- project
- namespace
- user
product_group: runner
milestone: '17.2'
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/158957
distributions:
- ce
- ee
tiers:
- free
- premium
- ultimate

View File

@ -6,4 +6,4 @@ rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/470926
milestone: '17.2'
group: group::optimize
type: beta
default_enabled: false
default_enabled: true

View File

@ -1,8 +1,8 @@
---
name: wiki_front_matter
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/27706
rollout_issue_url:
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/435056
milestone: '12.10'
type: development
group: group::knowledge
default_enabled: false
default_enabled: true

View File

@ -5,4 +5,4 @@ rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/428259
milestone: '16.6'
type: development
group: group::knowledge
default_enabled: false
default_enabled: true

View File

@ -0,0 +1,22 @@
---
key_path: redis_hll_counters.count_distinct_user_id_from_click_view_all_link_in_pipeline_analytics_monthly
description: Monthly count of unique users who clicked on the View All link on the Pipeline Analytics page
product_group: runner
performance_indicator_type: []
value_type: number
status: active
milestone: '17.2'
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/158957
time_frame: 28d
data_source: internal_events
data_category: optional
distribution:
- ce
- ee
tier:
- free
- premium
- ultimate
events:
- name: click_view_all_link_in_pipeline_analytics
unique: user.id

View File

@ -0,0 +1,22 @@
---
key_path: redis_hll_counters.count_distinct_user_id_from_click_view_all_link_in_pipeline_analytics_weekly
description: Weekly count of unique users who clicked on the View All link on the Pipeline Analytics page
product_group: runner
performance_indicator_type: []
value_type: number
status: active
milestone: '17.2'
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/158957
time_frame: 7d
data_source: internal_events
data_category: optional
distribution:
- ce
- ee
tier:
- free
- premium
- ultimate
events:
- name: click_view_all_link_in_pipeline_analytics
unique: user.id

View File

@ -0,0 +1,12 @@
---
table_name: namespace_import_users
classes:
- Import::NamespaceImportUser
feature_categories:
- importers
description: Represents a list of import users for namespaces
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/157979
milestone: '17.2'
gitlab_schema: gitlab_main_cell
sharding_key:
namespace_id: namespaces

View File

@ -0,0 +1,19 @@
# frozen_string_literal: true
class AddNamespaceImportUsersTable < Gitlab::Database::Migration[2.2]
milestone '17.2'
def up
create_table :namespace_import_users do |t| # rubocop:disable Migration/EnsureFactoryForTable -- False Positive
t.bigint :user_id, null: false
t.bigint :namespace_id, null: false
t.index :namespace_id, unique: true, name: :index_namespace_import_users_on_namespace_id
t.index :user_id, unique: true, name: :index_namespace_import_users_on_user_id
end
end
def down
drop_table :namespace_import_users
end
end

View File

@ -0,0 +1,17 @@
# frozen_string_literal: true
class AddForeignKeyOnUserToNamespaceImportUsers < Gitlab::Database::Migration[2.2]
milestone '17.2'
disable_ddl_transaction!
def up
add_concurrent_foreign_key :namespace_import_users, :users, column: :user_id, on_delete: :cascade
end
def down
with_lock_retries do
remove_foreign_key :namespace_import_users, column: :user_id
end
end
end

View File

@ -0,0 +1,17 @@
# frozen_string_literal: true
class AddForeignKeyOnNamespaceToNamespaceImportUsers < Gitlab::Database::Migration[2.2]
milestone '17.2'
disable_ddl_transaction!
def up
add_concurrent_foreign_key :namespace_import_users, :namespaces, column: :namespace_id, on_delete: :cascade
end
def down
with_lock_retries do
remove_foreign_key :namespace_import_users, column: :namespace_id
end
end
end

View File

@ -0,0 +1,9 @@
# frozen_string_literal: true
class AddSeatControlToNamespaceSettings < Gitlab::Database::Migration[2.2]
milestone '17.2'
def change
add_column :namespace_settings, :seat_control, :smallint, null: false, default: 0
end
end

View File

@ -0,0 +1 @@
584a0b88e726e9bcb8a270b3f670de7aaa01df882fe5a80b739e7b30ca6bb28f

View File

@ -0,0 +1 @@
43829404f3e2b2b9fc5780da09ef3f73fa91fd5d690c4a1387181b9cb765f9c4

View File

@ -0,0 +1 @@
181a63a57456d990e65c2b7e712d06da4a0d3a336a3d1b9cfcec47b355f3889e

View File

@ -0,0 +1 @@
635b0c8cd5d82026f915d924c0923c21799f48e0e0f452bbfd6c92c5b3d1f168

View File

@ -13227,6 +13227,21 @@ CREATE TABLE namespace_details (
pending_delete boolean DEFAULT false NOT NULL
);
CREATE TABLE namespace_import_users (
id bigint NOT NULL,
user_id bigint NOT NULL,
namespace_id bigint NOT NULL
);
CREATE SEQUENCE namespace_import_users_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE namespace_import_users_id_seq OWNED BY namespace_import_users.id;
CREATE TABLE namespace_ldap_settings (
namespace_id bigint NOT NULL,
created_at timestamp with time zone NOT NULL,
@ -13343,6 +13358,7 @@ CREATE TABLE namespace_settings (
remove_dormant_members boolean DEFAULT false NOT NULL,
remove_dormant_members_period integer DEFAULT 90 NOT NULL,
early_access_program_joined_by_id bigint,
seat_control smallint DEFAULT 0 NOT NULL,
CONSTRAINT check_0ba93c78c7 CHECK ((char_length(default_branch_name) <= 255)),
CONSTRAINT namespace_settings_unique_project_download_limit_alertlist_size CHECK ((cardinality(unique_project_download_limit_alertlist) <= 100)),
CONSTRAINT namespace_settings_unique_project_download_limit_allowlist_size CHECK ((cardinality(unique_project_download_limit_allowlist) <= 100))
@ -21094,6 +21110,8 @@ ALTER TABLE ONLY namespace_bans ALTER COLUMN id SET DEFAULT nextval('namespace_b
ALTER TABLE ONLY namespace_commit_emails ALTER COLUMN id SET DEFAULT nextval('namespace_commit_emails_id_seq'::regclass);
ALTER TABLE ONLY namespace_import_users ALTER COLUMN id SET DEFAULT nextval('namespace_import_users_id_seq'::regclass);
ALTER TABLE ONLY namespace_statistics ALTER COLUMN id SET DEFAULT nextval('namespace_statistics_id_seq'::regclass);
ALTER TABLE ONLY namespaces ALTER COLUMN id SET DEFAULT nextval('namespaces_id_seq'::regclass);
@ -23414,6 +23432,9 @@ ALTER TABLE ONLY namespace_commit_emails
ALTER TABLE ONLY namespace_details
ADD CONSTRAINT namespace_details_pkey PRIMARY KEY (namespace_id);
ALTER TABLE ONLY namespace_import_users
ADD CONSTRAINT namespace_import_users_pkey PRIMARY KEY (id);
ALTER TABLE ONLY namespace_ldap_settings
ADD CONSTRAINT namespace_ldap_settings_pkey PRIMARY KEY (namespace_id);
@ -28036,6 +28057,10 @@ CREATE UNIQUE INDEX index_namespace_commit_emails_on_user_id_and_namespace_id ON
CREATE INDEX index_namespace_details_on_creator_id ON namespace_details USING btree (creator_id);
CREATE UNIQUE INDEX index_namespace_import_users_on_namespace_id ON namespace_import_users USING btree (namespace_id);
CREATE UNIQUE INDEX index_namespace_import_users_on_user_id ON namespace_import_users USING btree (user_id);
CREATE UNIQUE INDEX index_namespace_root_storage_statistics_on_namespace_id ON namespace_root_storage_statistics USING btree (namespace_id);
CREATE UNIQUE INDEX index_namespace_statistics_on_namespace_id ON namespace_statistics USING btree (namespace_id);
@ -32724,6 +32749,9 @@ ALTER TABLE ONLY merge_request_assignment_events
ALTER TABLE ONLY bulk_import_entities
ADD CONSTRAINT fk_a44ff95be5 FOREIGN KEY (parent_id) REFERENCES bulk_import_entities(id) ON DELETE CASCADE;
ALTER TABLE ONLY namespace_import_users
ADD CONSTRAINT fk_a49233ca5d FOREIGN KEY (namespace_id) REFERENCES namespaces(id) ON DELETE CASCADE;
ALTER TABLE ONLY abuse_report_user_mentions
ADD CONSTRAINT fk_a4bd02b7df FOREIGN KEY (note_id) REFERENCES notes(id) ON DELETE CASCADE;
@ -32853,6 +32881,9 @@ ALTER TABLE ONLY issue_assignees
ALTER TABLE ONLY agent_project_authorizations
ADD CONSTRAINT fk_b7fe9b4777 FOREIGN KEY (agent_id) REFERENCES cluster_agents(id) ON DELETE CASCADE;
ALTER TABLE ONLY namespace_import_users
ADD CONSTRAINT fk_b82be3e1f3 FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE;
ALTER TABLE ONLY namespace_commit_emails
ADD CONSTRAINT fk_b8d89d555e FOREIGN KEY (email_id) REFERENCES emails(id) ON DELETE CASCADE;

View File

@ -36424,6 +36424,7 @@ Stage event identifiers.
| <a id="valuestreamstageeventmerge_request_last_build_started"></a>`MERGE_REQUEST_LAST_BUILD_STARTED` | Merge request last build started event. |
| <a id="valuestreamstageeventmerge_request_last_edited"></a>`MERGE_REQUEST_LAST_EDITED` | Merge request last edited event. |
| <a id="valuestreamstageeventmerge_request_merged"></a>`MERGE_REQUEST_MERGED` | Merge request merged event. |
| <a id="valuestreamstageeventmerge_request_reviewer_first_assigned"></a>`MERGE_REQUEST_REVIEWER_FIRST_ASSIGNED` | Merge request reviewer first assigned event. |
| <a id="valuestreamstageeventplan_stage_start"></a>`PLAN_STAGE_START` | Plan stage start event. |
### `VerificationStateEnum`

View File

@ -1,45 +1,11 @@
---
owning-stage: "~devops::ai-powered"
description: 'AI Context Management ADR 001: Keeping AI Context Policy Management close to AI Context Retriever'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ai_context_management/decisions/001_policy_on_the_client/'
remove_date: '2025-07-08'
---
# AI Context Management ADR 001: Keeping AI Context Policy Management close to AI Context Retriever
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ai_context_management/decisions/001_policy_on_the_client/).
## Summary
To manage AI Context effectively and ensure flexible and scalable solutions, AI Context Policy Management will reside in the
same environment, as the AI Context Retriever, and, as a result, as close to the context fetching mechanism as possible. This
approach aims to reduce latency and improve user control over the contextual information sent to AI systems.
## Context
The original blueprint outlined the necessity of a flexible AI Context Management system to provide accurate and relevant
AI responses while addressing security and trust concerns. It suggested that AI Context Policy Management should act as
a filtering solution between the context resolver and the context fetcher in the AI Context Retriever. However, the
blueprint did not specify the exact location for the AI Context Policy Management within the system.
During [a sync discussion](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/155707#note_1978675445), it was determined
that placing the AI Context Policy Management close to AI Context Retriever would provide significant benefits. This decision
aligns with our approach of having shared components, like the AI Gateway and the Duo Chat UI, to ensure consistency and reduce
redundancy across different environments.
## Decision
AI Context Management will happen as close to the user's interaction with Duo features as possible. As a result, the [AI Gateway](https://gitlab.com/gitlab-org/modelops/applied-ml/code-suggestions/ai-assist) will only receive context that is policy-compliant.
Users interact with Duo features in many different environments, including their IDE and the GitLab Web UI. Rather than retrieving the context from this environment and sending it to the AI Gateway for filtering based on the AI Context Policy, this decision states that the AI Context Retriever will filter this content *before* it reaches the AI Gateway.
This decision allows for better security, flexibility and scalability, enabling dynamic user interactions and immediate feedback on context validation.
## Consequences
- *Implementation Complexity*: Users must create, modify, and remove context policies in each environment where they are
interacting with Duo features. This requires multiple implementations to support different environments.
- *Flexibility and Scalability*: Storing AI Context Policy Management close the AI Context Retriever allows for more flexible
and scalable policy implementations tailored to specific environments, such as IDEs and the Web.
- *Reduced Latency*: Filtering out unwanted context at the earliest possible stage reduces latency and ensures that only
the necessary information is sent to the AI models.
- *User Experience*: This approach facilitates dynamic UX, providing instant feedback to users in case of failed context
validation. Users can manage their supplementary context more effectively through a user-friendly interface.
- *Security*: By managing policies closer to the content retrieving mechanism, sensitive information can be filtered out
locally, enhancing security and user trust.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 249 KiB

View File

@ -1,290 +1,11 @@
---
status: proposed
creation-date: "2023-06-03"
authors: [ "@dmishunov" ]
coach: "@jessieay"
approvers: ["@pwietchner", "@dmishunov" ]
owning-stage: "~devops::ai-powered"
participating-stages: ["~devops::create"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ai_context_management/'
remove_date: '2025-07-08'
---
# AI Context Management
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ai_context_management/).
## Glossary
- **AI Context**. In the scope of this technical blueprint, the term "AI Context" refers to supplementary information
provided to the AI system alongside the primary prompts.
- **AI Context Policy**. The "AI Context Policy" is a user-defined and user-managed mechanism allowing precise
control over the content that can be sent to the AI as contextual information. In the context of this blueprint, the
_AI Context Policy_ is suggested as a YAML configuration file.
- **AI Context Policy Management**. Within this blueprint, "Management" encompasses the user-driven processes of
creating, modifying, and removing AI Context Policies according to specific requirements and preferences.
- **Automatic AI Context**. _AI Context_, retrieved automatically based on the active document. _*Automatic AI Contex_
can be the active document's dependencies (modules, methods, etc., imported into the active document), some
search-based, or other mechanisms over which the user has limited control.
- **Supplementary User Context**: User-defined _AI Context_, such as open tabs in IDEs, local files, and folders, that the user
provides from their local environment to extend the default _AI Context_.
- **AI Context Retriever**: A backend system capable of:
- communicating with _AI Context Policy Management_
- fetching content defined in _Automatic AI Context_ and _Supplementary User Context_ (complete files, definitions,
methods, etc.), based on the _AI Context Policy Management_
- correctly augment the user prompt with AI Context before sending it to LLM. Presumably, this part is already
handled by [AI Gateway](../ai_gateway/index.md).
- **Project Administrator**. In the context of this blueprint, "Project Administrator" means any individual with the
"Edit project settings" permission ("Maintainer" or "Owner" roles, as defined in [Project members permissions](../../../user/permissions.md#project-members-permissions)).
![Illustration of the AI Context architecture](img/architecture.jpg)
## Summary
Correct context can dramatically improve the quality of AI responses. This blueprint aims to accommodate AI Context
seamlessly into our offering by architecting a solution that is ready for this additional context coming from different
AI features.
However, we recognize the importance of security and trust, which automatic solutions do not necessarily provide. To
address any concerns users might have about the content fed into the AI Context, this blueprint suggests providing them
with control and customization options. This way, users can adjust the content according to their preferences and have a
clear understanding of what information is being utilized.
This blueprint proposes a system for managing _AI Context_ at the _Project Administrator_ and individual
user levels. Its goal is to allow _Project Administrator_ to set high-level rules for what content can be included as context for AI
prompts while enabling users to specify _Supplementary User Context_ for their prompts. The global _AI Context Policy_ will use a YAML
configuration file format stored in the same Git repository. The suggested format of the YAML configuration files
is discussed below.
## Motivation
Ensuring the AI has the correct context is crucial for generating accurate and relevant code suggestions or responses.
As the adoption of AI-assisted development grows, it's essential to give organizations and users control over what project
content is sent as context to AI models. Some files or directories may contain sensitive information that should not
be shared. At the same time, users may want to provide additional context for their prompts to get more
relevant suggestions. We need a flexible _AI Context_ management system to handle these cases.
### Goals
### For _Project Administrators_
- Allow _Project Administrators_ set the default _AI Context Policy_ to control whether content can or cannot be
automatically included in the _AI Context_ when making requests to LLMs
- Allow _Project Administrators_ to specify exceptions to the default _AI Context Policy_
- Provide a UI to manage the default _AI Context Policy_ and its exceptions list easily
### For users
- Allow to set _Supplementary User Context_ to include as AI context for their prompts
- Provide a UI to manage _Supplementary User Context_ easily
### Non-Goals
- _AI Context Retriever_ architecture - different environments (Web, IDEs) will probably implement their retrievers.
However, the unified public interface of the retrievers should be considered.
- Extremely granular controls like allowing/excluding individual lines of code
- Storing entire file contents from user projects, only paths will be persisted
## Proposal
The proposed architecture consists of 3 main parts:
- _AI Context Retriever_
- _AI Context Policy Management_
- _Supplementary User Context_
There are several different ongoing efforts related to various implementations of _AI Context Retriever_ both
[for Web](https://gitlab.com/groups/gitlab-org/-/epics/14040), and [for IDEs](https://gitlab.com/groups/gitlab-org/editor-extensions/-/epics/55).
Because of that, the architecture for _AI Context Retriever_ is beyond the scope of this blueprint. However, in the
context of this blueprint, it is assumed that:
- _AI Context Retriever_ is capable of automatically retrieving and fetching _Automatic AI Context_ and passing it
on as _AI Context_ to LLM.
- _AI Context Retriever_ can automatically retrieve and fetch _Supplementary User Context_and pass
it on as _AI Context_ to LLM.
- _AI Context Retriever_ implementation can ensure that any content passed as _AI Context_ to a model
adheres to the global _AI Context Policy_.
- _AI Context Retriever_ can trim the _AI Context_ to meet the contextual window requirement for a
specific LLM used for that or another Duo feature.
### _AI Context Policy Management_ proposal
To implement the _AI Context Policy Management_ system, it is proposed to:
- Introduce the YAML file format for configuring global policies
- In the YAML configuration file, support two `ai_context_policy` types:
- `block`: blocks all content except for the specified `exclude` paths. Excluded files are allowed. (**Default**)
- `allow`: allows all content except for the specified `exclude` paths. Excluded files are blocked.
- `version`: specifies the schema version of the AI context file. Starting with `version: 1`. If omitted treated as the latest version known to the client.
- In the YAML configuration file, support glob patterns to exclude certain paths from the global policy
- Support nested _AI Context Policies_ to provide a more granular control of _AI Context_ in sub-folders. For
example, a policy in `/src/tests` would override a policy in `/src`, which, in its turn, would override a
global _AI Context Policy_ in `/`.
### _Supplementary User Context_ proposal
To implement the _Supplementary User Context_ system, it is proposed to:
- Introduce user-level UI to specify _Supplementary User Context_ for prompts. A particular implementation of the UI could
differ in different environments (IDEs, Web, etc.), but the actual design of these implementations is beyond the scope of
this architecture blueprint
- The user-level UI should communicate to the user what is in the _Supplementary User Context_ at any moment.
- The user-level UI should allow the user to edit the contents of the _Supplementary User Context_.
### Optional steps
- Provide UI for _Project Administrators_ to configure global _AI Context Policy_. [Source Editor](../../../development/fe_guide/source_editor.md)
can be used as the editor for this type of YAML file format, similar to the
[Security Policy Editor](../../../user/application_security/policies/index.md#policy-editor).
- Implement a validation mechanism for _AI Context Policies_ to somehow notify the _Project Administrators_ in case
of the invalid format of the YAML configuration file. It could be a job in CI. But to catch possible issues proactively, it is
also advised to introduce the validation step as part of the
[pre-push static analysis](../../../development/contributing/style_guides.md#pre-push-static-analysis-with-lefthook)
## Design and implementation details
- **YAML Configuration File Format**: The proposed YAML configuration file format for defining the global
_AI Context Policy_ is as follows:
```yaml
ai_context_policy: [allow|block]
exclude:
- glob/**/pattern
```
The `ai_context_policy` section specifies the current policy for this and all underlying folders in a repo.
The `exclude` section specifies the exceptions to the `ai_context_policy`. Technically, it's an inversion of the policy.
For example, if we specify `foo_bar.js` in `exclude`:
- for the `allow` policy, it means that `foo_bar.js` will be blocked
- for the `block` policy, it means that `foo_bar.js` will be allowed
- **User-Level UI for _Supplementary User Context_**: The UI for specifying _Supplementary User Context_ for prompts
can be implemented differently depending on the environment (IDEs, Web, etc.). However, the implementation should
ensure users can provide additional context for their prompts. The specified _Supplementary User Context_ for
each user can be stored as:
- a preference stored in the user profile in GitLab
- **Pros**: Consistent across devices and environments (Web, IDEs, etc.)
- **Cons**: Additional work in the monolith, potentially a lot of new read/writes to a database
- a preference stored in the local IDE/Web storage
- **Pros**: User-centric, local to user environment
- **Cons**: Different implementations for different environments (Web, IDEs, etc.), doesn't survive switching
environment or device
In both cases, the storage should allow the preference to be associated with a particular repository. Factors
like data consistency, performance, and implementation complexity should guide the decision on what type of storage
to use.
- To mitigate potential performance and scalability issues, it would make sense to keep _AI Context Retriever_, and
_AI Context Policy Management_ in the same environment as the feature needing those. It would be
[Language Server](https://gitlab.com/gitlab-org/editor-extensions/gitlab-lsp) for Duo features in IDEs and different
services in the monolith for Duo features on the Web.
### Data flow
Here's the draft of the data flow demonstrating the role of _AI Context_ using the Code Suggestions feature as an example.
```mermaid
sequenceDiagram
participant CS as Code Suggestions
participant CR as AI Context Retriever
participant PM as AI Context Policy Management
participant LLM as Language Model
CS->>CR: Request Code Suggestion
CR->>CR: Retrieve Supplementary User Context list
CR->>CR: Retrieve Automatic AI Context list
CR->>PM: Check AI Context against Policy
PM-->>CR: Return valid AI Context list
CR->>CR: Fetch valid AI Context
CR->>LLM: Send prompt with final AI Context
LLM->>LLM: Generate code suggestions
LLM-->>CS: Return code suggestions
CS->>CS: Present code suggestions to the user
```
In case the _AI Context Retriever_ fails to fetch any content from the _AI Context_, the prompt is sent with
_AI Context_, which was successfully fetched. In a low-probability case, when _AI Context Retriever_ cannot fetch any content, the prompt should be sent out as-is.
## Alternative solutions
### JSON Configuration Files
- **Pros**: Widely used, easier integration with web technologies.
- **Cons**: Less readable compared to YAML for complex configurations.
### Database-Backed Configuration
- **Pros**: Centralized management, dynamic updates.
- **Cons**: Not version controlled.
### Environment Variables
- **Pros**: Simplifies configuration for deployment and scaling.
- **Cons**: Less suitable for complex configurations.
### Policy as Code (without YAML)
- **Pros**: Better control and auditing with versioned code.
- **Cons**: It requires users to write code and us to invent a language for it.
### Policy in `.ai_ignore` and other Git-like files
- **Pros**: Provides a straightforward approach, identical to the `allow` policy with the list of `exclude` suggested in this blueprint
- **Cons**: Supports only the `allow` policy; the processing of this file type still has to be implemented
Based on these alternatives, the YAML file was chosen as a format for this blueprint because of versioning
in Git, and more versatility compared to the `.ai_ignore` alternative.
## Suggested iterative implementation plan
Please refer to the [Proposal](#proposal) for a detailed explanation of the items in every iteration.
### Iteration 1
- Introduce the global `.ai-context-policy.yaml` YAML configuration file format and schema for this file type
as part of _AI Context Policy Management_.
- _AI Context Retrievers_ introduce support for _Supplementary User Context_.
- Optional: validation mechanism (like CI job and pre-push static analysis) for `.ai-context-policy.yaml`
**Success criteria for the iteration:** Prompts sent from the Code Suggestions feature in IDEs contain
_AI Context_ only with the open IDE tabs, which adhere to the global _AI Context Policy_ in the root of a repository.
### Iteration 2
- In _AI Context Retrievers_ introduce support for _Automatic AI Context_.
- Connect more features to the _AI Context Management_ system.
**Success criteria for the iteration:** Prompts sent from the Code Suggestions feature in IDEs contain _AI Context_
with items of _Automatic AI Context_, which adhere to the global _AI Context Policy_ in the root of a repository.
### Iteration 3
- Connect all Duo features on the Web and in IDEs to _AI Context Retrievers_ and adhere to the global
_AI Context Policy_.
**Success criteria for the iteration:** All Duo features in all environments send _AI Context_ which adheres to the
global _AI Context Policy_
### Iteration 4
- Support nested `.ai-context-policy.yaml` YAML configuration files.
**Success criteria for the iteration:** _AI Context Policy_ placed into the sub-folders of a repository, override
higher-level policies when sending prompts.
### Iteration 5
- User-level UI for _Supplementary User Context_.
**Success criteria for the iteration:** Users can see and edit the contents of the _Supplementary User Context_ and
the context is shared between all Duo features within the environment (Web, IDEs, etc.)
### Iteration 6
- Optional: UI for configuring the global _AI Context Policy_.
**Success criteria for the iteration:** Users can see and edit the contents of the _AI Context Policies_ in a UI
editor.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,210 +1,11 @@
---
status: proposed
creation-date: "2024-02-16"
authors: [ "@ash2k", "@ntepluhina" ]
coach: "@grzesiek"
approvers: [ "@nagyv-gitlab", "@nmezzopera" ]
owning-stage: "~devops::deploy"
participating-stages: [ "~devops::plan" ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/autoflow/'
remove_date: '2025-07-08'
---
<!-- Blueprints often contain forward-looking statements -->
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/autoflow/).
# AutoFlow - workflows for automation
Automation + Workflow = AutoFlow.
## Summary
GitLab offers a single application for the whole DevSecOps cycle, and we aim to become the AllOps platform.
Being a platform means to provide users with tools to solve various problems in the corresponding domain.
There is a huge number of use cases that boil down to letting users automate interactions between the DevSecOps domain
objects.
Automation is key to increasing productivity and reducing cost in most businesses so those use cases are a big deal for
our customers.
We don't provide a comprehensive way to automate processes across the domain at the moment.
GitLab AutoFlow allows users to encode workflows of interactions between DevSecOps domain objects and external systems.
Users are able to share, reuse, and collaborate on workflow blocks.
## Motivation
### Goals
- Let users build workflows that automate tasks in the DevSecOps domain.
- Workflows should be able to interact with GitLab, users (via UI), and with external systems. Interactions are via API
calls and emitting/receiving events.
- Users should be able to share and reuse parts of a workflow on a self-serve basis.
- Workflow definitions should be testable without having to run them in a real environment.
- Security of customer data and GitLab platform should be kept in mind at all times.
- Workflows have to be executed in a durable way. Put differently, the automated show must go on even if a server
crashes or network connectivity is disrupted.
### Example use cases
#### Trivial
- When milestone on an issue is `Backlog`, set label `workflow::backlog`. When it's set to a milestone, set label to
`workflow::ready for dev`.
- When label on an issue is `group::environments`, set labels `devops::deploy` and `section::cd`.
- All what [GitLab Triage](https://gitlab.com/gitlab-org/ruby/gems/gitlab-triage) can do.
#### Interesting
- **Retro bot**: when a milestone expires or is closed, wait for the next Monday. Get a list of members of a group.
For each member open an issue in a project.
In the issue add a form with fields (a new UI component that we don't have now) to enter what went well in this
milestone, what didn't, and praise for the team. If an issue stays open for more than two days, ping the assigned team
member.
Once all opened issues have been closed or a week later, whatever happens first, collect form data from them
and open a new issue with that data aggregated.
Assign to the group's manager, mention all group members.
- **Project compliance**: when project settings change, trigger a "pre-commit" flow that allows for programmatic
validation of the intended changes. Restricting project settings is a common compliance requirement. Today, GitLab
role model does not allow for much customization, and users work around this functionality with code-based automations
like Terraform. An alternative, often requested approach is to restrict project settings at higher levels. Given the
wide variety of project settings, this would likely either have only partial support or would require re-implementing
all the project settings in the compliance settings. Overall, most single use-case solutions will likely have serious
maintenance and scalability issues. Implementing validation as code could provide a simple interface.
#### Sophisticated
- **Deployments**: when a commit is merged into the main branch:
- A build should run.
- On success, certain artifacts should be combined into a release.
- The release then should be rolled out to pre-production environment using a certain deployment method.
- The deployment should be soaked there under synthetic load for 1 day.
- Then promoted to staging.
- After 1 more day in staging, the release should be gradually rolled out to production.
- Production rollout should start with a canary deployment.
- Then it should be scaled up in 10% increments each hour.
- For that deployment, anomaly detection system should be monitoring certain metrics.
- It should stop the rollout if something unusual is detected (we don't have this mechanism yet, but it'd be
great), notify the SRE team.
- If things are "really bad" (i.e. certain metrics breach certain thresholds), create an incident issue and start
rolling the deployment back.
- Keep the incident issue up to date with what's happening with the deployment.
- Get information about the Deployment object (let's assume we are deploying to Kubernetes), events in
the namespace, and Pod logs from the GitLab agent for Kubernetes.
- Feed that into GitLab Duo to get advice on what the problem might be and how to fix it. Post the reply as a comment.
- **Compliance in workflows**: any of the automated workflows, e.g. the one above, can have one or more steps where
a manual interaction from a user is awaited.
- If we let workflows generate UI elements, they could wait for those forms
to be filled, for buttons pushed, etc and take further actions based on user input (or lack of - timeouts).
- We could have a workflow request an approval from the risk management team if the deployment is happening during
a [PCL](https://handbook.gitlab.com/handbook/engineering/infrastructure/change-management/#production-change-lock-pcl).
- Because the process is automated, automation is code that is version-controlled, passing an audit becomes easier. No
chance to forget to follow the process if it's automated and there is no way around it.
- **Access requests**: most (?) of
our [access requests](https://gitlab.com/gitlab-com/team-member-epics/access-requests/)
can probably be automated.
- Team member creates an issue, fills in a form, assigns to their manager, they approve by setting a
label or pressing a special button, automation takes it from there - most systems have APIs that can be used to make
the requested changes.
- Consider how much time is wasted here - people have to wait, people have to do repetitive work.
- Manual actions mean there is a chance of making a mistake while making an important change.
- It's not only us, most of the businesses have a need to automate such processes.
### Related issues
Over the years we've accumulated many issues, epics, ideas, use cases on the topic of automation. Here are some of the
more interesting ones.
---
[Improved Work Item lifecycle management & native automations](https://gitlab.com/groups/gitlab-org/-/epics/364),
[GitLab Automations](https://gitlab.com/groups/gitlab-org/-/epics/218), [Workflows Solution Validation](https://gitlab.com/gitlab-org/gitlab/-/issues/344136).
These look at the problem from the `devops::plan` point of view:
> Customers and prospects frequently lament that there is no way to easily manage end-to-end workflows (Epic, Issue,
> MR...) within GitLab.
>
> Officially requested by 14 distinct accounts and is the third most requested / highest value capability from the Plan
> stage.
See the linked issues from the epics too.
[Configure label to be removed when issue is closed](https://gitlab.com/gitlab-org/gitlab/-/issues/17461) is yet
another example. 283 upvotes.
---
[Automatable DevOps](https://gitlab.com/gitlab-org/gitlab/-/issues/330084) is Mikhail's previous attempt to provide the
automation capability. It inspired lots of thinking and lead to this proposal.
---
[Add the ability to define an issue/MR event and an action to take as a result of that event](https://gitlab.com/gitlab-org/gitlab/-/issues/242194).
Customer [quote](https://gitlab.com/gitlab-org/gitlab/-/issues/242194#note_1785436689):
> I'm in agreement. I'm having a hard enough time bringing a development team on board to GitLab, adding manual label
> management to the process when parts of it should be done via automation adds to the challenge.
>
> We don't want to auto-close issues on merge and have defined a QA role to perform that step. The problem I'm working
> on figuring out now is how to automate label management on an issue when the associated MR is closed, while leaving
> the
> Issue open but updating the workflow labels on it automatically.
>
> We're a smallish team and I need to be focused on product development, not how to build GitLab automation scripts.
>
> Having the ability to trigger some events as a part of an MR merging to manage other aspects of the system would be
> extremely helpful.
---
Some use cases from `group::delivery` (from
[this comment](https://gitlab.com/gitlab-org/ci-cd/section-showcases/-/issues/54#note_1663194580)):
- If we have events from when certain files are added/changed in Git for a project, we could use this to automate the
Provisioner in the Runway platform (and deprovision when people want to).
- Automating certain tasks when a new backport request issue is created.
- Automated tasks when we want to start a new monthly release.
- Moving to a "GitLab deployment Engine" that is more powerful than GitLab CI alone. This is perhaps the most
interesting use case to me, but I do wonder how complicated it would be to manage these workflows.
---
Some use cases from the Remote Development team (from
[this comment](https://gitlab.com/gitlab-org/ci-cd/section-showcases/-/issues/54#note_1658464245)):
> A real world example of this is the Remote Development Teams work to implement
> [a standard XP/Scrum style velocity-based process and workflow](https://handbook.gitlab.com/handbook/engineering/development/dev/create/ide/#planning-process-overview)
> in GitLab.
>
> There's
> [multiple limitations in GitLab the product itself](https://gitlab.com/cwoolley-gitlab/gl-velocity-board-extension#why-doesnt-standard-gitlab-support-this)
> which make it difficult to use this common process, and we have to work around them.
>
> To avoid the manual toil of making this process work in GitLab, we would like to automate it. However our efforts to
> set up the
> [several desired automations](https://handbook.gitlab.com/handbook/engineering/development/dev/create/ide/#automations)
> have been limited because of the barriers to implementing and testing them in
> Triage Bot, especially for ones that contain more complex logic, or can't be implemented solely via quick actions.
>
> I believe a tool like GitLab Flow would make it much easier for us and our customers to implement common but
> non-supported processes and workflows such as this on top of GitLab, without having to wait months or years for a
> given feature to be shipped which unblocks the effort.
## Proof of concept, demos
- [Implementation issue](https://gitlab.com/gitlab-org/cluster-integration/gitlab-agent/-/issues/473)
- Video with [conceptual and technical details, fist demo](https://www.youtube.com/watch?v=g9HSPV3GKas). It's a long
video, **watch on 1.5x**. [Skip right to the demo](https://youtu.be/g9HSPV3GKas?t=1325) at 22:05.
- [Slides](https://docs.google.com/presentation/d/1doMdiyusAjzHq-hlqHqHr0y4WZN2EiJZHFS_PVrTfJ8/edit?usp=sharing). Please
see speaker notes for links and code.
- Demo project: N/A See speaker notes for code (as text, not video)
- Implementation MRs:
[kas part](https://gitlab.com/gitlab-org/cluster-integration/gitlab-agent/-/merge_requests/1173), [Rails part](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/136696).
Since the above demo went well, [GitLab AutoFlow for internal use](https://gitlab.com/groups/gitlab-org/-/epics/12120)
epic was opened. Then we tried to address a concrete use case
in [AutoFlow PoC: configurable automation for issue status updates](https://gitlab.com/groups/gitlab-org/-/epics/12571).
We recorded two more demos as part of that (see the epic for more details):
- [GitLab AutoFlow PoC, iteration 1](https://www.youtube.com/watch?v=2Ntdnv2LY6I)
- [AutoFlow UI for issues triaging (Iteration 3 demo)](https://www.youtube.com/watch?v=bIBWxcJ1YTg&list=PL05JrBw4t0Kqgx_Pzuum5GeyNkMMcf2Bp&index=6)
## Links to related documents
- [Relation of GitLab AutoFlow to GitLab CI](relation_to_ci.md)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,65 +1,11 @@
---
stage: Deploy
group: Environments
info: Relation of GitLab AutoFlow to GitLab CI
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/autoflow/relation_to_ci/'
remove_date: '2025-07-08'
---
# Relation of GitLab AutoFlow to GitLab CI
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/autoflow/relation_to_ci/).
GitLab CI and GitLab AutoFlow are different tools for solving different sets of problems. Here are the differences based
on the
[PoC](https://gitlab.com/groups/gitlab-org/-/epics/12571#note_1759648935) / [demo implementation](https://gitlab.com/gitlab-org/ci-cd/section-showcases/-/issues/54)
of the idea that is based on [Temporal](https://temporal.io/). Technical details and decisions on what technology to use
will be part of separate documents. But, since the question of relation to GitLab CI came up a few times, the following
is documented here to pre-emptively answer the question.
## Conceptual differentiation
- GitLab CI solves the problem of Continuous Integration. Use it to build and test your software.
- GitLab AutoFlow solves the problem of automation in the DevSecOps domain, but not CI.
Use it to automate business processes.
## Task-based differentiation
Use GitLab CI if:
- Need to execute a program/binary/tool, including a (shell) script.
- Need to execute a container.
- Need to perform heavy computations.
- Need lots of RAM to perform an operation.
Use GitLab AutoFlow:
- Orchestrating complex, cross-project CI pipelines as part of the DevSecOps domain.
- Manipulating DevOps domain object(s) when something happens (or on a schedule) by calling APIs.
- Need to wait for an unspecified amount of time (possibly days or even weeks) for async events to take place
before proceeding.
## Implementation differences
Temporal-based GitLab AutoFlow implementation:
- Designed for durable execution. I.e. can safely resume workflow execution after failure.
- Designed to run for an arbitrary long time (literary years). I.e. can wait for events and/or timers to "wake up" a
workflow, only occupying disk space in the DB for state storage. No CPU/RAM resources are reserved/used for a
non-executing workflow.
- Not designed to run heavy execution tasks. This is not a limitation of Temporal (as it does not run any code), it's
just this PoC doesn't give user a way to run something computationally expensive. Well, you could do computations in
Starlark, but you cannot run an external program.
- Not designed to run containers.
- Activities (executable unit of a workflow) have near-zero execution overhead. Think "function invocation" in an
already
running program. No startup cost at all. Activities are literally functions in kas that kas calls when it's told to.
- Not designed (at least not in this PoC) to run untrusted code BUT Starlark interpreter is not doing code generation
and is built in Go, not C, so most of typical "interpreter VM" vulnerabilities are simply impossible. This means it's
quite safe to execute untrusted Starlark code. Such code can only interact with the host program/machine via objects
explicitly injected into the script, which we control, it cannot do anything else.
GitLab CI:
- Is not designed for durable execution. If a job fails, it can be manually restarted. It will run from the start,
not from a particular point where it failed. It may not be safe to restart a failed job because it depends on what the
user is doing there. It's by far not a 1:1 comparison, but unlike CI jobs, Temporal activities are/must be idempotent
so are safe to retry automatically.
- Designed as a perfect solution for Continuous Integration.
- Designed to run arbitrary containers and untrusted code.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,175 +1,11 @@
---
status: proposed
creation-date: "2024-02-07"
authors: [ "@vshumilo" ]
coach: "@vitallium"
approvers: [ "@tgolubeva", "@jameslopez" ]
owning-stage: "~devops::fulfillment"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/cdot_plan_managment/'
remove_date: '2025-07-08'
---
# Automate CustomersDot Plan management
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/cdot_plan_managment/).
## Summary
The [GitLab Customers Portal](https://customers.gitlab.com/) is an independent application, distinct from the GitLab product, designed to empower GitLab customers in managing their accounts, subscriptions, and conducting tasks such as renewing and purchasing additional seats. More information about the Customers Portal can be found in [the GitLab docs](../../../subscriptions/customers_portal.md). Internally, the application is known as [CustomersDot](https://gitlab.com/gitlab-org/customers-gitlab-com) (also known as CDot).
GitLab uses [Zuora's platform](https://handbook.gitlab.com/handbook/business-technology/enterprise-applications/guides/zuora/) as the SSoT for all product-related information. The [Zuora Product Catalog](https://knowledgecenter.zuora.com/Get_Started/Zuora_quick_start_tutorials/B_Billing/A_The_Zuora_Product_Catalog) represents the full list of revenue-making products and services that are sellable, or have been sold by GitLab, which is core knowledge for CustomersDot decision making. CustomersDot currently has a local cache of the Zuora Product Catalog via the [IronBank](https://github.com/zendesk/iron_bank) gem and [its LocalRecord extension](https://gitlab.com/gitlab-org/customers-gitlab-com/blob/45f5dedbb4fa803d19827472214ea0b5b0ce1861/lib/gem_extensions/iron_bank/local_records.rb#L1).
CustomersDot uses `Plan` as a wrapper class for easy access to all the details about a Plan in the Product Catalog. Given the name, price, minimum quantity and other details of the Plan are spread around the `Zuora::ProductRatePlan`, `Zuora::ProductRatePlanCharge` and `Zuora::ProductRatePlanChargeTier` objects, traditional access of these details can be cumbersome. This class is very useful because it saves us the need to query for all these details. On the other hand this class helps with the classification of `Zuora::ProductRatePlan`s based on their tier, deployment type, and other criteria to be used across the app.
CustomersDot's cached Product Catalog is currently synced manually and requires a restart of CustomersDot to be fully refreshed due to limitations in the `Plan` class. Every time a new Product, Product Rate Plan or Product Rate Plan Charge are updated or added to the Zuora Product Catalog, an additional manual effort is required to add it to the `Plan` class and configure it.
The main goal for this design document is to improve the architecture and maintainability of the `Plan` model within CustomersDot. When the Product Catalog is updated in Zuora, it automatically reflects in CustomersDot without requiring app restarts, code changes, or manual intervention.
## Motivation
Current Zuora Product Catalog updates are not automatically picked up by CustomersDot for a couple of reasons:
- CustomersDot's cached Product Catalog sync requires a manual intervention via Rails console and a full refresh requires a server restart due to `Plan` heavily relying on constants and class variables for its `Zuora::ProductRatePlan` classification.
- Every time a new Product / SKU is added to the Zuora Product Catalog, even if the cache previously described is refreshed, it requires code changes in CustomersDot to make it available. This is due to the current strategy the `Plan` class uses for classification consisting of assigning the `Zuora::ProductRatePlan` ids to constants and then manually forming groups of ids to represent different categories like all plans in the Ultimate tier or all the add-ons available for self procurement for GitLab.com and then uses those categories for decision-making during execution.
As the codebase and number of product grows this manual intervention becomes more expensive.
### Goals
The main goals are:
- Make sure the CustomersDot cached Product Catalog is in sync with Zuora at any point in time.
- Automate the Plan management in CustomersDot so it will require no manual intervention for basic Product Catalog updates in Zuora. For example, if a new Product / SKU is added, if a RatePlanCharge is updated, or if a Product is discontinued. For this we need to step away from hardcoding product rate plan ids within CustomersDot and transfer the classification knowledge to the ProductCatalog (by adding CustomersDot metadata to it in the form of custom fields) to be able to resolve these sets dynamically from the `LocalRecord`s on demand.
## Proposal
CustomersDot currently [has a local cache](https://gitlab.com/gitlab-org/customers-gitlab-com/-/merge_requests/1762) of the Zuora's Product Catalog via the [IronBank](https://github.com/zendesk/iron_bank) gem and [its LocalRecord extension](https://gitlab.com/gitlab-org/customers-gitlab-com/blob/45f5dedbb4fa803d19827472214ea0b5b0ce1861/lib/gem_extensions/iron_bank/local_records.rb#L1).
At the moment we refresh this cache manually when we are notified that a new change exists in Zuora that is of interest for CustomersDot:
```mermaid
sequenceDiagram
participant CustomersDot
participant Zuora
Note left of CustomersDot: ProductCatalog refresh is triggered<br/>via Rails console
CustomersDot->>Zuora: GET Product Catalog
Zuora->>CustomersDot: Respond with Product Catalog
CustomersDot->>CustomersDot: Cache copy of Product Catalog in LocalRecord database
Note left of CustomersDot: For future Product Catalog queries<br/>LocalRecords are used.
CustomersDot->>CustomersDot: GET Zuora::Product
CustomersDot->>CustomersDot: GET Zuora::ProductRatePlan
Note right of Zuora: Product information was updated
CustomersDot->>CustomersDot: GET Zuora::ProductRatePlanCharge
CustomersDot->>CustomersDot: GET Zuora::ProductRatePlanChargeTier
Note left of CustomersDot: CustomersDot is unaware of Zuora changes<br/>until next deployment
```
### Iteration 1
Keep Product Catalog in sync with Zuora so at any point in time:
```mermaid
sequenceDiagram
participant CustomersDot
participant Zuora
Note right of Zuora: Product information was updated
Zuora->>CustomersDot: Notification on Product update
CustomersDot->>Zuora: GET Product Catalog
Zuora->>CustomersDot: Respond with Product Catalog
CustomersDot->>CustomersDot: Refresh Product Catalog cache (LocalRecord database)
Note left of CustomersDot: CustomersDot Product Catalog<br/>cache is up to date with Zuora
```
### Iteration 2
Transfer CustomersDot's classification knowledge to the Zuora Product Catalog (by adding CustomersDot metadata to it in the form of custom fields) to be able to resolve `ProductRatePlan`s directly from the `LocalRecord`s on demand.
We are proposing to add these custom fields to the Product Catalog:
```mermaid
---
title: Zuora Product Catalog Proposed Additions
---
erDiagram
"Product" ||--|{ "ProductRatePlan" : "has many"
"ProductRatePlan" ||--|{ "ProductRatePlanCharge" : "has many"
"ProductRatePlanCharge" ||--|{ "ProductRatePlanChargeTier" : "has many"
"Product" {
enum Tier__c
enum DeploymentType__c
}
"ProductRatePlan" {
boolean WebDirect__c
}
```
### Iteration 3
Use Zuora custom metadata (introduced in iteration 2) to resolve `ProductRatePlan`s directly from the Zuora Product Catalog, and remove the `Plan` constants that are preventing the full cache refresh.
## Design and implementation details
### Iteration 1
**(Iteration 1) Product Catalog is in sync with Zuora**
- Cron job to refresh the Product Catalog every day as a first iteration and add immediate value.
- Create a Finance Systems issue to request:
- New custom event for when custom fields are updated for records from the Zuora Product Catalog.
| Base object | Custom event name |
| ------------------------- | -------------------------------------- |
| Product | CatalogProductUpdate |
| ProductRatePlan | CatalogProductRatePlanUpdate |
| ProductRatePlanCharge | CatalogProductRatePlanChargeUpdate |
| ProductRatePlanChargeTier | CatalogProductRatePlanChargeTierUpdate |
- New callout under the `Billing` component for when records from the Zuora Product Catalog are added, deleted or updated.
- Create a new controller in CustomersDot based on `ActionController::Metal` to not include redundant middlewares, callbacks, additional Rails stuff and make this controller as fast as possible.
```ruby
module Zuora
class WebHooksController < ActionController::Metal
feature_category :platform
def create
# Step 1. Validate and save an incoming webhook payload into the database
# Step 2. Kick of the SyncProductCatalogJob
head :ok
end
end
end
```
Ensure a debouncing strategy for `SyncProductCatalogJob` in case we get too many Product Catalog update notifications in a short period of time. Initially we can delay its execution for 5 minutes and ensure no new job is enqueued if one is already in the queue.
**(Iteration 2) Transfer CustomersDot's classification knowledge to the Zuora Product Catalog**
_All these changes require a Finance Systems issue._
- Review existing field `Zuora::Product#category` to make sure it is properly set for all Products. Possible values: `[null, "Base Products", "Add On Services", "Miscellaneous Products"]`.
- Add new custom field `Zuora::ProductRatePlan#web_direct` to be a `boolean`
- true: the plan is available for self service
- false: the plan is not available for self service
- Add new custom field `Product#tier` to be an `enum` (multiselect). Possible values: `[premium, ultimate, starter, bronze, silver, gold, free, null]`
- Add new custom field `Product#deployment_type` to be an `enum` (multiselect). Possible values: `[self_managed, dedicated, gitlab_dot_com]`
For each added field: the value in Zuora has to be aligned with CustomersDot classification given by `Zuora::ProductRatePlan` ids current grouping in the `Plan` class.
NOTE:
There is a [current effort](https://gitlab.com/gitlab-com/business-technology/enterprise-apps/intake/-/issues/44) to add some of these fields to Zuora so we might be able to reuse these. If we are reusing these we need to double check that the value in Zuora and CustomersDot classification are aligned
for each.
**(Iteration 3) Use this Zuora custom metadata to resolve `ProductRatePlan`s directly from the Zuora Catalog**
- Create scopes to fetch `Zuora::Product`s and `Zuora::ProductRatePlan`s based on the metadata introduced in Iteration 2. Possible scopes:
- `self_managed`
- `dedicated`
- `gitlab_dot_com`
- `base_products`
- `add_ons`
- `web_direct`
- `sales_assisted`
- `ultimate`
- `premium`
- `active` (based on the effective start / end dates)
- Replace the usage of `Plan` constants that represent a collection of records that meet a given classification by a call to a method that loads the same collection from LocalRecords using the implemented scopes e.g. `ALL_ULTIMATE_SM_PLANS` can be replaced with `Zuora::Product.self_managed.ultimate.flat_map(&:product_rate_plans).map(&:id)`. This step can be done in iteration until all constants are replaced. Depending on how complex each iteration is we can decide if a feature flag is required or not.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -432,7 +432,7 @@ sequenceDiagram
[Spanner](https://cloud.google.com/spanner) will be a new data store introduced into the GitLab Stack, the reasons we are going with Spanner are:
1. It supports Multi-Regional read-write access with a lot less operations when compared to PostgreSQL helping with out [regional DR](../disaster_recovery/index.md)
1. It supports Multi-Regional read-write access with a lot less operations when compared to PostgreSQL helping with out [regional DR](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/disaster_recovery/)
1. The data is read heavy not write heavy.
1. Spanner provides [99.999%](https://cloud.google.com/spanner/sla) SLA when using Multi-Regional deployments.
1. Provides consistency whilst still being globally distributed.
@ -547,7 +547,7 @@ However looking at the [performance documentation](https://cloud.google.com/span
## Disaster Recovery
We must stay in our [Disaster Recovery targets](../disaster_recovery/index.md#dr-implementation-targets) for the Topology Service.
We must stay in our [Disaster Recovery targets](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/disaster_recovery/#dr-implementation-targets) for the Topology Service.
Ideally, we need smaller windows for recovery because this service is in the critical path.
The service is stateless, which should be much easier to deploy to multiple regions using [runway](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/1206).

View File

@ -1,84 +1,11 @@
---
status: ongoing
creation-date: "2024-01-12"
authors: [ "@grzesiek" ]
coach: "@grzesiek"
approvers: [ "@gabrielengel_gl"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_build_speed/benchmark/'
remove_date: '2025-07-08'
---
# CI Build Speed Benchmarking Framework
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_build_speed/benchmark/).
In order to understand how GitLab CI performs in terms of CI build speed, we
plan to build CI Build Speed Benchmarking Framework.
## Benchmark
In order to run the benchmark, we will:
1. Install the benchmarking tool.
1. Start the tool.
1. Runs scenarios.
1. Report results back to GitLab data warehouse.
In the first iteration, we will focus on measuring the speed of GitLab CI, GitHub Actions, and CircleCI.
## Principles
There are a few design principles we should abide by:
1. Make it CI-platform agnostic. Can run on any Continuous Integration platform.
1. Do not depend on any specific technology that might not be available on some platforms.
1. Easy installation setup, not requiring many dependencies. Zero-dependency would be ideal.
1. Send results back to GitLab through an HTTP request, unless there is a better way.
1. Read as much data about the environment running a build and send details in the telemetry.
## Benchmarking: Client Side
The benchmarking tool should be able to measure every step of CI build
execution:
1. Time from build requested to scenario execution started.
1. Monotonic time to execute each of the steps of the scenario.
1. Thread time to execute each of the steps of the scenario.
1. Time required to report results back to GitLab.
Ideally the tool could collect this data in the
[Open Telemetry Tracing](https://opentelemetry.io/docs/specs/otel/trace/api/)
format.
### Go-based tool
One of the solutions that could meet the requirements / principles listed
above, could be a Go-based binary, which would be installed on different CI
platform using `wget` / `curl` or in a different convinient way. The benefits
of using the binary are:
1. Easy installation method, without the need to use containers.
1. Few external dependencies for a statically-linked binary.
1. Many libraries available, for tracing or HTTP / API integrations.
1. Multi-threaded execution mode that broadens benchmarking scope.
1. Expressive language that can make it easier to maintain the scenarios.
### Benchmarking: Server Side
## Pipelines scheduler
In order to run the benchmark a new build / pipeline / job will have to be
started on a continuous integration platform under test. Some platforms support
scheduled pipelines, but this could make it difficult to measure the build
start-up time. On alternative to consider during the implementation is to start
pipelines using API trigger endpoints. Most of the CI platforms support this
way of running pipelines, and we could pass the start-up time / pipeline
creation request time in an argument, that then will be consumed by the
benchmarking tool, and forwarded to the data warehouse along with the build
benchmark telemetry.
## Data warehouse
The server side, that will receive benchmarking telemetry, will eventually need
to forward the data to a data warehouse, in which we will be able to visualize
results, like Kibana or our Observability / Tracing tooling.
Before doing that, it could be advisable to persist the payload in object
storage, just in case we need to migrate historical entries to a different data
warehouse later on.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,73 +1,11 @@
---
status: ongoing
creation-date: "2024-01-12"
authors: [ "@grzesiek" ]
coach: "@grzesiek"
approvers: [ "@gabrielengel_gl"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_build_speed/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_build_speed/).
# CI/CD Build Speed
## Summary
GitLab CI is a Continuous Integration platform which is widely used to run a
variety of jobs, builds, pipelines. It was [integrated into GitLab in September 2015](https://about.gitlab.com/releases/2015/09/22/gitlab-8-0-released/)
and has become [one of the most beloved CI/CD solutions](https://about.gitlab.com/blog/2017/09/27/gitlab-leader-continuous-integration-forrester-wave/).
With years we've added a lot of new features and code to the GitLab CI
platform. In order to retain the "one of the most beloved solutions" status, we
also need keep attention to making it fast, reliable and secure. This design
doc is describing the path towards the former: making GitLab CI fast by
improving CI build speed.
## Goals
1. Establish a CI Speed Benchmark, used to compare GitLab CI to other platforms.
1. Build CI Benchmark Framework to measure the GitLab CI speed over the long term.
1. Describe next steps for improving GitLab CI Build Speed.
## Proposal
### CI Speed Benchmark
First, we plan to build a [CI Speed Benchmark](benchmark.md) solution, that
will allow us to run specific scenarios on various CI/CD platform and ingest
results into our data warehouse.
This will make it possible to define a baseline of the CI Build Speed for many
different scenarios and track the progress we, and other providers, are making
over time.
The core part of this goal is to define a set of scenarios that will allow us
to build a proxy metrics for build speed. For example, we could run following
scenarios:
1. Time to first byte of build log for `echo "Hello World"` build.
1. Time to result to perform a CPU-intensive cryptographic operation.
1. Time to result to perform a memory-intensive for a given amount of bytes.
1. Time to result to build a Linux kernel.
The scenarios should be idempotent and deterministic.
In the first iteration, we will only focus on the total job execution time, and not go into detail e.g. comparing specific startup times.
### CI Benchmark Framework
Once we define scenarios that we want to implement, we should build a
[CI Benchmark Framework](benchmark.md). The framework will be used to run
scenarios in a Continuous Integration environment, and to send the results back
to our data warehouse, for analysis and comparison.
The main principles behind design choices for the framework, are:
1. Make it CI-platform agnostic. Can run on any Continuous Integration platform.
1. Do not depend on any specific technology that might not be available on some platforms.
1. Easy installation setup, not requiring many dependencies. Zero-dependency would be ideal.
1. Send results back to GitLab through an HTTP request, unless there is a better way.
#### Improve CI Build Speed
Once we can measure CI Build Speed, improving it can be possible. We will
define the next steps for improving the speed once we have initial results.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,249 +1,11 @@
---
status: proposed
creation-date: "2024-03-29"
authors: [ "@sean_carroll", "@eduardobonet" ]
coach: "@jessieay"
approvers: [ "@susie.bee", "@m_gill" ]
owning-stage: "~devops::ai-powered"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/custom_models/'
remove_date: '2025-07-08'
---
<!-- Blueprints often contain forward-looking statements -->
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/custom_models/).
# Self-Hosted Model Deployment
This Blueprint describes support for customer self-deployments of Mistral LLMs as a backend for GitLab Duo features, as an alternative to the default Vertex or Anthropic models offered on GitLab Dedicated and .com. This initiative supports both internet connected and air-gapped GitLab deployments.
## Motivation
Self-hosted LLM models allow customers to manage the end-to-end transmission of requests to enterprise-hosted LLM backends for [GitLab Duo features](../../../user/ai_features.md), and keep all requests within their enterprise network. GitLab provides as a default LLM backends of Google Vertex and Anthropic, hosted externally to GitLab. GitLab Duo feature developers are able to access other LLM choices via the AI Gateway. More details on model and region information can be [found here](https://gitlab.com/groups/gitlab-org/-/epics/13024#current-feature-outline).
### Goals
Self-Managed models serve sophisticated customers capable of managing their own LLM infrastructure. GitLab provides the option to connect supported models to LLM features. Model-specific prompts and GitLab Duo feature support is provided by the self-hosted models feature.
- Choice of LLM models
- Ability to keep all data and request/response logs within their own domain
- Ability to select specific GitLab Duo Features for their users
- Non-reliance on the .com AI Gateway
### Non-Goals
Other features that are goals of the Custom Models group and which may have some future overlap are explicitly out of scope for the current iteration of this blueprint. These include:
- Local Models
- RAG
- Fine Tuning
- GitLab managed hosting of open source models, other than the current supported third party models.
- Bring Your Own API Key (BYOK)
## Proposal
GitLab will provide support for specific LLMs hosted in a customer's infrastructure. The customer will self-host the AI Gateway, and self-host one or more LLMs from a predefined list. Customers will then configure their GitLab instance for specific models by LLM feature. A different model can be chosen for each GitLab Duo feature.
This feature is accessible at the instance-level and is intended for use in GitLab Self-Managed instances.
Self-hosted model deployment is a [GitLab Duo Enterprise Add-on](https://about.gitlab.com/pricing/).
## Design and implementation details
### Component Architecture
```mermaid
graph LR
a1 --> c1
a2 --> b1
b1 --> c1
b3 --> b1
b4 --> b1
c1 --> c2
c2 --> c3
c3 --> d1
d1 --> d2
subgraph "User"
a1[IDE Request]
a2[Web / CLI Request]
end
subgraph "Self-Managed GitLab"
b1[GitLab Duo Feature] <--> b2[Model & Feature-specific<br/>Prompt Retrieval]
b3[GitLab Duo Feature<br/>Configuration]
b4[LLM Serving Config]
end
subgraph "Self-Hosted AI Gateway"
c1[Inbound API interface]
c2[Model routing]
c3[Model API interface]
end
subgraph "Self-Hosted LLM"
d1[LoadBalancer]
d2[GPU-based backend]
end
```
#### Diagram Notes
- **User request**: A GitLab Duo Feature is accessed from one of three possible starting points (Web UI, IDE or Git CLI). The IDE communicates directly with the AI Gateway.
- **LLM Serving Config**: The existence of a customer-hosted model along with its connectivity information is declared in GitLab Rails and exposed to the AI Gateway with an API.
- **GitLab Duo Feature Configuration**: For each supported GitLab Duo feature, a user may select a supported model and the associated prompts are automatically loaded.
- **Prompt Retrieval**: GitLab Rails chooses and processes the correct prompt(s) based on the GitLab Duo Feature and model being used
- **Model Routing**: The AI Gateway routes the request to the correct external AI endpoint. The current default for GitLab Duo features is either Vertex or Anthropic. If a Self-Managed model is used, the AI Gateway must route to the correct customer-hosted model's endpoint. The customer-hosted model server details are the `LLM Serving Config` and retrieved from GitLab Rails as an API call. They may be cached in the AI Gateway.
- **Model API interface**: Each model serving has its own endpoint signature. The AI Gateway needs to be able to communicate using the right signature. We will support commonly supported model serving formats such as the OpenAI API spec.
### Configuration
Configuration is set at the GitLab instance-level; for each GitLab Duo feature a drop-down list of options will be presented. The following options will be available:
- Self-hosted model 1
- Self-hosted model n
- Feature Inactive
In the initial implementation a single self-hosted Model will be supported, but this will be expanded to a number of GitLab-defined models.
### AI Gateway Deployment
Customers will be required to deploy a local instance of the AI Gateway in their own infrastructure. The initial means to do this is via Docker container, as described [in this issue](https://gitlab.com/gitlab-org/gitlab/-/issues/452489).
Self-hosted Runway will be the preferred delivery mechanism for deploying the AI Gateway. Future options, in order of preference are:
- Runway [discussion](https://gitlab.com/gitlab-com/gl-security/security-assurance/fedramp/fedramp-certification/-/issues/452#note_1832261170)
- Kubernetes deployment [issue](https://gitlab.com/gitlab-org/gitlab/-/issues/452490)
- Omnibus packaging [issue](https://gitlab.com/gitlab-org/omnibus-gitlab/-/issues/8467)
It should be noted that deployment by Docker container is a temporary measure only, and will be superceeded by the three options listed above.
### Prompt Support
For each supported model and supported GitLab Duo feature, prompts will be developed and evaluated by GitLab. They will be baked into the Rails Monolith source code.
When the standard prompts are migrated into either the AI Gateway or a prompt template repository (direction is to be determined), the prompts supporting self-hosted models will also be migrated.
### Supported LLMs
- [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [Mixtral-8x7B-instruct](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
- [Mixtral 8x22B](https://huggingface.co/mistral-community/Mixtral-8x22B-v0.1)
- [CodeGemma 7B IT](https://huggingface.co/google/codegemma-7b-it)
- [CodeGemma 2B](https://huggingface.co/google/codegemma-2b)
Installation instructions will be added to the Developer documentation. [issue](https://gitlab.com/gitlab-org/gitlab/-/issues/452509)
### GitLab Duo Feature Support
| Feature | Default Model | Mistral 7B | Mixtral-8x7B | Mixtral 8x22B | CodeGemma 7B | CodeGemma 2B |
|---------------------|------------------------------|------------|--------------|----------------|--------------|--------------|
| GitLab Duo Chat | Anthropic Claude-3 | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Code Completion | Vertex AI Codey code-gecko | 🚫 | 🚫 | 🚫 | ✅ | ✅ |
| Code Generation | Anthropic Claude-3 | ✅ | ✅ | ✅ | 🔎 | 🔎 |
| Git Suggestions | Vertex AI Codey codechat-bison | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Discussion Summary | Vertex AI Codey text-bison | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Issue Description Generation | Anthropic Claude-2 | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Test Generation | Anthropic Claude-2 | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| MR template population | Vertex AI Codey text-bison | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Suggested Reviewers | GitLab In-House Model | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Merge request summary | Vertex AI Codey text-bison | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Code review summary | Vertex AI Codey text-bison | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Vulnerability explanation | Vertex AI Codey text-bison Anthropic | 🚫 | 🔎 | 🔎 | 🔎 | 🔎 |
| Vulnerability resolution | Vertex AI Codey code-bison | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Code explanation | Vertex AI Codey codechat-bison | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Root cause analysis | Vertex AI Codey text-bison | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
| Value stream forecasting | GitLab In-House Model | 🔎 | 🔎 | 🔎 | 🔎 | 🔎 |
- The `Suggested Reviewers` and `Value stream forecasting` models are Convolutional Neural Networks (CNNs) developed in-house by GitLab.
- ✅ `Supported` : means the GitLab Duo feature is supported for this model
- 🚫 `Not Supported`: the GitLab Duo feature is not supported for this model
- 🔎 `To be Evaluated`: research is needed to determine if this feature will be supported
- GitLab Duo Chat can also use Vertex AI Codey textembedding-gecko
- Vulnerability explanation can fall back to Claude-2 if degraded performance
#### RAG / Duo Chat tools
Most of the tools available to Duo Chat behave the same for self-hosted models as
they do in the GitLab-hosted AI Gateway architecture. Below are the expections:
##### Duo Documentation search
Duo documentation search performed through the GitLab-managed AI Gateway (`cloud.gitlab.com`) relies on [VertexAI Search](../gitlab_rag/index.md),
which is not available for air-gapped customers. As a replacement, only within the scope of
self-hosted, air-gapped customers, an index of GitLab documentation has been provided
within the self-hosted AI Gateway.
This index is an SQLite database that allows for full-text search. An index is
generated for each GitLab version and saved into a generic package registry. The index that matches the customer's GitLab version is then downloaded by the self-hosted AI Gateway.
Using a local index does bring some limitations:
- [BM25 search](https://en.wikipedia.org/wiki/Okapi_BM25) performs worse in the presence of typos, and the performance also depends on how the index was built
- Given that the indexed tokens depend on how the corpus was cleaned (stemming, tokenisation, punctuation), the same text cleaning steps need to be applied to the user query for it to properly match the indexes
- Local search diverges from other already implemented solutions, and creates a split between self-managed and GitLab-hosted instances of the AI Gateway.
Over time, we intend to replace this solution with a self-hosted Elasticsearch/OpenSearch
alternative, but as of now, the percentage of self-hosted customers that have
[Elasticsearch enabled is low](https://gitlab.com/gitlab-org/gitlab/-/issues/438178#current-adoption).
For further discussion, refer to the [proof of concept](https://gitlab.com/gitlab-org/modelops/applied-ml/code-suggestions/ai-assist/-/merge_requests/974).
**Index creation**
Index creation and implementation is being worked on as part of [this epic](https://gitlab.com/groups/gitlab-org/-/epics/14282)
**Evaluation**
Evaluation of the local-search is being worked on as part of [this epic](https://gitlab.com/gitlab-org/gitlab/-/issues/468666).
#### LLM-hosting
Customers will self-manage LLM hosting. We provided limited documentation on how
customers can host their own [LLMs](../../../administration/self_hosted_models/install_infrastructure.md)
#### GitLab Duo License Management
The Self-Managed GitLab Rails will self-issue a token (same process as for .com) that the local AI Gateway can verify, to guarantee that cross-service communication is secure. [Details](https://gitlab.com/gitlab-org/gitlab/-/issues/444216)
### System Architectures
At this time a single system architecture only is supported. See the Out of Scope section for discussion on alternatives.
#### Self-Managed GitLab with self-hosted AI Gateway
This system architecture supports both a internet-connected GitLab and AI Gateway, or can be run in an air-gapped environment. Customers install a self-managed AI Gateway within their own infrastructure. The long-term vision for such installations is via Runway, but until that is available a Docker-based install will be supported.
Self-Managed customers who deploy a self-managed AI Gateway will only be able to access self-hosted models at this time. Future work around [Bring Your Own Key](https://gitlab.com/groups/gitlab-org/-/epics/12973) may change that in the future.
### Development Environment
Engineering documentation will be produced on how to develop this feature, with work in progress on:
- [Include AI Gateway in GDK](https://gitlab.com/gitlab-org/gitlab-development-kit/-/issues/2025)
- [Developer setup for self-hosted models](https://gitlab.com/gitlab-org/gitlab/-/issues/452509)
- [Centralized Evaluation Framework](https://gitlab.com/gitlab-org/modelops/ai-model-validation-and-research/ai-evaluation/prompt-library/-/tree/main)
### Out of scope
- It would be possible to support customer self-hosted models within a customer's infrastructure for dedicated or .com customers, but this is not within scope at this time.
- Support for models other than those listed in the Supported LLMs section above.
- Support for modified models.
#### Out of scope System Architectures
There are no plans to support these system architectures at this time, this could change if there was sufficient customer demand.
##### Self-Managed GitLab with .com AI Gateway
In this out-of-scope architecture a self-managed customer continues to use the .com hosted AI gateway, but points back to self-managed models.
##### .com GitLab with .com AI Gateway
In this out-of-scope architecture .com customers point to self-managed models. This topology might be desired if there were better quality of results for a given feature by a specific model, or if customers could improve response latency by using their own model-serving infrastructure.
##### GitLab Dedicated
Support will not be provided for Dedicated customers to use a self-hosted AI Gateway and self-hosted models. Dedicated customers who use GitLab Duo features can access them via the .com AI Gateway. If there is customer demand for self-managed models for Dedicated customers, this can be considered in the future.
##### Externally hosted models
It is expected that customers will self-host models.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,87 +1,11 @@
---
status: ongoing
creation-date: "2024-01-29"
authors: [ "@jarv" ]
coach:
approvers: [ ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/disaster_recovery/'
remove_date: '2025-07-08'
---
# Disaster Recovery
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/disaster_recovery/).
This document is a work-in-progress and proposes architecture changes for the GitLab.com SaaS.
The goal of these changes are to maintain GitLab.com service continuity in the case a regional or zonal outage.
- A **zonal recovery** is required when all resources are unavailable in one of the three availability zones in `us-east1` or `us-central1`.
- A **regional recovery** is required when all resources become unavailable in one of the regions critical to operation of GitLab.com, either `us-east1` or `us-central1`.
## Services not included in the current DR strategy for FY24 and FY25
We have limited the scope of DR to services that support primary services (Web, API, Git, Pages, Sidekiq, CI, and Registry).
These services tie directly into our overall [availability score](https://dashboards.gitlab.net/d/general-slas/general3a-slas?orgId=1) (internal link) for GitLab.com.
For example, DR does not include the following:
- AI services including code suggestions
- Error tracking and other observability services like tracing
- CustomersDot, responsible for billing and new subscriptions
- Advanced Search
## DR Implementation Targets
The FY24 targets were:
| | Recovery Time Objective (RTO) | Recovery Point Objective (RPO) |
|--------------|-------------------------------|--------------------------------|
| **Zonal** | 2 hours | 1 hour |
| **Regional** | 96 hours | 2 hours |
The FY25 targets before cell architecture are:
| | Recovery Time Objective (RTO) | Recovery Point Objective (RPO) |
|--------------|-------------------------------|--------------------------------|
| **Zonal** | 0 minutes | 0 minutes |
| **Regional** | 48 hours | 0 minutes |
**Note**: While the RPO values are targets, they cannot be met exactly due to the limitations of regional bucket replication and replication lag of Gitaly and PostgreSQL.
## Current Recovery Time Objective (RTO) and Recovery Point Objective (RPO) for Zonal Recovery
We have not yet simulated a full zonal outage on GitLab.com.
The following are RTO/RPO estimates based on what we have been able to test using the [disaster recovery runbook](https://gitlab.com/gitlab-com/runbooks/-/tree/master/docs/disaster-recovery?ref_type=heads).
It is assumed that each service can be restored in parallel.
A parallel restore is the only way we are able to meet the FY24 RTO target of 2 hours for a zonal recovery.
| Service | RTO | RPO |
| --- | --- | --- |
| PostgreSQL | 1.5 hr | <=5 min |
| Redis [^1] | 0 | 0 |
| Gitaly | 30 min | <=1 hr |
| CI | 30 min | not applicable |
| Load balancing (HAProxy) | 30 min | not applicable |
| Frontend services (Web, API, Git, Pages, Registry) [^2] | 15 min | 0 |
| Monitoring (Prometheus, Thanos, Grafana, Alerting) | 0 | not applicable |
| Operations (Deployments, runbooks, operational tooling, Chef) [^3] | 30 min | 4 hr |
| PackageCloud (distribution of packages for self-managed) | 0 | 0 |
## Current Recovery Time Objective (RTO) and Recovery Point Objective (RPO) for Regional Recovery
Regional recovery requires a complete rebuild of GitLab.com using backups that are stored in multi-region buckets.
The recovery has not yet been validated end-to-end, so we don't know how long the RTO is for a regional failure.
Our target RTO for FY25 is to have a procedure to recover from a regional outage in under 48 hours.
The following are considerations for choosing multi-region buckets over dual-region buckets:
- We operate out of a single region so multi-region storage is only used for disaster recovery.
- Although Google recommends dual-region for disaster recovery, dual-region is [not an available storage type for disk snapshots](https://cloud.google.com/compute/docs/disks/snapshots#selecting_a_storage_location).
- To mitigate the bandwidth limitation of multi-region buckets, we spread Gitaly VMs infra across multiple projects.
## Proposals for Regional and Zonal Recovery
- [Regional](regional.md)
- [Zonal](zonal.md)
---
[^1]: Most of the Redis load is on the primary node, so losing replicas should not cause any service interruption
[^2]: We setup maximum replicas in our Kubernetes clusters servicing front-end traffic, this is done to avoid saturating downstream dependencies. For a zonal failure, a cluster reconfiguration is necessary to increase these maximums.
[^3]: There is a 4 hr RPO for Operations because Chef is an single point of failure in a single availability zone and our restore method uses disk snapshots, taken every 4 hours. While most of our Chef configuration is also stored in Git, some data (like node registrations) are only stored on the server.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,163 +1,11 @@
---
status: ongoing
creation-date: "2024-01-29"
authors: [ "@jarv" ]
coach:
approvers: [ ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/disaster_recovery/regional/'
remove_date: '2025-07-08'
---
# Regional Recovery
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/disaster_recovery/regional/).
## Improving the Recovery Time Objective (RTO) and Recovery Point Objective (RPO) for Regional Recovery
The following list the top challenges that limit our ability to drive `RTO` to 48 hours for a regional recovery.
1. We have a large amount of legacy infrastructure managed using Chef. This configuration has been difficult for us to manage and would require a large a mount of manual copying and duplication to create new infrastructure in an alternate region.
1. Operational infrastructure is located in a single region, `us-central1`. For a regional failure in this region, it requires rebuilding the ops infrastructure with only local copies of runbooks and tooling scripts.
1. Observability is hosted in a single region.
1. The infrastructure (`dev.gitlab.org`) that builds Docker images and packages is located in a single region, and is a single point of failure.
1. There is no launch-pad that would allow us to get a head-start on a regional recovery. Our IaC (Infrastructure-as-Code) does not allow us to switch regions for provisioning.
1. We don't have confidence that Google can provide us with the capacity we need in a new region, specifically the large amount of SSD necessary to restore all of our customer Git data.
1. We use [Global DNS](https://cloud.google.com/compute/docs/internal-dns) for internal DNS making it difficult to use multiple instances with the same name across multiple regions, we also don't incorporate regions into DNS names for our internal endpoints (for example dashboards, logs, etc).
1. If we deploy replicas in another region to reduce RPO we are not yet sure of the latency or cloud spend impacts.
1. We have special/negotiated Quota increases for Compute, Network, and API with the Google Cloud Platform only for a single region, we have to match these quotas in a new region, and keep them in sync.
1. We have not standardized a way to divert traffic at the edge from 1 region to another.
1. In monitoring, and configuration we have places where we hardcode the region to `us-east1`.
## Regional recovery work-streams
The first step of our regional recovery plan creates new infrastructure in the recovery region that involves a large number of manual steps.
To give us a head-start on recovery, we propose a "regional bulkhead" deployment in a new GCP region.
A "regional bulkhead" meets the following requirements:
1. A specific region is allocated.
1. Quotas are set and synced so that we can duplicate all of us-east1 in the new region.
1. Subnets are allocated or reserved in the same VPC for us-east1.
1. Some infrastructure is deployed where it makes sense to lower RTO, while keeping cloud-spend low.
The following are work-streams that can be done mostly in parallel.
The end-goal of the regional recovery is to have a bulkhead that has the basic scaffolding for deployment in the alternate region.
This bulkhead can be used as a launching pad for a full data restore from `us-east1` to the alternate region.
### Select an alternate region
We are going with **`us-central1`**. Discussion for this was done in <https://gitlab.com/gitlab-com/gl-infra/production-engineering/-/issues/25094>
- Dependencies: none
- Teams: Ops
The following are considerations that need to be made when selecting an alternate region for DR:
1. Ensure there is enough capacity to meet compute usage.
1. Network and network latency requirements, if any.
1. Feature parity between regions.
### Deploy Kubernetes clusters supporting front-end services in a new region with deployments
- Dependencies: [External front-end load balancing](#external-front-end-load-balancing)
- Teams: Ops, Foundations, Delivery
GitLab.com has Web, API, Git, Git HTTPs, Git SSH, Pages, and Registry as front-end services.
All of these services are run in 4 Kubernetes clusters deployed in `us-east1`.
These services are either stateless or use multi-region storage buckets for data.
In the case of a failure in `us-east1`, we would need to rebuild these clusters in the alternate region and set them up for deployments.
### Switch from Global to Zonal DNS
- Dependencies: None
- Teams: Gitaly
Gitaly VMs are single points of failure that are deployed in `us-east1`.
The internal DNS naming of the nodes have the following convention:
```plaintext
gitaly-01-stor-gprd.c.gitlab-gitaly-gprd-ccb0.internal
^ name ^ project
```
By switching to zonal DNS, we can change the internal DNS entries so they have the zone in the DNS name:
```plaintext
gitaly-01-stor-gprd.c.us-east1-b.gitlab-gitaly-gprd-ccb0.internal
^ name ^ zone ^ project
```
Allowing us to keep the same name when recovering into a new region or zone.
```plaintext
gitaly-01-stor-gprd.c.us-east1-b.gitlab-gitaly-gprd-ccb0.internal
gitaly-01-stor-gprd.c.us-east4-a.gitlab-gitaly-gprd-ccb0.internal
```
For fleets of VMs outside of Kubernetes, these names allow us to have the same node names in the recovery region.
### Gitaly
- Dependencies: [Switch from Global to Zonal DNS](#switch-from-global-to-zonal-dns) (optional, but desired)
- Teams: Gitaly, Ops, Foundations
Restoring the entire Gitaly fleet requires a large number of VMs deployed in the alternate region.
It also requires a lot of bandwidth because restore is based on disk snapshots.
To ensure a successful Gitaly restore, quotas need to be synced with us-east1 and there needs to be end-to-end validation.
### PostgreSQL
- Dependencies: [Improve Chef provisioning time by using preconfigured golden OS images](zonal.md#improve-chef-provisioning-time-by-using-preconfigured-golden-os-images) (optional, but desired), local backups in the standby region (data disk snapshot and `WAL` archiving).
- Teams: Database Reliability, Ops
The configuration for Patroni provisioning only allows a single region per cluster.
There is networking infrastructure, Consul, and load balancers that need to be setup in the alternate region.
We may consider setting up a "cascaded cluster" for the databases to improve recovery time for replication.
### Redis
- Dependencies: [Improve Chef provisioning time by using preconfigured golden OS images](zonal.md#improve-chef-provisioning-time-by-using-preconfigured-golden-os-images) (optional, but desired)
- Teams: Ops
To provision Redis subnets need to be allocated in the alternate region with and end-to-end validation of the new deployments.
### External front-end load balancing
- Dependencies: HAProxy replacement, mostly likely [GKE Gateway and Istio](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/1157)
- Teams: Ops, Foundations
External front-end load balancing is necessary to validate the deployment in the alternate region.
This requires both external and internal LBs for all front-end-services.
### Monitoring
- Dependencies: [Eliminate X% Chef dependencies in Infra by moving infra away from Chef](zonal.md#eliminate-x-chef-dependencies-in-infra-by-moving-infra-away-from-chef) (migrate Prometheus infra to Kubernetes)
- Teams: Scalability:Observability, Ops, Foundations
Setup an alternate ops Kubernetes cluster in a different region that is scaled down to zero replicas.
### Runners
Dependencies: [Improve Chef provisioning time by using preconfigured golden OS images](zonal.md#improve-chef-provisioning-time-by-using-preconfigured-golden-os-images) (optional, but desired)
Teams: Scalability:Practices, Ops, Foundations
Ensure quotas are set and align with us-east1 in the alternate region for both runner managers and ephemeral VMs.
Setup and validate networking configuration with peering configuration.
### Ops and Packaging
- Dependencies: [Create an HA Chef server configuration to avoid an outage for a single zone failure](zonal.md#create-an-ha-chef-server-configuration-to-avoid-an-outage-for-a-single-zone-failure)
- Teams: Scalability:Practices, Ops, Foundations, Distribution
All image creation and packaging is done on a single VM, our operation tooling is also on a single VM.
Both of these are single points of failures that have data stored locally.
In the case of a regional outage, we would need to rebuild them from snapshot and lose about 4 hours of data.
The following are options to mitigate this risk:
- Move our packaging jobs to `ops.gitlab.net` so we eliminate `dev.gitlab.org` as a single point of failure.
- Use the Geo feature for `ops.gitlab.net`.
### Regional Recovery Gameday
- Dependencies: Recovery improvements
- Teams: Ops
Following the improvements for regional recovery, a Gameday needs to be executed for end-to-end testing of the procedure.
Once validated, it can be added to our existing [disaster recovery runbook](https://gitlab.com/gitlab-com/runbooks/-/tree/master/docs/disaster-recovery?ref_type=heads).
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,121 +1,11 @@
---
status: ongoing
creation-date: "2024-01-29"
authors: [ "@jarv" ]
coach:
approvers: [ ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/disaster_recovery/zonal/'
remove_date: '2025-07-08'
---
# Zonal Recovery
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/disaster_recovery/zonal/).
## Improving the Recovery Time Objective (RTO) and Recovery Point Objective (RPO) for Zonal Recovery
The following represents our current DR challenges and are candidates for problems that we should address in this architecture blueprint.
1. Postgres replicas run close to capacity and are scaled manually. New
instances must go through Terraform CI pipelines and Chef configuration.
Over-provisioning to absorb a zone failure would add significant cloud-spend
(see proposal section at the end of the document for details).
1. HAProxy (load balancing) is scaled manually and must go through Terraform CI
pipelines and Chef configuration.
1. CI runner managers are present in 2 availability zones and scaled close to
capacity. New instances must go through Terraform CI pipelines and Chef
configuration.
1. In a zone there are saturation limits, like the number of replicas that need
to be manually adjusted if load is shifted away from a failed availability
zone.
1. Gitaly `RPO` is limited by the frequency of disk snapshots, `RTO` is limited
by the time it takes to provision and configure through Terraform CI
pipelines and Chef configuration.
1. Monitoring infrastructure that collects metrics from Chef managed VMs is
redundant across 2 availability zones and scaled manually. New instances must
go through Terraform CI pipelines and Chef configuration.
1. The Chef server which is responsible for all configuration of Chef managed
VMs is a single point of failure located in `us-central1`. It has a local
Postgres database and files on local disk.
1. The infrastructure (`dev.gitlab.org`) that builds Docker images and packages
is located in a single region, and is a single point of failure.
## Zonal recovery work-streams
Improvements around zonal recovery revolve around improving the time it takes to provision for fleets that do not automatically scale.
There is already work in-progress to completely eliminate statically allocated VMs like HAProxy.
Additionally efforts can be made to shorten launch and configuration times for fleets that are not able to automatically scale like Gitaly, PostgreSQL and Redis.
### Over-provision to absorb a single zone failure
- Dependencies: None
- Teams: Ops, Scalability:Practices, Database Reliability
All of our Chef managed VM fleets run close to capacity and require manual scaling and provisioning using Terraform/Chef.
In the case of a zonal outage, it is necessary to provision more servers through Terraform which adds to our recovery time objective.
One way to avoid this is to over-provision so we have a full zone's worth of extra capacity.
1. Patroni Main (`n2-highmem-128` 6.5k/month): 3 additional nodes for +20k/month
1. Patroni CI (`n2-highmem-96` 5k/month): 3 additional nodes for +15k/month
1. HAProxy (`t2d-standard-8` 285/month): 20 additional nodes for +5k/month
1. CI Runner managers (`c2-standard-30` 1.3k/month) 60 additional nodes for +78k/month
The Kubernetes horizontal auto-scaler (`HPA`) has a maximum number of pods configured on front-end services.
It is configured to protect downstream dependencies like the database from saturation due to scaling events.
If we allow a zone to scale up rapidly, these limits need to be adjusted or re-evaluated in the context of disaster recovery.
### Remove HAProxy as a load balancing layer
- Dependencies: None
- Teams: Foundations
HAProxy is a fleet of Chef managed VMs that are statically allocated across 3 AZs in `us-east1`.
In the case of a zonal outage we would need to rapidly scale this fleet, adding to our RTO.
In FY24Q4 the Foundations team started working on a proof-of-concept to use
[Istio in non-prod environments](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/1157).
We anticipate in FY25 to have a replacement for HAProxy using Istio and
[GKE Gateway](https://cloud.google.com/kubernetes-engine/docs/concepts/gateway-api).
Completing this work reduces the impact to our LoadBalancing layer for zonal outages,
as it eliminates the need to manually scale the HAProxy fleet.
Additionally, we spend around 17k/month on HAProxy nodes, so there may be a
cloud-spend reduction if we are able to reduce this footprint.
### Create an HA Chef server configuration to avoid an outage for a single zone failure
- Dependencies: None
- Teams: Ops
Chef is responsible for configuring VMs that have workloads outside of Kubernetes.
It is a single point of failure that resides in `us-central1-b`.
Data is persisted locally on disk, and we have not yet investigated moving it to a highly available setup.
In the case of a zonal outage of `us-central1-b` the server would need to be rebuilt from snapshot, losing up to 4 hours of data.
### Create an HA Packaging server configuration to avoid an outage for a single zone failure
- Dependencies: None
- Teams: Ops
In the case of a zonal outage of `us-east1-c` the server (like `dev.gitlab.org`) would need to be rebuilt from snapshot, losing up to 4 hours of data.
The additional challenge of this host is that it is a GitLab-CE instance so we would be limited in features.
The best approach here would likely be to move packaging CI pipelines to `ops.gitlab.net`.
### Improve Chef provisioning time by using preconfigured golden OS images
- Dependencies: None
- Teams: Ops
For the [Gitaly fleet upgrade in 2022](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/601) a scheduled CI pipeline was created to build a golden OS images.
We can revive this work and start generating images for Gitaly and other VMs to shorten configuration time.
We estimate that using an image can reduce our recovery time by about 15 minutes to improve RTO for zonal failures.
### Eliminate X% Chef dependencies in Infra by moving infra away from Chef
- Dependencies: None
- Teams: Ops, Scalability:Observability, Scalability:Practices
Gitaly, Postgres, CI runner managers, HAProxy, Bastion, CustomersDot, Deploy, DB Lab, Prometheus, Redis, SD Exporter, and Console servers are managed by Chef.
To help improve the speed of recoveries, we can move this infrastructure into Kubernetes or Ansible for configuration management.
### Write-ahead-log for Gitaly snapshot restores
- Dependencies: None
- Teams: Gitaly
There is [work planned in FY25Q1](https://gitlab.com/gitlab-com/gitlab-OKRs/-/work_items/5710) that adds a transaction log for Gitaly to reduce RPO.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 95 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

View File

@ -1,685 +1,11 @@
---
status: ongoing
creation-date: "2024-05-17"
authors: [ "@DylanGriffith", "@mikolaj_wawrzyniak"]
coach:
approvers: [ ]
owning-stage: "~devops::create"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/duo_workflow/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/duo_workflow/).
# GitLab Duo Workflow
## Execution Environment
### Executive Summary
The functionality to support Duo Workflow needs to be able to execute arbitrary code
which effectively means "untrusted" code. This means that they cannot just run
like any other service we deploy and specifically they cannot just run inside
the Duo Workflow Service or AI Gateway.
In order to address this the Duo Workflow functionality will be comprised of 2
separate components:
1. The Duo Workflow Service, which is a Python service we run in our
infrastructure. The Workflow Service is built on top of
[LangGraph](https://github.com/langchain-ai/langgraph).
1. The Duo Worklow Executor, which is a Go binary that communicates via long
running gRPC connection to the Duo Workflow Service and executes the arbtitrary
commands. It will be possible for users to run this locally or in CI pipelines.
In our first release we will support 2 execution modes:
1. Local Executor: which will run commands and edit files locally in a
sandboxed Docker container on the developer machine. They will be able to
see the files being edited live and it will be interactive
1. CI Executor: All non-local use-cases of Duo Workflow (for example:
issue/epic based workflows) will be triggered by the GitLab UI and will
create a CI Pipeline to run the Duo Workflow Executor
Our architecture will also support mixed deployments for self-managed such that
some features of Duo Workflow will be available using a cloud-hosted AI
Gateway.
### Detailed plan
We plan on building this feature set with 3 independent components that can be
run in multiple runtimes:
1. The Duo Workflow Web UI. This will be web UI built into GitLab that manages the
creation and interaction of all workflows. There may be many interaction
points in the GitLab application but there should be a central workflow UI
with reusable components (e.g. Vue components) that could be embedded into
our editor extensions
1. The Duo Workflow Service. This is a Python-based service we deploy with
a gRPC API. The only interface to this is the gRPC interface, which is
called from the Duo Workflow Executor. Internally, this will use LangGraph to
execute the workflows. For reasons why LangGraph was chosen, see [this work item](https://gitlab.com/gitlab-org/gitlab/-/work_items/457958).
The Workflow Service will not have any persisted state but the state of
running workflows will be kept in memory and periodically checkpointed in
GitLab. The Workflow Service is built [in its own codebase](https://gitlab.com/gitlab-org/duo-workflow/duo-workflow-service/)
and will have its own deployment but the codebase
[may be merged with the AI Gateway codebase in the future](https://gitlab.com/gitlab-org/modelops/applied-ml/code-suggestions/ai-assist/-/issues/527)
1. The [Duo Workflow Executor](https://gitlab.com/gitlab-org/duo-workflow/duo-workflow-executor).
This is being written in Go for easy installation
in development containers. This component will run in CI jobs or on a user's
local workstation. In the local workstation it will run sandboxed in a
Docker container with the working directory optionally mounted by the
user for a live pairing experience. It will only be responsible for opening
a gRPC connection to Duo Workflow Service and executing the commands it is
told to.
The following are important constraints of the architecture:
1. All state management for workflows will be inside GitLab.
1. Duo Workflow Service is expected to periodically checkpoint its state in GitLab
1. Duo Workflow Service in-memory state can be dropped/lost at any time so
checkpointing will be the only guaranteed point that can be returned to
1. If a local Duo Workflow Executor drops the connection, the Duo Workflow
Service will checkpoint and shutdown the state as soon as it runs into
something where it is waiting on the executor
1. In order to avoid multiple Duo Workflow Service instances running on the
same workflow, the Duo Workflow Service will always acquire a lock with
GitLab before it starts running. When it suspends, it will release the lock and
similarly there will be a timeout state if it has not checkpointed in the
last 60 seconds. GitLab will not accept checkpoints from a timed out run of
the Duo Workflow Service.
1. Each time a Duo Workflow Service resumes a workflow it gets a new ID and
this is sent when checkpointing so that GitLab can drop/ignore zombie
services running the workflow and inform the zombie service to shutdown.
1. Code is checkpointed by the executor pushing hidden Git refs to the GitLab
instance. This will be happening on the same frequency as other checkpoints.
1. For local execution Duo Workflows are initiated using the Duo Workflow
Executor directly calling Duo Workflow Service
1. For workflows triggered via the UI that don't require a Duo Workflow
Executor GitLab can call the Duo Workflow Service directly
1. All API calls from Duo Workflow Service to GitLab that access private data
or update data will be authenticated on behalf of the user that created the
worklow. Duo Workflow Service should not need privileged access to GitLab
CI pipelines have been chosen as the hosted runtime option for Duo Workflow
Executor because it is the only infrastructure we have available today to run
untrusted customer workloads with stability, support, security, abuse
prevention and a billing model. In the short term for early customers we may
rely on the existing compute minutes for CI pipelines but in the long run we
may want to deploy dedicated runners and introduce a billing model specific for
Duo Workflow.
For many development use cases we expect developers may prefer to run Duo
Workflow Executor locally as it can operate on a locally mounted directory and
allow the user to more easily watch changes as they happen.
### GitLab.com architecture
<img src="diagrams/duo-workflow-architecture-gitlab-com.png" height="600" alt="">
1. Initially we focus on running locally and in CI pipelines with all inputs as
environment variables
1. State stored in GitLab so it can be accessed from the web UI and through IDE
extensions
#### With Local (IDE) execution
```mermaid
sequenceDiagram
participant user as User
participant ide as IDE
participant executor as Duo Workflow Executor
participant gitlab_rails as GitLab Rails
box AI-gateway service
participant duo_workflow_service as Duo Workflow Service
participant ai_gateway as AI Gateway
end
participant llm_provider as LLM Provider
ide->>gitlab_rails: Request AI Gateway JWT using OAuth token or PAT
ide->>executor: start executor with JWT
user->>ide: trigger workflow from IDE
executor->>+duo_workflow_service: Solve this issue (open grpc connection auth'd with AI Gateway JWT)
duo_workflow_service->>gitlab_rails: Request ai_workflow scoped OAuth token using AI Gateway JWT
duo_workflow_service->>gitlab_rails: Create the workflow (auth'd with ai_workflow OAuth token)
duo_workflow_service->>llm_provider: Ask LLM what to do
llm_provider->>duo_workflow_service: Run rails new my_new_app
duo_workflow_service->>executor: execute `rails new my_new_app`
executor->>duo_workflow_service: result `rails new my_new_app`
duo_workflow_service->>gitlab_rails: Save checkpoint
duo_workflow_service->>llm_provider: What's next?
llm_provider->>duo_workflow_service: You're finished
duo_workflow_service->>gitlab_rails: Save checkpoint and mark completed
duo_workflow_service->>gitlab_rails: Revoke ai_workflow scoped OAuth token
deactivate duo_workflow_service
gitlab_rails->>user: Workflow done!
```
#### With Remote (CI pipeline) execution
```mermaid
sequenceDiagram
participant user as User
participant gitlab_rails as GitLab Rails
box CI-Runner
participant executor as Duo Workflow Executor
end
box AI-gateway service
participant duo_workflow_service as Duo Workflow Service
participant ai_gateway as AI Gateway
end
participant llm_provider as LLM Provider
user->>gitlab_rails: trigger workflow from Web UI
gitlab_rails->>executor: start executor (sends AI Gateway JWT with request)
executor->>+duo_workflow_service: Solve this issue (open grpc connection auth'd with AI Gateway JWT)
duo_workflow_service->>gitlab_rails: Request ai_workflow scoped OAuth token using AI Gateway JWT
duo_workflow_service->>gitlab_rails: Create the workflow (auth'd with ai_workflow OAuth token)
duo_workflow_service->>llm_provider: Ask LLM what to do
llm_provider->>duo_workflow_service: Run rails new my_new_app
duo_workflow_service->>executor: execute `rails new my_new_app`
executor->>duo_workflow_service: result `rails new my_new_app`
duo_workflow_service->>gitlab_rails: Save checkpoint
duo_workflow_service->>llm_provider: What's next?
llm_provider->>duo_workflow_service: You're finished
duo_workflow_service->>gitlab_rails: Save checkpoint and mark completed
duo_workflow_service->>gitlab_rails: Revoke ai_workflow scoped OAuth token
deactivate duo_workflow_service
gitlab_rails->>user: Workflow done!
```
### Self-managed architecture
#### With local Duo Workflow Service
When customers are running the Duo Workflow Service locally the architecture will be very
similar to GitLab.com . This will also allow them to use whatever customer
models they configure in their Duo Workflow Service.
<img src="diagrams/duo-workflow-architecture-self-managed-full.png" height="600" alt="">
#### With cloud Duo Workflow Service
In order to allow self-managed customers to trial and rapidly adopt Duo
Workflow without running all Duo Workflow Service components this architecture will
supported a mixed deployment mode. In this case we assume that the cloud AI
Gateway will not have access to the customers GitLab instance but we can make
use of the local executor (on the user's machine or in a CI runner) to proxy
all interactions with GitLab.
<img src="diagrams/duo-workflow-architecture-self-managed-mixed.png" height="600" alt="">
#### Running without the Executor
As described above there are 2 reasons we need the Duo Workflow Executor:
1. We need a sandboxed environment to safely execute arbitrary commands
generated by the AI
1. We need a path to proxy requests to self-managed GitLab instances that may
not be visible to our cloud Duo Workflow Service
But we will have a subset of use cases for Duo Workflow where these conditions
will not apply. Specifically we expect to have "non-code" workflows (e.g.
review this merge request) where we just need to interact between LLM and
GitLab APIs. And if the customer is using GitLab.com or a self-hosted AI
Gateway that has access to their GitLab instance then we can safely run all of
this inside the Duo Workflow Service making API calls to the GitLab instance.
This architecture unlocks a considerable advantage as we expect the cost of
running workloads in CI pipelines is quite high (and indeed wasteful) and the
effort to get a local Executor running is also quite high and it may limit the
value for simple non-code workflows if these were a requirement. Additionally
we get quite significant scaling advantages if we never have to create a CI
pipeline to do some of these non-code workflows as starting up a pipeline and
keeping it running for the duration of a workflow is a large overhead.
<img src="diagrams/duo-workflow-without-executor.png" height="600" alt="">
We may choose to support this architecture later but it will depend on the
following design decisions:
1. The workflow should know how to call GitLab directly for several API calls
and especially checkpointing
1. Proxying to GitLab via the Executor should be designed as an optional proxy
where the Duo Workflow Service constructs the full HTTP request, but under
certain configurations, will choose to pass the HTTP request to the executor
instead of calling GitLab directly
1. The Duo Workflow Executor is optional for workflows. The workflow should
only suspend when the workflow depends on an executor and one is not
present. When the workflow suspends it should know to checkpoint in GitLab
before shutting down.
1. The Duo Workflow Service will need to acquire some kind of "lease" from the
GitLab instance while it is running to prevent the possibility of 2
instances of the same workflow running concurrently or to prevent a crashed
workflow from being indefinitely locked up and cannot be resumed. Leases
should timeout and API calls to checkpoint (or perform other operations) may
be rejected if they come from an instance of the workflow with an expired
lease.
### Data flow
The below diagram shows what happens when the user is triggering workflows from
their IDE using a local executor. The architecture will be similar when
triggering from the GitLab UI using CI pipelines except that GitLab will start
a CI pipeline to create run the Duo Workflow Executor and create the workflow.
```mermaid
sequenceDiagram
participant user as User
participant ide as IDE
participant executor as Duo Workflow Executor
participant gitlab_rails as GitLab Rails
participant duo_workflow_service as Duo Workflow Service
participant llm_provider as LLM Provider
user->>ide: trigger workflow from IDE
ide->>executor: start executor
executor->>+duo_workflow_service: Solve this issue
duo_workflow_service->>gitlab_rails: Create the workflow
duo_workflow_service->>llm_provider: Ask LLM what to do
llm_provider->>duo_workflow_service: Need the file list
duo_workflow_service->>executor: execute `ls`
duo_workflow_service->>gitlab_rails: Save checkpoint
executor->>duo_workflow_service: result `ls`
duo_workflow_service->>llm_provider: What's next?
llm_provider->>duo_workflow_service: Here's a patch
duo_workflow_service->>executor: execute `git apply`
duo_workflow_service->>gitlab_rails: Save checkpoint
duo_workflow_service->>executor: execute `poetry run pytest`
duo_workflow_service->>gitlab_rails: Save checkpoint
executor->>duo_workflow_service: result `poetry run pytest`
duo_workflow_service->>llm_provider: fix the tests
llm_provider->>duo_workflow_service: Here's a patch
duo_workflow_service->>executor: execute `git apply`
duo_workflow_service->>gitlab_rails: Save checkpoint
duo_workflow_service->>executor: execute `poetry run pytest`
executor->>duo_workflow_service: result `poetry run pytest`
duo_workflow_service->>executor: Next step?
executor->>gitlab_rails: Check in & Next step?
gitlab_rails->>executor: Last step!
executor->>duo_workflow_service: Done!
deactivate duo_workflow_service
gitlab_rails->>user: Workflow done!
```
### CI Pipeline architecture
We don't want users to have to configure a specific `.gitlab-ci.yml` in order
to support Duo Workflow. In order to avoid this we'll use the same approach as
[that used by DAST site validations](https://gitlab.com/gitlab-org/gitlab/-/blob/19e0669446f55bd29a8df29174d3b0379b8e22c2/ee/app/services/app_sec/dast/site_validations/runner_service.rb#L11)
which dynamically constructs a pipeline configuration in GitLab and triggers
the pipeline without using any `.gitlab-ci.yml`.
CI Pipelines also must be run inside a project. There will be some usecases of
Duo Workflow where there is no appropriate project in which to run the pipeline
(e.g. bootstrapping a new project). For these workflows we will:
1. Initially require the user to have a default Workflow project created. It
can just be any empty project and we'll automatically run the pipeline there.
1. If this proves to be too much setup we'll automate the creation of a default
Duo Workflow project for you
1. If the UX is poor over time we might abstract the user away from the
existence of the Project altogether and make this an implementation detail.
This will be considered a last resort because it could be quite a wide
impacting change to GitLab as projects are a central part of GitLab.
#### Considerations for CI Runners and Infrastructure
1. Our Duo Workflow rollout may involve substantial increases to our CI runner
usage
1. Duo Workflow will likely involve running long running CI pipelines that use
very little CPU. Mostly what they will be doing is communicating back
and forth with the LLMs and users in a long running gRPC connection.
1. Users will expect very low latency for CI Runner startup
1. We should determine if there are ways to have preloaded VMs with our
Docker images running ready to start a pipeline when it a
workflow is triggered
1. We likely want a set of CI Runners that are just for Duo Workflow. This may
mean enabling the runners to a subset of customers or just using appropriate
job labeling/runner matching to only use these runners for Duo Workflow
1. It might be possible to roll out some Duo Workflow features on our existing
runner fleets but we believe there will be enough benefits to invest in
segregating these runners.
### State checkpointing
The Duo Workflow state will be persisted in GitLab-Rails as the Duo Workflow
Service works. There are 2 components to state:
1. The State object being managed by Langgraph. This includes all prompt history
between user and agents and any other metadata created by the LangGraph
graph
1. The working directory where the agent is writing code.
1. We will have data retention limits on all state. We will use PostgreSQL
partitioning to drop old workflow data after some time and we will also
drop old Git refs after some time.
We will be persisting the LangGraph state object using APIs in GitLab to
persist this state to PostgreSQL as it goes. The API will use similar LangGraph
conventions to identify all checkpoints with a `thread_ts` as implemented in
the POC <https://gitlab.com/gitlab-org/gitlab/-/merge_requests/153551>.
For the current working directory which contains the code the agent has written
so far we will store this by pushing hidden Git refs to GitLab for the checkpoint. Each
checkpoint will have an associated ref and a checkpoint naming convention (or
something stored in PostgreSQL) will allow us to identify the appropriate Git ref
for the state checkpoint.
Storing in Git has the advantage that we don't need to build any new API for
storing artifacts and it's very easy for the user to access the code by just
checking out that SHA. It also has huge storage savings where a workflow is
working on an existing large project. Ultimately we expect code changes end up
being pushed to Git anyway so this is the simplest solution.
Some Duo Workflows do not have an existing project (e.g. bootstrapping a
project). Even those workflows will need to be triggered from some project (as
explained in the section about CI piplelines). As such we can use the workflow
project as a temporary repository to store the snapshots of code generated by
the workflow.
Consideration should also be made to cleanup Git refs over time after some
workflow expiration period.
### Authentication
Duo Workflow requires several authentication flows.
In this section, each connection that requires authentication is listed and the
authentication mechanism is discussed.
<img src="diagrams/duo_workflow_auth.png" height="600" alt="">
#### Local Duo Workflow Executor -> Duo Workflow Service
When a Duo Workflow starts, the Duo Workflow Executor must connect to the Duo Workflow Service.
To authenticate this connection:
1. The IDE will use the OAuth token of Personal Access Token (PAT) that the user
generated while setting up the GitLab editor extension.
1. The IDE uses that token to authenticate a request to a GitLab Rails API
endpoint to obtain a short-lived user- and system-scoped JWT.
1. When the GitLab Rails instance receives this request, it loads its
instance-scoped JWT (synced daily from CustomersDot) and contacts the AI
gateway to swap this instance token for the above-mentioned user-scoped token
(also cryptographically signed)
1. GitLab Rails returns this JWT to the IDE.
1. The IDE passes on this JWT to the local Duo Workflow Executor component.
1. The Duo Workflow Executor uses this JWT to authenticate the Duo Workflow
Service gRPC connection.
This flow mimics the
[token flow that allows IDEs to connect direct to the AI Gateway](https://gitlab.com/groups/gitlab-org/-/epics/13252).
#### CI Duo Workflow Executor -> Duo Workflow Service
When a Duo Workflow is executed by a CI Runner, the Duo Workflow Executor must
connect to the Duo Workflow Service.
A CI Pipeline is created by GitLab, so there is no need to query a GitLab Rails
API endpoint to obtain a short-lived user- and system-scoped JWT. Instead, in
the process of creating the CI pipeline, GitLab Rails will:
1. Generate the user-scoped JWT.
1. Inject the JWT as an environment variable (for example: `DUO_WORKFLOW_TOKEN`)
in the CI pipeline.
1. The Duo Workflow Executor running inside the CI job uses this environment
variable value to authenticate the Duo Workflow Service gRPC connection.
#### Duo Workflow Service -> GitLab Rails API
Reasons that the Duo Workflow Service must be able to authenticate requests to
the GitLab Rails API:
1. The Duo Workflow Service will need to periodically make requests to GitLab Rails
to sync workflow state. This means that the Duo Workflow Service must be able
to authenticate these requests.
1. Duo Workflow may need to make other GitLab Rails API queries to gather
context. For example, a Duo Workflow for "solve issue with code" would
require an API request to retrieve the issue content.
1. The end state of a Duo Workflow may take the form of a generated artifact
(for example, Git commit or pull request) on the GitLab platform. To
generate this artifact, the Duo Workflow Service must be able to make API
requests to GitLab Rails.
Requirements for the token used to authenticate requests from the Duo Workflow Service to
the GitLab Rails API:
1. Any artifacts created by a Duo Workflow must be auditable in order
to maintain transparency about AI-generated activities on the GitLab platform.
1. The token's access level must match the access level of the user who
initiated the Workflow to ensure that there is no privilege escalation.
1. We must have the ability to block read/write for all resources that belong to
instances/projects/groups with `duo_features_enabled` set to false.
1. Token must be valid for as long as it takes an agent to execute or be
refreshable by the Duo Workflow Service. Workflow execution may take several hours.
The JWT that the Workflow Executor uses to authenticate to the Duo Workflow
Service could potentially be adapted to also work for this use-case but has some problems:
1. Need to update GitLab Rails to accept this type of token for API authentication.
1. JWTs are not revocable; what if we need to cut off an agent's access?
1. Need to build token rotation. How would the Duo Workflow Service authenticate an API
request to generate a new token if the old JWT is already expired?
For these reasons, OAuth is a better protocol for this use-case. OAuth tokens:
1. Are only valid for 2 hours.
1. Can be revoked.
1. Have a built-in refresh flow.
1. Are an established authentication pattern for federating access between
services.
To use OAuth, we will:
1. Create a new token scope called `ai_workflows`
([related issue](https://gitlab.com/gitlab-org/gitlab/-/issues/467160))
1. When the IDE requests the Duo Workflow Service User JWT from GitLab Rails, we
will also generate and return an OAuth token with the `ai_workflows` scope.
1. Duo Workflow executor will send that OAuth token, along with the `base_url`
of the GitLab Rails instance, as metadata when the Duo Workflow Service when
the gRPC connection is opened.
1. The Duo Workflow Service will use the OAuth token for any GitLab Rails API
Requests to read or write data for a Workflow.
### Options we've considered and pros/cons
#### Delegate only unsafe execution to local/CI pipelines
This was the option we chose. It attempts to keep as much of the functionality
as possible in services we run while delegating the unsafe execution to Duo
Workflow Executor which can run locally or in CI pipelines.
**Pros**:
1. Running the infrastructure ourselves gives us more control over the versions
being rolled out
1. There is less dependencies the user needs to install for local usage
1. It offers a rapid onboarding experience for self-managed customers to try
Duo Workflow without deploying any new GitLab components
**Cons**
1. We need to deploy and maintain new infrastructure which has different
scaling characteristics to other services we run duo to long running
execution
#### Run it locally
**Pros**:
1. This keeps developers in their local environment where most of them work
1. Compute is absorbed by the local developer so they don't have to worry about
being billed per minute
1. Low latency for user interaction especially where the user needs to
review/edit code while the agent is working
**Cons**:
1. There are more risks running it locally unless you have an isolated
development environment as commands have full access to your computer. This
can be mitigated by UX that limits what commands the agent can run without
user confirmation.
1. This approach will require some local developer setup and may not be suited
to tasks that users are expecting to kick off from the web UI (e.g.
issue/epic planning)
#### CI pipelines (on CI runners)
See <https://gitlab.com/gitlab-org/gitlab/-/issues/457959> for a POC and investigation.
**Pros**:
1. CI pipelines are the only pre-configured infrastructure we have that can run untrusted workflows
1. We have an established billing model for CI minutes
**Cons**:
1. CI pipelines are slow to start up and this might mean that iteration and incremental AI development might be slow if the pipelines need to be restarted while timing out waiting for user input
1. CI minutes will need to be consumed while the agent is awaiting for user input. This will likely require a timeout mechanism and as such if the user returns we'll need to restart a new pipeline when they give input
1. CI pipelines run in a difficult to access environment (ie. you cannot SSH it or introspect it live) and as such it may make it difficult for users to interact with code that is being built out live in front of them without
1. CI pipelines require there to be some project to run in. This is not likely something we can overcome but we may be able to simplify the setup process by automatically creating you a "workflow project" for your workflow pipelines to run in
1. When we implement non-code workflows (e.g. reviewing MRs) there is no need for an isolated compute environment but we'll still be forcing customers to use compute minutes. We've seen this is not a good experience in other cases like X-Ray reports
#### GitLab workspaces (remote development)
See <https://gitlab.com/gitlab-org/gitlab/-/issues/458339> for a POC and investigation.
**Pros**:
1. This has the fastest iteration cycle as the agent is working locally in your development environment and can interact with you and you can even see and edit the same files live as them
1. Customers can run it on their own infrastructure and this gives them control over efficient resource usage
**Cons**:
1. Today we only support customers bringing their own infrastructure (K8s cluster) and this means that the barrier to getting started is to bring your own K8s cluster and this is a fairly significant effort
1. If we wanted to build out infrastructure on GitLab.com to save customers having to bring their own K8s cluster this would be a fairly large effort from a security and infrastructure perspective. It's possible but to deal with all the complexities of security, abuse and billing would require many teams involvement in both initial development and sustained maintenance.
## Security
### Threat modeling
See <https://gitlab.com/gitlab-com/gl-security/product-security/appsec/threat-models/-/issues/46>.
### Security considerations for local execution
Local execution presents the highest value opportunity for developers but also
comes with the greatest risk that a bug or mistake from an LLM could lead to
causing significant harm to a user's local development environment or
compromise confidential information.
Some examples of risks:
1. An AI that can make honest but significant mistakes
1. An AI that might sometimes be adversarial
1. The AI gateway serving the LLM responses may be compromised which would then
allow shell access to all users of this tool
### Sandboxing Duo Workflow Executor
One proposal here to mitigate risks would be to use some form of sandboxing
where the Duo Workflow Executor is only able to run inside of an unprivileged
Docker container. Such a solution would need to:
1. Mount the local working directory into the container so it is still editing
the files the user is working on in the host
1. Install all development dependencies the user or agent would need to run the
application and tests
The above option may also make use of Dev Containers.
### User confirmation for commands
Another option for limiting the risk is to require the user to confirm every
command the agent executes before it runs the command. We will likely be
implementing this as an option anyway but given the desire for efficient
development of larger workflows it might limit the efficiency of the tool if it
needs to execute a lot of commands to finish a task.
We may also consider a hybrid approach where there a set of user-defined
allowlisted commands (e.g. `ls` and `cat`) which allow the agent to read and
learn about a project without the user needing to confirm. This approach may
not solve all needs though where the user may want to allowlist commands like
`rspec` which then effectively still allow for arbitrary code execution as the
agent can put whatever they want in the spec file.
## Duo Workflow UI
The Duo Workflow UI will need to be available at least in the following places:
1. In GitLab Rails web UI
1. In our editor extensions
The fact that we'll need multiple UIs and also as described above we have
multiple execution environments for Duo Workflow Executor have led to the
following decisions.
### How do we package and run the local web UI
We will build the majority of data access related to our local IDE UI into the
[GitLab Language Server](https://gitlab.com/gitlab-org/editor-extensions/gitlab-lsp)
to maximize re-use across all our editor extensions. We will also employ a mix
of webviews rendered in the IDE and served by the LSP as well as native IDE UI
elements. Where it doesn't considerably limit our user experience we'll opt to
build the interface into a web page served from the LSP and then rendered in
the IDE as a web view because this again maximises re-use across all our editor
extensions.
### How does the web UI reflect the current state live
The Duo Workflow Service will persist it's state frequently to the main GitLab Rails
application. There will be GraphQL subscriptions for streaming updates about a
workflow. The UI will consume these GraphQL apis and update the UI as
updates stream in.
Given that the user may be running the Duo Workflow Executor locally which may
be seeing some of the state as it happens it might be reasonable to want to
just live render the in-memory state of the running workflow process. We may
choose this optional deliberately for latency reasons but we need to be careful
to architect the frontend and Duo Workflow Executor as completely decoupled
because they will not always be running together. For example users may trigger
a workflow locally which runs in GitLab CI or they may be using the web UI to
interact with and re-run a workflow that was initiated locally.
As such we will generally prefer not to have direct interaction between the UI
and Executor but instead all communication should be happening via GitLab. Any
exceptions to this might be considered case by case but we'll need clear API
boundaries which allow the functionality to easily be changed to consume from
GitLab for the reasons described.
## Duo Workflow Agent's tools
Duo Workflow **agents** are, in a simplified view, a pair of: **prompt** and **LLM**.
By this definition, agents on their own are not able to interact with the outside world,
which significantly limits the scope of work that can be automated. To overcome this limitation, agents are being equipped with **tools**.
Tools are functions that agents can invoke using the [function calling](https://docs.anthropic.com/en/docs/tool-use) LLM feature.
These functions perform different actions on behalf of the agent. For example, an agent might be equipped with a tool (function)
that executes bash commands like `ls` or `cat` and returns the result of those bash commands back to the agent.
The breadth of the tool set available to **agents** defines the scope of work that can be automated. Therefore, to
set up the Duo Workflow feature for success, it will be required to deliver a broad and exhaustive tool set.
Foreseen tools include:
1. Tools to execute bash commands via the Duo Workflow Executor
1. Tools to manipulate files (including reading and writing to files)
1. Tools to manipulate Git VCS
1. Tools to integrate with the [GitLab HTTP API](../../../api/api_resources.md)
The fact that the Duo Workflow Service is going to require Git and GitLab API tools entails that the **Duo Workflow Service
must have the ability to establish an SSH connection and make HTTP requests to the GitLab instance.** This ability can be granted directly to the Duo Workflow Service or can be provided via the Duo Workflow Executor if a direct connection between the Duo Workflow Service and a GitLab instance is not possible due to a firewall or network partition.
## Milestones
1. All the components implemented and communicating correctly with only a
trivial workflow implemented
1. Checkpointing code as well as LangGraph state
1. Workflow locking in GitLab to ensure only 1 concurrent instance of a
workflow
1. Add more workflows and tools
1. Ability to resume a workflow
## POC - Demos
1. [POC: Solve issue (internal only)](https://www.youtube.com/watch?v=n1mpFirme4o)
1. [POC: Duo Workflow in Workspaces (internal only)](https://youtu.be/x7AxYwiQayg)
1. [POC: Autograph using Docker Executor (internal only)](https://www.youtube.com/watch?v=V-Mw6TXOkKI)
1. [POC: Duo Workflows in CI pipelines with timeout and restart (internal only)](https://youtu.be/v8WWZuAGXMU)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,28 +1,11 @@
---
owning-stage: "~devops::secure"
description: 'EPSS Support ADR 002: Use a new bucket for EPSS data'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/epss/decisions/002_use_new_bucket/'
remove_date: '2025-07-08'
---
# EPSS Support ADR 002: Use a new bucket for EPSS data
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/epss/decisions/002_use_new_bucket/).
## Context
PMDB exports data to GCP buckets. The data is later pulled by GitLab instances. Advisory data and license data are stored in different buckets. This is sensible, because advisory and license data are not directly related, and rather provide additional information about packages. Data is updated based on deltas—changes from the previous state of the data. Only those changes are saved with each addition to the database.
EPSS data is directly associated with advisories, so it feels natural to add it to the existing advisories bucket. However, the current advisories bucket is structured based on `purl_type`. Adding an `epss` data type would couple `epss` with `purl_type` which is a faulty pairing. Due to the tight coupling between `purl_type` and the existing advisories bucket, it would be difficult and convoluted to add `epss` to it.
Following [extensive discussions on the EPSS epic](https://gitlab.com/groups/gitlab-org/-/epics/11544#note_1952695268) and [discussion](https://gitlab.com/gitlab-org/gitlab/-/issues/468131#note_1961344123) during the refinement of PMDB issues, it was initially decided to use the existing bucket as this feels most intuitive and at the time felt a healthier approach. [Further discussion](https://gitlab.com/gitlab-org/gitlab/-/issues/467672#note_1980715240) during the refinement of the GitLab backend effort led to the decision to use a new bucket, due to the complexity of the coupling of `purl_type` and other, unrelated areas in the monolith. Adding `epss` to `purl_type` would impact other components and we want to avoid having to work around that. We may want to later simplify these areas and reconsider the bucket structure at a later stage.
## Decision
Export EPSS data to a new bucket, rather than exporting it into the existing PMDB advisories bucket.
## Consequences
The implementation is simpler than adding a directory to the existing advisories bucket, but may feel less intuitive.
This change require the relevant Terraform changes regarding the provisioning of a new bucket.
This should also be addressed in the exporter and the GitLab `package_metadata` sync configuration.
## Alternatives
The other option is to add EPSS data to the advisories bucket, since they are directly related. This was the [initial decision](https://gitlab.com/gitlab-org/gitlab/-/issues/468131#note_1980366323). This would allow us to utilize existing mechanisms and keep related data close. However, EPSS data doesn't fit into the current structure of the advisories bucket. An ideal solution would reconstruct the buckets in a manner more fitting for this approach, but this would be a big effort and is not critical enough.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,279 +1,11 @@
---
status: proposed
creation-date: "2024-06-19"
authors: [ "@YashaRise" ]
coach: [ "@theoretick" ]
approvers: [ "@johncrowley", "@tkopel", "@nilieskou" ]
owning-stage: "~devops::secure"
participating-stages: TBD
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/epss/'
remove_date: '2025-07-08'
---
<!--
Before you start:
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/epss/).
- Copy this file to a sub-directory and call it `index.md` for it to appear in
the blueprint directory.
- Remove comment blocks for sections you've filled in.
When your blueprint ready for review, all of these comment blocks should be
removed.
To get started with a blueprint you can use this template to inform you about
what you may want to document in it at the beginning. This content will change
/ evolve as you move forward with the proposal. You are not constrained by the
content in this template. If you have a good idea about what should be in your
blueprint, you can ignore the template, but if you don't know yet what should
be in it, this template might be handy.
- **Fill out this file as best you can.** At minimum, you should fill in the
"Summary", and "Motivation" sections. These can be brief and may be a copy
of issue or epic descriptions if the initiative is already on Product's
roadmap.
- **Create a MR for this blueprint.** Assign it to an Architecture Evolution
Coach (i.e. a Principal+ engineer).
- **Merge early and iterate.** Avoid getting hung up on specific details and
instead aim to get the goals of the blueprint clarified and merged quickly.
The best way to do this is to just start with the high-level sections and fill
out details incrementally in subsequent MRs.
Just because a blueprint is merged does not mean it is complete or approved.
Any blueprint is a working document and subject to change at any time.
When editing blueprints, aim for tightly-scoped, single-topic MRs to keep
discussions focused. If you disagree with what is already in a document, open a
new MR with suggested changes.
If there are new details that belong in the blueprint, edit the blueprint. Once
a feature has become "implemented", major changes should get new blueprints.
The canonical place for the latest set of instructions (and the likely source
of this file) is [here](https://gitlab.com/gitlab-org/gitlab/-/blob/master/doc/architecture/blueprints/_template.md).
Blueprint statuses you can use:
- "proposed"
- "accepted"
- "ongoing"
- "implemented"
- "postponed"
- "rejected"
-->
<!-- Blueprints often contain forward-looking statements -->
<!-- vale gitlab.FutureTense = NO -->
# EPSS Support
<!--
This is the title of your blueprint. Keep it short, simple, and descriptive. A
good title can help communicate what the blueprint is and should be considered
as part of any review.
-->
<!--
For long pages, consider creating a table of contents.
The `[_TOC_]` function is not supported on docs.gitlab.com.
-->
For important terms, see [glossary](#glossary).
## Summary
[EPSS](https://www.first.org/epss/faq) scores specify the likelihood a CVE will be exploited in the next 30 days. This data may be used to improve and simplify prioritization efforts when remediating vulnerabilities in a project. EPSS support requirements are outlined in [the EPSS epic](https://gitlab.com/groups/gitlab-org/-/epics/11544) along with an overview of EPSS. This document focuses on the technical implementation of EPSS support.
EPSS scores may be populated from the [EPSS Data page](https://www.first.org/epss/data_stats) or through their provided API. Ultimately, EPSS scores should be reachable through the GitLab GraphQL API, as seen on the vulnerability report and details pages, and be filterable and usable when setting policies.
Package metadata database (PMDB, also known as license-db), an existing advisory pull-and-enrichment method, is for this purpose. The flow is as follows:
```mermaid
flowchart LR
A[EPSS Source] -->|Pull| B[PMDB]
B -->|Process and export| C[Bucket]
C -->|Pull| D[GitLab Instance]
```
<!--
This section is very important, because very often it is the only section that
will be read by team members. We sometimes call it an "Executive summary",
because executives usually don't have time to read entire document like this.
Focus on writing this section in a way that anyone can understand what it says,
the audience here is everyone: executives, product managers, engineers, wider
community members.
A good summary is probably at least a paragraph in length.
-->
## Motivation
The classic approach to vulnerability prioritization is using severity based on [CVSS](https://www.first.org/cvss/). This approach provides some guidance, but is too unrefined—more than half of all published CVEs have a high or critical score. Other metrics need to be employed to reduce remediation fatigue and help developers prioritize their work better. EPSS provides a metric to identify which vulnerabilities are most likely to be exploited in the near future. Combined with existing prioritization methods, EPSS helps to focus remediation efforts better and reduce remediation workload. By adding EPSS to the information presented to users, we deliver these benefits to the GitLab platform.
<!--
This section is for explicitly listing the motivation, goals and non-goals of
this blueprint. Describe why the change is important, all the opportunities,
and the benefits to users.
The motivation section can optionally provide links to issues that demonstrate
interest in a blueprint within the wider GitLab community. Links to
documentation for competing products and services is also encouraged in cases
where they demonstrate clear gaps in the functionality GitLab provides.
For concrete proposals we recommend laying out goals and non-goals explicitly,
but this section may be framed in terms of problem statements, challenges, or
opportunities. The latter may be a more suitable framework in cases where the
problem is not well-defined or design details not yet established.
-->
### Goals
- Enable users to use EPSS scores on GitLab as another metric for their vulnerability prioritization efforts.
- Provide scalable means of efficiently repopulating recurring EPSS scores to minimize system load.
#### Phase 1 (MVC)
- Enable access to EPSS scores through GraphQL API.
#### Phase 2
- Show EPSS scores in vulnerability report and details pages.
#### Phase 3
- Allow filtering vulnerabilities based on EPSS scores.
- Allow creating policies based on EPSS scores.
<!--
List the specific goals / opportunities of the blueprint.
- What is it trying to achieve?
- How will we know that this has succeeded?
- What are other less tangible opportunities here?
-->
### Non-Goals
- Dictate priority to users based on EPSS (or any other metric).
<!--
Listing non-goals helps to focus discussion and make progress. This section is
optional.
- What is out of scope for this blueprint?
-->
## Proposal
Support EPSS on the GitLab platform.
Following the discussions in the [EPSS epic](https://gitlab.com/groups/gitlab-org/-/epics/11544), the proposed flow is:
1. PMDB database is extended with a new table to store EPSS scores.
1. PMDB infrastructure runs the feeder daily in order to pull and process EPSS data.
1. The advisory-processor receives the EPSS data and stores them to the PMDB DB.
1. PMDB exports EPSS data to a new PMDB EPSS bucket.
- Create a new bucket to store EPSS data.
- Delete former EPSS data once new data is uploaded, as the old data is no longer needed.
- Truncate EPSS scores to two digits after the dot.
1. GitLab instances pull data from the PMDB EPSS bucket.
- Create a new table in rails DB to store EPSS data.
1. GitLab instances expose EPSS data through GraphQL API and present data in vulnerability report and details pages.
```mermaid
flowchart LR
AF[Feeder] -->|pulls| A[EPSS Source]
AF -->|publishes| AP[Advisory Processor]
AP -->|stores| DD[PMDB database]
E[Exporter] -->|loads|DD
E --> |exports| B[Public Bucket]
GitLab[GitLab instance] --> |syncs| B
GitLab --> |stores| GitLabDB
```
<!--
This is where we get down to the specifics of what the proposal actually is,
but keep it simple! This should have enough detail that reviewers can
understand exactly what you're proposing, but should not include things like
API designs or implementation. The "Design Details" section below is for the
real nitty-gritty.
You might want to consider including the pros and cons of the proposed solution so that they can be
compared with the pros and cons of alternatives.
-->
## Design and implementation details
### Decisions
- [002: Use a new bucket for EPSS data](decisions/002_use_new_bucket.md)
### Important notes
- All EPSS scores get updated on a daily basis. This is pivotal to this feature's design.
- The [fields retrieved](https://www.first.org/epss/data_stats) from the EPSS source are `cve`, `score`, `percentile`. 9 digits after the dot are maintained.
- To reduce the amount of upserts, based on a [spike to check magnitude of change](https://gitlab.com/gitlab-org/gitlab/-/issues/468286), we will truncate EPSS scores to two digits after the dot.
### PMDB
- Create a new EPSS table in [PMDB](https://gitlab.com/gitlab-org/security-products/license-db) with an advisory identifier and the EPSS score. This includes changing the [schema](https://gitlab.com/gitlab-org/security-products/license-db/schema) and any necessary migrations.
- Ingest EPSS data into new PMDB table. We want to keep the EPSS data structure as close as possible to the origin so all of the data may be available to the exporter, and the exporter may choose how to process it. Therefore we will save scores and percentiles with their complete values.
- Export EPSS scores in separate bucket.
- Delete the previous day's export as it is no longer needed after the new one is added.
- Add new pubsub topics to deployment to be used by PMDB components, using existing terraform modules.
### GitLab Rails backend
- Create table in rails backend to hold EPSS scores.
- Configure Rails sync to ingest EPSS exports and save to new table.
- Include EPSS data attributes in GraphQL API Occurrence objects.
### GitLab UI
- Add EPSS data to vulnerability report page.
- Add EPSS data to vulnerability details page.
- Allow filtering by EPSS score.
- Allow creating policies based on EPSS score.
<!--
This section should contain enough information that the specifics of your
change are understandable. This may include API specs (though not always
required) or even code snippets. If there's any ambiguity about HOW your
proposal will be implemented, this is the place to discuss them.
If you are not sure how many implementation details you should include in the
blueprint, the rule of thumb here is to provide enough context for people to
understand the proposal. As you move forward with the implementation, you may
need to add more implementation details to the blueprint, as those may become
an important context for important technical decisions made along the way. A
blueprint is also a register of such technical decisions. If a technical
decision requires additional context before it can be made, you probably should
document this context in a blueprint. If it is a small technical decision that
can be made in a merge request by an author and a maintainer, you probably do
not need to document it here. The impact a technical decision will have is
another helpful information - if a technical decision is very impactful,
documenting it, along with associated implementation details, is advisable.
If it's helpful to include workflow diagrams or any other related images.
Diagrams authored in GitLab flavored markdown are preferred. In cases where
that is not feasible, images should be placed under `images/` in the same
directory as the `index.md` for the proposal.
-->
## Alternative Solutions
<!--
It might be a good idea to include a list of alternative solutions or paths considered, although it is not required. Include pros and cons for
each alternative solution/path.
"Do nothing" and its pros and cons could be included in the list too.
-->
## Glossary
- **PMDB** (Package metadata database, also known as License DB): PMDB is a standalone service (and not solely a database), outside of the Rails application, that gathers, stores and exports packages metadata for GitLab instances to consume. See [complete documentation](https://gitlab.com/gitlab-org/security-products/license-db/deployment/-/blob/main/docs/DESIGN.md?ref_type=heads). PMDB components include:
- **Feeder**: a scheduled job called by the PMDB deployment to publish data from the relevant sources to pub/sub messages consumed by PMDB processors.
- **Advisory processor**: Runs as a Cloud Run instance and consumes messages published by the advisory feeder containing advisory related data and stores them to the PMDB database.
- **PMDB database**: a PostgreSQL instance storing license and advisory data.
- **Exporter**: exports license/advisory data from the PMDB database to public GCP buckets.
- **GitLab database**: the database used by GitLab instances.
- **CVE** (Common Vulnerabilities and Exposures): a list of publicly known information-security vulnerabilities. "A CVE" usually refers to a specific vulnerability and its CVE ID.
- **EPSS** (Exploit prediction scoring system) **score**: a score ranging from 0 to 1 representing the probability of exploitation in the wild in the next 30 days of a given vulnerability.
- **EPSS score percentile**: for a given EPSS score (of some vulnerability), the proportion of all scored vulnerabilities with the same or a lower EPSS score.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,41 +1,11 @@
---
status: proposed
creation-date: "2024-01-25"
authors: [ "@shinya.maeda", "@mikolaj_wawrzyniak" ]
coach: [ "@stanhu" ]
approvers: [ "@pwietchner", "@oregand", "@tlinz" ]
owning-stage: "~devops::ai-powered"
participating-stages: ["~devops::data stores", "~devops::create"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_duo_rag/elasticsearch/'
remove_date: '2025-07-08'
---
# Elasticsearch
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_duo_rag/elasticsearch/).
For more information on Elasticsearch and RAG broadly, see the [Elasticsearch article](../gitlab_rag/elasticsearch.md) in [RAG at GitLab](../gitlab_rag/index.md).
## Retrieve GitLab Documentation
A [proof of concept](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/145392) was done to switch the documentation embeddings from being stored in the embedding database to being stored on Elasticsearch.
### Synchronizing embeddings with data source
The same procedure used by [PostgreSQL](postgresql.md) can be followed to keep the embeddings up to date in Elasticsearch.
### Retrieval
To get the nearest neighbours, the following query can be executed an index containing the embeddings:
```ruby
{
"knn": {
"field": vector_field_containing_embeddings,
"query_vector": embedding_for_question,
"k": limit,
"num_candidates": number_of_candidates_to_compare
}
}
```
### Requirements to get to self-managed
- Productionalize the PoC [MR](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/145392)
- Get more self-managed instances to install Elasticsearch by [shipping GitLab with Elasticsearch](https://gitlab.com/gitlab-org/gitlab/-/issues/438178). Elastic gave their approval to ship with the free license. The work required for making it easy for customers to host Elasticsearch is more than 2 milestones.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,37 +1,11 @@
---
status: proposed
creation-date: "2024-01-25"
authors: [ "@shinya.maeda", "@mikolaj_wawrzyniak" ]
coach: [ "@stanhu" ]
approvers: [ "@pwietchner", "@oregand", "@tlinz" ]
owning-stage: "~devops::ai-powered"
participating-stages: ["~devops::data stores", "~devops::create"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_duo_rag/'
remove_date: '2025-07-08'
---
# Retrieval Augmented Generation (RAG) for GitLab Duo on self-managed
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_duo_rag/).
RAG is an application architecture used to provide knowledge to a large language model that doesn't exist in its training set, so that it can use that knowledge to answer user questions. To learn more about RAG, see [RAG for GitLab](../gitlab_rag/index.md).
## Goals of this blueprint
This blueprint aims to drive a decision for a RAG solution for GitLab Duo on self-managed, specifically for shipping GitLab Duo with access to GitLab documentation. We outline three potential solutions, including PoCs for each to demonstrate feasibility for this use case.
## Constraints
- The solution must be viable for self-managed customers to run and maintain
- The solution must be shippable in 1-2 milestones <!-- I don't actually know that this is true, just adding an item for time constraint -->
- The solution should be low-lock-in, since we are still determining our long term technical solution(s) for RAG at GitLab
## Proposals for GitLab Duo Chat RAG for GitLab documentation
The following solutions have been proposed and evaluated for the GitLab Duo Chat for GitLab documentation use case:
- [Vertex AI Search](vertex_ai_search.md)
- [Elasticsearch](elasticsearch.md)
- [PostgreSQL with PGVector extension](postgresql.md)
You can read more about how each evaluatoin was conducted in the links above.
## Chosen solution
[Vertex AI Search](vertex_ai_search.md) is going to be implemented due to the low lock-in and being able to reach customers quickly. It could be moved over to another solution in the future.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,95 +1,11 @@
---
status: proposed
creation-date: "2024-01-25"
authors: [ "@shinya.maeda", "@mikolaj_wawrzyniak" ]
coach: [ "@stanhu" ]
approvers: [ "@pwietchner", "@oregand", "@tlinz" ]
owning-stage: "~devops::ai-powered"
participating-stages: ["~devops::data stores", "~devops::create"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_duo_rag/postgresql/'
remove_date: '2025-07-08'
---
# PostgreSQL
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_duo_rag/postgresql/).
## Retrieve GitLab Documentation
PGVector is currently being used for the retrieval of relevant documentation for GitLab Duo chat's RAG.
A separate `embedding` database runs alongside `geo` and `main` which has the `pg-vector` extension installed and contains embeddings for GitLab documentation.
- Statistics (as of January 2024):
- Data type: Markdown written in natural language (Unstructured)
- Data access level: Green (No authorization required)
- Data source: `https://gitlab.com/gitlab-org/gitlab/-/blob/master/doc`
- Data size: 147 MB in `vertex_gitlab_docs`. 2194 pages.
- Service: `https://docs.gitlab.com/` ([source repo](https://gitlab.com/gitlab-org/gitlab-docs)
- Example of user input: "How do I create an issue?"
- Example of expected AI-generated response: "To create an issue:\n\nOn the left sidebar, select Search or go to and find your project.\n\nOn the left sidebar, select Plan > Issues, and then, in the upper-right corner, select New issue."
### Synchronizing embeddings with data source
Here is the overview of synchronizing process that is currently running in GitLab.com:
1. Load documentation files of the GitLab instance. i.e. `doc/**/*.md`.
1. Compare the checksum of each file to detect an new, update or deleted documents.
1. If a doc is added or updated:
1. Split the docs with the following strategy:
- Text splitter: Split by new lines (`\n`). Subsequently split by 100~1500 chars.
1. Bulk-fetch embeddings of the chunks from `textembedding-gecko` model (768 dimensions).
1. Bulk-insert the embeddings into the `vertex_gitlab_docs` table.
1. Cleanup the older embeddings.
1. If a doc is deleted:
1. Delete embeddings of the page.
As of today, there are 17345 rows (chunks) on `vertex_gitlab_docs` table on GitLab.com.
For Self-managed instances, we serve embeddings from AI Gateway and GCP's Cloud Storage,
so the above process can be simpler:
1. Download an embedding package from Cloud Storage through AI Gateway API.
1. Bulk-insert the embeddings into the `vertex_gitlab_docs` table.
1. Delete older embeddings.
We generate this embeddings package before GitLab monthly release.
Sidekiq cron worker automatically renews the embeddings by comparing the embedding version and the GitLab version.
If it's outdated, it will download the new embedding package.
Going further, we can consolidate the business logic between SaaS and Self-managed by generating the package every day (or every grpd deployment).
This is to reduce the point of failure in the business logic and let us easily reproduce an issue that reported by Self-managed users.
Here is the current table schema:
```sql
CREATE TABLE vertex_gitlab_docs (
id bigint NOT NULL,
created_at timestamp with time zone NOT NULL,
updated_at timestamp with time zone NOT NULL,
version integer DEFAULT 0 NOT NULL, -- For replacing the old embeddings by new embeddings (e.g. when doc is updated)
embedding vector(768), -- Vector representation of the chunk
url text NOT NULL,
content text NOT NULL, -- Chunked data
metadata jsonb NOT NULL, -- Additional metadata e.g. page URL, file name
CONSTRAINT check_2e35a254ce CHECK ((char_length(url) <= 2048)),
CONSTRAINT check_93ca52e019 CHECK ((char_length(content) <= 32768))
);
CREATE INDEX index_vertex_gitlab_docs_on_version_and_metadata_source_and_id ON vertex_gitlab_docs USING btree (version, ((metadata ->> 'source'::text)), id);
CREATE INDEX index_vertex_gitlab_docs_on_version_where_embedding_is_null ON vertex_gitlab_docs USING btree (version) WHERE (embedding IS NULL);
```
### Retrieval
After the embeddings are ready, GitLab-Rails can retrieve chunks in the following steps:
1. Fetch embedding of the user input from `textembedding-gecko` model (768 dimensions).
1. Query to `vertex_gitlab_docs` table for finding the nearest neighbors. e.g.:
```sql
SELECT *
FROM vertex_gitlab_docs
ORDER BY vertex_gitlab_docs.embedding <=> '[vectors of user input]' -- nearest neighbors by cosine distance
LIMIT 10
```
### Requirements to get to self-managed
All instances of GitLab have postgres running but allowing instances to administer a separate database for embeddings or combining the embeddings into the main database would require some effort which spans more than a milestone.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,134 +1,11 @@
---
status: proposed
creation-date: "2024-01-25"
authors: [ "@shinya.maeda", "@mikolaj_wawrzyniak" ]
coach: [ "@stanhu" ]
approvers: [ "@pwietchner", "@oregand", "@tlinz" ]
owning-stage: "~devops::ai-powered"
participating-stages: ["~devops::data stores", "~devops::create"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_duo_rag/vertex_ai_search/'
remove_date: '2025-07-08'
---
# Vertex AI Search
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_duo_rag/vertex_ai_search/).
## Retrieve GitLab Documentation
- Statistics (as of January 2024):
- Date type: Markdown (Unstructured) written in natural language
- Date access level: Green (No authorization required)
- Data source: `https://gitlab.com/gitlab-org/gitlab/-/blob/master/doc`
- Data size: approx. 56,000,000 bytes. 2194 pages.
- Service: `https://docs.gitlab.com/` ([source repo](https://gitlab.com/gitlab-org/gitlab-docs)
- Example of user input: "How do I create an issue?"
- Example of expected AI-generated response: "To create an issue:\n\nOn the left sidebar, select Search or go to and find your project.\n\nOn the left sidebar, select Plan > Issues, and then, in the upper-right corner, select New issue."
[The GitLab documentation](https://gitlab.com/gitlab-org/gitlab-docs/-/blob/main/doc/architecture.md) is the SSoT service to serve GitLab documentation for SaaS (both GitLab.com and Dedicated) and Self-managed.
When a user accesses to a documentation link in GitLab instance,
they are [redirected to the service](https://gitlab.com/groups/gitlab-org/-/epics/11600#note_1690083049) since 16.0 (except air-gapped solutions).
In addition, the current search backend of `docs.gitlab.com` needs to transition to [Vertex AI Search](https://cloud.google.com/enterprise-search?hl=en). See [this issue](https://gitlab.com/gitlab-com/legal-and-compliance/-/issues/1876) (GitLab member only) for more information.
We introduce a new semantic search API powered by Vertex AI Search for the documentation tool of GitLab Duo Chat.
### Setup in Vertex AI Search
We [create a search app](https://cloud.google.com/generative-ai-app-builder/docs/create-engine-es) for each GitLab versions.
These processes will likely be automated in the [GitLab Documentation project](https://gitlab.com/gitlab-org/gitlab-docs/-/blob/main/doc/architecture.md)
by CI/CD pipelines.
1. Create a new Bigquery table e.g. `gitlab-docs-latest` or `gitlab-docs-v16.4`
1. Download documents from repositories (e.g. `gitlab-org/gitlab/doc`, `gitlab-org/gitlab-runner/docs`, `gitlab-org/omnibus-gitlab/doc`).
1. Split them by Markdown headers and generate metadata (e.g. URL and title).
1. Insert rows into the Bigquery table.
1. [Create a search app](https://cloud.google.com/generative-ai-app-builder/docs/create-engine-es)
See [this notebook](https://colab.research.google.com/drive/1XxYPWkNBnwZ0UG1aJ0Pjb2gfYmLnrHft?usp=sharing) for more implementation details.
The data of the latest version will be refreshed by a nightly build with [Data Store API](https://cloud.google.com/generative-ai-app-builder/docs/reference/rpc).
### AI Gateway API
API design is following the existing patterns in [AI Gateway](../ai_gateway/index.md).
```plaintext
POST /v1/search/docs
```
```json
{
"type": "search",
"metadata": {
"source": "GitLab EE",
"version": "16.3" // Used for switching search apps for older GitLab instances
},
"payload": {
"query": "How can I create an issue?",
"params": { // Params for Vertex AI Search
"page_size": 10,
"filter": "",
},
"provider": "vertex-ai"
}
}
```
The response will include the search results. For example:
```json
{
"response": {
"results": [
{
"id": "d0454e6098773a4a4ebb613946aadd89",
"content": "\nTo create an issue from a group: \n1. On the left sidebar, ...",
"metadata": {
"Header1": "Create an issue",
"Header2": "From a group",
"url": "https://docs.gitlab.com/ee/user/project/issues/create_issues.html"
}
}
]
},
"metadata": {
"provider": "vertex-ai"
}
}
```
See [SearchRequest](https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest) and [SearchResponse](https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchResponse) for Vertex AI API specs.
### Proof of Concept
- [GitLab-Rails MR](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/144719)
- [AI Gateway MR](https://gitlab.com/gitlab-org/modelops/applied-ml/code-suggestions/ai-assist/-/merge_requests/642)
- [Vertex AI Search service](https://console.cloud.google.com/gen-app-builder/engines?referrer=search&project=ai-enablement-dev-69497ba7)
- [Google Colab notebook](https://colab.research.google.com/drive/1XxYPWkNBnwZ0UG1aJ0Pjb2gfYmLnrHft?usp=sharing)
- [Demo video](https://youtu.be/ipEpMt-U6rQ?feature=shared) (Note: In this video, Website URLs are used as data source).
#### Evaluation score
Here is the evaluation scores generated by [Prompt Library](https://gitlab.com/gitlab-org/modelops/ai-model-validation-and-research/ai-evaluation/prompt-library).
|Setup|correctness|comprehensiveness|readability|evaluating_model|
|---|---|---|---|---|
|New (w/ Vertex AI Search)|3.7209302325581382|3.6976744186046511|3.9069767441860455|claude-2|
|Current (w/ Manual embeddings in GitLab-Rails and PgVector)|3.7441860465116279|3.6976744186046511|3.9767441860465116|claude-2|
<details>
<summary>Dataset</summary>
- Input Bigquery table: `dev-ai-research-0e2f8974.duo_chat_external.documentation__input_v1`
- Output Bigquery table:
- `dev-ai-research-0e2f8974.duo_chat_external_results.sm_doc_tool_vertex_ai_search`
- `dev-ai-research-0e2f8974.duo_chat_external_results.sm_doc_tool_legacy`
- Command: `promptlib duo-chat eval --config-file /eval/data/config/duochat_eval_config.json`
</details>
### Estimated Time of Completion
- Milestone N:
- Setup in Vertex AI Search with CI/CD automation.
- Introduce `/v1/search/docs` endpoint in AI Gateway.
- Updates the retrieval logic in GitLab-Rails.
- Feature flag clean up.
Total milestones: 1
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,221 +1,11 @@
---
status: proposed
creation-date: "2024-02-20"
authors: [ "@bvenker", "@mikolaj_wawrzyniak" ]
coach: [ "@stanhu" ]
approvers: [ "@pwietchner", "@oregand", "@shinya.meda", "@mikolaj_wawrzyniak" ]
owning-stage: "~devops::data stores"
participating-stages: ["~devops::ai-powered", "~devops::create"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_rag/elasticsearch/'
remove_date: '2025-07-08'
---
# Elasticsearch
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_rag/elasticsearch/).
Elasticsearch is a search engine and data store which allows generating, storing and querying vectors and performing keyword and semantic search at scale.
Elasticsearch employs a distributed architecture, where data is stored across multiple nodes. This allows for parallel processing of queries, ensuring fast results even with massive datasets.
## Using Elasticsearch as a vector store
Elasticsearch can be used to store embedding vectors up to 4096 dimensions and find the closest neighbours for a given embedding.
![Elasticsearch as vector store](img/elasticsearch_vector_store.png)
### Licensing
Does not require a paid license.
### Indexing embeddings
For every document type (e.g. `gitlab_documentation`), an index is created and stores the original source, embeddings and optional metadata such as URL. An initial backfill is required to index all current documents and a process to upsert or delete documents as the source changes.
For GitLab Duo Documentation, the current async process for generating and storing embeddings in the embeddings database can be altered to index into Elasticsearch.
Using the Advanced Search framework, database records are automatically kept up to date in Elasticsearch. [Issue 442197](https://gitlab.com/gitlab-org/gitlab/-/issues/442197) proposes changing the Elasticsearch framework to allow for other datasets to be indexed.
For documents with large sources that need to be split into chunks, [nested kNN search](https://www.elastic.co/guide/en/elasticsearch/reference/8.12/knn-search.html#nested-knn-search) can be used whereby a single top-level document contains nested objects each with a source and embedding. This enables searching for the top K documents with the most relevant chunks. It is not suited for cases where the top k chunks need to be searched within a single document. In such cases, every chunk should be stored as a separate document.
### Querying context-relevant information
A given question is passed to a model to generate embeddings. The vector is then sent to Elasticsearch to find the most relevant documents.
### Generation
The N most relevant documents are added to a prompt which is sent to an LLM to generate an answer for the original question.
## RAG in Elasticsearch using hosted models
Similar to the above but the question's embeddings are generated from within Elasticsearch.
![RAG overview](img/elasticsearch_rag_hosted_models.png)
### Licensing
Requires a paid license on every cluster.
### Model hosting
Requires model(s) used to be hosted on every cluster which adds effort and cost.
Elasticsearch supports the following models:
- ELSER (Elastic Learned Sparse Encoder): Built-in model provided by Elasticsearch used to generate text embeddings for semantic search.
- TensorFlow Models: Custom TensorFlow models can be deployed for semantic search using the ML APIs.
- Third-Party Models: Elasticsearch supports deploying models from Hugging Face and other providers. This provides access to a wider range of pre-trained models, but deployment and maintenance requires additional work.
## Hybrid Search
Hybrid search combines text and semantic search to return the most revelant sources. A reranker could be used to combine the results from both methods.
![Hybdid search](img/elasticsearch_hybrid_search.png)
### Advanced text search features of Elasticsearch
1. Inverted Indexing: At its core, Elasticsearch relies on a powerful data structure called an inverted index. This index essentially flips the traditional approach, where each document contains a list of words. Instead, the inverted index catalogues every unique word across all documents and tracks where it appears in each one. This enables lightning-fast searches by finding relevant documents based on matching words instantly.
1. Advanced Text Analysis: Elasticsearch doesn't simply match whole words. It leverages text analyzers to break down and understand text intricacies. This includes handling:
- Stemming and lemmatization: Reducing words to their root form (e.g., "running" and "ran" both matching "run").
- Synonyms and related terms: Recognizing synonyms and similar words to expand search results.
- Stop words: Ignoring common words like "the" and "a" that don't contribute much to meaning.
- Custom analysis: Defining your own rules for specific domains or languages.
1. Powerful Query Capabilities: Elasticsearch goes beyond basic keyword searches. It supports complex queries using Boolean operators (AND, OR, NOT), proximity searches (finding words close together), fuzzy searches (handling typos), and more. You can also filter results based on other criteria alongside text matching.
### Reranking
Elasticsearch currently supports [Reciprocal rank fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html) which works out-the-box. They also released [Learning to Rank](https://elasticsearch-learning-to-rank.readthedocs.io/en/latest/) which uses ML to improve ranking.
## Running Elasticsearch
Elasticsearch is available on GitLab.com and can be integrated on Dedicated and Self-Managed instances. To use as a vector store only:
- [Install Elasticsearch version `8.12`](../../../integration/advanced_search/elasticsearch.md#install-elasticsearch-or-aws-opensearch-cluster) or upgrade to at least version `8.12`.
- Add URL, Username and Password on the Advanced Search settings page: `admin/application_settings/advanced_search`
After the integration is configured, instance admins don't need to do further work to use it as a vector store since the GitLab Elasticsearch framework handles setting mappings, settings and indexing data.
## Supported dimensions
Elasticsearch can store up to 4096 dimensions and OpenSearch up to 16000 dimensions, compared to `pg_vector` which can store up to 2000.
## Limitations
### Licensing
In order to use the ML capabilities offered by Elastic, every cluster has to have a valid license.
If Elastic is used only as a vector store and all embeddings generated outside of Elastic, a license is not required.
### Adoption
The Elastic integration is available to all GitLab instances to unlock Advanced Search but not all instances have chosen to run the integration. There is also an additional cost for every instance to host the integration.
## Performance and scalability
Elasticsearch is horizontally scalable and handles storing and querying at scale. An Elasticsearch cluster consists of multiple nodes each contributing resources.
## Cost
Elastic Cloud pricing for GitLab Documentation vector storage is about $38 per month and the price scales with storage requirements.
## Elasticseach vs. OpenSearch
### Features
Both offer storing vector embeddings and similarity search (kNN).
Elasticsearch supports custom TensorFlow models which OpenSearch does not offer. Both offer pre-trained models.
The APIs for kNN searching differ slightly between the two platforms but work in the same way.
### Supported platforms
Currently GitLab offers Advanced Search for both Elasticsearch and OpenSearch due to parity between the text search APIs. If both are supported for AI features, there would be a need to adapt to two different AI APIs.
## PoC: Repository X Ray
To test the viability of Elasticsearch for generating embeddings, a PoC was done with Repository X Ray project.
Repository X Ray hasn't yet implemented any semantic seach and this section is based soely on a [prototype implementation](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/144715)
- Statistics (as of February 2024):
- Data type: JSON document with source code libraries descriptions in natural language
- Data access level: Red (each JSON document belongs to specific project, and data access rules should adhere to data access rules configure for that project)
- Data source: Repository X Ray report CI artifact
- Data size: N/A
- Example of user input: "# generate function that fetches sales report for vendor from App Store"
- Example of expected AI-generated response:
```python
def sales_reports(vendor_id)\n app_store_connect.sales_reports(\n filter: {\n report_type: 'SALES',\n report_sub_type: 'SUMMARY',\n frequency: 'DAILY',
vendor_number: '123456'\n }\n)\nend
```
### Synchronizing embeddings with data source
In a similar manner as with the [documentation example](../gitlab_duo_rag/elasticsearch.md#retrieve-gitlab-documentation) Repository X Ray report data is a derivative. It uses an underlying repository source code as a base, and it must be synchronised with it, whenever any changes to the source code occurs.
Right now there is no synchronisation mechanism that includes embeddings and vector storage. However there is an existing pipeline that generates and stores Repository X Ray reports.
The ingestion pipeline is performed in following steps:
1. A CI X Ray scanner job is triggered - a documentation [page](../../../user/project/repository/code_suggestions/repository_xray.md#enable-repository-x-ray) suggest limiting this job to be executed only when changes occur to the main repository branch. However repository maintainers may configure trigger rules differently.
- An X Ray [scanner](https://gitlab.com/gitlab-org/code-creation/repository-x-ray) locates and process one of the supported [dependencies files](../../../user/project/repository/code_suggestions/repository_xray.md#supported-languages-and-package-managers), producing JSON report files.
1. After the X Ray scanner job finishes successfully, a [background job](https://gitlab.com/gitlab-org/gitlab/-/blob/c6b2f18eaf0b78a4e0012e88f28d643eb0dfb1c2/ee/app/workers/ai/store_repository_xray_worker.rb#L18) is triggered in GitLab Rails monolith that imports JSON report into [`Projects::XrayReport`](https://gitlab.com/gitlab-org/gitlab/-/blob/bc2ad40b4b026dd359e289cf2dc232de1a2d3227/ee/app/models/projects/xray_report.rb#L22).
- There can be only one Repository X Ray report per project in the scope of programming language, duplicated records are being upserted during import process
As of today, there are 84 rows on `xray_reports` table on GitLab.com.
### Retrieval
After Repository X Ray report gets imported, when IDE extension sends request for a [code generation](../../../user/project/repository/code_suggestions/index.md),
Repository X Ray report is retrieved in the following steps:
1. GitLab Rails monotlith fetches corresponding `xray_reports` record from main database. `xray_reports` records are filiterd based on `project_id` foreign key, and `lang` columns.
1. From retrieved record first 50 dependencies are being added into a prompt that is forwarded to AI Gateway
### Current state overview
```mermaid
sequenceDiagram
actor USR as User
participant IDE
participant GLR as GitLabRails
participant RN as GitLabRunner
participant PG as GitLabPsqlMainDB
participant AIGW as AIGateway
USR->>+GLR: commits changes to Gemfile.lock
GLR->>RN: triggers Repository X Ray CI scanner job
RN->>GLR: Repository X Ray report
GLR->>GLR: triggers Repository X Ray ingestion job
GLR->>-PG: upserts xray_reports record
USR->>+IDE: types: "#35; generate function that fetches sales report for vendor from App Store"
IDE->>+GLR: trigger code generation for line ` "#35; generate function `
GLR->>PG: fetch X Ray report for project and language
PG->>GLR: xray_reports record
GLR->>GLR: include first 50 entities from xray report into code generation prompt
GLR->>-AIGW: trigger code generation ` "#35; generate function `
```
### Embeddings prospect application
As described in retrieval section above, currently Repository X Ray reports follow very naive approach, that does not include any metric for assessing relevance between Repository X Ray report content and
user instruction. Therefore applying embeddings and semantic search to X Ray report has a high potential of improving results by selecting limited set of related entries from Repository X Ray report based on user instruction.
To achieve that embeddings should be generated during Repository X Ray ingestion. Additionally an user instruction should be turned into embeddings vector to perform semantic search over stored Repository X Ray report data during retrieval process.
### Elasticsearch and PGVector comparison
Following paragraph is a result of [PoC](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/144715) work.
From a product feature implementation point of view both solutions seems as viable ones, offering in the current state all necessary tools to support the product features requirements.
Given the Elasticsearch built in capabilities it is acknowledged that it might bring better long term support, enabling more powerful RAG solution in the future than `pg_vector` based ones.
The current Elasticsearch integration only indexes `ActiveRecord` models and
source code from Git repositories. Further work required to build more
generic abstractions to index other data (eg. X-Ray reports)
has been defined by [issue 442197](https://gitlab.com/gitlab-org/gitlab/-/issues/442197).
To prevent suboptimal workarounds of existing limitation
which will create technical debt, it is advised that [issue 442197](https://gitlab.com/gitlab-org/gitlab/-/issues/442197)
is completed before Elasticsearch is selected as main vector storage for RAG.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

View File

@ -1,305 +1,11 @@
---
status: ongoing
creation-date: "2024-02-20"
authors: [ "@maddievn", "@mikolaj_wawrzyniak", "@dgruzd" ]
coach: [ "@stanhu" ]
approvers: [ "@pwietchner", "@oregand", "@shinya.meda", "@mikolaj_wawrzyniak" ]
owning-stage: "~devops::data stores"
participating-stages: ["~devops::ai-powered", "~devops::create"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_rag/'
remove_date: '2025-07-08'
---
# Retrieval Augmented Generation (RAG) for GitLab
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_rag/).
## Goals
The goal of this blueprint is to describe viable options for RAG at GitLab
across deployment types. The aim is to describe RAG implementations that provide
our AI featuresand by extension our customerswith best-in-class user
experiences.
## Overview of RAG
RAG, or Retrieval Augmented Generation, involves several key process blocks:
- **Input Transformation**: This step involves processing the user's input,
which can vary from natural language text to JSON or keywords. For effective
query construction, we might utilize Large Language Models (LLMs) to format
the input into a standard expected format or to extract specific keywords.
- **Retrieval**: Here, we fetch relevant data from specified data sources, which
may include diverse storage engines like vector, graph, or relational
databases. It's crucial to conduct [data access checks](#data-access-policy)
during this phase. After retrieval, the data should be optimized for LLMs
through post-processing to enhance the quality of the generated responses.
- **Generation**: This phase involves crafting a prompt with the retrieved data
and submitting it to an LLM, which then generates an AI-powered response.
![Current page](img/blog_figure-1.jpg)
(Image from [Deconstructing RAG](https://blog.langchain.dev/deconstructing-rag/))
## Challenges of RAG
### Data for LLMs
Ensuring data is optimized for LLMs is crucial for consistently generating
high-quality AI responses. Several challenges exist when providing context to
LLMs:
- **Long Contexts:** Extensive contexts can degrade LLM performance, a
phenomenon known as the Lost in the Middle problem. Employing Rerankers can
enhance performance but may also increase computational costs due to longer
processing times.
- **Duplicate Contents:** Repetitive content can reduce the diversity of search
results. For instance, if a semantic search yields ten results indicating
"Tom is a president" but the eleventh reveals "Tom lives in the United States,"
solely using the top ten would omit critical information. Filtering out
duplicate content, for example, through Maximal Marginal Relevance (MMR), can
mitigate this issue.
- **Conflicting Information:** Retrieving conflicting data from multiple sources
can lead to LLM "hallucinations." For example, mixing sources that define
"RAG" differently can confuse the LLM. Careful source selection and content
curation are essential.
- **Irrelevant Content:** Including irrelevant data can negatively impact LLM
performance. Setting a threshold for relevance scores or considering that
certain irrelevant contents might actually enhance output quality are
strategies to address this challenge.
It's highly recommended to evaluate the optimal data format and size for
maximizing LLM performance, as the effects on performance and result quality can
vary significantly based on the data's structure.
References:
- [Benchmarking Methods for Semi-Structured RAG](https://youtu.be/KMZZh7Z5mno?si=-Gr-acXcjg7QXmBU)
- [Edge cases of semantic search](https://youtu.be/DY3sT4yIezs?feature=shared&t=1382)
#### Regenerating Embeddings
The AI field is evolving rapidly and new models and approaches seem to appear
daily that could improve our users' experience, we want to conscious of model
switching costs. If we decide to swap models or change our chunking strategy (as two examples),
we will need to wipe our existing embeddings and do a full
replacement with embeddings from the new model or with the new text chunks, etc.
Factors to consider which could trigger the need for a full regeneration of
embeddings for the affected data include:
- A change in the optimal text chunk size
- A change in a preprocessing step which perhaps adds new fields to a text chunk
- Exclusion of content, such as the removal of a field that was previously embedded
- Addition of new metadata that needs to be embedded
### Multi-source Retrieval
Addressing complex queries may require data from multiple sources. For instance,
queries linking issues to merge requests necessitate fetching details from both.
GitLab Duo Chat, utilizing the
[ReACT framework](https://arxiv.org/abs/2210.03629), sequentially retrieves data from
PostgreSQL tables, which can prolong the retrieval process due to the sequential
execution of multiple tools and LLM inferences.
## Searching for Data
Choosing the appropriate search method is pivotal for feature design and UX optimization. Here are common search techniques:
### Semantic Search using embeddings
Semantic search shines when handling complex queries that demand an
understanding of the context or intent behind the words, not just the words
themselves. It's particularly effective for queries expressed in natural
language, such as full sentences or questions, where the overall meaning
outweighs the importance of specific keywords. Semantic search excels at
providing thorough coverage of a topic, capturing related concepts that may not
be directly mentioned in the query, thus uncovering more nuanced or indirectly
related information.
In the realm of semantic search, the K-Nearest Neighbors (KNN) method is
commonly employed to identify data segments that are semantically closer to the
user's input by using embeddings. To measure the semantic proximity, various methods are used:
- **Cosine Similarity:** Focuses solely on the direction of vectors.
- **L2 Distance (Euclidean Distance):** Takes into account both the direction
and magnitude of vectors. These vectors, known as "embeddings," are created by
processing the data source through an embedding model. Currently, in GitLab
production, we utilize the `textembedding-gecko` model provided by Vertex AI.
However, there might be scenarios where you consider using alternative embedding
models, such as those available on HuggingFace, to reduce costs. Opting for
different models requires comprehensive evaluation and consultation,
particularly with the legal team, to ensure the chosen model's usage complies
with GitLab policies. See the
[Security, Legal, and Compliance](https://gitlab.com/gitlab-org/gitlab/-/blob/52f4fcb033d13f3d909a777728ba8f3fa2c93256/doc/architecture/blueprints/gitlab_duo_rag/index.md#security-legal-and-compliance)
section for more details. It's also important to note that multilingual support
can vary significantly across different embedding models, and switching models
may lead to regressions.
For large datasets, it's advisable to implement indexes to enhance query
performance. The HNSW (Hierarchical Navigable Small World) method, combined with
approximate nearest neighbors (ANN) search, is a popular strategy for this
purpose. For insights into HNSW's effectiveness, consider reviewing
[benchmarks on its performance in large-scale applications](https://supabase.com/blog/increase-performance-pgvector-hnsw).
Due to the existing framework and scalability of Elasticsearch, embeddings will
be stored on Elasticsearch for large datasets such as
[issues](https://gitlab.com/gitlab-org/gitlab/-/issues/451431), merge requests,
etc. This will be used to perform [Hybrid Search](https://gitlab.com/gitlab-org/gitlab/-/issues/440424)
but will also be useful for other features such as finding duplicates, similar results or
categorizing documents.
### Keyword Search
Keyword search is the go-to method for straightforward, specific queries where
users are clear about their search intent and can provide precise terms or
phrases. This method is highly effective for retrieving exact matches, making it
suitable for searches within structured databases or when looking for specific
documents, terms, or phrases.
Keyword search operates on the principle of matching the query terms directly
with the content in the database or document collection, prioritizing results
that have a high frequency of the query terms. Its efficiency and directness
make it particularly useful for situations where users expect quick and precise
results based on specific keywords or phrases.
Elasicsearch uses a BM25 algorigthm to perform keyword search.
If one of the existing [indexed document types](../../../integration/advanced_search/elasticsearch.md#advanced-search-index-scopes)
is not covered, a [new document type](../../../development/advanced_search.md#add-a-new-document-type-to-elasticsearch) can be added.
### Hybrid Search
Hybrid search combines the depth of semantic search with the precision of
keyword search, offering a comprehensive search solution that caters to both
context-rich and specific queries. By running both semantic and keyword searches
simultaneously, it integrates the strengths of both methods—semantic search's
ability to understand the context and keyword search's precision in identifying
exact matches.
The results from both searches are then combined, with their relevance scores
normalized to provide a unified set of results. This approach is particularly
effective in scenarios where queries may not be fully served by either method
alone, offering a balanced and nuanced response to complex search needs. The
computational demands of kNN searches, which are part of semantic search, are
contrasted with the relative efficiency of [BM25](https://pub.aimind.so/understanding-the-bm25-ranking-algorithm-19f6d45c6ce)
keyword searches, making hybrid search a strategic choice for optimizing
performance across diverse datasets.
The first hybrid search scope is for issues which combines keyword search with kNN matches using embeddings.
### Code Search
Like the other data types above, a source code search task can use different
search types, each more suited to address different queries.
Two code searches are available: `Elasticsearch` and `Zoekt`.
Elasticsearch provides blob search which supports [Advanced Search Syntax](../../../user/search/advanced_search.md#syntax).
[Zoekt](../code_search_with_zoekt/index.md) is employed on GitLab.com to provide
exact match keyword search and regular expression search capabilities for source
code.
Semantic search and hybrid search functionalities are yet to be
implemented for code.
### ID Search
Facilitates data retrieval using specific resource IDs, such as issue links. For
example retrieving data from the specified resource ID, such as an Issue link or
a shortcut. See [ID search](postgresql.md#id-search) for more information.
### Knowledge Graph
Knowledge Graph search transcends the limitations of traditional search methods
by leveraging the interconnected nature of data represented in graph form.
Unlike semantic search, which focuses on content similarity, Knowledge Graph
search understands and utilizes the relationships between different data points,
providing a rich, contextual exploration of data.
This approach is ideal for queries that benefit from understanding the broader
context or the interconnectedness of data entities. Graph databases store
relationships alongside the data, enabling complex queries that can navigate
these connections to retrieve highly contextual and nuanced information.
Knowledge Graphs are particularly useful in scenarios requiring deep insight
into the relationships between entities, such as recommendation systems, complex
data analysis, and semantic querying, offering a dynamic way to explore and
understand large, interconnected datasets.
## Security, Legal and Compliance
### Data access policy
The retrieval process must comply with the
[GitLab Data Classification Standard](https://handbook.gitlab.com/handbook/security/data-classification-standard/).
If the user doesn't have access to the data, GitLab will not fetch the data for
building a prompt.
For example:
- When the data is GitLab Documentation (GREEN level), the data can be fetched
without authorizations.
- When the data is customer data such as issues, merge requests, etc (RED level),
the data must be fetched with proper authorizations based on permissions and roles.
If you're proposing to fetch data from an external public database
(e.g. fetching data from `arxiv.org` so the LLM can answer questions about
quantitative biology), please conduct a thorough review to ensure the external
data isn't inappropriate for GitLab to process.
### Data usage
Using a new embedding model or persisting data into a new storage would require
[legal reviews](https://handbook.gitlab.com/handbook/legal/). See the following
links for more information:
- [Data privacy](../../../user/gitlab_duo/data_usage.md#data-privacy)
- [Data retention](../../../user/gitlab_duo/data_usage.md#data-retention)
- [Training data](../../../user/gitlab_duo/data_usage.md#training-data)
## Evaluation
Evaluation is a crucial step in objectively determining the quality of the
retrieval process. Tailoring the retrieval process based on specific user
feedback can lead to biased optimizations, potentially causing regressions for
other users. It's essential to have a dedicated test dataset and tools for a
comprehensive quality assessment. For assistance with AI evaluation, please
reach out to the [AI Model Validation Group](https://handbook.gitlab.com/handbook/engineering/development/data-science/model-validation/).
## Before Implementing RAG
Before integrating Retrieval Augmented Generation (RAG) into your system, it's
important to evaluate whether it enhances the quality of AI-generated responses.
Consider these essential questions:
- **What does typical user input look like?**
- For instance, "Which class should we use to make an external HTTP request in this repository?"
- **What is the desired AI-generated response?**
- Example: "Within this repository, Class-A is commonly utilized for..."
- **What are the current responses from LLMs?** (This helps determine if the necessary knowledge is already covered by the LLM.)
- Example: Receiving a "Sorry, I don't have an answer for that." from the Anthropic Claude 2.1 model.
- **What data is required in the LLM's context window?**
- Example: The code for Class-A.
- **Consider the current search method used for similar tasks**. (Ask yourself: How would I currently search for this data with the tools at my disposal?)
- Example: Navigate to the code search page and look for occurrences of "http."
- **Have you successfully generated the desired AI response with sample data?** Experiment in a third-party prompt playground or Google Colab to test.
- **If contemplating semantic search**, it's **highly recommended** that you
develop a prototype first to ensure it meets your specific retrieval needs.
Semantic search may interpret queries differently than expected, especially
when the data source lacks natural language context, such as uncommented code.
In such cases, semantic search might not perform as well as traditional
keyword search methods. Here's [an example prototype](https://colab.research.google.com/drive/1K1gf6FibV-cjlXvTJPboQJtjYcSsyYi2?usp=sharing)
that demonstrates semantic search for CI job configurations.
## Evaluated Solutions
The following solutions have been validated with PoCs to ensure they meet the
basic requirements of vector storage and retrieval for GitLab Duo Chat with
GitLab documentation. Click the links to learn more about each solutions
attributes that relate to RAG:
- [PostgreSQL with PGVector](postgresql.md)
- [Elasticsearch](elasticsearch.md)
- [Google Vertex](vertex_ai_search.md)
To read more about the [GitLab Duo Chat PoCs](../gitlab_duo_rag/index.md) conducted, see:
- [PGVector PoC](../gitlab_duo_rag/postgresql.md)
- [Elasticsearch PoC](../gitlab_duo_rag/elasticsearch.md)
- [Google Vertex PoC](../gitlab_duo_rag/vertex_ai_search.md)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,140 +1,11 @@
---
status: proposed
creation-date: "2024-02-20"
authors: [ "@bvenker", "@mikolaj_wawrzyniak" ]
coach: [ "@stanhu" ]
approvers: [ "@pwietchner", "@oregand", "@shinya.meda", "@mikolaj_wawrzyniak" ]
owning-stage: "~devops::data stores"
participating-stages: ["~devops::ai-powered", "~devops::create"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_rag/postgresql/'
remove_date: '2025-07-08'
---
# PostgreSQL
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_rag/postgresql/).
This page explains how to retrieve data from PostgreSQL for [RAG](index.md).
## Semantic search
### Overview
1. Install [PgVector extension](#vector-store-with-pgvector) to the PostgreSQL database.
1. Add a `vector` column to a new or existing table.
1. Data <=> Embedding synchronization
1. Load data which you want to search from.
1. Pass the data to an embedding model and get an vector.
1. Set the vector to the `vector` column.
1. Retrieval
1. Pass the user input to an embedding model and get an vector.
1. Get the nearest neighbors to the user input vector e.g. `SELECT * FROM a_table ORDER BY vector_column <-> '<user-input-vector>' LIMIT 5;`
### Vector store with PgVector
To store the embeddings for semantic search, we need to add a vector store in GitLab PostgreSQL.
This vector store can be added by installing [PgVector extension](https://github.com/pgvector/pgvector) (Postgres 12+ is required).
A vector store is currently running on GitLab.com and it's separately hosted from the main/CI databases.
Our current architecture of having a separate database for embeddings is probably ideal. We don't gain much by combining them and, as PGVector is all new and will likely require a lot of experimenting to get performance at scale (today we only have a tiny amount of data in it), we'll have a lot more options to experiment with without impacting overall GitLab.com stability (if PGVector is on a separate database). Having a separate database is recommended because it allows for experimentation without impacting performance of the main database.
### Limitations
- It could be locked down to a specific embedding model, because you must specify the dimensions of the vector column.
- Vectors with up to 2,000 dimensions can be indexed.
### Performance and scalability implications
- Is there any guidance on how much data we can add to the PostgreSQL (regardless of the vector data or normal data)?
- Not really, as we do not usually just add data to the database, but rather it's a result of the instance being used. I don't see any specific [storage requirements](../../../install/requirements.md#storage). If the existing `vertex_gitlab_docs` table size is a good indicator, we probably can add this without causing much trouble, though having an option to opt-in or opt-out is preferable.
### Availability
- PostgreSQL is availble in all GitLab installations (both CNG and Omnibus).
- Most major cloud providers have added PgVector to their offerings by now: Google Cloud SQL and Alloy DB, DigitalOcean, AWS RDS and Aurora, Azure Flexible and Cosmos, etc. There might be a case where customers would need to upgrade to versions that support PGVector.
## ID search
### Overview
1. Execute a few-shot prompts to extract a resource identifier from the user input.
- e.g. When user asks `Can you summarize #12312312?`, ResourceIdentifier is `12312312` as a GitLab-Issue.
1. Retrieve the record from the PostgreSQL. e.g. `Issue.find(12312312)`
1. Check if the user can read the resource.
1. Build a prompt with the retrieved data and passing it to an LLM to get a AI-generated response.
## PoC: Repository X Ray
Repository X Ray hasn't yet implemented any semantic seach and this section is based soely on a [prototype implementation](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/142912)
- Statistics (as of February 2024):
- Date type: JSON document with source code libraries desctiptions in natural language
- Date access level: Red (each JSON document belongs to specific project, and data access rules should adhere to data access rules configure for that project)
- Data source: Repository X Ray report CI artifact
- Data size: N/A
- Example of user input: "# generate function that fetches sales report for vendor from App Store"
- Example of expected AI-generated response:
```python
def sales_reports(vendor_id)\n app_store_connect.sales_reports(\n filter: {\n report_type: 'SALES',\n report_sub_type: 'SUMMARY',\n frequency: 'DAILY',
vendor_number: '123456'\n }\n)\nend
```
### Synchronizing embeddings with data source
In symilar manner as with the [documentation example](../gitlab_duo_rag/postgresql.md#retrieve-gitlab-documentation) Repository X Ray report data is a derivative. It uses an underlaying repository source code as a base,
and it must be synchronised with it, whenever any changes to the source code occurs.
Right now there is no synchronisation mechanism that includes embeddings and vector storage. However there is an existing pipeline that generates and stores Repository X Ray reports
The ingestion pipeline is performed in following steps:
1. A CI X Ray scanner job is triggered - a documentation [page](../../../user/project/repository/code_suggestions/repository_xray.md#enable-repository-x-ray) suggest limiting this job to be executed only when changes occur to the main repository branch. However repository maintainers may configure trigger rules differently.
1. An X Ray [scanner](https://gitlab.com/gitlab-org/code-creation/repository-x-ray) locates and process one of the supported [dependencies files](../../../user/project/repository/code_suggestions/repository_xray.md#supported-languages-and-package-managers), producing JSON report files
1. After the X Ray scanner job finishes successfully, a [background job](https://gitlab.com/gitlab-org/gitlab/-/blob/c6b2f18eaf0b78a4e0012e88f28d643eb0dfb1c2/ee/app/workers/ai/store_repository_xray_worker.rb#L18) is triggered in GitLab Rails monolith that imports JSON report into [`Projects::XrayReport`](https://gitlab.com/gitlab-org/gitlab/-/blob/bc2ad40b4b026dd359e289cf2dc232de1a2d3227/ee/app/models/projects/xray_report.rb#L22)
1. There can be only one Repository X Ray report per project in the scope of programming language, duplicated records are being upserted during import process
As of today, there are 84 rows on `xray_reports` table on GitLab.com.
### Retrieval
After Repository X Ray report gets imported, when IDE extension sends request for a [code generation](../../../user/project/repository/code_suggestions/index.md), Repository X Ray report is retrieved, in following steps
1. Fetch embedding of the user input from `textembedding-gecko` model (768 dimensions).
1. Query to `vertex_gitlab_docs` table for finding the nearest neighbors. For example:
```sql
SELECT *
FROM vertex_gitlab_docs
ORDER BY vertex_gitlab_docs.embedding <=> '[vectors of user input]' -- nearest neighbors by cosine distance
LIMIT 10
```
1. GitLab Rails monotlith fetches corresponding `xray_reports` record from main database. `xray_reports` records are filiterd based on `project_id` foreign key, and `lang` columns.
1. From retrieved record first 50 dependencies are being added into a prompt that is forwarded to AI Gateway
### Current state overview
```mermaid
sequenceDiagram
actor USR as User
participant IDE
participant GLR as GitLabRails
participant RN as GitLabRunner
participant PG as GitLabPsqlMainDB
participant AIGW as AIGateway
USR->>+GLR: commits changes to Gemfile.lock
GLR->>RN: triggers Repository X Ray CI scanner job
RN->>GLR: Repository X Ray report
GLR->>GLR: triggers Repository X Ray ingestion job
GLR->>-PG: upserts xray_reports record
USR->>+IDE: types: "#35; generate function that fetches sales report for vendor from App Store"
IDE->>+GLR: trigger code generation for line ` "#35; generate function `
GLR->>PG: fetch X Ray report for project and language
PG->>GLR: xray_reports record
GLR->>GLR: include first 50 entities from xray report into code generation prompt
GLR->>-AIGW: trigger code generation ` "#35; generate function `
```
### Embeddings prospect application
As described in retrieval section above, currently Repository X Ray reports follows very naive approach, that does not iclude any metric for assesing relevance between Repository X Ray report content and user instruction. Therefore applying embeddings and semantic search to X Ray report has a high potential of improving results by selecting limited set of related entries from Repository X Ray report based on user instruction.
To achieve that embeddings should be generated during Repository X Ray ingestion. Additionaly an user instruction should be turned into embeddings vector to perform semantic search over stored Repository X Ray report data during retrieval process.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,87 +1,11 @@
---
status: proposed
creation-date: "2024-02-20"
authors: [ "@bvenker", "@mikolaj_wawrzyniak" ]
coach: [ "@stanhu" ]
approvers: [ "@pwietchner", "@oregand", "@shinya.meda", "@mikolaj_wawrzyniak" ]
owning-stage: "~devops::data stores"
participating-stages: ["~devops::ai-powered", "~devops::create"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_rag/vertex_ai_search/'
remove_date: '2025-07-08'
---
# Vertex AI Search
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_rag/vertex_ai_search/).
This page explains how to retrieve data from Google Vertex AI Search for [RAG](index.md).
## Overview
Some of our data are public resources that don't require [data access check](index.md#data-access-policy) when retrieving.
These data are often identical across GitLab instances so it's redundant to ingest the same data into every single database.
It'd be more efficient to serve the data from the single service.
We can use [Vertex AI Search](https://cloud.google.com/products/agent-builder?hl=en) in this case.
It can search at scale, with high queries per second (QPS), high recall, low latency, and cost efficiency.
This approach allows us to minimize code that we can't update on a customer's behalf, which means avoiding hard-coding AI-related logic in the GitLab monolith codebase. We can retain the flexibility to make changes in our product without asking customers to upgrade their GitLab version.
This is same with the [AI Gateway](../ai_gateway/index.md)'s design principle.
```mermaid
flowchart LR
subgraph GitLab managed
subgraph AIGateway
VertexAIClient["VertexAIClient"]
end
subgraph Vertex AI Search["Vertex AI Search"]
subgraph SearchApp1["App"]
direction LR
App1DataStore(["BigQuery"])
end
subgraph SearchApp2["App"]
direction LR
App2DataStore(["Cloud Storage / Website URLs"])
end
end
end
subgraph SM or SaaS GitLab
DuoFeatureA["Duo feature A"]
DuoFeatureB["Duo feature B"]
end
DuoFeatureA -- Semantic search --- VertexAIClient
DuoFeatureB -- Semantic search --- VertexAIClient
VertexAIClient -- Search from Gitlab Docs --- SearchApp1
VertexAIClient -- Search from other data store --- SearchApp2
```
## Limitations
- Data **must be** [GREEN level](index.md#data-access-policy) and publicly shareable.
- Examples:
- GitLab documentations (`gitlab-org/gitlab/doc`, `gitlab-org/gitlab-runner/docs`, `gitlab-org/omnibus-gitlab/doc`, etc)
- Dynamically construct few-shot prompt templates with [Example selectors](https://python.langchain.com/v0.1/docs/modules/model_io/prompts/example_selectors/).
**IMPORTANT: We do NOT persist customer data into Vertex AI Search. See the other solutions for persisting customer data.**
## Performance and scalability implications
- GitLab-side: Vertex AI Search can [search at scale, with high queries per second (QPS), high recall, low latency, and cost efficiency](https://cloud.google.com/vertex-ai/docs/vector-search/overview).
- GitLab-side: Vertex AI Search supports [global and multi-region deployments](https://cloud.google.com/generative-ai-app-builder/docs/locations).
- Customer-side: The outbound requests from their GitLab Self-managed instances could cause more network latency than retrieving from a local vector store.
This latency issue is addressable by multi-region deployments.
## Availability
- Customer-side: Air-gapped solutions can't be supported due to the required access to AI Gateway (`cloud.gitlab.com`).
This concern would be negligible since GitLab Duo already requires the access.
- Customer-side: Since the service is the single point of failure, retrievers stop working when the service is down.
## Cost implications
- GitLab-side: See [Vertex AI Search pricing](https://cloud.google.com/generative-ai-app-builder/pricing).
- Customer-side: No additional cost required.
## Maintenance
- GitLab-side: GitLab needs to maintain the data store (e.g. Structured data in Bigquery or unstructured data in Cloud Storage). Google automatically detects the schema and indexes the stored data.
- Customer-side: No maintenance required.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

View File

@ -1,30 +1,11 @@
---
owning-stage: "~devops::verify"
description: 'GitLab Steps ADR 001: Bootstrap Step Runner'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/decisions/001_initial_support/'
remove_date: '2025-07-08'
---
# GitLab Steps ADR 001: Bootstrap Step Runner
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/decisions/001_initial_support/).
## Context
[GitLab Steps](../index.md) is a new feature that does not have any prior usage at GitLab.
We decided that there are two important objectives at this stage of the project:
- Integrate the project into existing CI pipelines for the purpose of user evaluation as part of an [experiment](../../../../policy/experiment-beta-support.md#experiment) phase.
- Provide a contribution framework for other developers in the form of a project with contribution guidelines.
## Decision
The [GitLab Steps: Iteration 1: Bootstrap Step Runner (MVC)](https://gitlab.com/groups/gitlab-org/-/epics/11736)
was created to achieve the following objectives:
- We defined the initial plan to bootstrap the project.
- The project will be stored in [`gitlab-org/step-runner`](https://gitlab.com/gitlab-org/step-runner).
- We will implement the [Step Definition](../step-definition.md) as a [Protocol Buffer](https://protobuf.dev/). The initial implementation is described in the [Baseline Step Proto](../implementation.md).
- Usage of [Protocol Buffers](https://protobuf.dev/) will provide strong guards for the minimal required definition to be used by the project.
- We will provide documentation on how to use GitLab Steps in existing CI pipelines.
## Alternatives
No alternatives were considered at this phase, since there's no pre-existing work at GitLab
for that type of feature.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,247 +1,11 @@
---
owning-stage: "~devops::verify"
description: Usage of the [GitLab Steps](index.md) with [`.gitlab-ci.yml`](../../../ci/yaml/index.md).
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/gitlab-ci/'
remove_date: '2025-07-08'
---
# Usage of the [GitLab Steps](index.md) with [`.gitlab-ci.yml`](../../../ci/yaml/index.md)
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/gitlab-ci/).
This document describes how [GitLab Steps](index.md) are integrated into the `.gitlab-ci.yml`.
GitLab Steps will be integrated using a three-stage execution cycle
and replace `before_script:`, `script:` and `after_script:`.
- `setup:`: Execution stage responsible for provisioning the environment,
including cloning the repository, restoring artifacts, or installing all dependencies.
This stage will replace implicitly cloning, restoring artifacts, and cache download.
- `run:`: Execution stage responsible for running a test, build,
or any other main command required by that job.
- `teardown:`: Execution stage responsible for cleaning the environment,
uploading artifacts, or storing cache. This stage will replace implicit
artifacts and cache uploads.
Before we can achieve three-stage execution we will ship minimal initial support
that does not require any prior GitLab integration.
## Phase 1: Initial support
Initially the Step Runner will be used externally, without any prior dependencies
to GitLab:
- The `step-runner` will be provided as part of a container image.
- The `step-runner` will be explicitly run in the `script:` section.
- The `$STEPS` environment variable will be executed as [`type: steps`](step-definition.md#the-steps-step-type).
```yaml
hello-world:
image: registry.gitlab.com/gitlab-org/step-runner
variables:
STEPS: |
- step: gitlab.com/josephburnett/component-hello-steppy@master
inputs:
greeting: "hello world"
script:
- /step-runner ci
```
## Phase 2: The addition of `run:` to `.gitlab-ci.yml`
In Phase 2 we will add `run:` as a first class way to use GitLab Steps:
- `run:` will use a [`type: steps`](step-definition.md#the-steps-step-type) syntax.
- `run:` will replace usage of `before_script`, `script` and `after_script`.
- All existing functions to support Git cloning, artifacts, and cache would continue to be supported.
- It is yet to be defined how we would support `after_script`, which is executed unconditionally
or when the job is canceled.
- `run:` will not be allowed to be combined with `before_script:`, `script:` or `after_script:`.
- GitLab Rails would not parse `run:`, instead it would only perform static validation
with a JSON schema provided by the Step Runner.
```yaml
hello-world:
image: registry.gitlab.com/gitlab-org/step-runner
run:
- step: gitlab.com/josephburnett/component-hello-steppy@master
inputs:
greeting: "hello world"
```
The following example would **fail** syntax validation:
```yaml
hello-world:
image: registry.gitlab.com/gitlab-org/step-runner
run:
- step: gitlab.com/josephburnett/component-hello-steppy@master
inputs:
greeting: "hello world"
script: echo "This is ambiguous and invalid example"
```
### Transitioning from `before_script:`, `script:` and `after_script:`
GitLab Rails would automatically convert the `*script:` syntax into relevant `run:` specification:
- Today `before_script:` and `script:` are joined together as a single script for execution.
- The `after_script:` section is always executed in a separate context, representing a separate step to be executed.
- It is yet to be defined how we would retain the existing behavior of `after_script`, which is always executed
regardless of the job status or timeout, and uses a separate timeout.
- We would retain all implicit behavior which defines all environment variables when translating `script:`
into step-based execution.
For example, this CI/CD configuration:
```yaml
hello-world:
before_script:
- echo "Run before_script"
script:
- echo "Run script"
after_script:
- echo "Run after_script"
```
Could be translated into this equivalent specification:
```yaml
hello-world:
run:
- step: gitlab.com/gitlab-org/components/steps/legacy/script@v1.0
inputs:
script:
- echo "Run before_script"
- echo "Run script"
- step: gitlab.com/gitlab-org/components/steps/legacy/script@v1.0
inputs:
script:
- echo "Run after_script"
when: always
```
## Phase 3: The addition of `setup:` and `teardown:` to `.gitlab-ci.yml`
The addition of `setup:` and `teardown:` will replace the implicit functions
provided by GitLab Runner: Git clone, artifacts and cache handling:
- The usage of `setup:` would stop GitLab Runner from implicitly cloning the repository.
- `artifacts:` and `cache:`, when specified, would be translated and appended to `setup:` and `teardown:`
to provide backward compatibility for the old syntax.
- `release:`, when specified, would be translated and appended to `teardown:`
to provide backward compatibility for the old syntax.
- `setup:` and `teardown:` could be used in `default:` to simplify support
of common workflows like where the repository is cloned, or how the artifacts are handled.
- The split into 3-stage execution additionally improves composability of steps with `extends:`.
- The `hooks:pre_get_sources_script` would be implemented similar to [`script:`](#transitioning-from-before_script-script-and-after_script)
and be prepended to `setup:`.
For example, this CI/CD configuration:
```yaml
rspec:
script:
- echo "This job uses a cache."
artifacts:
paths: [binaries/, .config]
cache:
key: binaries-cache
paths: [binaries/*.apk, .config]
```
Could be translated into this equivalent specification executed by a step runner:
```yaml
rspec:
setup:
- step: gitlab.com/gitlab-org/components/git/clone@v1.0
- step: gitlab.com/gitlab-org/components/artifacts/download@v1.0
- step: gitlab.com/gitlab-org/components/cache/restore@v1.0
inputs:
key: binaries-cache
run:
- step: gitlab.com/gitlab-org/components/steps/legacy/script@v1.0
inputs:
script:
- echo "This job uses a cache."
teardown:
- step: gitlab.com/gitlab-org/components/artifacts/upload@v1.0
inputs:
paths: [binaries/, .config]
- step: gitlab.com/gitlab-org/components/cache/restore@v1.0
inputs:
key: binaries-cache
paths: [binaries/*.apk, .config]
```
### Inheriting common operations with `default:`
`setup:` and `teardown:` are likely to become very verbose over time. One way to simplify them
is to allow inheriting the common `setup:` and `teardown:` operations
with `default:`.
The previous example could be simplified to:
```yaml
default:
setup:
- step: gitlab.com/gitlab-org/components/git/clone@v1.0
- step: gitlab.com/gitlab-org/components/artifacts/download@v1.0
- step: gitlab.com/gitlab-org/components/cache/restore@v1.0
inputs:
key: binaries-cache
teardown:
- step: gitlab.com/gitlab-org/components/artifacts/upload@v1.0
inputs:
paths: [binaries/, .config]
- step: gitlab.com/gitlab-org/components/cache/restore@v1.0
inputs:
key: binaries-cache
paths: [binaries/*.apk, .config]
rspec:
run:
- step: gitlab.com/gitlab-org/components/steps/legacy/script@v1.0
inputs:
script:
- echo "This job uses a cache."
linter:
run:
- step: gitlab.com/gitlab-org/components/steps/legacy/script@v1.0
inputs:
script:
- echo "Run linting"
```
### Parallel jobs and `setup:`
With the introduction of `setup:` at some point in the future we will introduce
an efficient way to parallelize the jobs:
- `setup:` would define all steps required to provision the environment.
- The result of `setup:` would be snapshot and distributed as the base
for all parallel jobs, if `parallel: N` is used.
- The `run:` and `teardown:` would be run on top of cloned job, and all its services.
- The runner would control and intelligently distribute all parallel
jobs, significantly cutting the resource requirements for fixed
parts of the job (Git clone, artifacts, installing dependencies.)
```yaml
rspec-parallel:
image: ruby:3.2
services: [postgres, redis]
parallel: 10
setup:
- step: gitlab.com/gitlab-org/components/git/clone@v1.0
- step: gitlab.com/gitlab-org/components/artifacts/download@v1.0
inputs:
jobs: [setup-all]
- script: bundle install --without production
run:
- script: bundle exec knapsack
```
Potential GitLab Runner flow:
1. Runner receives the `rspec-parallel` job with `setup:` and `parallel:` configured.
1. Runner executes a job on top of Kubernetes cluster using block volumes up to the `setup`.
1. Runner then runs 10 parallel jobs in Kubernetes, overlaying the block volume from 2
and continue execution of `run:` and `teardown:`.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,339 +1,11 @@
---
owning-stage: "~devops::verify"
description: Implementation details for [CI Steps](index.md).
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/implementation/'
remove_date: '2025-07-08'
---
# Design and implementation details
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/implementation/).
## Baseline Step Proto
The internals of Step Runner operate on the baseline step definition
which is defined in Protocol Buffer. All GitLab CI steps (and other
supported formats such as GitHub Actions) compile / fold to baseline steps.
Both step invocations in `.gitlab-ci.yml` and step definitions
in `step.yml` files will be compiled to baseline structures.
The term "step" means "baseline step" for the remainder of this document.
Each step includes a reference `ref` in the form of a URI. The method of
retrieval is determined by the protocol of the URI.
Steps and step traces have fields for inputs, outputs,
environment variables and environment exports.
After steps are downloaded and the `step.yml` is parsed
a step definition `def` will be added.
If a step defines multiple additional steps then the
trace will include sub-traces for each sub-step.
```protobuf
message Step {
string name = 1;
string step = 2;
map<string,string> env = 3;
map<string,google.protobuf.Value> inputs = 4;
}
message Definition {
DefinitionType type = 1;
Exec exec = 2;
repeated Step steps = 3;
message Exec {
repeated string command = 1;
string work_dir = 2;
}
}
enum DefinitionType {
definition_type_unspecified = 0;
exec = 1;
steps = 2;
}
message Spec {
Content spec = 1;
message Content {
map<string,Input> inputs = 1;
message Input {
InputType type = 1;
google.protobuf.Value default = 2;
}
}
}
enum InputType {
spec_type_unspecified = 0;
string = 1;
number = 2;
bool = 3;
struct = 4;
list = 5;
}
message StepResult {
Step step = 1;
Spec spec = 2;
Definition def = 3;
enum Status {
unspecified = 0;
running = 1;
success = 2;
failure = 3;
}
Status status = 4;
map<string,Output> outputs = 5;
message Output {
string key = 1;
string value = 2;
bool masked = 3;
}
map<string,string> exports = 6;
int32 exit_code = 7;
repeated StepResult children_step_results = 8;
}
```
## Step Caching
Steps are cached locally by a key comprised of `location`
(URL), `version` and `hash`. This prevents the exact same component
from being downloaded multiple times. The first time a step is
referenced it will be downloaded (unless local) and the cache will
return the path to the folder containing `step.yml` and the other
step files. If the same step is referenced again, the same folder
will be returned without downloading.
If a step is referenced which differs by version or hash from another
cached step, it will be re-downloaded into a different folder and
cached separately.
## Execution Context
State is kept by Step Runner across all steps in the form of
an execution context. The context contains the output of each step,
environment variables and overall job and environment metadata.
The execution context can be referenced by expressions in
GitLab CI steps provided by the workflow author.
Example of context available to expressions in `.gitlab-ci.yml`:
```yaml
steps:
previous_step:
outputs:
name: "hello world"
env:
EXAMPLE_VAR: "bar"
job:
id: 1234
```
Expressions in step definitions can also reference execution
context. However they can only access overall
job and environment metadata and the inputs defined in `step.yml`.
They cannot access the outputs of previous steps. In order to
provide the output of one step to the next, the step input
values should include an expression which references another
step's output.
Example of context available to expressions in `step.yml`:
```yaml
inputs:
name: "foo"
env:
EXAMPLE_VAR: "bar"
job:
id: 1234
```
E.g. this is not allowed in a `step.yml file` because steps
should not couple to one another.
```yaml
spec:
inputs:
name:
---
type: exec
exec:
command: [echo, hello, ${{ steps.previous_step.outputs.name }}]
```
This is allowed because the GitLab CI steps syntax passes data
from one step to another:
```yaml
spec:
inputs:
name:
---
type: exec
exec:
command: [echo, hello, ${{ inputs.name }}]
```
```yaml
steps:
- name: previous_step
...
- name: greeting
inputs:
name: ${{ steps.previous_step.outputs.name }}
```
Therefore evaluation of expressions will done in two different kinds
of context. One as a GitLab CI Step and one as a step definition.
### Step Inputs
Step inputs can be given in several ways. They can be embeded
directly into expressions in an `exec` command (as above). Or they
can be embedded in expressions for environment variables set during
exec:
```yaml
spec:
inputs:
name:
---
type: exec
exec:
command: [greeting.sh]
env:
NAME: ${{ inputs.name }}
```
### Input Types
Input values are stored as strings. But they can also have a type
associated with them. Supported types are:
- `string`
- `bool`
- `number`
- `object`
String type values can be any string. Bool type values must be either `true`
or `false` when parsed as JSON. Number type values must a valid float64
when parsed as JSON. Object types will be a JSON serialization of
the YAML input structure.
For example, these would be valid inputs:
```yaml
steps:
- name: my_step
inputs:
foo: bar
baz: true
bam: 1
```
Given this step definition:
```yaml
spec:
inputs:
foo:
type: string
baz:
type: bool
bam:
type: number
---
type: exec
exec:
command: [echo, ${{ inputs.foo }}, ${{ inputs.baz }}, ${{ inputs.bam }}]
```
And it would output `bar true 1`
For an object type, these would be valid inputs:
```yaml
steps:
name: my_step
inputs:
foo:
steps:
- name: my_inner_step
inputs:
name: steppy
```
Given this step definition:
```yaml
spec:
inputs:
foo:
type: object
---
type: exec
exec:
command: [echo, ${{ inputs.foo }}]
```
And it would output `{"steps":[{"name":"my_inner_step","inputs":{"name":"steppy"}}]}`
### Outputs
Output files are created into which steps can write their
outputs and environment variable exports. The file locations are
provided in `OUTPUT_FILE` and `ENV_FILE` environment variables.
After execution Step Runner will read the output and environment
variable files and populate the trace with their values. The
outputs will be stored under the context for the executed step.
And the exported environment variables will be merged with environment
provided to the next step.
Some steps can be of type `steps` and be composed of a sequence
of GitLab CI steps. These will be compiled and executed in sequence.
Any environment variables exported by nested steps will be available
to subsequent steps. And will be available to high level steps
when the nested steps are complete. E.g. entering nested steps does
not create a new "scope" or context object. Environment variables
are global.
## Containers
We've tried a couple approaches to running steps in containers.
In end we've decided to delegate steps entirely to a step runner
in the container.
Here are the options considered:
### Delegation (chosen option)
A provision is made for passing complex structures to steps, which
is to serialize them as JSON (see Inputs above). In this way the actual
step to be run can be merely a parameter to step running in container.
So the outer step is a `docker/run` step with a command that executes
`step-runner` with a `steps` input parameter. The `docker/run` step will
run the container and then extract the output files from the container
and re-emit them to the outer steps.
This same technique will work for running steps in VMs or whatever.
Step Runner doesn't have to know anything about containerizing or
isolation steps.
### Special Compilation (rejected option)
When we see the `image` keyword in a GitLab CI step we would download
and compile the "target" step. Then manufacture a `docker/run` step
and pass the complied `exec` command as an input. Then we would compile
the `docker/run` step and execute it.
However this requires Step Runner to know how to construct a `docker/run`
step. Which couples Step Runner with the method of isolation, making
isolation in VMs and other methods more complicated.
### Native Docker (rejected option)
The baseline step can include provisions for running a step in a
Docker container. For example the step could include a `ref` "target"
field and an `image` field.
However this also couples Step Runner with Docker and expands the role
of Step Runner. It is preferable to make Docker an external step
that Step Runner execs in the same way as any other step.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,239 +1,11 @@
---
status: proposed
creation-date: "2023-08-23"
authors: [ "@ayufan", "@josephburnett" ]
coach: "@grzegorz"
approvers: [ "@dhershkovitch", "@DarrenEastman", "@cheryl.li" ]
owning-stage: "~devops::verify"
participating-stages: [ ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/'
remove_date: '2025-07-08'
---
# Step Runner for executing GitLab Steps
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/).
## Summary
This document describes architecture of a new component called Step Runner, the GitLab Steps syntax it uses,
and how the GitHub Actions support will be achieved.
The competitive CI products [drone.io](https://www.drone.io/),
[GitHub Actions](https://docs.github.com/en/actions/creating-actions)
have a composable CI jobs execution in form of steps, or actions.
Their usage and our prior evaluation of [GitLab Runner Plugins](https://gitlab.com/gitlab-org/gitlab/-/issues/15067)
shows a need for a better way to define CI job execution.
## Glossary
- GitLab Steps: a name of GitLab CI feature to define and use reusable components
within a single job execution context.
- Step Runner: a RFC implementation for GitLab Steps that provides compatibility with the GitHub Actions.
- GitHub Actions: similar to GitLab Steps, a reusable execution component used on GitHub.
- CI Catalog: a public or private component catalog that could be used to discover and use shared components.
- GitLab Rails: a main application responsible for pipeline execution, running on GitLab.com or on-premise installation.
## Motivation
Even though the current `.gitlab-ci.yml` file is reasonably flexible, it easily becomes very
complex when trying to support complex workflows. This complexity is represented
with repetitive patterns, a purpose-specific syntax, or a complex sequence of commands
to execute.
This is particularly challenging, because the `.gitlab-ci.yml` file
is inflexible on more complex workflows that require fine-tuning or special behavior
for the CI job execution. Its prescriptive approach how to handle Git cloning,
when artifacts are downloaded, or how the shell script is being executed quite often
results in the need to work around the system for pipelines that are not "standard"
or when new features are requested.
This proves especially challenging when trying to add a new syntax to the
`.gitlab-ci.yml` file
to support a specific feature, like [`secure files`](../../../ci/secure_files/index.md)
or `release:` keyword. Adding these special features on a syntax level
results in a more complex config, which is harder to maintain, and more complex
to deal with technical debt when requirements change.
An example of the `drone.io` and the `GitHub Actions` shows that a lot of workflows do not
have to be part of CI syntax. Instead, they can be provided in the form of reusable components
that are configured in a generic way in the CI config, and later downloaded and executed according
to inputs and parameters.
GitLab Steps is meant to fill that product-gap by following similar model to competitors
and to some extent staying compatible with them. The GitLab Steps is meant to replace all
purpose-specific syntax to handle specific features. By providing and using reusable components,
that are build outside of `.gitlab-ci.yml`, that are versioned, and requested when needed
this allows the customer much more flexibility, and allows us to iterate on a catalog much faster.
The reusable components that are part of a CI job execution could be used from a publicily hosted
repository on GitLab.com, from on-premise repository of steps, or be fetched from local project.
Each CI job would define a list of `steps:` to execute, that would reference GitLab Steps
or GitHub Actions. Those steps would be executed by the step runner directly in the context of
the target environment. GitLab Runner would be responsible to be connection between GitLab.com
(or on-premise installation) and Step Runner.
### Goals
GitLab Steps:
- GitLab Steps defines a syntax and structure for GitLab specific Steps implementation.
- GitLab Steps are published in CI Catalog.
- GitLab Steps can be used across instances (federation).
- GitLab Steps do define `inputs` and `outputs`.
- GitLab Steps needs to explicitly request sensitive informations with expected permissions.
For example: secrets, variables, tokens.
GitLab Inc. managed repository of GitLab Steps:
- GitLab Inc. provides a repository of GitLab Steps that are a drop-in replacement
for all current purpose-specific syntax: `artifacts:`, `cache:`, `release:`, etc.
- GitLab Inc. will provide a generic step to execute `shell` steps supporting various
shells (`bash`, `powershell`).
- The usage of purpose-specific syntax might be eventually deprecated in favor of steps.
Step Runner:
- Step Runner is hosted in a separate project in `https://gitlab.com/gitlab-org`.
- Step Runner can be used to execute most of GitHub Actions.
- Step Runner is run as a process in a target environment.
- Step Runner can be used by user on their local machine to run steps of a specific CI job
from locally stored `.gitlab-ci.yml`.
- Step Runner is external component to GitLab Runner, the GitLab Runner does provision
environment, construct payload and pass execution to Step Runner.
- Step Runner is to replace all custom handling in GitLab Runner for `clone`, `artifacts`,
`caches`, `script` and `after_script`, and custom handling for all different shells (`bash`, `powershell`).
- Step Runner is responsible for parsing and compiling GitLab Steps and GitHub Actions.
- Step Runner is responsible for downloading, and managing repositories required by GitLab Steps and GitHub Actions.
- Step Runner does control and monitor execution flow of individual steps of execution.
- Step Runner is required to be executable from the command-line interface (CLI). It means that it can be configured either via config file,
or environment file, or be able to read `.gitlab-ci.yml`.
- Step Runner can expose gRPC or other programmable interface to run config or get trace from.
Steps Execution:
- Each Step is defined by a single published or locally defined GitLab Step, or GitHub Action.
- Each Step is executed depending on conditions that are defined by that step.
- Each Step is executed with least amount of information exposed. Exposed informations to step
are requested explicitly by the step. For example: only environment variables explicitly
requested by the step will be passed to the step.
- Each Step is considered untrusted. It means that even though some steps are trusted, the whole
CI job should be considered untrusted, since system cannot guarantee trust.
- Each Step describes its execution in a form of preconditions, versions used, and output produced.
This is meant to allow to sign steps execution for the purpose of creating reproducible builds.
Backward compatibility:
- All currently executable syntax (for example: `before_script:`, `script:`, `artifacts:`, `cache:`, etc.)
should be convertible by GitLab (Rails)
## Non-Goals
TBD
## Proposal
Step Runner will be a new go binary which lives at `https://gitlab.com/gitlab-org/step-runner`.
It will be able to accept a number of input formats which are compiled to a standard proto format.
Output will be a standard proto trace which will include details for debugging and reproducing the build.
### Capabilities
- Read steps
- from environment variable
- from `.gitlab-ci.yml` file
- from gRPC server in step-runner
- from commandline JSON input
- Compile GitLab Steps and GitHub Actions to a baseline step definition
- explicit inputs and outputs
- explicit environment and exports
- baseline steps can be type `exec` or more steps
- Download and run steps from:
- Git repos
- zip files
- locally provided
- A job can be composed of different kinds of steps
- steps can come from different sources and be run in different ways
- steps can access environment exports and output of previous steps
- Produce a step-by-step trace of execution
- including final inputs and outputs
- including final environment and exports
- including logs of each step
- each step specifies the exact runtime and component used (hash)
- (optional) masking sensitive inputs, outputs, environment and exports
- Replaying a trace
- reuses the exact runtimes and components from trace
- output of trace will be the same trace if build is deterministic
### Example invocations
#### Command line
- `STEPS=$(cat steps.yml) step-runner ci`
- `step-runner local .gitlab-ci.yml --format gitlab-ci --job-name hello-world --output-file trace.json`
- `step-runner replay trace.json`
- `step-runner ci --port 8080`
#### GitLab CI
```yaml
hello-world:
image: registry.gitlab.com/gitlab-org/step-runner
variables:
STEPS: |
- step: gitlab.com/josephburnett/component-hello-steppy@master
inputs:
greeting: "hello ${{ env.name }}"
env:
name: world
script:
- /step-runner ci
artifacts:
paths:
- trace.json
```
### Basic compilation and execution process
Steps as expressed in GitLab CI are complied to the baseline step definition.
Referenced steps are loaded and compiled to produce an `exec` command,
or to produce an additional list of GitLab CI steps which are compiled recursively.
Each steps is executed immediately after compilation so its output will be available for subsequent compilations.
![diagram of data during compilation](data.drawio.png)
Steps return outputs and exports via files which are collected by Step Runner after each step.
Finally all the compiled inputs and outputs for each step are collected in a step trace.
![sequenced diagram of step runner compilation and execution](step-runner-sequence.drawio.png)
### GitLab Steps definition and syntax
- [Step Definition](step-definition.md).
- [Syntactic Sugar extensions](steps-syntactic-sugar.md).
### Integration of GitLab Steps
- [Usage of the GitLab Steps with `.gitlab-ci.yml`](gitlab-ci.md).
- [Runner Integration](runner-integration.md).
## Design and implementation details
### 2023-11-28 - GitLab Steps ADR 001: Bootstrap Step Runner
- See the [GitLab Steps ADR 001: Bootstrap Step Runner](decisions/001_initial_support.md).
- See the [Baseline Step Proto](implementation.md).
## References
- [GitLab Issue #215511](https://gitlab.com/gitlab-org/gitlab/-/issues/215511)
- [Step Runner Code](https://gitlab.com/josephburnett/step-runner/-/tree/blueprint2).
This is the exploratory code created during the writing of this blueprint.
It shows the structure of the Step Runner binary and how the pieces fit together.
It runs but doesn't quite do the right thing (see all the TODOs).
- [CI Steps / CI Events / Executors / Taskonaut (video)](https://youtu.be/nZoO547IISM).
Some high-level discussion about how these 4 blueprints relate to each other.
And a good prequel to the video about this MR.
- [Steps in Runner (video)](https://youtu.be/82WLQ4zHYts).
A walk through of the Step Runner details from the code perspective.
- [CI YAML keywords](https://gitlab.com/gitlab-org/gitlab/-/issues/398129#note_1324467337).
An inventory of affected keywords.
- [GitLab Epic 11535](https://gitlab.com/groups/gitlab-org/-/epics/11535)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,293 +1,11 @@
---
owning-stage: "~devops::verify"
description: Runner integration for [CI Steps](index.md).
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/runner-integration/'
remove_date: '2025-07-08'
---
# Runner Integration
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/runner-integration/).
## Non goals
This proposal does not address deployment of the Step Runner binary into
target environments, nor of starting the Step Runner gRPC service
described below. The rest of the proposal assumes both that the Step
Runner binary exists in the target environment and that the gRPC service
is running and listening on a local socket. Similarly this proposal does
not address the life-cycle of the `Step Runner` service, and how to handle
things like restarting the service if it dies, or upgrades.
See [Deployment and Lifecycle Management](service-deployment.md) for relevant blueprint.
## Steps Service gRPC Definition
The Step Runner service gRPC definition is as follows:
```proto
service StepRunner {
rpc Run(RunRequest) returns (RunResponse);
rpc FollowSteps(FollowStepsRequest) returns (stream FollowStepsResponse);
rpc FollowLogs(FollowLogsRequest) returns (stream FollowLogsResponse);
rpc Finish(FinishRequest) returns (FinishResponse);
rpc Status(StatusRequest) returns (StatusResponse);
}
message Variable {
string key = 1;
string value = 2;
bool file = 3;
bool masked = 4;
}
message Job {
repeated Variable variables = 1;
string job_id = 2;
string pipeline_id = 3;
string build_dir = 4;
repeated string token_prefixes = 5;
}
message Masking {
repeated string phrases = 1;
repeated string token_prefixes = 2;
}
message RunRequest {
string id = 1;
string work_dir = 2;
map<string,string> env = 3;
Masking masking = 4;
Job job = 5;
string steps = 6;
}
message RunResponse {
}
message FollowStepsRequest {
string id = 1;
}
message FollowStepsResponse {
StepResult result = 1;
}
message FollowLogsRequest {
string id = 1;
int32 offset = 2;
}
message FollowLogsResponse {
bytes data = 1;
}
message FinishRequest {
string id = 1;
}
message FinishResponse {
}
message Status {
string id = 1;
bool finished = 2;
int32 exit_code = 3;
google.protobuf.Timestamp start_time = 4;
google.protobuf.Timestamp end_time = 5;
}
message StatusRequest {
string id = 1;
}
message StatusResponse {
repeated Status jobs = 1;
}
```
Runner interacts with Step Runner over the above gRPC service which is
started on a local socket in the execution environment. Runner accesses
the local socket by first connecting to the target environment via
executor-specific protocols, then use a provided `proxy` command to
connect to the `gRPC` service, and transparently tunnel `gRPC` requests
from the Runner to Step Runner (see[Proxy Command](#proxy-command)). This
is the same way that Nesting serves a gRPC service in a dedicated Mac
instance. The service has five RPCs, `Run`, `FollowSteps`, `FollowLogs`,
`Finish` and `Status`.
`Run` is the initial delivery of the steps. `FollowSteps` requests a
streaming response of step-result traces. `FollowLogs` similarly requests
a streaming response of output (`stdout`/`stderr`) written by processes
executed as part of running the steps, and logs produced by Step Runner
itself. `Finish` stops execution of the request (if still running) and
cleans up resources as soon as possible. `Status` lists the status of the
specified job, or if no job was specified, of all active jobs in the Step
Runner service (including completed but not `Finish`ed jobs). `Status` can
for example be used by a runner to recover after a crash.
The Step Runner gRPC service will be able to execute multiple `Run`
payloads at once. That is, each call to `Run` will start a new goroutine
and execute the steps until completion. Multiple calls to `Run` may be
made simultaneously.
As steps are executed, step-result traces and sub-process logs can be
streamed back to GitLab Runner. This allows Runner (or any caller) to
follow execution, at the step level for step-result traces
(`FollowSteps`), and as written for sub-process and Step Runner logs
(`FollowLogs`). Logs will be written in a [specific format](#log-format),
and sensitive tokens will be [masked](#masking) by Step Runner before
being streamed to Runner.
All APIs excluding `Status` are idempotent, meaning that multiple calls to
the same API with the same parameters should return the same result. For
example, If `Run` is called multiple times with the same `id`, only the
first invocation should begin processing of the job request, and
subsequent invocations return a success status but otherwise do noting.
Similarly, multiple calls to `Finish` with the same `id` should finish and
remove the relevant job on the first call, and do nothing on subsequent
calls.
The service should not assume clients will be well-behaved, and should be
able to handle clients that never call or prematurely disconnect from
either of the `Follow` APIs, and also clients that never call `Finish` on
a corresponding `Run` request. To this end the `Step Runner` process
should periodically perform a scan to identify and prune stale or
runaway/stuck jobs. A stale job could be a job that has finished some
specified time ago (and has not been `Finish`ed). A runaway job is a job
that has been running some (long) specified amount of time, possibly
without producing output.
Finally, to facilitate integrating steps into the below runner executors,
it is recommended that steps provide a client library to coordinate
execution of the `Run`/`Follow*`/`Finish` APIs, and to handle reconnecting
to the step-runner service in the event that the `Follow*` calls loose
connectivity.
## RunRequest Parameters
Steps are delivered to Step Runner in the `RunRequest.Steps` field as a
JSON-serialized version of
[step.go](https://gitlab.com/gitlab-org/step-runner/-/blob/main/schema/v1/step.go),
with no processing of the step definition required by runner itself. The
`id` field uniquely identifies each request running on the `Step Runner`
service. The `RunRequest.Env` field holds environment variable that are to
be injected into the environment when each step is executed.
The optional `Job` parameter will include select parameters from the
corresponding CI job. `Job` will include the corresponding CI job's build
directory; `Job.BuildDir` should be copied to `RunRequest.WorkDir`, and
all steps in a request should be invoked in that directory to preserve
existing job script behavior. The `RunRequest` will also include the CI
job's environment variables (i.e. the `variables` defined at the job and
global levels in the CI configuration). When a `RunRequest` is made by
Runner, variables must be included in `Job.Variables`, and
`RunRequest.Env` should be left empty. When the run request is processed,
file-type variables will be written to file, variables will be expanded,
copied into `RunRequest.Env`, and the `Job` field will be discarded from
the remainder of the request. Variables should be expanded by the Step
Runner service since they may reference object in the execution
environment (like other environment variables or paths). This includes
file-type variables, which should be written to the same path as they
are be in traditional runner job execution. Similarly, from
`Job.Variables`, phrases to be masked should be extracted and used to
populate `Masking.Phrases`, and `Job.TokenPrefixes` should be copied into
`Masking.TokenPrefixes`.
Clients other than Runner wishing to run steps can omit the `Job` field,
and in this case the `Masking` and `Env` fields should be populated
directly by the caller.
## Log Format
Log lines emitted buy the `FollowLogs` API should have the format
```plaintext
<timestamp> <stream> <stdout/stderr> <append flag> <message>
```
This is the same log format introduce into runner in [this merge request](https://gitlab.com/gitlab-org/gitlab-runner/-/merge_requests/4591).
The logging library used to produce this format should be shared between
`GitLab Runner` and `Step Runner`.
## Masking
`Step Runner` will be responsible for masking sensitive variables or
tokens. This should be done before the raw log message is formatted into
the above log format. The libraries used to mask variables should shared
between `GitLab Runner` and `Step Runner`. (See
[relevant](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/main/helpers/trace/internal/tokensanitizer/token_masker.go)
[modules](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/main/helpers/trace/internal/masker/masker.go)).
## Proxy Command
The `Step Runner` binary will include a command to proxy data from
(typically text-based) `stdin`/`stdout`/`stderr`-based protocols to the
gRPC service. This command will run in the same host as the gRPC service,
and will read input from `stdin`, forward it to the gRPC service over a
local socket, receive output from the gRPC service over same socket, and
forward it to the client via `stdout`/`stderr`. This command will enable
clients (like Runner) to transparently tunnel to the `gRPC` service via
`stdin`/`stderr`/`stdout`-based protocols like SSH or `docker exec`, which
will eliminate the need to expose the Step Runner service's gRPC port on
Docker images, or set up SSH port forwarding on VMs, and will allow runner
to interact with `Step Runner` using established protocols (i.e. SSH and
`docker exec`). `stdout` should be reserved for writing responses from the
`Step Runner` service, and `stderr` should be reserved for errors
originating in the `proxy` command itself.
## Executors
Here is how GitLab Runner will connect to Step Runner in each runner
executor:
### Instance
The Instance executor is accessed via SSH, the same as today. However
instead of starting a bash shell and piping in commands, it invokes the
[proxy command](#proxy-command), which in turn connects to the Step
Runner socket in a known location. Runner can then make `gRPC` calls
directly, and transparently tunnel through the `SSH` connection to the
`gRPC` service. This is the same as how Runner calls the Nesting server in
dedicated Mac instances to make VMs.
This requires that Step Runner is present and started in the job
execution environment.
### Docker
The same requirement that Step Runner is present and the gRPC service is
running is true for the Docker executor (and `docker-autoscaler`). However
in order to connect to the gRPC service inside the container, Runner will
`docker exec` to the container and execute the proxy command to connect to
the gRPC service in the container. The client can then write to the
`docker exec`'s `stdin`, which will transparently be proxied to the gRPC
service, and read from its `stdout/stderr`, which will contain responses
from the gRPC service.
### Kubernetes
The Kubelet on Kubernetes Nodes exposes an exec API which will start a
process in a container of a running Pod. We will use this to `exec create`
a bridge process that will allow the caller to make `gRPC` calls inside
the Pod, similar to the Docker executor.
In order to access to this protected Kubelet API we must use the
Kubernetes API which provides an exec sub-resource on Pod. A caller
can POST to the URL of a pod suffixed with `/exec` and then negotiate
the connection up to a SPDY protocol for bidirectional byte
streaming. So GitLab Runner can use the Kubernetes API to connect to
the Step Runner service and deliver job payloads.
This is the same way that `kubectl exec` works. In fact most of the
internals such as SPDY negotiation are provided as `client-go`
libraries. So Runner can call the Kubernetes API directly by
importing the necessary libraries rather than shelling out to
Kubectl.
Historically one of the weaknesses of the Kubernetes Executor was
running a whole job through a single exec. To mitigate this Runner
uses the attach command instead, which can "re-attach" to an existing
shell process and pick up where it left off.
This is not necessary for Step Runner however, because the exec is
just establishing a bridge to the long-running gRPC process. If the
connection drops, Runner will just "re-attach" by exec'ing another
connection and continuing to make RPC calls like `follow`.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,18 +1,11 @@
---
owning-stage: "~devops::verify"
description: Steps Runner Deployment and Lifecycle Management for [Runner Integration](runner-integration.md).
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/service-deployment/'
remove_date: '2025-07-08'
---
# Steps Runner Deployment and Lifecycle Management
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/service-deployment/).
This Blueprint is concerned with:
- The deployment or injection of the Step Runner binary into target
environments. This includes build containers for Docker, Kubernetes and
Instance executors.
- Startup of the Step Runner gRPC service in said environments.
- Any required install-time configuration.
- Service restart in the event of a crash.
- Step Runner binary upgrade for environments where the Step Runner service is long lived.
- Management of any resources used by the Step Runner service
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,368 +1,11 @@
---
owning-stage: "~devops::verify"
description: The Step Definition for [GitLab Steps](index.md).
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/step-definition/'
remove_date: '2025-07-08'
---
# The Step definition
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/step-definition/).
A step is the minimum executable unit that user can provide and is defined in a `step.yml` file.
The following step definition describes the minimal syntax supported.
The syntax is extended with [syntactic sugar](steps-syntactic-sugar.md).
A step definition consists of two documents. The purpose of the document split is
to distinguish between the declaration and implementation:
1. [Specification / Declaration](#step-specification):
Provides the specification which describes step inputs and outputs,
as well any other metadata that might be needed by the step in the future (license, author, etc.).
In programming language terms, this is similar to a function declaration with arguments and return values.
1. [Implementation](#step-implementation):
The implementation part of the document describes how to execute the step, including how the environment
has to be configured, or how actions can be configured.
## Example step that prints a message to stdout
In the following step example:
1. The declaration specifies that the step accepts a single input named `message`.
The `message` is a required argument that needs to be provided when running the step
because it does not define `default:`.
1. The implementation section specifies that the step is of type `exec`. When run, the step
will execute an `echo` command with a single argument (the `message` value).
```yaml
# .gitlab/ci/steps/exec-echo.yaml
spec:
inputs:
message:
---
type: exec
exec:
command: [echo, "${{inputs.message}}"]
```
## Step specification
The step specification currently only defines inputs and outputs:
- Inputs:
- Can be required or optional.
- Have a name and can have a description.
- Can contain a list of accepted options. Options limit what value can be provided for the input.
- Can define matching regexp. The matching regexp limits what value can be provided for the input.
- Can be expanded with the usage of syntax `${{ inputs.input_name }}`.
- All **input values** can be accessed when `type: exec` is used,
by decoding the `$STEP_JSON` file that does provide information about the context of the execution.
- Outputs:
- Have a name and can have a description.
- Can be set by writing to a special [dotenv](https://github.com/bkeepers/dotenv) file named:
`$OUTPUT_FILE` with a format of `output_name=VALUE` per output.
For example:
```yaml
spec:
inputs:
message_with_default:
default: "Hello World"
message_that_is_required:
description: "This description explains that the input is required, because it does not specify a default:"
type_with_limited_options:
options: [bash, powershell, detect]
type_with_default_and_limited_options:
default: bash
options: [bash, powershell, detect]
description: "Since the options are provided, the default: needs to be one of the options"
version_with_matching_regexp:
match: ^v\d+\.\d+$
description: "The match pattern only allows values similar to `v1.2`"
outputs:
code_coverage:
description: "Measured code coverage that was calculated as part of the step"
---
type: steps
steps:
- step: ./bash-script.yaml
inputs:
script: "echo Code Coverage = 95.4% >> $OUTPUT_FILE"
```
## Step Implementation
The step definition can use the following types to implement the step:
- `type: exec`: Run a binary command, using STDOUT/STDERR for tracing the executed process.
- `type: steps`: Run a sequence of steps.
- `type: parallel` (Planned): Run all steps in parallel, waiting for all of them to finish.
- `type: grpc` (Planned): Run a binary command but use gRPC for intra-process communication.
- `type: container` (Planned): Run a nested Step Runner in a container image of choice,
transferring all execution flow.
### The `exec` step type
The ability to run binary commands is one of the primitive functions:
- The command to execute is defined by the `exec:` section.
- The result of the execution is the exit code of the command to be executed, unless the default behavior is overwritten.
- The default working directory in which the command is executed is the directory in which the
step is located.
- By default, the command is not time-limited, but can be time-limited during job execution with `timeout:`.
For example, an `exec` step with no inputs:
```yaml
spec:
---
type: exec
exec:
command: [/bin/bash, ./my-script.sh]
timeout: 30m
workdir: /tmp
```
#### Example step that executes user-defined command
The following example is a minimal step definition that executes a user-provided command:
- The declaration section specifies that the step accepts a single input named `script`.
- The `script` input is a required argument that needs to be provided when running the step
because no `default:` is defined.
- The implementation section specifies that the step is of type `exec`. When run, the step
will execute in `bash` passing the user command with `-c` argument.
- The command to be executed will be prefixed with `set -veo pipefail` to print the execution
to the job log and exit on the first failure.
```yaml
# .gitlab/ci/steps/exec-script.yaml
spec:
inputs:
script:
description: 'Run user script.'
---
type: exec
exec:
command: [/usr/bin/env, bash, -c, "set -veo pipefail; ${{inputs.script}}"]
```
### The `steps` step type
The ability to run multiple steps in sequence is one of the primitive functions:
- A sequence of steps is defined by an array of step references: `steps: []`.
- The next step is run only if previous step succeeded, unless the default behavior is overwritten.
- The result of the execution is either:
- A failure at the first failed step.
- Success if all steps in sequence succeed.
#### Steps that use other steps
The `steps` type depends extensively on being able to use other steps.
Each item in a sequence can reference other external steps, for example:
```yaml
spec:
---
type: steps
steps:
- step: ./.gitlab/ci/steps/ruby/install.yml
inputs:
version: 3.1
env:
HTTP_TIMEOUT: 10s
- step: gitlab.com/gitlab-org/components/bash/script@v1.0
inputs:
script: echo Hello World
```
The `step:` value is a string that describes where the step definition is located:
- **Local**: The definition can be retrieved from a local source with `step: ./path/to/local/step.yml`.
A local reference is used when the path starts with `./` or `../`.
The resolved path to another local step is always **relative** to the location of the current step.
There is no limitation where the step is located in the repository.
- **Remote**: The definition can also be retrieved from a remote source with `step: gitlab.com/gitlab-org/components/bash/script@v1.0`.
Using a FQDN makes the Step Runner pull the repository or archive containing
the step, using the version provided after the `@`.
The `inputs:` section is a list of key-value pairs. The `inputs:` specify values
that are passed and matched against the [step specification](#step-specification).
The `env:` section is a list of key-value pairs. `env:` exposes the given environment
variables to all children steps, including [`type: exec`](#the-exec-step-type) or [`type: steps`](#the-steps-step-type).
#### Remote Steps
To use remote steps with `step: gitlab.com/gitlab-org/components/bash/script@v1.0`
the step definitions must be stored in a structured-way. The step definitions:
- Must be stored in the `steps/` folder.
- Can be nested in sub-directories.
- Can be referenced by the directory name alone if the step definition
is stored in a `step.yml` file.
For example, the file structure for a repository hosted in `git clone https://gitlab.com/gitlab-org/components.git`:
```plaintext
├── steps/
├── ├── secret_detection.yml
| ├── sast/
│ | └── step.yml
│ └── dast
│ ├── java.yml
│ └── ruby.yml
```
This structure exposes the following steps:
- `step: gitlab.com/gitlab-org/components/secret_detection@v1.0`: From the definition stored at `steps/secret_detection.yml`.
- `step: gitlab.com/gitlab-org/components/sast@v1.0`: From the definition stored at `steps/sast/step.yml`.
- `step: gitlab.com/gitlab-org/components/dast/java@v1.0`: From the definition stored at `steps/dast/java.yml`.
- `step: gitlab.com/gitlab-org/components/dast/ruby@v1.0`: From the definition stored at `steps/dast/ruby.yml`.
#### Example step that runs other steps
The following example is a minimal step definition that
runs other steps that are local to the current step.
- The declaration specifies that the step accepts two inputs, each with
a default value.
- The implementation section specifies that the step is of type `steps`, meaning
the step will execute the listed steps in sequence. The usage of a top-level
`env:` makes the `HTTP_TIMEOUT` variable available in all executed steps.
```yaml
spec:
inputs:
ruby_version:
default: 3.1
http_timeout:
default: 10s
---
type: steps
env:
HTTP_TIMEOUT: ${{inputs.http_timeout}}
steps:
- step: ./.gitlab/ci/steps/exec-echo.yaml
inputs:
message: "Installing Ruby ${{inputs.ruby_version}}..."
- step: ./.gitlab/ci/ruby/install.yaml
inputs:
version: ${{inputs.ruby_version}}
```
## Context and interpolation
Every step definition is executed in a context object which
stores the following information that can be used by the step definition:
- `inputs`: The list of inputs, including user-provided or default.
- `outputs`: The list of expected outputs.
- `env`: The current environment variable values.
- `job`: The metadata about the current job being executed.
- `job.project`: Information about the project, for example ID, name, or full path.
- `job.variables`: All [CI/CD Variables](../../../ci/variables/predefined_variables.md) as provided by the CI/CD execution,
including project variables, predefined variables, etc.
- `job.pipeline`: Information about the current executed pipeline, like the ID, name, full path
- `step`: Information about the current executed step, like the location of the step, the version used, or the [specification](#step-specification).
- `steps` (only for `type: exec`): - Information about each step in sequence to be run, containing information about the
result of the step execution, like status or trace log.
- `steps.<name-of-the-step>.status`: The status of the step, like `success` or `failed`.
- `steps.<name-of-the-step>.outputs.<output-name>`: To fetch the output provided by the step
The context object is used to enable support for the interpolation in the form of `${{ <value> }}`.
Interpolation:
- Is forbidden in the [step specification](#step-specification) section.
The specification is static configuration that should not affected by the runtime environment.
- Can be used in the [step implementation](#step-implementation) section. The implementation
describes the runtime set of instructions for how step should be executed.
- Is applied to every value of the hash of each data structure.
- Of the *values* of each hash is possible (for now). The interpolation of *keys* is forbidden.
- Is done when executing and passing control to a given step, instead of running
it once when the configuration is loaded. This enables chaining outputs to inputs, or making steps depend on the execution
of earlier steps.
For example:
```yaml
# .gitlab/ci/steps/exec-echo.yaml
spec:
inputs:
timeout:
default: 10s
bash_support_version:
---
type: steps
env:
HTTP_TIMEOUT: ${{inputs.timeout}}
PROJECT_ID: ${{job.project.id}}
steps:
- step: ./my/local/step/to/echo.yml
inputs:
message: "I'm currently building a project: ${{job.project.full_path}}"
- step: gitlab.com/gitlab-org/components/bash/script@v${{inputs.bash_support_version}}
```
## Reference data structures describing YAML document
```go
package main
type StepEnvironment map[string]string
type StepSpecInput struct {
Default *string `yaml:"default"`
Description string `yaml:"description"`
Options *[]string `yaml:"options"`
Match *string `yaml:"match"`
}
type StepSpecOutput struct {
}
type StepSpecInputs map[string]StepSpecInput
type StepSpecOutputs map[string]StepSpecOutput
type StepSpec struct {
Inputs StepSpecInput `yaml:"inputs"`
Outputs StepSpecOutputs `yaml:"outputs"`
}
type StepSpecDoc struct {
Spec StepSpec `yaml:"spec"`
}
type StepType string
const StepTypeExec StepType = "exec"
const StepTypeSteps StepType = "steps"
type StepDefinition struct {
Def StepSpecDoc `yaml:"-"`
Env StepEnvironment `yaml:"env"`
Steps *StepDefinitionSequence `yaml:"steps"`
Exec *StepDefinitionExec `yaml:"exec"`
}
type StepDefinitionExec struct {
Command []string `yaml:"command"`
WorkingDir *string `yaml:"working_dir"`
Timeout *time.Duration `yaml:"timeout"`
}
type StepDefinitionSequence []StepReference
type StepReferenceInputs map[string]string
type StepReference struct {
Step string `yaml:"step"`
Inputs StepReferenceInputs `yaml:"inputs"`
Env StepEnvironment `yaml:"env"`
}
```
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

View File

@ -1,66 +1,11 @@
---
owning-stage: "~devops::verify"
description: The Syntactic Sugar extensions to the Step Definition
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/steps-syntactic-sugar/'
remove_date: '2025-07-08'
---
# The Syntactic Sugar extensions to the Step Definition
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_steps/steps-syntactic-sugar/).
[The Step Definition](step-definition.md) describes a minimal required syntax
to be supported. To aid common workflows the following syntactic sugar is used
to extend different parts of that document.
## Syntactic Sugar for Step Reference
Each of syntactic sugar extensions is converted into the simple
[step reference](step-definition.md#steps-that-use-other-steps).
### Easily execute scripts in a target environment
`script:` is a shorthand syntax to aid execution of simple scripts, which cannot be used with `step:`
and is run by an externally stored step component provided by GitLab.
The GitLab-provided step component performs shell auto-detection unless overwritten,
similar to how GitLab Runner does that now: based on a running system.
`inputs:` and `env:` can be used for additional control of some aspects of that step component.
For example:
```yaml
spec:
---
type: steps
steps:
- script: bundle exec rspec
- script: bundle exec rspec
inputs:
shell: sh # Force runner to use `sh` shell, instead of performing auto-detection
```
This syntax example translates into the following equivalent syntax for
execution by the Step Runner:
```yaml
spec:
---
type: steps
steps:
- step: gitlab.com/gitlab-org/components/steps/script@v1.0
inputs:
script: bundle exec rspec
- step: gitlab.com/gitlab-org/components/steps/script@v1.0
inputs:
script: bundle exec rspec
shell: sh # Force runner to use `sh` shell, instead of performing auto-detection
```
This syntax example is **invalid** (and ambiguous) because the `script:` and `step:` cannot be used together:
```yaml
spec:
---
type: steps
steps:
- step: gitlab.com/my-component/ruby/install@v1.0
script: bundle exec rspec
```
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,286 +1,11 @@
---
status: ongoing
creation-date: "2024-04-23"
authors: [ "@mikolaj_wawrzyniak" ]
coach: "N/A"
approvers: [ "@jprovaznik", "@maddievn", "@mkaeppler" ]
owning-stage: "~devops::create"
participating-stages: ["~devops::data stores"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_xray_rag/'
remove_date: '2025-07-08'
---
# Repository X-Ray RAG
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_xray_rag/).
Group ~"group::global search" is leading effort to build [RAG at GitLab](../gitlab_rag/index.md). Because this is a global effort, it is in the spirit of efficiency and collaboration values for ~"group::code creation" to join that effort and integrate [Repository X-Ray](https://gitlab.com/gitlab-org/code-creation/repository-x-ray#repository-x-ray) data into GitLab RAG. Doing so should not only result in more efficient resource allocation, but also enable other AI features to integrate and reuse Repository X-Ray data, for example users could ask questions to GitLab Duo Chat that could be answered with X-Ray report data.
## Goal
Integrate the existing Repository X-Ray scan flow with the RAG platform.
## Proof of Concept
Proof of concept has been built at [merge request 144715](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/144715). This MR contains a large amount of information that can be helpful during implementation.
## Implementation
### Current state
Currently, Repository X-Ray does not use semantic search. The MVC approach
naively selects first 50 entities from the X-Ray report and includes them into a code generation request.
For more details, see the following diagrams.
<details><summary> <bold>Current state of Repository X-Ray diagrams</bold> </summary>
The Repository X-Ray scan is processed as shown on the following diagram:
```mermaid
sequenceDiagram
actor USR as User
participant RN as GitLab Runner
participant GLR as GitLab Rails
participant PG as GitLab PostgreSQL DB
participant AIGW as AI Gateway
USR->>GLR: commits changes <br> to a package manager file <br>eg. Gemfile.lock
GLR->>+RN: triggers Repository X Ray CI scanner job
loop for each batch of packages
RN->>GLR: Request packages description by AI
GLR->>AIGW: Forward request for packages description
AIGW->>GLR: Packages description
GLR->>RN: Forwards packages description
end
RN->>-GLR: Repository X Ray report
GLR->>+GLR: triggers Repository X Ray ingestion background job
GLR->>-PG: upserts xray_reports record
```
The report is later used as shown in the following diagram:
```mermaid
sequenceDiagram
actor USR as User
participant IDE
participant PG as GitLab PostgreSQL DB
participant GLR as GitLab Rails
participant AIGW as AI Gateway
USR->>+IDE: types: "#35; generate a function that transposes a matrix"
IDE->>+GLR: trigger code generation for line ` "#35; generate function `
GLR->>PG: fetch X Ray report for project and language
PG->>GLR: xray_reports record
GLR->>GLR: include first 50 entities from xray report into code generation prompt
GLR->>-AIGW: trigger code generation ` "#35; generate function `
```
</details>
### Desired outcome
After this effort is completed, the Repository X-Ray scan is processed in the following manner:
```mermaid
sequenceDiagram
actor USR as User
participant RN as GitLab Runner
participant GLR as GitLab Rails
participant ES as Elasticsearch
participant PG as GitLab PostgreSQL DB
participant AIGW as AI Gateway
USR->>GLR: commits changes <br> to a package manager file <br>eg. Gemfile.lock
GLR->>+RN: triggers Repository X Ray CI scanner job
loop for each batch of packages
RN->>GLR: Request packages description by AI
GLR->>AIGW: Forward request for packages description
AIGW-->>GLR: Packages description
GLR-->>RN: Forwards packages description
end
RN->>-GLR: Repository X Ray report
GLR->>+GLR: triggers Repository X Ray ingestion background job
GLR->>PG: Fetch currently stored packages for given repository and language
PG-->>GLR: List of repository packages
GLR->>GLR: Group packages into lists#58; "skip", "add", "remove"
GLR->>PG: Insert new packages from "add" list for given repository
GLR->>PG: Delete packages from "remove" list for given repository
rect rgb(0, 223, 0, .1)
note right of RN: Embeddings flow
opt with Elasticsearch available on an instance
GLR->>ES: Check if packages from "add" list already exists in ES index
ES-->>GLR: List of existing packages from "add" list
GLR->>GLR: Remove already indexed packages from "add" list
loop for each remaining package in "add" list
GLR->>AIGW: Request embeddings for package description
AIGW-->>GLR: Embeddings for package description
end
GLR->>ES: Create documents for packages in "add" list in ES index
GLR->>PG: Check if packages form "remove" list exists in any repository
PG-->>GLR: List of packages from "remove" group still in use
GLR->>GLR: Filter packages that are still in use from "remove" list
GLR->>ES: Delete all remaining packages in "remove" list from ES index
end
end
```
Later on, the Repository X-Ray report is used as follows:
```mermaid
sequenceDiagram
actor USR as User
participant IDE
participant GLR as GitLabRails
participant ES as Elasticsearch
participant PG as PostgreSQL
participant AIGW as AI Gateway
USR->>+IDE: types: "#35; generate method that fetches <br>top charts from Spotify"
IDE->>+GLR: trigger code generation for "#35; generate method <br>that fetches top charts from Spotify"
alt with Elasticsearch available on an instance
rect rgb(0, 223, 0, .1)
note left of GLR: new embeddings flow
GLR->>+AIGW: fetch embedding for instruction "in utils.js generate method that ...
AIGW-->>-GLR: embeddings vector for instruction
GLR->>PG: fetch packages for given repository and programming language
PG-->>GLR: List of repository package names for given language
GLR->>ES: fetches packages documents using KNN for instruction embeddings vector <br> from filtered list using repository package names
ES-->>GLR: spotify/web-api-ts-sdk - A package that wraps ...
GLR->>AIGW: code generation request with prompt including <br>spotify/web-api-ts-sdk - A package that wraps...
end
else
rect rgb(128, 128, 128, .1)
note left of GLR: current flow as fallback
GLR->>PG: fetch X Ray report for repository and language
PG-->>GLR: xray_report_packages records
GLR->>AIGW: code generation request with prompt first 50 <br> entities from xray report
end
end
```
### Required changes
#### On X Ray write path
##### 1. Create new AI Gateway endpoint that generate embeddings
Embeddings for GitLab documentation are requested directly from GitLab Rails. That approach limits embeddings availability to GitLab.com only, and this is also misaligned with the architecture blueprint that outlines AI Gateway as a provider of AI services for GitLab features (for more information, see the related [AI Gateway epic 13024](https://gitlab.com/groups/gitlab-org/-/epics/13024)). To avoid similar problems for Repository X-Ray, a new endpoint should be added to AI Gateway API that accepts batches of strings, and responds with embeddings vectors.
In the following iterations, it is expected that the AI Gateway endpoint (as the central point which has complete overview of all connected instances traffic) will enforce rate limiting and manage token consumption. However, from the very start clients will be responsible for correctly handling rate limiting and token exhaustion type errors.
**Open question:**
[Embeddings](../gitlab_rag/postgresql.md) are generated with the `textembedding-gecko` model (768 dimensions). When adding a new API endpoint, we might be able to change model if needed, if so we should decide which one.
Because Repository X Ray report data has small volume and size (at this moment there are 379 reports on GitLab.com), the decision to switch model and rebuild embeddings data is low cost and can be deferred in order to unblock iteration.
##### 2. Store embeddings vectors for Repository X-Ray report
The current Elasticsearch framework uses ActiveRecord to keep the index up to date. It uses callbacks on create/update/delete to action the corresponding record in Elasticsearch.
Because `xray_reports` table in PostgreSQL main database stores whole reports as JSON blobs in order to persist generated embeddings vectors for each of report item (representing library used in given repository)
we would need to either:
- Modify the current Elastic framework as defined in [issue 442197](https://gitlab.com/gitlab-org/gitlab/-/issues/442197).
- If, due to higher urgency, migrate `xray_reports` table to a new structure, where each record represents a single entity (package/library) in the Repository X-Ray report, which would be compatible with the current Elasticsearch upload pipeline.
The index in Elasticsearch that stores Repository X-Ray packages has the following format:
```json
index: xray-packages
document: {
id:
name:
language:
description:
embedding:
}
```
This index would store:
- Description.
- Name.
- The embeddings generated for the concatenated package name and description.
- Programming language.
This index will be shared between all repositories in a given GitLab instance. However, upon search the list of relevant packages will be filtered using prefetched lists of names of packages that belong to the given repository in a given programming language. That approach should:
- Reduce required storage capacity.
- Improve search performance.
- Reduce AI consumption used to generate embeddings.
###### Open question
We need to check if descriptions vary significantly depending on version, and decide whether to store records per package and its version, or just one per package.
##### 3. Modify storage on GitLab Rails layer
Right now Repository X-Ray packages are stored in `xray_reports` table at the main PostgreSQL DB.
```sql
CREATE TABLE xray_reports (
id bigint NOT NULL,
project_id bigint NOT NULL,
created_at timestamp with time zone NOT NULL,
updated_at timestamp with time zone NOT NULL,
lang text NOT NULL,
payload jsonb NOT NULL,
file_checksum bytea NOT NULL,
);
```
In order to support sharing embeddings between repositories whilst providing way to remove stale data from Elasticsearch, we must create a new table, `xray_report_packages`.
```sql
CREATE TABLE xray_report_packages (
id bigint NOT NULL,
project_id bigint NOT NULL,
created_at timestamp with time zone NOT NULL,
updated_at timestamp with time zone NOT NULL,
lang text NOT NULL,
name text NOT NULL,
version text NOT NULL,
description text, --nullable filed as with Elasticsearch available this file will not be in use
);
```
After the new table is created, all reports from `xray_reports` should be migrated there, and `xray_reports` should be removed.
##### 4. Modify Repository X-Ray import pipeline
The Repository X-Ray report, after being generated during the CI job, is imported using a background job.
That job uses [`Ai::StoreRepositoryXrayService`](https://gitlab.com/gitlab-org/gitlab/blob/c6b2f18eaf0b78a4e0012e88f28d643eb0dfb1c2/ee/app/services/ai/store_repository_xray_service.rb#L4)
to parse and save the report file into the `xray_reports` table at the main PostgreSQL DB.
In order to support semantic search for Repository X-Ray, we must apply the following modifications:
1. Load the new Repository X-Ray report from scanner into the list of `current_report_packages`.
1. Load the list of packages reported in previous report for the given project stored in `xray_report_packages` into the list of `previous_report_packages`
(For example, `SELECT * FROM xray_report_packages WHERE lang = 'ruby' AND project_id = 123`).
1. Filter packages from the new X-Ray report into three groups:
1. `skip` - a collection of unmodified packages that do not require any action.
1. `add` - a collection of new packages that should be added to `xray_report_packages`.
1. `remove` - a collection of packages that were present in the old report but missing in the new one (`previous_report_packages` - `current_report_packages`).
1. Insert the new records into the PostgreSQL `xray_report_packages` for the `add` list of packages.
1. From the `add` list, filter out packages that already exist in Elasticsearch. Those are packages that are being used by other repositories, and in this case embeddings and
descriptions will be shared.
1. Update and insert all packages that are left in the `add` list of packages in Elasticsearch.
1. Delete the `remove` list of packages from PostgreSQL `xray_report_packages` for a given project.
1. Create an `orphaned` list of packages from the `remove` list by selecting ones of the same `name` and `lang` that are not being used by any other project
(For example, `SELECT 1 FROM xray_report_packages WHERE name = 'kaminari' AND lang = 'ruby' LIMIT 1`).
1. Remove the `orphaned` packages from Elasticsearch.
#### On X-Ray read path
1. Retrieve code generation instruction either from:
1. IDE:
1. IDE / LS upon detecting generation should send a code generation instruction (for example, a content of a comment that triggered code generation).
1. GitLab Rails Code Suggestions API needs to add optional string parameter `instruction`.
1. GitLab Rails if due to different priorities the IDE will not be able to deliver instruction on time to unblock this effort, for the sake of the initial iteration, the code generation instruction can be also retrieved by GitLab Rails.
1. GitLab Rails needs to detect if Elasticsearch is available and:
1. When Elasticsearch is available:
1. Fetch list of `names` of packages that are stored in `xray_report_packages` PostgreSQL database for the given project and language.
1. Use k-nearest neighbor (kNN) search of Elasticsearch filtered with `names` list of projects packages to retrieve the most relevant context.
1. When Elasticsearch is not available:
1. Select the first 50 records from `xray_report_packages` in PostgreSQL database aligned with the [current state](#current-state) diagram.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,52 +1,11 @@
---
status: proposed
creation-date: "2023-06-21"
authors: [ "@fabiopitino" ]
coach: [ ]
approvers: [ ]
owning-stage: ""
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/bounded_contexts/'
remove_date: '2025-07-08'
---
# Defining bounded contexts
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/bounded_contexts/).
## Historical context
Until May 2024 the GitLab codebase didn't have a clear domain structure.
We have [forced the creation of some modules](https://gitlab.com/gitlab-org/gitlab/-/issues/212156)
as a first step but we didn't have a well defined strategy for doing it consistently.
The majority of the code was not properly namespaced and organized:
- Ruby namespaces used didn't always represent the SSoT. We had overlapping concepts spread across multiple
namespaces. For example: `Abuse::` and `Spam::` or `Security::Orchestration::` and `Security::SecurityOrchestration`.
- Domain code related to the same bounded context was scattered across multiple directories.
- Domain code was present in `lib/` directory under namespaces that differed from the same domain under `app/`.
- Some namespaces were very shallow, containing a few classes while other namespaces were very deep and large.
- A lot of the old code was not namespaced, making it difficult to understand the context where it was used.
In May 2024 we [defined and enforced bounded contexts](decisions/002_bounded_contexts_definition.md).
## Goal
1. Define a list of characteristics that bounded contexts should have. For example: must relate to at least 1 product category.
1. Have a list of top-level bounded contexts where all domain code is broken down into.
1. Engineers can clearly see the list of available bounded contexts and can make an easy decision where to add
new classes and modules.
1. Define a process for adding a new bounded context to the application. This should occur quite infrequently
and new bounded contexts need to adhere to the characteristics defined previously.
1. Enforce the list of bounded contexts so that no new top-level namespaces can be used aside from the authorized ones.
## Iterations
1. [Extract libraries out of the `lib/` directory](https://gitlab.com/gitlab-org/gitlab/-/blob/4c6e120069abe751d3128c05ade45ea749a033df/doc/development/gems.md).
- This step is non blocking to modularization but the less generic code exists in `lib/` the
easier will be to identify and separate bounded context.
- Isolate code that could live in a separate project, to prevent it from depending on domain code.
1. [ADR-001: Modularize application domain](decisions/001_modular_application_domain.md)? Start with modularizing
1. [ADR-002: Define bounded context around feature categories](decisions/002_bounded_contexts_definition.md) as a SSoT in the code.
1. [ADR-003: Assign stewards to all modules and libraries](decisions/003_stewardship.md).
1. [Publish the list of bounded contexts](../../../development/software_design.md#use-namespaces-to-define-bounded-contexts).
- Define a SSoT list of bounded contexts.
- Enforce enforce it using RuboCop static analyzer.
- Autoload non-standard Rails directories based on the given list.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,53 +1,11 @@
---
creation-date: "2024-05-07"
authors: [ "@fabiopitino" ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/decisions/001_modular_application_domain/'
remove_date: '2025-07-08'
---
# Modular Monolith ADR 001: Modularize the application domain
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/decisions/001_modular_application_domain/).
## Context
Before we modularize a codebase we first needed to define how we are going to divide it.
## Decision
We start by focusing on the application domain (backend business logic) leaving the
application adapters (Web controllers and views, REST/GraphQL endpoints) outside the
scope of the modularization initially.
The reasons for this are:
1. Code in application adapters may not always align with a specific
domain. For example: a project settings endpoint or a merge request page contain
references to many domains.
1. There was a need to run separate Rails nodes for the SaaS architecture using different
profiles in order to save on memory.
For example: on SaaS we wanted to be able to spin up more Sidekiq nodes without the need
to load the whole Rails application. The assumption is that for running Sidekiq we don't
need ActionCable, REST endpoints, GraphQL mutations or Rails views.
We only need the application domain and infrastructure code.
This could still be true even with the introduction of [Cells](../../cells/index.md) but
we need to re-evaluate this assumption.
1. Keep the scope and effort smaller. Tackling only domain code is easier to understand than
the complexity of how to breakdown the application adapters and all their edge cases.
The decision to scope out application adapters is not final and we decided to defer
it to later.
Finally, the infrastructure code containing technical concerns (typically the `lib/`) will
be part of a common "platform" module that every domain module will depend on in order to function.
The "platform" module can be broken down into independent libraries extracted as gems.
## Consequences
We focus on modularizing business logic primarily we simplify the rules and guidelines for
engineers. We can apply the same set of patterns across modules.
## Alternatives
We looked into including application adapters to the modularization effort but noticed that:
1. Modularizing adapters is more delicate as we need to preserve user-facing dependencies like
routes.
1. The size of the adapters code is much smaller than the whole application domain.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,61 +1,11 @@
---
creation-date: "2024-05-07"
authors: [ "@fabiopitino" ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/decisions/002_bounded_contexts_definition/'
remove_date: '2025-07-08'
---
# Modular Monolith ADR 002: Define bounded contexts
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/decisions/002_bounded_contexts_definition/).
## Context
With the focus primarily on the application domain we needed to define how to
modularize it.
## Decision
The application domain is divided into bounded contexts which define the top-level
modules of GitLab application. The term bounded context is widely used in
Domain-Driven Design.
Defining bounded contexts means to organize the code around product structure rather than
organizational structure.
From the research in [Proposal: split GitLab monolith into components](https://gitlab.com/gitlab-org/gitlab/-/issues/365293)
it seems that following [product categories](https://handbook.gitlab.com/handbook/product/categories/#hierarchy), as a guideline,
would be much better than translating organization structure into folder structure (for example, `app/modules/verify/pipeline-execution/...`).
However, this guideline alone is not sufficient and we need a more specific strategy:
- Bounded contexts (top level modules) should be [sufficiently deep](../../../../development/software_design.md#use-namespaces-to-define-bounded-contexts)
to encapsulate implementation details and provide a smaller interface.
- Some product categories, such as Browser Performance Testing, are just too small to represent
a bounded context on their own.
We should have a strategy for grouping product categories together when makes sense.
- Product categories don't necessarily translate into clean boundaries.
`Category:Pipeline Composition` and `Category:Continuous Integration` are some examples
where Pipeline Authoring team and Pipeline Execution team share a lot of code.
- Some parts of the code might not have a clear product category associated to it.
Despite the above, product categories provide a rough view of the bounded contexts at play in the application.
For that we use product categories to sketch the initial set of bounded contexts.
Then, group related or strongly coupled categories under the same bounded context and create new bounded contexts if missing.
## Consequences
In May 2024 we completed the [Bounded Contexts working group](https://handbook.gitlab.com/handbook/company/working-groups/bounded-contexts/)
which completed the first phase of modularization, described in this page.
We defined a list of [bounded contexts in code](../../../../development/software_design.md#use-namespaces-to-define-bounded-contexts)
and started enforcing them with RuboCop, in order to move towards a fully namespaced monolith.
Team members can edit this list by creating and deleting bounded contexts explicitly and the decision is reviewed
by Staff+ engineers.
## Alternatives
We evaluated whether to align the code to the organizational structure but we decided it wasn't viable:
- Product categories can change ownership and we have seen some pretty frequent changes, even back and forth.
Moving code every time a product category changes ownership adds too much maintenance overhead.
- Teams and organization changes should just mean relabelling the ownership of specific modules.
- Coupling and complexity are directly correlated to business logic and product structure.
A code organization that aligns to organizational structure could generate unnecessary complexity and
much more coupling.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,60 +1,11 @@
---
creation-date: "2024-05-08"
authors: [ "@fabiopitino" ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/decisions/003_stewardship/'
remove_date: '2025-07-08'
---
# Modular Monolith ADR 003: Module stewardship
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/decisions/003_stewardship/).
## Context
How do we assign stewardship to domain and platform modules? We have a large amount of shared code
that does not have explicit stewards who can provide a vision and direction on that part of code.
## Decision
We use the term **stewards** instead of **owners** to be more in line with GitLab principle of
**everyone can contribute**. Stewards are care takers of the code. They know how a specific
functionality is designed and why. They know the architectural characteristics and constraints.
However, they welcome changes and guide contributors towards success.
A module, whether is from a domain bounded context or platform module, must have at least 1 group of stewards.
This group can be a team name (or GitLab group handle). Optionally, the list of stewards can include
single IC entries.
When we will use a Packwerk package to extract a module we will be able to indicate stewardship directly
in the `package.yml`:
```yaml
metadata:
stewards:
- group::pipeline execution # team name
- group::pipeline authoring # team name
- @grzesiek # IC
- @ayufan # IC
```
For platform modules (e.g. `Gitlab::Redis`) we might not have a whole team dedicated as stewards since
all platform code is classified as "shared". However, team members can add themselves as experts of a
particular functionality.
## Consequences
Stewardship defined in code can be very powerful:
- Sections of CODEOWNERS could be automatically generated from packages' metadata.
- Review Roulette or Suggested Reviews features can use this list as first preference.
- Engineers can easily identify stewards and have design conversations early.
- Gems living in the monolith (`gems/`), which should be wrapped into a Packwerk package,
can benefit of having explicit stewards.
## Alternatives
In the initial phase of modularization, before adopting Packwerk, we don't have an explicit concept
of ownership. We are initially relying on each team to know what bounded contexts they are responsible
for. For the "shared code" in the platform modules we initially expect maintainers to fill the role of
stewards.
- Pros: we give trainee maintainer a clear development path and goals. Today it feels unclear what they must
learn in order to become successful maintainers.
- Cons: The amount of "shared" code is very large and still hard to understand who knows best about
a particular functionality. Even extracting code into gems doesn't solve the lack of explicit ownership.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

View File

@ -1,218 +1,11 @@
---
status: proposed
creation-date: "2023-05-22"
authors: [ "@fabiopitino" ]
coach: [ ]
approvers: [ ]
owning-stage: ""
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/hexagonal_monolith/'
remove_date: '2025-07-08'
---
# Hexagonal Rails Monolith
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/hexagonal_monolith/).
## Background
This design document supersedes the previous [Composable GitLab Codebase](../../composable_codebase_using_rails_engines/index.md)
where we explored the idea of separating the codebase into technical runtime profiles:
for example, run the monolith solely as a Sidekiq node.
With a modular monolith and the use of an Hexagonal Architecture, we can achieve both
separation of domains as well as separation of application adapters, which may include the usage of engines and/or different runtime profiles.
## Summary
**TL;DR:** Change the Rails monolith from a [big ball of mud](https://en.wikipedia.org/wiki/Big_ball_of_mud) state to
a [modular monolith](https://www.thereformedprogrammer.net/my-experience-of-using-modular-monolith-and-ddd-architectures/)
that uses an [Hexagonal architecture](https://en.wikipedia.org/wiki/Hexagonal_architecture_(software)) (or ports and adapters architecture).
Extract cohesive functional domains into separate directory structure using Domain-Driven Design practices.
Extract infrastructure code (logging, database tools, instrumentation, etc.) into gems, essentially remove the need for `lib/` directory.
Define what parts of the functional domains (for example application services) are of public use for integration (the ports)
and what parts are instead private encapsulated details.
Define Web, Sidekiq, REST, GraphQL, and Action Cable as the adapters in the external layer of the architecture.
Use [Packwerk](https://github.com/Shopify/packwerk) to enforce privacy and dependency between modules of the monolith.
![Hexagonal Architecture for GitLab monolith](hexagonal_architecture.png)
## Details
```mermaid
flowchart TD
u([User]) -- interacts directly with --> AA[Application Adapter: WebUI, REST, GraphQL, git, ...]
AA --uses abstractions from--> D[Application Domain]
AA -- depends on --> Platform
D -- depends on --> Platform[Platform: gems, configs, framework, ...]
```
### Application domain
The application core (functional domains) is composed of all the code that describes the business logic, policies and data
that is unique to GitLab product.
It is divided into separate top-level [bounded contexts](../bounded_contexts.md).
A bounded-context is represented in the form of a Ruby module.
This follows the existing [guideline on naming namespaces](../../../../development/software_design.md#use-namespaces-to-define-bounded-contexts)
but puts more structure to it.
Modules should:
- Be deep enough to encapsulate a lot of the internal logic, state and data.
- Have a public interface that is as small as possible, safe to use by other bounded contexts and well documented.
- Be cohesive and represent the SSoT (single source of truth) of the feature it describes.
Feature categories represent a product area that is large enough for the module to be deep, so we don't have a proliferation
of small top-level modules. It also helps the codebase to follow the
[ubiquitous language](../../../../development/software_design.md#use-ubiquitous-language-instead-of-crud-terminology).
A team can be responsible for multiple feature categories, hence owning the vision for multiple bounded contexts.
While feature categories can sometimes change ownership, this change of mapping the bounded context to new owners
is very cheap.
Using feature categories also helps new contributors, either as GitLab team members of members of the wider community,
to navigate the codebase.
If multiple feature categories are strongly related, they may be grouped under a single bounded context.
If a feature category is only relevant in the context of a parent feature category, it may be included in the
parent's bounded context. For example: Build artifacts existing in the context of Continuous Integration feature category
and they may be merged under a single bounded context.
The application domain has no knowledge of outer layers like the application adapters and only depends on the
platform code. This makes the domain code to be the SSoT of the business logic, be reusable and testable regardless
whether the request came from the WebUI or REST API.
If a dependency between an outer layer and an inner layer is required (domain code depending on the interface of an adapter), this can be solved using inversion of control techniques, especially dependency injection.
### Application adapters
>>>
_Adapters are the glue between components and the outside world._
_They tailor the exchanges between the external world and the ports that represent the requirements of the inside_
_of the application component. There can be several adapters for one port, for example, data can be provided by_
_a user through a GUI or a command-line interface, by an automated data source, or by test scripts._ -
[Wikipedia](https://en.wikipedia.org/wiki/Hexagonal_architecture_(software)#Principle)
>>>
Application adapters would be:
- Web UI (Rails controllers, view, JS and Vue client)
- REST API endpoints
- GraphQL Endpoints
They are responsible for the interaction with the user. Each adapter should interpret the request, parse parameters
and invoke the right abstraction from the application domain, then present the result back to the user.
Presentation logic, and possibly authentication, would be specific to the adapters layer.
The application adapters layer depends on the platform code to run: the Rails framework, the gems that power the adapter,
the configurations and utilities.
### Platform code
For platform code we consider any classes and modules that are required by the application domain and/or application
adapters to work.
The Rails' `lib/` directory today contains multiple categories of code that could live somewhere else,
most of which is platform code:
- REST API endpoints could be part of the [application adapters](#application-adapters).
- domain code (both large domain code such as `Gitlab::Ci` and small such as `Gitlab::JiraImport`) should be
moved inside the [application domain](#application-domain).
- The rest could be extracted as separate single-purpose gems under the `gems/` directory inside the monolith.
This can include utilities such as logging, error reporting and metrics, rate limiters,
infrastructure code like `Gitlab::ApplicationRateLimiter`, `Gitlab::Redis`, `Gitlab::Database`
and generic subdomains like `Banzai`.
Base classes to extend Rails framework such as `ApplicationRecord` or `ApplicationWorker` as well as GitLab base classes
such as `BaseService` could be implemented as gem extensions.
This means that aside from the Rails framework code, the rest of the platform code resides in `gems/`.
Eventually all code inside `gems/` could potentially be extracted in a separate repository or open sourced.
Placing platform code inside `gems/` makes it clear that its purpose is to serve the application code.
### Enforcing boundaries
Ruby does not have the concept of privacy of constants in a given module. Unlike other programming languages, even extracting
well documented gems doesn't prevent other developers from coupling code to implementation details because all constants
are public in Ruby.
We can have a codebase perfectly organized in an hexagonal architecture but still having the application domain, the biggest
part of the codebase, being a non modularized [big ball of mud](https://en.wikipedia.org/wiki/Big_ball_of_mud).
Enforcing boundaries is also vital to maintaining the structure long term. We don't want that after a big modularization
effort we slowly fall back into a big ball of mud gain by violating the boundaries.
We explored the idea of [using Packwerk in a proof of concept](../proof_of_concepts.md#use-packwerk-to-enforce-module-boundaries)
to enforce module boundaries.
[Packwerk](https://github.com/Shopify/packwerk) is a static analyzer that allows to gradually introduce packages in the
codebase and enforce privacy and explicit dependencies. Packwerk can detect if some Ruby code is using private implementation
details of another package or if it's using a package that wasn't declared explicitly as a dependency.
Being a static analyzer it does not affect code execution, meaning that introducing Packwerk is safe and can be done
gradually.
Companies like Gusto have been developing and maintaining a list of [development and engineering tools](https://github.com/rubyatscale)
for organizations that want to move to using a Rails modular monolith around Packwerk.
### EE and JH extensions
One of the unique challenges of modularizing the GitLab codebase is the presence of EE extensions (managed by GitLab)
and JH extensions (managed by JiHu).
By moving related domain code (e.g. `Ci::`) under the same bounded context and Packwerk package, we would also need to
move `ee/` extensions in it.
To have top-level bounded contexts to also match Packwerk packages it means that all code related to a specific domain
needs to be placed under the same package directory, including EE extensions, for example.
The following is just an example of a possible directory structure:
```shell
domains
├── ci
│ ├── package.yml # package definition.
│ ├── packwerk.yml # tool configurations for this package.
│ ├── package_todo.yml # existing violations.
│ ├── core # Core features available in Community Edition and always autoloaded.
│ │ ├── app
│ │ │ ├── models/...
│ │ │ ├── services/...
│ │ │ └── lib/... # domain-specific `lib` moved inside `app` together with other classes.
│ │ └── spec
│ │ └── models/...
│ ├── ee # EE extensions specific to the bounded context, conditionally autoloaded.
│ │ ├── models/...
│ │ └── spec
│ │ └── models/...
│ └── public # Public constants are placed here so they can be referenced by other packages.
│ ├── core
│ │ ├── app
│ │ │ └── models/...
│ │ └── spec
│ │ └── models/...
│ └── ee
│ ├── app
│ │ └── models/...
│ └── spec
│ └── models/...
├── merge_requests/
├── repositories/
└── ...
```
## Challenges
- Such changes require a shift in the development mindset to understand the benefits of the modular
architecture and not fallback into legacy practices.
- Changing the application architecture is a challenging task. It takes time, resources and commitment
but most importantly it requires buy-in from engineers.
- This may require us to have a medium-long term team of engineers or a Working Group that makes progresses
on the architecture evolution plan, foster discussions in various engineering channels and resolve adoption challenges.
- We need to ensure we build standards and guidelines and not silos.
- We need to ensure we have clear guidelines on where new code should be placed. We must not recreate junk drawer folders like `lib/`.
## Opportunities
The move to a modular monolith architecture enables a lot of opportunities that we could explore in the future:
- We could align the concept of domain expert with explicitly owning specific modules of the monolith.
- The use of static analysis tool (such as Packwerk, RuboCop) can catch design violations in development and CI, ensuring
that best practices are honored.
- By defining dependencies between modules explicitly we could speed up CI by testing only the parts that are affected by
the changes.
- Such modular architecture could help to further decompose modules into separate services if needed.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,136 +1,11 @@
---
status: proposed
creation-date: "2023-05-22"
authors: [ "@grzesiek", "@fabiopitino" ]
coach: [ ]
approvers: [ ]
owning-stage: ""
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/).
# GitLab Modular Monolith
## Summary
The main [GitLab Rails](https://gitlab.com/gitlab-org/gitlab)
project has been implemented as a large monolithic application, using
[Ruby on Rails](https://rubyonrails.org/) framework. It has over 2.2 million
lines of Ruby code and hundreds of engineers contributing to it every day.
The application has been growing in complexity for more than a decade. The
monolithic architecture has served us well during this time, making it possible
to keep high development velocity and great engineering productivity.
Even though we strive for having [an approachable open-core architecture](https://about.gitlab.com/blog/2022/07/14/open-core-is-worse-than-plugins/)
we need to strengthen the boundaries between domains to retain velocity and
increase development predictability.
As we grow as an engineering organization, we want to explore a slightly
different, but related, architectural paradigm:
[a modular monolith design](https://en.wikipedia.org/wiki/Modular_programming),
while still using a [monolithic architecture](https://en.wikipedia.org/wiki/Monolithic_application)
with satellite services.
This should allow us to increase engineering efficiency, reduce the cognitive
load, and eventually decouple internal components to the extend that will allow
us to deploy and run them separately if needed.
## Motivation
Working with a large and tightly coupled monolithic application is challenging:
Engineering:
- Onboarding engineers takes time. It takes a while before engineers feel
productive due to the size of the context and the amount of coupling.
- We need to use `CODEOWNERS` file feature for several domains but
[these rules are complex](https://gitlab.com/gitlab-org/gitlab/-/blob/409228f064a950af8ff2cecdd138fc9da41c8e63/.gitlab/CODEOWNERS#L1396-1457).
- It is difficult for engineers to build a mental map of the application due to its size.
Even apparently isolated changes can have [far-reaching repercussions](https://handbook.gitlab.com/handbook/engineering/core-development/#reducing-the-impact-of-far-reaching-work)
on other parts of the monolith.
- Attrition/retention of engineering talent. It is fatiguing and demoralizing for
engineers to constantly deal with the obstacles to productivity.
Architecture:
- There is little structure inside the monolith. We have attempted to enforce
the creation [of some modules](https://gitlab.com/gitlab-org/gitlab/-/issues/212156)
but have no company-wide strategy on what the functional parts of the
monolith should be, and how code should be organized.
- There is no isolation between existing modules. Ruby does not provide
out-of-the-box tools to effectively enforce boundaries. Everything lives
under the same memory space.
- We rarely build abstractions that can boost our efficiency.
- Moving stable parts of the application into separate services is impossible
due to high coupling.
- We are unable to deploy changes to specific domains separately and isolate
failures that are happening inside them.
Productivity:
- High median-time-to-production for complex changes.
- It can be overwhelming for the wider-community members to contribute.
- Reducing testing times requires diligent and persistent efforts.
## Goals
- Increase the development velocity and predicability through separation of concerns.
- Improve code quality by reducing coupling and introducing useful abstractions.
- Build abstractions required to deploy and run GitLab components separately.
## How do we get there?
While we do recognize that modularization is a significant technical endeavor,
we believe that the main challenge is organizational, rather than technical. We
not only need to design separation in a way that modules are decoupled in a
pragmatic way which works well on GitLab.com but also on self-managed
instances, but we need to align modularization with the way in which we want to
work at GitLab.
There are many aspects and details required to make modularization of our
monolith successful. We will work on the aspects listed below, refine them, and
add more important details as we move forward towards the goal:
1. [Deliver modularization proof-of-concepts that will deliver key insights](proof_of_concepts.md).
1. Align modularization plans to the product structure by [defining bounded contexts](bounded_contexts.md).
1. [Separate domains into modules](packages_extraction.md) that will reflect product structure.
1. Start a training program for team members on how to work with decoupled domains (TODO)
1. Build tools that will make it easier to build decoupled domains through inversion of control (TODO)
1. [Introduce hexagonal architecture within the monolith](hexagonal_monolith/index.md)
1. Introduce clean architecture with one-way-dependencies and host application (TODO)
1. Build abstractions that will make it possible to run and deploy domains separately (TODO)
## Status
In progress.
- A working group [Bounded Contexts](https://handbook.gitlab.com/handbook/company/working-groups/bounded-contexts/)
was concluded in April 2024 which defined a list of bounded contexts to be enforced for GitLab Rails domain and
infrastructure layer.
## Decisions
1. [ADR-001: Modularize application domain](decisions/001_modular_application_domain.md)? Start with modularizing
the application domain and infrastructure code.
1. [ADR-002: Define bounded context around feature categories](decisions/002_bounded_contexts_definition.md) as a SSoT in the code.
1. [ADR-003: Assign stewards to all modules and libraries](decisions/003_stewardship.md).
## Glossary
- `modules` are Ruby modules and can be used to nest code hierarchically.
- `namespaces` are unique hierarchies of Ruby constants. For example, `Ci::` but also `Ci::JobArtifacts::` or `Ci::Pipeline::Chain::`.
- `packages` are Packwerk packages to group together related functionalities. These packages can be big or small depending on the design and architecture. Inside a package all constants (classes and modules) have the same namespace. For example:
- In a package `ci`, all the classes would be nested under `Ci::` namespace. There can be also nested namespaces like `Ci::PipelineProcessing::`.
- In a package `ci-pipeline_creation` all classes are nested under `Ci::PipelineCreation`, like `Ci::PipelineCreation::Chain::Command`.
- In a package `ci` a class named `MergeRequests::UpdateHeadPipelineService` would not be allowed because it would not match the package's namespace.
- This can be enforced easily with [Packwerk's based RuboCop Cops](https://github.com/rubyatscale/rubocop-packs/blob/main/lib/rubocop/cop/packs/root_namespace_is_pack_name.rb).
- `bounded context` is a top-level Packwerk package that represents a macro aspect of the domain. For example: `Ci::`, `MergeRequests::`, `Packages::`, etc.
- A bounded context is represented by a single Ruby module/namespace. For example, `Ci::` and not `Ci::JobArtifacts::`.
- A bounded context can be made of 1 or multiple Packwerk packages. Nested packages would be recommended if the domain is quite complex and we want to enforce privacy among all the implementation details. For example: `Ci::PipelineProcessing::` and `Ci::PipelineCreation::` could be separate packages of the same bounded context and expose their public API while keeping implementation details private.
- A new bounded context like `RemoteDevelopment::` can be represented a single package while large and complex bounded contexts like `Ci::` would need to be organized into smaller/nested packages.
## References
[List of references](references.md)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,52 +1,11 @@
---
status: proposed
creation-date: "2023-09-29"
authors: [ "@fabiopitino" ]
coach: [ ]
approvers: [ ]
owning-stage: ""
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/packages_extraction/'
remove_date: '2025-07-08'
---
# Convert domain module into packages
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/packages_extraction/).
The general steps of refactoring existing code to modularization could be:
1. Use the same namespace for all classes and modules related to the same [bounded context](bounded_contexts.md).
- **Why?** Without even a rough understanding of the domains at play in the codebase it is difficult to draw a plan.
Having well namespaced code that everyone else can follow is also the pre-requisite for modularization.
- If a domain is already well namespaced and no similar or related namespaces exist, we can move directly to the
next step.
1. Prepare Rails development for Packwerk packages. This is a **once off step** with maybe some improvements
added over time.
- We will have the Rails autoloader to work with Packwerk's directory structure, as demonstrated in
[this PoC](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/129254/diffs#note_1512982957).
- We will have [Danger-Packwerk](https://github.com/rubyatscale/danger-packwerk) running in CI for merge requests.
- We will possibly have Packer check running in Lefthook on pre-commit or pre-push.
1. Move file into a Packwerk package.
- This should consist in creating a Packwerk package and iteratively move files into the package.
- Constants are auto-loaded correctly whether they are in `app/` or `lib/` inside a Packwerk package.
- This is a phase where the domain code will be split between the package directory and the Rails directory structure.
**We must move quickly here**.
1. Enforce namespace boundaries by requiring packages declare their [dependencies explicitly](https://github.com/Shopify/packwerk/blob/main/USAGE.md#enforcing-dependency-boundary)
and only depend on other packages' [public interface](https://github.com/rubyatscale/packwerk-extensions#privacy-checker).
- **Why?** Up until now all constants would be public since we have not enforced privacy. By moving existing files
into packages without enforcing boundaries we can focus on wrapping a namespace in a package without being distracted
by Packwer privacy violations. By enforcing privacy afterwards we gain an understanding of coupling between various
constants and domains.
- This way we know what constants need to be made public (as they are used by other packages) and what can
remain private (taking the benefit of encapsulation). We will use Packwerk's recorded violations (like RuboCop TODOs)
to refactor the code over time.
- We can update the dependency graph to see where it fit in the overall architecture.
1. Work off Packwerk's recorded violations to make refactorings. **This is a long term phase** that the DRIs of the
domain need to nurture over time. We will use Packwerk failures and the dependency diagram to influence the modular design.
- Revisit wheteher a class should be private instead of public, and crate a better interface.
- Move constants to different package if too coupled with that.
- Join packages if they are too coupled to each other.
Once we have Packwerk configured for the Rails application (step 2 above), emerging domains could be directly implemented
as Packwerk packages, benefiting from isolation and clear interface immediately.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,134 +1,11 @@
---
status: proposed
creation-date: "2023-07-05"
authors: [ "@grzesiek", "@fabiopitino" ]
coach: [ ]
owners: [ ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/proof_of_concepts/'
remove_date: '2025-07-08'
---
# Modular Monolith: PoCs
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/proof_of_concepts/).
Modularization of our monolith is a complex project. There will be many
unknowns. One thing that can help us mitigate the risks and deliver key
insights are Proof-of-Concepts that we could deliver early on, to better
understand what will need to be done.
## Inter-module communicaton
A PoC that we plan to deliver is a PoC of inter-module communication. We do
recognize the need to separate modules, but still allow them to communicate
together using a well defined interface. Modules can communicate through a
facade classes (like libraries usually do), or through eventing system. Both
ways are important.
The main question is: how do we want to define the interface and how to design
the communication channels?
It is one of our goals to make it possible to plug modules out, and operate
some of them as separate services. This will make it easier deploy GitLab.com
in the future and scale key domains. One possible way to achieve this goal
would be to design the inter-module communication using a protobuf as an
interface and gRPC as a communication channel. When modules are plugged-in, we
would bypass gRPC and serialization and use in-process communication primitives
(while still using protobuf as an interface). When a module gets plugged-out,
gRPC would carry messages between modules.
## Use Packwerk to enforce module boundaries
Packwerk is a static analyzer that helps defining and enforcing module boundaries
in Ruby.
[In this PoC merge request](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/98801)
we demonstrate a possible directory structure of the monolith broken down into separate
modules.
The PoC also aims to solve the problem of EE extensions (and JH too) allowing the
Rails autoloader to be tweaked depending on whether to load only the Core codebase or
any extensions.
The PoC also attempted to only move a small part of the `Ci::` namespace into a
`components/ci` Packwerk package. This seems to be the most iterative approach
explored so far.
There are different approaches we could use to adopt Packwerk. Other PoC's also
explored are the [large extraction of CI package](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/88899)
and [moving the 2 main CI classes into a package](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/90595).
All 3 PoC's have a lot in common, from the introduction of Packwerk packages and configurations
to setting paths for the autoloader to work with any packages. What changes between the
various merge requests is the approach on choosing which files to move first.
The main goals of the PoC were:
- understand if Packwerk can be used on the GitLab codebase.
- understand the learning curve for developers.
- verify support for EE and JH extensions.
- allow gradual modularization.
### Positive results
- Using Packwerk would be pretty simple on GitLab since it's designed primarily to work
on Rails codebases.
- We can change the organization of the domain code to be module-oriented instead of following
the MVC pattern. It requires small initial changes to allow the Rails autoloading
to support the new directory structure, which is by the way not imposed by Packwerk.
After that, registering a new top-level package/bounded-context would be a 1 LOC change.
- Using the correct directory structure indicated in the [PoC](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/98801)
allows packages to contain all the code, including EE and JH extensions.
- Gradual modularization is possible and we can have any degree of modularization as we want,
from initial no enforcement down to complete isolation simulating an in-memory micro-service environment.
- Moving files into a Packwerk package doesn't necessarily mean renaming constants.
While this is not advisable long term, its an extra flexibility that the tool provides.
- For example: If we are extracting the `Ci::` module into a Packwerk package there can be
constants that belong to the CI domain but are not namespaced, like `CommitStatus` or
that have a different namespace, like `Gitlab::Ci::`.
Packwerk allows such constants to be moved inside the `ci` package and correctly flags
boundary violations.
- Packwerk enhancements from RubyAtScale tooling allow to enforce that all constants inside
a package share the same Ruby namespace. We eventually would want to leverage that.
- RubyAtScale provides also tools to track metrics about modularization and adoption which we
would need to monitor and drive as an engineering organization.
- Packwerk has IDE extensions (e.g. for VS Code) to provide realtime feedback on violations
(like RuboCop). It can also be run via CLI during the development workflow against a single
package. It could be integrated into pre-push Git hooks or Danger during code reviews.
### Challenges
Some of these challenges are not specific to Packwerk as tool/approach. They were observed
during the PoC and are more generically related to the process of modularization:
- There is no right or wrong approach when introducing Packwerk packages. We need to define
clear guidelines to give developers the tools to make the best decision:
- Sometimes it could be creating an empty package and move files in it gradually.
- Sometimes it could be wrapping an already well designed and isolated part of the codebase.
- Sometimes it could be creating a new package from scratch.
- As we move code to a different directory structure we need to involve JiHu as they manage
extensions following the current directory structure.
We may have modules that are partially migrated and we need to ensure JiHu is up-to-date
with the current progresses.
- After privacy/dependency checks are enabled, Packwerk will log a lot of violations
(like RuboCop TODOs) since constant references in a Rails codebase are very entangled.
- The team owning the package needs to define a vision for the package.
What would the package look like once all violations have been fixed?
This may mean specifying where the package fits in the
[context map](https://www.oreilly.com/library/view/what-is-domain-driven/9781492057802/ch04.html)
of the system. How the current package should be used by another package `A` and how
it should use other packages.
- The vision above should tell developers how they should fix these violations over time.
Should they make a specific constant public? Should the package list another package as its
dependencies? Should events be used in some scenarios?
- Teams will likely need guidance in doing that. We may need to have a team of engineers, like
maintainers with a very broad understanding of the domains, that will support engineering
teams in this effort.
- Changes to CI configurations on tuning Knapsack and selective testing were ignored durign the
PoC.
## Frontend sorting hat
Frontend sorting-hat is a PoC for combining multiple domains to render a full
page of GitLab (with menus, and items that come from multiple separate
domains).
## Frontend assets aggregation
Frontend assets aggregation is a PoC for a possible separation of micro-frontends.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,70 +1,11 @@
---
status: proposed
creation-date: "2023-06-21"
authors: [ "@fabiopitino" ]
coach: [ ]
approvers: [ ]
owning-stage: ""
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/references/'
remove_date: '2025-07-08'
---
# References
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/modular_monolith/references/).
## Related design docs
- [Composable codebase design doc](../composable_codebase_using_rails_engines/index.md)
## Related Issues
- [Split GitLab monolith into components](https://gitlab.com/gitlab-org/gitlab/-/issues/365293)
- [Make it simple to build and use "Decoupled Services"](https://gitlab.com/gitlab-org/gitlab/-/issues/31121)
- [Use nested structure to organize CI classes](https://gitlab.com/gitlab-org/gitlab/-/issues/209745)
- [Create new models / classes within a module / namespace](https://gitlab.com/gitlab-org/gitlab/-/issues/212156)
- [Make teams to be maintainers of their code](https://gitlab.com/gitlab-org/gitlab/-/issues/25872)
- [Add backend guide for Dependency Injection](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/73644)
## Internal Slack Channels
- [`#modular_monolith`](https://gitlab.slack.com/archives/C03NTK6HZBM)
- [`#architecture`](https://gitlab.slack.com/archives/CJ4DB7517)
## Reference Implementations / Guides
Gusto / RubyAtScale:
- [RubyAtScale toolchain for modularization](https://github.com/rubyatscale)
- [Gusto's engineering blog](https://engineering.gusto.com/laying-the-cultural-and-technical-foundation-for-big-rails/)
- [Gradual modularization](https://gradualmodularization.com/) (successor to CBRA)
- [Component-Based Rails Applications](https://cbra.info) ("deprecated")
Shopify:
- [Packwerk](https://github.com/Shopify/packwerk)
- [Shopify's jurney to modularization](https://shopify.engineering/shopify-monolith)
- [Internal GitLab doc transcript of an AMA with a Shopify engineer](https://docs.google.com/document/d/1uZbcaK8Aqs-D_n7_uQ5XE295r5UWDJEBwA6g5bTjcwc/edit#heading=h.d1tml5rlzrpa)
Domain-Driven Rails / Rails Event Store:
Rails Event Store is relevant because it is a mechanism to achieve many
of the goals discussed here, and is based upon patterns used by Arkency
to build production applications.
This doesn't mean we need to use this specific framework or approach.
However, the general concepts of DDD/ES/CQRS are important and in some
cases maybe necessary to achieve the goals of this blueprint, so it's
useful to have concrete production-proven implementations of those
concepts to look at as an example.
- [Arkency's domain-driven Rails](https://products.arkency.com/domain-driven-rails/)
- [Arkency's Rails Event Store](https://railseventstore.org)
App Continuum:
An illustration of how an application can evolve from a small, unstructured app, through various
stages including a modular well-structured monolith, all the way to a microservices architecture.
Includes discussion of why you might want to stop at various stages, and specifically the
challenges/concerns with making the jump to microservices, and why sticking with a
well-structured monolith may be preferable in many cases.
- [App Continuum](https://www.appcontinuum.io)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.2 KiB

View File

@ -1,154 +1,11 @@
---
status: ongoing
creation-date: "2024-05-27"
authors: [ "@bsandlin" ]
coach: "@ntepluhina"
approvers: [ "@dhershkovitch", "@marknuzzo" ]
owning-stage: "~devops::verify"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/pipeline_mini_graph/'
remove_date: '2025-07-08'
---
<!-- Blueprints often contain forward-looking statements -->
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/pipeline_mini_graph/).
# Pipeline Mini Graph
This blueprint serves as living documentation for the Pipeline Mini Graph. The Pipeline Mini Graph is used in various places throughout the platform to communicate to users the status of the relevant pipeline. Users are able to re-run jobs directly from the component or drilldown into said jobs and linked pipelines for further investigation.
![Pipeline Mini Graph](img/pipeline_mini_graph.png)
## Motivation
While the Pipeline Mini Graph primarily functions via REST, we are updating the component to support GraphQL and subsequently migrating all instances of this component to GraphQL. This documentation will serve as the SSOT for this refactor. Developers have expressed difficulty contributing to this component as the two APIs co-exist, so we need to make this code easier to update while also supporting both REST and GraphQL. This refactor lives behind a feature flag called `ci_graphql_pipeline_mini_graph`.
### Goals
- Improved maintainability
- Backwards compatibility
- Improved query performance
- Deprecation of REST support
### Non-Goals
- Redesign of the Pipeline Mini Graph UI
## Proposal
To break down implementation, we are taking the following steps:
1. Separate the REST version and the GraphQL version of the component into 2 directories called `pipeline_mini_graph` and `legacy_pipeline_mini_graph`. This way, developers can contribute with more ease and we can easily remove the REST version once all apps are using GraphQL.
1. Finish updating the newer component to fully support GraphQL
1. Optimize GraphQL query structure to be more performant.
1. Roll out `ci_graphql_pipeline_mini_graph` to globally enable GraphQL instances of the component.
## Implementation Details
| Issue | Milestone | MR | Status |
| ----- | --------- | -- | ------ |
| [Move legacy files to new directory](https://gitlab.com/gitlab-org/gitlab/-/work_items/464375) | [17.1](https://gitlab.com/groups/gitlab-org/-/milestones/99#tab-issues) | [154625](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/154625) | ✅ |
| [Move remaining legacy code](https://gitlab.com/gitlab-org/gitlab/-/work_items/464379) | [17.1](https://gitlab.com/groups/gitlab-org/-/milestones/99#tab-issues) |[154818](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/154818) | ✅ |
| [Create README for PMG](https://gitlab.com/gitlab-org/gitlab/-/work_items/464632) | [17.1](https://gitlab.com/groups/gitlab-org/-/milestones/99#tab-issues) | [154964](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/154964) | ✅ |
| [GraphQL Query Optimization](https://gitlab.com/gitlab-org/gitlab/-/issues/465309) | [17.1](https://gitlab.com/groups/gitlab-org/-/milestones/99#tab-issues) | [465309](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/155129) | ✅ |
| [Dedicated component for downstream pipelines](https://gitlab.com/gitlab-org/gitlab/-/issues/466238) | [17.1](https://gitlab.com/groups/gitlab-org/-/milestones/99#tab-issues) | [155382](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/155382) | ✅ |
| [Fetch Stage by ID](https://gitlab.com/gitlab-org/gitlab/-/issues/464100) | [17.2](https://gitlab.com/groups/gitlab-org/-/milestones/100#tab-issues) | [157506](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/157506) | ✅ |
| [Job Item](https://gitlab.com/gitlab-org/gitlab/-/issues/467278) | [17.2](https://gitlab.com/groups/gitlab-org/-/milestones/100#tab-issues) | [157798](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/157798) | In Review |
| [Job Actions](https://gitlab.com/gitlab-org/gitlab/-/issues/467279) | [17.2](https://gitlab.com/groups/gitlab-org/-/milestones/100#tab-issues) | TBD | In Dev |
| [Rollout `ci_graphql_pipeline_mini_graph`](https://gitlab.com/gitlab-org/gitlab/-/issues/407818) | [17.2](https://gitlab.com/groups/gitlab-org/-/milestones/100#tab-issues) | TBD | Blocked |
| [Migrate MR PMG to GraphQL instance](https://gitlab.com/gitlab-org/gitlab/-/issues/419725) | [17.2](https://gitlab.com/groups/gitlab-org/-/milestones/100#tab-issues) | TBD | Blocked |
| [Migrate pipeline editor PMG to GraphQL instance](https://gitlab.com/gitlab-org/gitlab/-/issues/466275) | TBD | TBD | Blocked |
| [Migrate commit page PMG to GraphQL instance](https://gitlab.com/gitlab-org/gitlab/-/issues/466274) | TBD | TBD | Blocked |
| [Remove dead logic from PMG codebase](https://gitlab.com/gitlab-org/gitlab/-/issues/466277) | TBD | TBD | Blocked |
## Design Details
### REST Structure
#### File Structure
```plaintext
├── pipeline_mini_graph/
├── └── legacy_pipeline_mini_graph/
│ ├── legacy_job_item.vue
│ ├── legacy_linked_pipelines_mini_list.vue
│ ├── legacy_pipeline_mini_graph.vue
│ ├── legacy_pipeline_stage.vue
│ └── legacy_pipeline_stages.yml
```
All data for the legacy pipeline mini graph is passed into the REST instance of the component. This data comes from various API calls throughout different apps which use the component.
#### Properties
| Name | Type | Required | Description |
| ---- | ---- | -------- | ----------- |
|`downstreamPipelines` | Array | false | pipelines triggered by current pipeline |
|`isMergeTrain` | Boolean | false | whether the pipeline is part of a merge train |
|`pipelinePath` | String | false | pipeline URL |
|`stages` | Array | true | stages of current pipeline |
|`updateDropdown` | Boolean | false | whether to fetch jobs when the dropdown is open |
|`upstreamPipeline` | Object | false | upstream pipeline which triggered current pipeline |
### GraphQL Structure
The GraphQL instance of the pipeline mini graph has self-managed data.
#### Current File Structure
```plaintext
├── pipeline_mini_graph/
| ├── job_item.vue
│ ├── linked_pipelines_mini_list.vue
│ ├── pipeline_mini_graph.vue
│ ├── pipeline_stage.vue
│ └── pipeline_stages.yml
├── ├── graphql/
│ ├── get_pipeline_stage_query.query.graphql
│ └── get_pipeline_stages_query.query.graphql
```
#### Current Properties
| Name | Type | Required | Description |
| --- | --- | --- | --- |
|`fullPath` | String | true | full path for the queries |
|`iid` | String | true | pipeline iid for the queries |
|`isMergeTrain` | Boolean | false | whether the pipeline is part of a merge train (under consideration) |
|`pipelineEtag` | String | true | etag for caching (under consideration) |
|`pollInterval` | Number | false | interval for polling updates |
#### Considerations
##### Properties
- `isMergeTrain`: This property is specific to the MR page and is used to display a message in the job dropdown to warn users that merge train jobs cannot be retried. This is an odd flow. Perhaps we should consider either having this data come from somewhere else within the pipeline mini graph, or living in the merge train widget itself. It is worth noting here that this boolean is not used for any other logic outside of displaying this message.
- `pipelineEtag`: Consider whether this data must be passed into the pipeline mini graph, or whether we can generate this within the component through a query.
##### Query Structure
Currently the approach is to use 2 queries in the parent file
- [getPipelineStages](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/assets/javascripts/ci/pipeline_mini_graph/graphql/queries/get_pipeline_stages.query.graphql?ref_type=heads): populates the stages of the current pipeline
- [getLinkedPipelines](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/assets/javascripts/ci/pipeline_details/graphql/queries/get_linked_pipelines.query.graphql?ref_type=heads): if there is an upstream pipeline or any downstream pipelines, this query will populate those
Another query called [getPipelineStage](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/assets/javascripts/ci/pipeline_mini_graph/graphql/queries/get_pipeline_stage.query.graphql?ref_type=heads) will be used to populate the dropdown when a stage icon is clicked.
We need to consider whether this is the most performant way to query for the mini graph.
Should pipeline stages and linked pipelines populate in 2 separate queries or is it okay to just have a single query for all icons and statuses in the mini graph to be displayed? Considerations below:
- The reason this is currently split is because the upstream/downstream sections of the mini graph were added as a feature enhancement after the initial component already existed and they used to be a premium feature. Since we have restructured this to all exist as free, we should reconsider.
- The pmg exists in several pipelines lists, so there will be up to 15 on a page rendering at a time.
- Since we have not implemented subscriptions, we are still using polling to update this component.
- If we merge two queries, do we hit maximum complexity or not?
- Loading time and the UI: What information do we want to render the fastest? Can we defer some information to load with the second query?
##### Directory Structure
There has been an [interest](https://gitlab.com/gitlab-org/gitlab/-/issues/345571) in more information from downstream pipelines. As such, we may want to consider rethinking `linked_pipelines_mini_list.vue` and having the downstream pipelines into their own file called `downstream_pipelines.vue` instead. The upstream pipeline could then be rendered with a simple `ci-icon`.
## Future Improvements
- [GraphQL Subscriptions](https://gitlab.com/gitlab-org/gitlab/-/issues/406652)
- [Show downstream pipeline jobs](https://gitlab.com/gitlab-org/gitlab/-/issues/345571)
- Possible redesign
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -210,12 +210,49 @@ and all updates should first be made there.
On a regular basis, the changes made in `gitlab` project to the Vale and markdownlint configuration should be
synchronized to the other projects. In each of the [supported projects](#supported-projects):
1. Create a new branch.
1. Copy the configuration files from the `gitlab` project into this branch, overwriting
the project's old configuration. Make sure no project-specific changes from the `gitlab`
project are included. For example, [`RelativeLinks.yml`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/doc/.vale/gitlab/RelativeLinks.yml)
is hard coded for specific projects.
1. Create a merge request and submit it to a technical writer for review and merge.
1. Create a new branch. Add `docs-` to the beginning or `-docs` to the end of the branch name. Some projects use this
convention to limit the jobs that run.
1. Copy the configuration files from the `gitlab` project. For example, in the root directory of the project, run:
```shell
# Copy markdownlint configuration file
cp ../gitlab/.markdownlint-cli2.yaml .
# Copy Vale configuration files for a project with documentation stored in 'docs' directory
cp -r ../gitlab/doc/.vale docs
```
1. Review the diff created for `.markdownlint-cli2.yaml`. For example, run:
```shell
git diff .markdownlint-cli2.yaml
```
1. Remove any changes that aren't required. For example, `customRules` is only used in the `gitlab` project.
1. Review the diffs created for the Vale configuration. For example, run:
```shell
git diff docs
```
1. Remove unneeded changes to `RelativeLinks.yml`. This rule is specific to each project.
1. Remove any `.tmpl` files. These files are only used in the `gitlab` project.
1. Run `markdownlint-cli2` to check for any violations of the new rules. For example:
```shell
markdownlint-cli2 docs/**/*.md
```
1. Run Vale to check for any violations of the new rules. For example:
```shell
vale --minAlertLevel error docs
```
1. Commit the changes to the new branch. Some projects require
[conventional commits](https://www.conventionalcommits.org/en/v1.0.0/) so check the contributing information for the
project before committing.
1. Submit a merge request for review.
## Update linting images

View File

@ -56,26 +56,6 @@ DETAILS:
- <i class="fa fa-youtube-play youtube" aria-hidden="true"></i> [Watch overview](https://youtu.be/ds7SG1wgcVM)
- [View documentation](../project/repository/code_suggestions/index.md).
### Vulnerability resolution
DETAILS:
**Tier:** Ultimate with [GitLab Duo Enterprise](../../subscriptions/subscription-add-ons.md) add-on
**Offering:** GitLab.com, Self-managed, GitLab Dedicated
- Help resolve a vulnerability by generating a merge request that addresses it.
- LLM: Anthropic's [`claude-3-haiku`](https://docs.anthropic.com/en/docs/about-claude/models#claude-3-a-new-generation-of-ai)
- [View documentation](../application_security/vulnerabilities/index.md#vulnerability-resolution).
### Vulnerability explanation
DETAILS:
**Tier:** Ultimate with [GitLab Duo Enterprise](../../subscriptions/subscription-add-ons.md)
**Offering:** GitLab.com, Self-managed, GitLab Dedicated
- Helps you understand vulnerabilities, how they can be exploited, and how to fix them.
- LLM: Anthropic's [`claude-3-haiku`](https://docs.anthropic.com/en/docs/about-claude/models#claude-3-a-new-generation-of-ai)
- [View documentation](../application_security/vulnerabilities/index.md#explaining-a-vulnerability).
### Code explanation in the IDE
DETAILS:
@ -118,6 +98,26 @@ DETAILS:
- LLM: Vertex AI Codey [`text-bison`](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/text-bison).
- [View documentation](../project/merge_requests/duo_in_merge_requests.md#generate-a-merge-commit-message).
### Vulnerability explanation
DETAILS:
**Tier:** Ultimate with [GitLab Duo Enterprise](../../subscriptions/subscription-add-ons.md)
**Offering:** GitLab.com, Self-managed, GitLab Dedicated
- Helps you understand vulnerabilities, how they can be exploited, and how to fix them.
- LLM: Anthropic's [`claude-3-haiku`](https://docs.anthropic.com/en/docs/about-claude/models#claude-3-a-new-generation-of-ai).
- [View documentation](../application_security/vulnerabilities/index.md#explaining-a-vulnerability).
### Vulnerability resolution
DETAILS:
**Tier:** Ultimate with [GitLab Duo Enterprise](../../subscriptions/subscription-add-ons.md) add-on
**Offering:** GitLab.com, Self-managed, GitLab Dedicated
- Help resolve a vulnerability by generating a merge request that addresses it.
- LLM: Anthropic's [`claude-3-haiku`](https://docs.anthropic.com/en/docs/about-claude/models#claude-3-a-new-generation-of-ai).
- [View documentation](../application_security/vulnerabilities/index.md#vulnerability-resolution).
## Beta features
### Merge request template population

View File

@ -54,6 +54,11 @@ Because this is an experimental feature,
| `work_items_rolledup_dates` | Calculates the start and due dates in a hierarchy for work items. | Group | **Required** |
| `epic_and_work_item_associations_unification` | Delegates other epic and work item associations. | Group | **Required** |
## Feedback
If you run into any issue while trying out this change, you can use
[feedback issue](https://gitlab.com/gitlab-org/gitlab/-/issues/463598) to provide more details.
## Related topics
- [Work items development](../../../development/work_items.md)

View File

@ -77,6 +77,8 @@ Value streams are container objects for the stages. You can have multiple value
### Value stream stage events
> - Merge request first reviewer assigned event [introduced](https://gitlab.com/gitlab-org/gitlab/-/issues/466383) in GitLab 17.2. Reviewer assignment events in merge requests created or updated prior to GitLab 17.2 are not available for reporting.
Events are the smallest building blocks of the value stream analytics feature. A stage consists of a start event and an end event.
The following stage events are available:
@ -95,6 +97,7 @@ The following stage events are available:
- MR created
- MR first commit time
- MR first assigned
- MR first reviewer assigned
- MR first deployed
- MR label added
- MR label removed

View File

@ -42,6 +42,14 @@ Practically, you can differentiate between achievements that are awarded:
- Once and revocable. For example, a "Core team member" achievement.
- Multiple times. For example, a "Contributor of the month" achievement.
## View group achievements
To view all available and awarded achievements for a group:
- Go to `https://gitlab.com/groups/<group-path>/-/achievements`.
The page displays a list of achievements and the members who were awarded the achievement.
## View a user's achievements
You can view a user's achievements on their profile page.
@ -92,44 +100,52 @@ Prerequisites:
- You must have the Maintainer or Owner role for the namespace.
To create an achievement, call the [`achievementsCreate` GraphQL mutation](../../api/graphql/reference/index.md#mutationachievementscreate).
To create an achievement:
```graphql
mutation achievementsCreate($file: Upload!) {
achievementsCreate(
input: {
namespaceId: "gid://gitlab/Namespace/<namespace id>",
name: "<name>",
description: "<description>",
avatar: $file}
) {
errors
achievement {
id
name
description
avatarUrl
- In the UI:
1. On the [Achievements page](#view-group-achievements), select **New achievement**.
1. Enter a name for the achievement.
1. Optional. Enter a description and upload an avatar for the achievement.
1. Select **Save changes**.
- With the GraphQL API, call the [`achievementsCreate` GraphQL mutation](../../api/graphql/reference/index.md#mutationachievementscreate):
```graphql
mutation achievementsCreate($file: Upload!) {
achievementsCreate(
input: {
namespaceId: "gid://gitlab/Namespace/<namespace id>",
name: "<name>",
description: "<description>",
avatar: $file}
) {
errors
achievement {
id
name
description
avatarUrl
}
}
}
}
```
```
To supply the avatar file, call the mutation using `curl`:
To supply the avatar file, call the mutation using `curl`:
```shell
curl "https://gitlab.com/api/graphql" \
-H "Authorization: Bearer <your-pat-token>" \
-H "Content-Type: multipart/form-data" \
-F operations='{ "query": "mutation ($file: Upload!) { achievementsCreate(input: { namespaceId: \"gid://gitlab/Namespace/<namespace-id>\", name: \"<name>\", description: \"<description>\", avatar: $file }) { achievement { id name description avatarUrl } } }", "variables": { "file": null } }' \
-F map='{ "0": ["variables.file"] }' \
-F 0='@/path/to/your/file.jpg'
```
```shell
curl "https://gitlab.com/api/graphql" \
-H "Authorization: Bearer <your-pat-token>" \
-H "Content-Type: multipart/form-data" \
-F operations='{ "query": "mutation ($file: Upload!) { achievementsCreate(input: { namespaceId: \"gid://gitlab/Namespace/<namespace-id>\", name: \"<name>\", description: \"<description>\", avatar: $file }) { achievement { id name description avatarUrl } } }", "variables": { "file": null } }' \
-F map='{ "0": ["variables.file"] }' \
-F 0='@/path/to/your/file.jpg'
```
When successful, the response returns the achievement ID:
When successful, the response returns the achievement ID:
```shell
{"data":{"achievementsCreate":{"achievement":{"id":"gid://gitlab/Achievements::Achievement/1","name":"<name>","description":"<description>","avatarUrl":"https://gitlab.com/uploads/-/system/achievements/achievement/avatar/1/file.jpg"}}}}
```
```shell
{"data":{"achievementsCreate":{"achievement":{"id":"gid://gitlab/Achievements::Achievement/1","name":"<name>","description":"<description>","avatarUrl":"https://gitlab.com/uploads/-/system/achievements/achievement/avatar/1/file.jpg"}}}}
```
## Update an achievement

View File

@ -45,7 +45,7 @@ Prerequisites:
- Must be between 2 and 255 characters in length.
- Must only include non-accented letters, digits, `_`, `-`, and `.`.
- Must not:
- Start with `-`.
- Start with `_`, `-`, or `.`.
- Contain emoji.
- End with `.` or `.<reserved file extension>`, for example `jon.png`, `jon.git` or `jon.atom`. However,
`jonpng` is valid.

View File

@ -30,7 +30,7 @@ module API
def publish_draft_note(params:)
::DraftNotes::PublishService
.new(merge_request(params: params), current_user)
.execute(get_draft_note(params: params))
.execute(draft: get_draft_note(params: params))
end
def publish_draft_notes(params:)

View File

@ -0,0 +1,77 @@
# frozen_string_literal: true
module Gitlab
module Import
class ImportUserCreator
include Gitlab::Utils::StrongMemoize
NoRootGroupError = Class.new(StandardError)
attr_reader :portable
def initialize(portable:)
@portable = portable
end
def execute
raise NoRootGroupError, 'Portable has no root group' unless root_ancestor
return import_user if import_user
User.transaction do
user = create_user
create_namespace_import_user(user)
user
end
rescue ActiveRecord::RecordNotUnique => e
::Import::Framework::Logger.warn(
message: 'Failed to create namespace_import_user',
error: e.message
)
# Import user already exists, try finding it again
import_user
end
private
def create_user
User.create!(
user_type: :import_user,
name: 'Import User',
username: username_and_email_generator.username,
email: username_and_email_generator.email
) do |u|
u.assign_personal_namespace(Organizations::Organization.default_organization)
end
end
def create_namespace_import_user(user)
::Import::NamespaceImportUser.create!(
user_id: user.id,
namespace_id: root_ancestor.id
)
end
def username_and_email_generator
Gitlab::Utils::UsernameAndEmailGenerator.new(
username_prefix: username_prefix,
email_domain: "noreply.#{Gitlab.config.gitlab.host}"
)
end
strong_memoize_attr :username_and_email_generator
def import_user
root_ancestor.import_user
end
def root_ancestor
portable.root_ancestor
end
def username_prefix
"import_user_namespace_#{root_ancestor.id}"
end
end
end
end

View File

@ -0,0 +1,180 @@
# frozen_string_literal: true
NUMBER_OF_GITLAB_TRACES = 1000
DEFAULT_NUMBER_OF_TRACES = 250
namespace :gitlab do
namespace :populate_job_traces do
desc "GitLab | Populates projects builds with real job traces, requires existing builds"
task :populate,
[:target_project_id, :access_token, :custom_project_id, :custom_number_to_populate] =>
[:environment] do |_t, args|
# These projects are chosen as they're open to the public
@projects = [
["gitlab-org/gitlab", 278964, NUMBER_OF_GITLAB_TRACES],
["gitlab-org/modelops/ai-model-validation-and-research/ai-evaluation/prompt-library",
46678122, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/gitlab-runner", 250833, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/gitlab-services/design.gitlab.com", 4456656, DEFAULT_NUMBER_OF_TRACES],
["gitlab-com/www-gitlab-com", 7764, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/gitlab-development-kit", 74823, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/omnibus-gitlab", 20699, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/cli", 34675721, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/gitaly", 2009901, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/gitlab-docs", 1794617, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/gitlab-ui", 7071551, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/gitlab-vscode-extension", 5261717, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/gitlab-pages", 734943, DEFAULT_NUMBER_OF_TRACES],
["gitlab-org/release-tools", 430285, DEFAULT_NUMBER_OF_TRACES]
].freeze
if args.custom_project_id.present? && args.custom_number_to_populate.present?
@projects = [["custom_project", args.custom_project_id.to_i, args.custom_number_to_populate.to_i]]
end
@project_id = args.target_project_id
@access_token = args.access_token
start_populate
end
desc "GitLab | Populate existing projects builds with only 1 trace per project"
task :populate_trial,
[:target_project_id, :access_token, :custom_project_id, :custom_number_to_populate] =>
[:environment] do |_t, args|
@projects = [
["gitlab-org/gitlab", 278964, 1],
["gitlab-org/modelops/ai-model-validation-and-research/ai-evaluation/prompt-library", 46678122, 1],
["gitlab-org/gitlab-runner", 250833, 1],
["gitlab-org/gitlab-services/design.gitlab.com", 4456656, 1],
["gitlab-com/www-gitlab-com", 7764, 1],
["gitlab-org/gitlab-development-kit", 74823, 1],
["gitlab-org/omnibus-gitlab", 20699, 1],
["gitlab-org/cli", 34675721, 1],
["gitlab-org/gitaly", 2009901, 1],
["gitlab-org/gitlab-docs", 1794617, 1],
["gitlab-org/gitlab-ui", 7071551, 1],
["gitlab-org/gitlab-vscode-extension", 5261717, 1],
["gitlab-org/gitlab-pages", 734943, 1],
["gitlab-org/release-tools", 430285, 1]
].freeze
if args.custom_project_id.present? && args.custom_number_to_populate.present?
@projects = [["custom_project", args.custom_project_id.to_i, args.custom_number_to_populate.to_i]]
end
@project_id = args.target_project_id
@access_token = args.access_token
start_populate
end
def start_populate
@num_builds_processed_this_project = 0
@current_project_index = -1 # This is because process goes to next project at first iteration
@current_project_id = nil
@ids_for_current_project = []
@current_job_index_for_project = 0
project = Project.find(@project_id)
start_time = Time.zone.now
project.builds.failed.each_batch(of: 500, order: :desc) do |batch|
puts "Starting batch with id range: #{batch.first.id} - #{batch.last.id}"
break if process_builds(batch)
end
puts "Time Elapsed: #{Time.zone.now - start_time}"
end
private
def process_builds(batch)
batch.each do |job|
# We've done all we should do for the current project, load the next one
if @current_job_index_for_project >= @ids_for_current_project.length
@current_project_index += 1
if @current_project_index >= @projects.length
puts "Finished!"
return true
end
@current_job_index_for_project = 0
_, project_id, number_to_load = @projects[@current_project_index]
@current_project_id = project_id
@ids_for_current_project = load_ids_from_pagination(project_id, number_to_load)
next if @ids_for_current_project.empty?
end
job_id = @ids_for_current_project[@current_job_index_for_project]
job_trace = load_trace_for_job(@current_project_id, job_id)
begin
job.trace.erase!
job.trace.set(job_trace)
rescue StandardError => e
puts "ERROR: (write log) #{job_id} - #{e}"
end
@current_job_index_for_project += 1
end
false
end
def load_ids_from_pagination(project_id, number_to_load)
url = "https://gitlab.com/api/v4/projects/#{project_id}/jobs?" \
"scope[]=failed" \
"&pagination=keyset" \
"&per_page=100" \
"&order_by=id" \
"&sort=desc"
headers = { "PRIVATE-TOKEN": @access_token }
ids = []
num_loaded = 0
(number_to_load / 100.to_f).ceil.times do
puts "Calling JOBs API: #{url}"
project_jobs_response = Gitlab::HTTP.get(url, headers: headers)
unless project_jobs_response.code == 200
puts "ERROR: (project api) #{project_jobs_response.code} : #{project_jobs_response.message}"
@current_project_index += 1
return []
end
project_jobs_response.each do |response|
break if num_loaded >= number_to_load
ids << response["id"]
num_loaded += 1
end
url = project_jobs_response.headers["link"]&.match(/<([^>]*)>/)&.[](1)
end
puts "Loaded ##{ids.length} ids for #{project_id}"
ids
end
def load_trace_for_job(project_id, job_id)
job_trace_url = "https://gitlab.com/api/v4/projects/#{project_id}/jobs/#{job_id}/trace"
headers = { "PRIVATE-TOKEN": @access_token }
job_trace_response = Gitlab::HTTP.get(job_trace_url, headers: headers)
unless job_trace_response.code == 200
puts "ERROR: (trace api) #{job_trace_response.code} : #{job_trace_response.message}"
return "Ignore: Failed to fetch actual job trace"
end
job_trace_response
end
end
end

Some files were not shown because too many files have changed in this diff Show More