Add latest changes from gitlab-org/gitlab@master

This commit is contained in:
GitLab Bot 2024-07-10 21:35:01 +00:00
parent b1016dffb4
commit f3b09181be
156 changed files with 2406 additions and 13530 deletions

View File

@ -333,7 +333,7 @@ gem 'licensee', '~> 9.16' # rubocop:todo Gemfile/MissingFeatureCategory
# Detect and convert string character encoding
# We forked charlock_holmes at https://gitlab.com/gitlab-org/ruby/gems/charlock_holmes
# but changed it's name to 'static_holmes' in the gemspec file.
gem 'static_holmes', '~> 0.7.7', require: 'charlock_holmes' # rubocop:todo Gemfile/MissingFeatureCategory
gem 'static_holmes', '~> 0.7.11', require: 'charlock_holmes', feature_category: :shared
# Detect mime content type from content
gem 'ruby-magic', '~> 0.6' # rubocop:todo Gemfile/MissingFeatureCategory

View File

@ -663,7 +663,7 @@
{"name":"state_machines","version":"0.5.0","platform":"ruby","checksum":"23e6249d374a920b528dccade403518b4abbd83841a3e2c9ef13e6f1a009b102"},
{"name":"state_machines-activemodel","version":"0.8.0","platform":"ruby","checksum":"e932dab190d4be044fb5f9cab01a3ea0b092c5f113d4676c6c0a0d49bf738d2c"},
{"name":"state_machines-activerecord","version":"0.8.0","platform":"ruby","checksum":"072fb701b8ab03de0608297f6c55dc34ed096e556fa8f77e556f3c461c71aab6"},
{"name":"static_holmes","version":"0.7.7","platform":"ruby","checksum":"87c426d821728915518444342aaac58d078b359e0f4a34e4e0516b0daadc8207"},
{"name":"static_holmes","version":"0.7.11","platform":"ruby","checksum":"c35cb3ed35986656e0b2bb4d69b89551d45c0a182ff6ae0181f1e2ba080732bb"},
{"name":"strings","version":"0.2.1","platform":"ruby","checksum":"933293b3c95cf85b81eb44b3cf673e3087661ba739bbadfeadf442083158d6fb"},
{"name":"strings-ansi","version":"0.2.0","platform":"ruby","checksum":"90262d760ea4a94cc2ae8d58205277a343409c288cbe7c29416b1826bd511c88"},
{"name":"swd","version":"1.3.0","platform":"ruby","checksum":"bc382a19e1d36a95529b25152976db61b80376c3d486b21c8dd60ac2b5c06389"},

View File

@ -1753,7 +1753,7 @@ GEM
state_machines-activerecord (0.8.0)
activerecord (>= 5.1)
state_machines-activemodel (>= 0.8.0)
static_holmes (0.7.7)
static_holmes (0.7.11)
strings (0.2.1)
strings-ansi (~> 0.2)
unicode-display_width (>= 1.5, < 3.0)
@ -2251,7 +2251,7 @@ DEPENDENCIES
ssh_data (~> 1.3)
stackprof (~> 0.2.25)
state_machines-activerecord (~> 0.8.0)
static_holmes (~> 0.7.7)
static_holmes (~> 0.7.11)
sys-filesystem (~> 1.4.3)
tanuki_emoji (~> 0.9)
telesignenterprise (~> 2.2)

View File

@ -42,7 +42,10 @@ window.snowplowPlugins = [
LinkClickTrackingPlugin(),
FormTrackingPlugin(),
TimezonePlugin(),
GaCookiesPlugin(),
GaCookiesPlugin({
ga4: true,
ga4MeasurementId: window.gl?.ga4MeasurementId,
}),
PerformanceTimingPlugin(),
ClientHintsPlugin(),
];

View File

@ -3,7 +3,7 @@ import { GlChart } from '@gitlab/ui/dist/charts';
import { DATA_VIZ_BLUE_500 } from '@gitlab/ui/src/tokens/build/js/tokens';
import { hexToRgba } from '@gitlab/ui/dist/utils/utils';
import { isNumber } from 'lodash';
import { formatDate } from '~/lib/utils/datetime/date_format_utility';
import { localeDateFormat } from '~/lib/utils/datetime/locale_dateformat';
import { logError } from '~/lib/logger';
function parseTimelineData(timelineData) {
@ -23,7 +23,7 @@ function parseTimelineData(timelineData) {
if (rawDate !== undefined && count !== undefined) {
// dates/timestamps are in seconds
const date = isNumber(rawDate) ? rawDate * 1000 : rawDate;
xData.push(formatDate(date));
xData.push(localeDateFormat.asDateTimeFull.format(date));
yData.push(count);
} else {
invalidDataPoints.push(f);

View File

@ -1,6 +1,7 @@
<script>
import { GlButton, GlIcon, GlSprintf, GlLink, GlFormCheckbox, GlToggle } from '@gitlab/ui';
import ConfirmDanger from '~/vue_shared/components/confirm_danger/confirm_danger.vue';
import glFeatureFlagMixin from '~/vue_shared/mixins/gl_feature_flags_mixin';
import settingsMixin from 'ee_else_ce/pages/projects/shared/permissions/mixins/settings_pannel_mixin';
import { __, s__ } from '~/locale';
import {
@ -17,6 +18,7 @@ import {
featureAccessLevelDescriptions,
modelExperimentsHelpPath,
modelRegistryHelpPath,
duoHelpPath,
} from '../constants';
import { toggleHiddenClassBySelector } from '../external';
import ProjectFeatureSetting from './project_feature_setting.vue';
@ -73,6 +75,8 @@ export default {
releasesHelpText: s__(
'ProjectSettings|Combine git tags with release notes, release evidence, and assets to create a release.',
),
duoLabel: s__('ProjectSettings|GitLab Duo'),
duoHelpText: s__('ProjectSettings|Use AI-powered features in this project.'),
securityAndComplianceLabel: s__('ProjectSettings|Security and Compliance'),
snippetsLabel: s__('ProjectSettings|Snippets'),
wikiLabel: s__('ProjectSettings|Wiki'),
@ -92,6 +96,7 @@ export default {
VISIBILITY_LEVEL_PUBLIC_INTEGER,
modelExperimentsHelpPath,
modelRegistryHelpPath,
duoHelpPath,
components: {
CiCatalogSettings,
ProjectFeatureSetting,
@ -108,7 +113,7 @@ export default {
'jh_component/pages/projects/shared/permissions/components/other_project_settings.vue'
),
},
mixins: [settingsMixin],
mixins: [settingsMixin, glFeatureFlagMixin()],
props: {
requestCveAvailable: {
@ -169,6 +174,16 @@ export default {
required: false,
default: false,
},
licensedAiFeaturesAvailable: {
type: Boolean,
required: false,
default: false,
},
duoFeaturesLocked: {
type: Boolean,
required: false,
default: false,
},
visibilityHelpPath: {
type: String,
required: false,
@ -293,6 +308,7 @@ export default {
emailsEnabled: true,
showDiffPreviewInEmail: true,
cveIdRequestEnabled: true,
duoFeaturesEnabled: false,
featureAccessLevelEveryone,
featureAccessLevelMembers,
featureAccessLevel,
@ -400,6 +416,9 @@ export default {
this.showDiffPreviewInEmail = newValue;
},
},
showDuoSettings() {
return this.licensedAiFeaturesAvailable && this.glFeatures.aiSettingsVueProject;
},
},
watch: {
@ -582,7 +601,7 @@ export default {
s__('ProjectSettings|Manage who can see the project in the public access directory.')
"
>
<div class="project-feature-controls gl-display-flex gl-align-items-center gl-my-3 gl-mx-0">
<div class="project-feature-controls gl-flex gl-items-center gl-my-3 gl-mx-0">
<div class="select-wrapper gl-flex-grow-1">
<select
v-model="visibilityLevel"
@ -1031,6 +1050,23 @@ export default {
name="project[project_feature_attributes][releases_access_level]"
/>
</project-setting-row>
<project-setting-row
v-if="showDuoSettings"
data-testid="duo-settings"
:label="$options.i18n.duoLabel"
:help-text="$options.i18n.duoHelpText"
:help-path="$options.duoHelpPath"
>
<gl-toggle
v-model="duoFeaturesEnabled"
class="gl-mt-2 gl-mb-4"
:disabled="duoFeaturesLocked"
:label="$options.i18n.duoLabel"
label-position="hidden"
name="project[project_setting_attributes][duo_features_enabled]"
data-testid="duo_features_enabled_toggle"
/>
</project-setting-row>
</div>
<project-setting-row v-if="canDisableEmails" ref="email-settings" class="mb-3">

View File

@ -50,3 +50,5 @@ export const modelExperimentsHelpPath = helpPagePath(
);
export const modelRegistryHelpPath = helpPagePath('user/project/ml/model_registry/index.md');
export const duoHelpPath = helpPagePath('user/ai_features');

View File

@ -33,8 +33,8 @@ export default {
return [
{
label: s__('PipelineCharts|Total pipelines'),
identifier: 'total-pipelines',
label: s__('PipelineCharts|Total pipeline runs'),
identifier: 'total-pipeline-runs',
value: formatNumber(this.counts.total),
},
{

View File

@ -22,6 +22,11 @@ export default {
default: false,
required: false,
},
largeTitle: {
type: Boolean,
required: false,
default: true,
},
},
data() {
return {
@ -32,6 +37,14 @@ export default {
ariaExpanded() {
return this.expanded ? 'true' : 'false';
},
titleData() {
// Admin and group settings have different tags and styling for headers
// Should be removed when https://gitlab.com/groups/gitlab-org/gitlab-services/-/epics/19
// is completed
return this.largeTitle
? { element: 'h2', class: 'gl-heading-2' }
: { element: 'h4', class: '' };
},
toggleButtonText() {
return this.expanded ? this.$options.i18n.collapseText : this.$options.i18n.expandText;
},
@ -57,17 +70,19 @@ export default {
<section class="vue-settings-block">
<div class="gl-flex gl-justify-between gl-items-start">
<div class="gl-grow">
<h2
<component
:is="titleData.element"
role="button"
tabindex="-1"
class="gl-heading-2 gl-cursor-pointer !gl-mb-2"
class="gl-cursor-pointer !gl-mb-2 gl-mt-0"
:class="titleData.class"
:aria-expanded="ariaExpanded"
:aria-controls="collapseId"
@click="toggleExpanded"
>
<slot v-if="$scopedSlots.title" name="title"></slot>
<template v-else>{{ title }}</template>
</h2>
</component>
<p class="gl-text-secondary gl-m-0"><slot name="description"></slot></p>
</div>
<div class="gl-flex-shrink-0 gl-px-2">

View File

@ -24,7 +24,7 @@ module Import
end
def importable_orgs
client_orgs.to_a
client_orgs.map(&:to_h)
end
def client_orgs

View File

@ -29,6 +29,8 @@
.settings-content
= render 'groups/settings/permissions'
= render_if_exists 'groups/settings/ai/ai_settings'
- if can?(current_user, :manage_merge_request_settings, @group)
= render_if_exists 'groups/settings/merge_requests/merge_requests', expanded: expanded, group: @group
= render_if_exists 'groups/settings/merge_requests/merge_request_approval_settings', expanded: expanded, group: @group, user: current_user

View File

@ -17,3 +17,4 @@
).to_context.to_json.to_json}
gl.snowplowPseudonymizedPageUrl = #{masked_page_url(group: namespace, project: @project).to_json};
gl.maskedDefaultReferrerUrl = #{masked_referrer_url(request.referer).to_json};
gl.ga4MeasurementId = 'G-ENFH3X7M5Y';

View File

@ -243,7 +243,7 @@
:versions: []
:when: 2019-09-11 13:08:28.431132000 Z
- - :permit
- "(MIT OR CC0-1.0)"
- '(MIT OR CC0-1.0)'
- :who:
:why:
:versions: []
@ -364,7 +364,7 @@
:versions: []
:when: 2022-10-18 16:24:56.611523399 Z
- - :approve
- "@gitlab/fonts"
- '@gitlab/fonts'
- :who: Lukas Eipert
:why: https://gitlab.com/gitlab-com/legal-and-compliance/-/issues/1265
:versions: []
@ -389,3 +389,9 @@
:why: Used by nio4r gem. MIT license.
:versions: []
:when: 2024-01-08 09:05:34.528980000 Z
- - :permit
- MIT-0
- :who: Lukas Eipert
:why: https://blueoakcouncil.org/list#bronze
:versions: []
:when: 2024-07-10 09:44:40.477216000 Z

View File

@ -0,0 +1,9 @@
---
name: ai_settings_vue_admin
feature_issue_url: https://gitlab.com/groups/gitlab-org/-/epics/13782
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/issues/470519
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/470584
milestone: '17.2'
group: group::ai framework
type: wip
default_enabled: false

View File

@ -0,0 +1,9 @@
---
name: ai_settings_vue_group
feature_issue_url: https://gitlab.com/groups/gitlab-org/-/epics/13782
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/issues/470519
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/470584
milestone: '17.2'
group: group::ai framework
type: wip
default_enabled: false

View File

@ -249,7 +249,7 @@ In the following steps, replace `<ssh_host_key_path>` with the one you're using:
gitlab-rake gitlab:geo:check
```
If any of the checks fail, check the [troubleshooting documentation](troubleshooting.md).
If any of the checks fail, check the [troubleshooting documentation](troubleshooting/index.md).
1. SSH into a **Rails or Sidekiq server on your primary** site and login as root to verify the
**secondary** site is reachable or there are any common issues with your Geo setup:
@ -258,7 +258,7 @@ In the following steps, replace `<ssh_host_key_path>` with the one you're using:
gitlab-rake gitlab:geo:check
```
If any of the checks fail, check the [troubleshooting documentation](troubleshooting.md).
If any of the checks fail, check the [troubleshooting documentation](troubleshooting/index.md).
After the **secondary** site is added to the Geo administration page and restarted,
the site automatically starts replicating missing data from the **primary** site
@ -358,14 +358,14 @@ site's **Geo Sites** dashboard in your browser.
![Geo dashboard](img/geo_dashboard_v14_0.png)
If your installation isn't working properly, check the
[troubleshooting document](troubleshooting.md).
[troubleshooting document](troubleshooting/index.md).
The two most obvious issues that can become apparent in the dashboard are:
1. Database replication not working well.
1. Instance to instance notification not working. In that case, it can be
something of the following:
- You are using a custom certificate or custom CA (see the [troubleshooting document](troubleshooting.md)).
- You are using a custom certificate or custom CA (see the [troubleshooting document](troubleshooting/index.md)).
- The instance is firewalled (check your firewall rules).
Disabling a **secondary** site stops the synchronization process.
@ -385,4 +385,4 @@ Currently, this is what is synced:
## Troubleshooting
See the [troubleshooting document](troubleshooting.md).
See the [troubleshooting document](troubleshooting/index.md).

View File

@ -1,11 +0,0 @@
---
redirect_to: 'troubleshooting/index.md'
remove_date: '2024-06-19'
---
This document was moved to [another location](troubleshooting/index.md).
<!-- This redirect file can be deleted after <2024-06-19>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -68,4 +68,4 @@ everything is working correctly:
1. Test the data replication by pushing code to the **primary** site and see if it
is received by **secondary** sites.
If you encounter any issues, see the [Geo troubleshooting guide](troubleshooting.md).
If you encounter any issues, see the [Geo troubleshooting guide](troubleshooting/index.md).

View File

@ -104,6 +104,7 @@ DETAILS:
> - [Introduced](https://gitlab.com/gitlab-org/gitlab/-/issues/17584) as an [experiment](../../policy/experiment-beta-support.md) in GitLab 16.7.
> - [Moved](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/148621) to [beta](../../policy/experiment-beta-support.md) in GitLab 16.11.
> - [Changed](https://gitlab.com/gitlab-org/gitlab-pages/-/issues/1111) implementation from NGINX to the GitLab Pages codebase in GitLab 17.2.
FLAG:
On self-managed GitLab, by default this feature is available.
@ -202,6 +203,7 @@ DETAILS:
> - [Introduced](https://gitlab.com/gitlab-org/gitlab/-/issues/17584) as an [experiment](../../policy/experiment-beta-support.md) in GitLab 16.7.
> - [Moved](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/148621) to [beta](../../policy/experiment-beta-support.md) in GitLab 16.11.
> - [Changed](https://gitlab.com/gitlab-org/gitlab-pages/-/issues/1111) implementation from NGINX to the GitLab Pages codebase in GitLab 17.2.
FLAG:
On self-managed GitLab, by default this feature is available.
@ -225,19 +227,20 @@ Prerequisites:
external_url "http://example.com"
pages_external_url 'http://example.io'
pages_nginx['enable'] = true
# Set this flag to enable this feature
gitlab_pages["namespace_in_path"] = true
```
1. [Reconfigure GitLab](../restart_gitlab.md#reconfigure-a-linux-package-installation).
NGINX uses the custom proxy header `X-Gitlab-Namespace-In-Path`
to send the namespace to the GitLab Pages daemon.
The resulting URL scheme is `http://example.io/<namespace>/<project_slug>`.
WARNING:
GitLab Pages supports only one URL scheme at a time:
with wildcard DNS, or without wildcard DNS.
If you enable `namespace_in_path`, existing GitLab Pages websites
are accessible only on domains without wildcard DNS.
### Wildcard domains with TLS support
Prerequisites:
@ -291,6 +294,7 @@ DETAILS:
> - [Introduced](https://gitlab.com/gitlab-org/gitlab/-/issues/17584) as an [experiment](../../policy/experiment-beta-support.md) in GitLab 16.7.
> - [Moved](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/148621) to [beta](../../policy/experiment-beta-support.md) in GitLab 16.11.
> - [Changed](https://gitlab.com/gitlab-org/gitlab-pages/-/issues/1111) implementation from NGINX to the GitLab Pages codebase in GitLab 17.2.
FLAG:
On self-managed GitLab, by default this feature is available.
@ -315,7 +319,6 @@ daemon doesn't listen to the outside world:
external_url "https://example.com"
pages_external_url 'https://example.io'
pages_nginx['enable'] = true
pages_nginx['redirect_http_to_https'] = true
# Set this flag to enable this feature
@ -344,11 +347,14 @@ daemon doesn't listen to the outside world:
1. [Reconfigure GitLab](../restart_gitlab.md#reconfigure-a-linux-package-installation).
NGINX uses the custom proxy header `X-Gitlab-Namespace-In-Path`
to send the namespace to the GitLab Pages daemon.
The resulting URL scheme is `https://example.io/<namespace>/<project_slug>`.
WARNING:
GitLab Pages supports only one URL scheme at a time:
with wildcard DNS, or without wildcard DNS.
If you enable `namespace_in_path`, existing GitLab Pages websites
are accessible only on domains without wildcard DNS.
### Wildcard domains with TLS-terminating Load Balancer
Prerequisites:
@ -393,7 +399,7 @@ control over how the Pages daemon runs and serves content in your environment.
| `artifacts_server` | Enable viewing [artifacts](../job_artifacts.md) in GitLab Pages. |
| `artifacts_server_timeout` | Timeout (in seconds) for a proxied request to the artifacts server. |
| `artifacts_server_url` | API URL to proxy artifact requests to. Defaults to GitLab `external URL` + `/api/v4`, for example `https://gitlab.com/api/v4`. When running a [separate Pages server](#running-gitlab-pages-on-a-separate-server), this URL must point to the main GitLab server's API. |
| `auth_redirect_uri` | Callback URL for authenticating with GitLab. Defaults to project's subdomain of `pages_external_url` + `/auth`, for example `https://projects.example.io/auth`. When `namespace_in_path` is enabled, defaults to `pages_external_url` + `/projects/auth`, for example `https://example.io/projects/auth`. |
| `auth_redirect_uri` | Callback URL for authenticating with GitLab. Defaults to project's subdomain of `pages_external_url` + `/auth`, for example `https://projects.example.io/auth`. When `namespace_in_path` is enabled, defaults to `pages_external_url` + `/projects/auth`, for example `https://example.io/projects/auth`. |
| `auth_secret` | Secret key for signing authentication requests. Leave blank to pull automatically from GitLab during OAuth registration. |
| `client_cert` | Client certificate used for mutual TLS with the GitLab API. See [Support mutual TLS when calling the GitLab API](#support-mutual-tls-when-calling-the-gitlab-api) for details. |
| `client_key` | Client key used for mutual TLS with the GitLab API. See [Support mutual TLS when calling the GitLab API](#support-mutual-tls-when-calling-the-gitlab-api) for details. |
@ -426,7 +432,7 @@ control over how the Pages daemon runs and serves content in your environment.
| `log_directory` | Absolute path to a log directory. |
| `log_format` | The log output format: `text` or `json`. |
| `log_verbose` | Verbose logging, true/false. |
| `namespace_in_path` | (Beta) Enable or disable namespace in the URL path. This requires `pages_nginx[enable] = true`. Sets `rewrite` configuration in NGINX to support [without wildcard DNS setup](#for-namespace-in-url-path-without-wildcard-dns). Default: `false`. |
| `namespace_in_path` | (Beta) Enable or disable namespace in the URL path to support [without wildcard DNS setup](#for-namespace-in-url-path-without-wildcard-dns). Default: `false`. |
| `propagate_correlation_id` | Set to true (false by default) to re-use existing Correlation ID from the incoming request header `X-Request-ID` if present. If a reverse proxy sets this header, the value is propagated in the request chain. |
| `max_connections` | Limit on the number of concurrent connections to the HTTP, HTTPS or proxy listeners. |
| `max_uri_length` | The maximum length of URIs accepted by GitLab Pages. Set to 0 for unlimited length. |

View File

@ -159,6 +159,28 @@ This field returns a [connection](#connections). It accepts the
four standard [pagination arguments](#pagination-arguments):
`before: String`, `after: String`, `first: Int`, and `last: Int`.
### `Query.blobSearch`
Find code visible to the current user.
DETAILS:
**Introduced** in GitLab 17.2.
**Status**: Experiment.
Returns [`BlobSearch`](#blobsearch).
#### Arguments
| Name | Type | Description |
| ---- | ---- | ----------- |
| <a id="queryblobsearchchunkcount"></a>`chunkCount` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Maximum chunks per file. |
| <a id="queryblobsearchgroupid"></a>`groupId` **{warning-solid}** | [`GroupID`](#groupid) | **Introduced** in GitLab 17.2. **Status**: Experiment. Group to search in. |
| <a id="queryblobsearchpage"></a>`page` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Page number to fetch the results. |
| <a id="queryblobsearchperpage"></a>`perPage` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Number of results per page. |
| <a id="queryblobsearchprojectid"></a>`projectId` **{warning-solid}** | [`ProjectID`](#projectid) | **Introduced** in GitLab 17.2. **Status**: Experiment. Project to search in. |
| <a id="queryblobsearchrepositoryref"></a>`repositoryRef` **{warning-solid}** | [`String`](#string) | **Introduced** in GitLab 17.2. **Status**: Experiment. Repository reference to search in. |
| <a id="queryblobsearchsearch"></a>`search` | [`String!`](#string) | Searched term. |
### `Query.boardList`
Find an issue board list.
@ -17497,6 +17519,21 @@ An emoji awarded by a user.
| <a id="blobwebpath"></a>`webPath` | [`String`](#string) | Web path of the blob. |
| <a id="blobweburl"></a>`webUrl` | [`String`](#string) | Web URL of the blob. |
### `BlobSearch`
Full JSON structure of multi-match results in a single file.
#### Fields
| Name | Type | Description |
| ---- | ---- | ----------- |
| <a id="blobsearchfilecount"></a>`fileCount` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Total number of files with matches. |
| <a id="blobsearchfiles"></a>`files` **{warning-solid}** | [`[SearchBlobFileType!]`](#searchblobfiletype) | **Introduced** in GitLab 17.2. **Status**: Experiment. List of files with matches. |
| <a id="blobsearchmatchcount"></a>`matchCount` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Total number of matches. |
| <a id="blobsearchperpage"></a>`perPage` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Total number of files per page. |
| <a id="blobsearchsearchlevel"></a>`searchLevel` **{warning-solid}** | [`SearchLevel`](#searchlevel) | **Introduced** in GitLab 17.2. **Status**: Experiment. Level of search performed. |
| <a id="blobsearchsearchtype"></a>`searchType` **{warning-solid}** | [`SearchType`](#searchtype) | **Introduced** in GitLab 17.2. **Status**: Experiment. Type of search performed. |
### `BlobViewer`
Represents how the blob content should be displayed.
@ -30820,6 +30857,45 @@ Represents a resource scanned by a security scan.
| <a id="scannedresourcerequestmethod"></a>`requestMethod` | [`String`](#string) | HTTP request method used to access the URL. |
| <a id="scannedresourceurl"></a>`url` | [`String`](#string) | URL scanned by the scanner. |
### `SearchBlobChunk`
JSON structure of a matched chunk.
#### Fields
| Name | Type | Description |
| ---- | ---- | ----------- |
| <a id="searchblobchunklines"></a>`lines` **{warning-solid}** | [`[SearchBlobLine!]`](#searchblobline) | **Introduced** in GitLab 17.2. **Status**: Experiment. Path of the file. |
| <a id="searchblobchunkmatchcountinchunk"></a>`matchCountInChunk` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Number of matches in the chunk. |
### `SearchBlobFileType`
JSON structure of a file with matches.
#### Fields
| Name | Type | Description |
| ---- | ---- | ----------- |
| <a id="searchblobfiletypeblameurl"></a>`blameUrl` **{warning-solid}** | [`String`](#string) | **Introduced** in GitLab 17.2. **Status**: Experiment. Blame URL of the file. |
| <a id="searchblobfiletypechunks"></a>`chunks` **{warning-solid}** | [`[SearchBlobChunk!]`](#searchblobchunk) | **Introduced** in GitLab 17.2. **Status**: Experiment. Maximum matches per file. |
| <a id="searchblobfiletypefileurl"></a>`fileUrl` **{warning-solid}** | [`String`](#string) | **Introduced** in GitLab 17.2. **Status**: Experiment. URL of the file. |
| <a id="searchblobfiletypematchcount"></a>`matchCount` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Matches per file in maximum 50 chunks. |
| <a id="searchblobfiletypematchcounttotal"></a>`matchCountTotal` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Total number of matches per file. |
| <a id="searchblobfiletypepath"></a>`path` **{warning-solid}** | [`String`](#string) | **Introduced** in GitLab 17.2. **Status**: Experiment. Path of the file. |
| <a id="searchblobfiletypeprojectpath"></a>`projectPath` **{warning-solid}** | [`String`](#string) | **Introduced** in GitLab 17.2. **Status**: Experiment. Full path of the project. |
### `SearchBlobLine`
JSON structure of each line in a matched chunk.
#### Fields
| Name | Type | Description |
| ---- | ---- | ----------- |
| <a id="searchbloblinelinenumber"></a>`lineNumber` **{warning-solid}** | [`Int`](#int) | **Introduced** in GitLab 17.2. **Status**: Experiment. Line number of the blob. |
| <a id="searchbloblinerichtext"></a>`richText` **{warning-solid}** | [`String`](#string) | **Introduced** in GitLab 17.2. **Status**: Experiment. Rich text of the blob. |
| <a id="searchbloblinetext"></a>`text` **{warning-solid}** | [`String`](#string) | **Introduced** in GitLab 17.2. **Status**: Experiment. Text content of the blob. |
### `SecurityPolicyValidationError`
Security policy validation error.
@ -35941,6 +36017,26 @@ The status of the security scan.
| <a id="scanstatusreport_error"></a>`REPORT_ERROR` | The report artifact provided by the CI build couldn't be parsed. |
| <a id="scanstatussucceeded"></a>`SUCCEEDED` | The report has been successfully prepared. |
### `SearchLevel`
Level of search.
| Value | Description |
| ----- | ----------- |
| <a id="searchlevelglobal"></a>`GLOBAL` | Global search including all groups and projects. |
| <a id="searchlevelgroup"></a>`GROUP` | Group search. |
| <a id="searchlevelproject"></a>`PROJECT` | Project search. |
### `SearchType`
Type of search.
| Value | Description |
| ----- | ----------- |
| <a id="searchtypeadvanced"></a>`ADVANCED` | Advanced search. |
| <a id="searchtypebasic"></a>`BASIC` | Basic search. |
| <a id="searchtypezoekt"></a>`ZOEKT` | Exact code search. |
### `SecurityPolicyRelationType`
| Value | Description |

View File

@ -39,7 +39,7 @@ POST /import/github
| `repo_id` | integer | yes | GitHub repository ID |
| `new_name` | string | no | Name of the new project. Also used as the new path so must not start or end with a special character and must not contain consecutive special characters. |
| `target_namespace` | string | yes | Namespace to import repository into. Supports subgroups like `/namespace/subgroup`. In GitLab 15.8 and later, must not be blank |
| `github_hostname` | string | no | Custom GitHub Enterprise hostname. Do not set for GitHub.com. |
| `github_hostname` | string | no | Custom GitHub Enterprise hostname. Do not set for GitHub.com. From GitLab 16.5 to GitLab 17.1, you must include the path `/api/v3`. |
| `optional_stages` | object | no | [Additional items to import](../user/project/import/github.md#select-additional-items-to-import). [Introduced](https://gitlab.com/gitlab-org/gitlab/-/issues/373705) in GitLab 15.5 |
| `timeout_strategy` | string | no | Strategy for handling import timeouts. Valid values are `optimistic` (continue to next stage of import) or `pessimistic` (fail immediately). Defaults to `pessimistic`. [Introduced](https://gitlab.com/gitlab-org/gitlab/-/issues/422979) in GitLab 16.5. |

View File

@ -0,0 +1,45 @@
---
owning-stage: "~devops::ai-powered"
description: 'AI Context Management ADR 001: Keeping AI Context Policy Management close to AI Context Retriever'
---
# AI Context Management ADR 001: Keeping AI Context Policy Management close to AI Context Retriever
## Summary
To manage AI Context effectively and ensure flexible and scalable solutions, AI Context Policy Management will reside in the
same environment, as the AI Context Retriever, and, as a result, as close to the context fetching mechanism as possible. This
approach aims to reduce latency and improve user control over the contextual information sent to AI systems.
## Context
The original blueprint outlined the necessity of a flexible AI Context Management system to provide accurate and relevant
AI responses while addressing security and trust concerns. It suggested that AI Context Policy Management should act as
a filtering solution between the context resolver and the context fetcher in the AI Context Retriever. However, the
blueprint did not specify the exact location for the AI Context Policy Management within the system.
During [a sync discussion](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/155707#note_1978675445), it was determined
that placing the AI Context Policy Management close to AI Context Retriever would provide significant benefits. This decision
aligns with our approach of having shared components, like the AI Gateway and the Duo Chat UI, to ensure consistency and reduce
redundancy across different environments.
## Decision
AI Context Management will happen as close to the user's interaction with Duo features as possible. As a result, the [AI Gateway](https://gitlab.com/gitlab-org/modelops/applied-ml/code-suggestions/ai-assist) will only receive context that is policy-compliant.
Users interact with Duo features in many different environments, including their IDE and the GitLab Web UI. Rather than retrieving the context from this environment and sending it to the AI Gateway for filtering based on the AI Context Policy, this decision states that the AI Context Retriever will filter this content *before* it reaches the AI Gateway.
This decision allows for better security, flexibility and scalability, enabling dynamic user interactions and immediate feedback on context validation.
## Consequences
- *Implementation Complexity*: Users must create, modify, and remove context policies in each environment where they are
interacting with Duo features. This requires multiple implementations to support different environments.
- *Flexibility and Scalability*: Storing AI Context Policy Management close the AI Context Retriever allows for more flexible
and scalable policy implementations tailored to specific environments, such as IDEs and the Web.
- *Reduced Latency*: Filtering out unwanted context at the earliest possible stage reduces latency and ensures that only
the necessary information is sent to the AI models.
- *User Experience*: This approach facilitates dynamic UX, providing instant feedback to users in case of failed context
validation. Users can manage their supplementary context more effectively through a user-friendly interface.
- *Security*: By managing policies closer to the content retrieving mechanism, sensitive information can be filtered out
locally, enhancing security and user trust.

View File

@ -39,7 +39,7 @@ See [issue #411931](https://gitlab.com/gitlab-org/gitlab/-/issues/411931) for de
## 4. Evaluation
We expect the [majority of Users to perform most of their activity in one single Organization](../../organization/index.md#data-exploration).
We expect the [majority of Users to perform most of their activity in one single Organization](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization#data-exploration).
This is why we deem it acceptable to scope the User Profile to an Organization at first.
More discovery is necessary to understand which aspects of the current User Profile are relevant to showcase contributions in a global context.

View File

@ -49,7 +49,7 @@ Your Work will be scoped to an Organization, giving the user an overview of all
## 4. Evaluation
Scoping Your Work to an Organization makes sense in the context of the [proposed Organization navigation](https://gitlab.com/gitlab-org/gitlab/-/issues/417778).
Considering that [we expect most users to work in a single Organization](../../organization/index.md#data-exploration), we deem this impact acceptable.
Considering that [we expect most users to work in a single Organization](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/#data-exploration), we deem this impact acceptable.
## 4.1. Pros

View File

@ -21,7 +21,7 @@ Cells 1.5 is meant to target existing enterprise customers:
From a development and infrastructure perspective we want to achieve the following goals:
1. Customers that migrated to the Organization model are isolated from each other.
1. Users can be [members of multiple Organizations](../../organization/index.md#organizations-on-cells-15-fy25q3-fy25q3).
1. Users can be [members of multiple Organizations](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/#organizations-on-cells-15-fy25q3-fy25q3).
1. We can migrate Organizations from the Primary Cell to another Cell without user intervention or changing any user workflows.
1. The routing solution can dynamically route customers to the correct Cell once they are migrated.

View File

@ -1,154 +1,11 @@
---
status: proposed
creation-date: "2023-01-25"
authors: [ "@pedropombeiro", "@vshushlin"]
coach: "@grzesiek"
approvers: [ ]
stage: Verify
group: Runner
participating-stages: []
description: 'CI Insights design'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_builds_runner_fleet_metrics/ci_insights/'
remove_date: '2025-07-08'
---
# CI Insights
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_builds_runner_fleet_metrics/ci_insights/).
## Summary
As part of the Fleet Metrics, we would like to have a section dedicated to CI insights to help users monitor pipelines and summarize findings about pipelines speed, common job failures and more. It would eventually offer actionables to help users optimize and fix issues with their CI/CD.
## Motivation
We have a [page for CI/CD Analytics](https://gitlab.com/gitlab-org/gitlab/-/pipelines/charts?chart=pipelines) that contain some very basic analytics on pipelines. Most of this information relates to the **total** number of pipelines over time, which does not give any real value to customers: projects will always see an increase of pipelines number over time, so the total number of pipelines is of little consequence.
![Current page](img/current_page.png)
Because this page lacks real insights, it makes understanding pipelines slowdowns or failures hard to track and becomes a very manual task. We want to empower users to optimize their workflow in a centralized place to avoid all of the manual labor associated with either querying the API for data and then manually parsing it or navigating the UI through dozens of pages utils the insights or action required can be found.
As we are going to process large quantities of data relating to a proejct pipelines, there is potential to eventually summarize findings with an AI tool to give insights into job failures, pipeline slowdowns and flaky specs. As AI has become a crucial part of our product roadmap and Verify lacks any promising lead in that area, this page could be the center of this new addition.
- Deliver a new Pipelines Analysis Dashbord page
- Have excellent data visualization to help digest information quickly
- Flexible querying to let users get the information they want
- Clear actionables based on information presented in the page
- Show some default information on landing like pipelines duration over time and slowest jobs
- Make the CI/CD Analytics more accessible, liked and remembered (AKA, more page views)
### Non-Goals
We do not aim to improve the GitLab project's pipeline speed. This feature could help us achieve this, but it is not a direct objective of this blueprint.
We also are not aiming to have AI in the first iteration and should instead focus on making as much information available and disgestible as possible.
## Proposal
Revamp the [page for CI/CD Analytics](https://gitlab.com/gitlab-org/gitlab/-/pipelines/charts?chart=pipelines) to include more meaningful data so that users can troubleshoot their pipelines with ease. Here is a list of the main improvements:
### Overall statistics
The current "overall statistics" will become a one line header in a smaller font to keep this information available, but without taking as much visual space. For the pipelines chart, we will replace it with a stacked bar plot where each stack of a bar represents a status and each bar is a unit (in days, a day, in month a month and in years, a year) so users can keep track of how many pipelines ran in that specific unit of time and what percent of these pipelines ended up in failling or succeeding.
### Pipeline duration graph
A new pipeline duration graph that can be customized by type (MR pipelines, pipeline on a specific branch, etc), number of runs and status (success, failed, etc) and will replace the current `Pipeline durations for the last 30 commits` chart. The existing chart checks the latest 30 commits made on the repository with no filtering so the results presented are not very valuable.
We also add jobs that failed multiple times and jobs that are the slowest in the last x pipelines on master. All of this is to support the effort of allowing users to query their pipelines data to figure out what they need to improve on or what kind of problems they are facing with their CI/CD configuration.
### Visibility
Add a link in the `pipelines` page to increase the visibility of this feature. We can add a new option with the `Run pipeline` primary button.
### Master Broken
Add a "Is master broken?" quick option that scans the last x pipelines on the main branch and check for failed jobs. All jobs that failed multiple times will be listed in a table with the option to create an incident from that list.
### Color scheme
Rethink our current color schemes for data visualization when it comes to pipelines statuses. We currently use the default visualization colors, but they don't actually match with that colors user have grown accustomed to for pipeline/jobs statuses. There is an opportunity here to help user better understand their data through more relevant color schemes and better visualization.
### Routing
Change the routing from `pipelines/charts` to `pipelines/analytics` since `charts` is a really restrictive terminology when talking about data visualization. It also doesn't really convey what this page is, which is a way to get information, not just nice charts. Then we can also get rid of the query parameter for the tabs and instead support first-class routing.
## Design and implementation details
### New API for aggregated data
This feature depends on having a new set of data available to us that aggregates jobs and pipelines insights and make them available to the client.
We'll start by aggregating data from ClickHouse, and probably only for `gitlab.com`, as the MVC. We will aggregate the data on the backend on the fly. So far ClickHouse has been very capable of such things.
We won't store the aggregated data anywhere (we'll probably have the materialized views in ClickHouse, but nothing more complex). Then if the features get traction, we can explore ways to bring these features to environments without ClickHouse
This way we can move fast, test our ideas with real users, and get feedback.
### Feature flag
To develop this new analytic page, we will gate the new page behind a feature flag `ci_insights`, and conditionally render the old or new analytics page. Potentially, we could even add the flag on the controller to decide which route to render: the new `/analytic` when the flag is one, and the old `/charts` when it isn't.
### Add analytics on page view
Make sure that we can get information on how often this page is viewed. If we do not have it, then let's implment some to know how visible this page is. The changes to this section should make the view count go up and we want to track this as a measure of success.
### Routing
We are planning to have new routes for the page and some redicts to set up. To read more about the routing proposal, see the [related issue](https://gitlab.com/gitlab-org/gitlab/-/issues/437556)
### Pipelines duration graph
We want a way for user to query data about pipelines with a lot of different criterias. Most notably, query for only pipelines with the scope `finished` or by status `success` or `failed`. There is also the possibility to scope this to a ref, so users could either test for the main branch or maybe even a branch that has introduced a CI/CD change. We want branch comparaison for pipeline speed.
To get more accurate data, we want to increase the count of pipelines requested. In graphQL, we have a limit of 100 items and we will probably get performance degradations quite quickly. We need to define how we could get more data set for more accurate data visualization.
### Jobs insights
Currently, there is no way to query a single job across multiple pipelines and it prevent us from doing a query that would look like this:
```graphql
query getJob($projectPath: ID!, $jobName: String!){
project(fullPath:$projectPath){
job(name: $jobName, last: 100){
nodes{
id
duration
}
}
}
}
```
There are plans to create a new unified table to log job analytics and it is not yet defined what this API will look like. Without comitting yet to an API definiton, we want so unified way to query information for nalytics that may look rougly like so:
```ruby
get_jobs(project_id:, job_name: nil, stage: nil, stage_index: nil, *etc)
# >
[{id: 1, duration: 134, status: 'failed'}, *etc]
get_jobs_statistics(project_id, job_name:, *etc)
# >
[{time_bucket: '2024-01-01:00:00:00', avg_duration: 234, count: 123, statuses_count: {success: 123, failed: 45, cancelled: 45}}]
```
### Revamping our charts
Explore new color scheme and nicer look on our charts. Colaborate with UX to determine whether this is something we had on our mind or not and support any iniative to have nicer, more modern looking charts as our charts are quite forgettable.
## Alternative Solutions
### New page
We could create a brand new page and leave this section as it is. The pro would be that we could perhaps have a more prominent placement in the Navigation under `Build`, while the cons are that we'd have clear overlap with the section.
### Pipeline analysis per pipeline
There was an [experiment](https://gitlab.com/gitlab-org/gitlab/-/issues/365902) in the past to add performance insights **per pipeline**. The experiment was removed and deemed not viable. Some of the findings were that:
- Users did not interact with the page as much as thought and would not click on the button to view insights
- Users who did click on the button did not try to get more insights into a job.
- Users did not leave feedback in the issue.
This experiment reveals to us mostly that users who go on the pipeline graph page `pipelines/:id` are **not** trying to imrpove the performance of pipelines. Instead, it is most likely that this page is used to debug pipeline failures, which means that they are from the IC/developer persona, not the DevOps engineer trying to improve the workflow. By having this section in a more "broad" area, we expect a much better adoption and more useful actionables.
### Do nothing
We could leave this section untouched and not add any new form of analytics. The pro here would be the saved resources and time. The cons are that we currently have no way to help customers improve their CI/CD configurations speed except reading our documentation. This revamped section would also be a great gateway for AI features and help user iteration on their setup.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

View File

@ -1,140 +1,11 @@
---
status: proposed
creation-date: "2023-01-25"
authors: [ "@pedropombeiro", "@vshushlin"]
coach: "@grzesiek"
approvers: [ ]
stage: Verify
group: Runner
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_builds_runner_fleet_metrics/'
remove_date: '2025-07-08'
---
# CI Builds and Runner Fleet metrics database architecture
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_builds_runner_fleet_metrics/).
The CI section envisions new value-added features in GitLab for CI Builds and Runner Fleet focused on observability and automation. However, implementing these features and delivering on the product vision of observability, automation, and AI optimization using the current database architecture in PostgreSQL is very hard because:
- CI-related transactional tables are huge, so any modification to them can increase the load on the database and subsequently cause incidents.
- PostgreSQL is not optimized for running aggregation queries.
- We also want to add more information from the build environment, making CI tables even larger.
- We also need a data model to aggregate data sets for the GitLab CI efficiency machine learning models - the basis of the Runner Fleet AI solution
We want to create a new flexible database architecture which:
- will support known reporting requirements for CI builds and Runner Fleet.
- can be used to ingest data from the CI build environment.
We may also use this database architecture to facilitate development of AI features in the future.
Our recent usability research on navigation and other areas suggests that the GitLab UI is overloaded with information and navigational elements.
This results from trying to add as much information as possible and attempting to place features in the most discoverable places.
Therefore, while developing these new observability features, we will rely on the jobs to be done research, and solution validation, to ensure that the features deliver the most value.
## Runner Fleet
### Metrics - MVC
#### What is the estimated wait time in queue for an instance runner?
The following customer problems should be solved when addressing this question. Most of them are quotes from our usability research
**UI**
- "There is no visibility for expected Runner queue wait times."
- "I got here looking for a view that makes it more obvious if I have a bottleneck on my specific runner."
**Types of metrics**
- "Is it possible to get metrics out of GitLab to check for the runners availability & pipeline wait times?
Goal - we need the data to evaluate the data to determine if to scale up the Runner fleet so that there is no waiting times for developers pipelines."
- "What is the estimated time in the Runner queue before a job can start?"
**Interpreting metrics**
- "What metrics for Runner queue performance should I look at and how do I interpret the metrics and take action?"
- "I want to be able to analyze data on Runner queue performance over time so that I can determine if the reports are from developers are really just rare cases regarding availability."
#### What is the estimated wait time in queue on a group runner?
#### What is the mean estimated wait time in queue for all instance runners?
#### What is the mean estimated wait time in queue for all group runners?
#### Which runners have failures in the past hour?
## CI Insights
CI Insights is a page that would mostly expose data on pipelines and jobs duration, with a multitude of different filters, search and dynamic graphs. To read more on this, see [this related sub-section](ci_insights.md).
## Implementation
The current implementation plan is based on a
[Proof of Concept](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/126863).
For an up to date status, see [epic 10682](https://gitlab.com/groups/gitlab-org/-/epics/10682).
### Database selection
In FY23, ClickHouse [was selected as GitLab standard datastore](https://handbook.gitlab.com/handbook/company/working-groups/clickhouse-datastore/#context)
for features with big data and insert-heavy requirements.
So we have chosen it for our CI analytics as well.
### Scope of data
We're starting with the denormalized version of the `ci_builds` table in the main database,
which will include fields from some other tables. For example, `ci_runners` and `ci_runner_machines`.
[Immutability is a key constraint in ClickHouse](../../../development/database/clickhouse/index.md#how-it-differs-from-postgresql),
so we only use `finished` builds.
### Developing behind feature flags
It's hard to fully test data ingestion and query performance in development/staging environments.
That's why we plan to deliver those features to production behind feature flags and test the performance on real data.
Feature flags for data ingestion and APIs will be separate.
### Data ingestion
Every time a job finished, a record will be created in a new `p_ci_finished_build_ch_sync_events` table, which includes
the `build_id` and a `processed` value.
A background worker loops through unprocessed `p_ci_finished_build_ch_sync_events` records and push the denormalized
`ci_builds` information from Postgres to ClickHouse.
At some point we most likely will need to
[parallelize this worker because of the number of processed builds](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/126863#note_1494922639).
This will be achieved by having the cron worker accept an argument determining the number of workers. The cron worker
will use that argument to queue the respective number of workers that will actually perform the syncing to ClickHouse.
We will start with most recent builds and will not upload all historical data.
### "Raw data", materialized views and queries
Ingested data will go to the "raw data" table in ClickHouse.
This table will use `ReplacingMergeTree` engine to deduplicate rows in case data ingestion mechanism accidentally submits the same batch twice.
Raw data can be used directly do execute queries, but most of the time we will create specialized materialized views
using `AggregatingMergeTree` engine.
This will allow us to read significantly less data when performing queries.
### Limitations and open questions
The topics below require further investigation.
#### Efficient way of querying data for namespaces
We start with the PoC available only for administrators,
but very soon we will need to implement features on the group level.
We can't just put denormalized "path" in the source table because it can be changed when groups or projects are moved.
The simplest way of solving this is to always filter builds by `project_id`,
but this may be inefficient and require reading a significant portion of all data because ClickHouse stores data in big batches.
#### Keeping the database schema up to date
Right now we don't have any mechanism equivalent to migrations we use for PostgreSQL.
While developing our first features we will maintain database schema by hand and
continue developing mechanisms for migrations.
#### Re-uploading data after changing the schema
If we need to modify database schema, old data maybe incomplete.
In that case we can simply truncate the ClickHouse tables and re-upload (part of) the data.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

View File

@ -1,198 +1,11 @@
---
status: ongoing
creation-date: "2021-09-10"
authors: [ "@grzesiek" ]
coach: [ "@ayufan", "@grzesiek" ]
approvers: [ "@jporter", "@cheryl.li" ]
owning-stage: "~devops::verify"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/).
# CI/CD data time decay
## Summary
GitLab CI/CD, [integrated into GitLab in 2015](https://about.gitlab.com/releases/2015/09/22/gitlab-8-0-released/),
is a [crucial](https://about.gitlab.com/blog/2017/09/27/gitlab-leader-continuous-integration-forrester-wave/)
yet resource-intensive component that has experienced exponential growth, surpassing 1 billion builds by 2021.
Despite its evolution, the CI/CD data storage architecture remains largely unchanged since 2012,
posing scalability challenges due to the vast volume of data stored in PostgreSQL.
The proposed strategies involves:
1. [Partitioning CI/CD data tables](pipeline_partitioning.md) to efficiently support large scale
of data and reduce the risk of databae performance degradation.
1. [Reducing the growth rate of metadata](reduce_data_growth_rate.md) by being efficient in the
data to store at any given stage of the pipeline lifecycle.
1. [Archiving pipeline data](pipeline_archival.md) to consistently move less accessed data to other
storage solutions like object storage and enforce a different access pattern.
1. [Introducing configurable data retention policies](retention_policies.md).
This architectural overhaul aims to enhance reliability, scalability, and performance while maintaining
data accessibility and compliance.
## Goals
This architectural overhaul aims to enhance reliability, scalability, and performance while maintaining
data accessibility and compliance.
## Challenges
There are more than two billion rows describing CI/CD builds in GitLab.com's
database. This data represents a sizable portion of the whole data stored in
PostgreSQL database running on GitLab.com.
This volume contributes to significant performance problems, development
challenges and is often related to production incidents.
We also expect a [significant growth in the number of builds executed on GitLab.com](../ci_scale/index.md)
in the upcoming years.
## Opportunity
CI/CD data is subject to
[time-decay](https://handbook.gitlab.com/handbook/company/working-groups/database-scalability/time-decay/)
because, usually, pipelines that are a few months old are not frequently
accessed or are even not relevant anymore. Restricting access to processing
pipelines that are older than a few months might help us to move this data out
of the primary database, to a different storage, that is more performant and
cost effective.
It is already possible to prevent processing builds
[that have been archived](../../../administration/settings/continuous_integration.md#archive-jobs).
When a build gets archived it will not be possible to retry it, but we still do
keep all the processing metadata in the database, and it consumes resources
that are scarce in the primary database.
To improve performance and make it easier to scale CI/CD data storage
we might want to follow these three tracks described below.
![pipeline data time decay](pipeline_data_time_decay.png)
<!-- markdownlint-disable MD029 -->
1. Partition CI/CD builds queuing database tables
2. Partition CI/CD pipelines database tables
3. Reduce the rate of builds metadata table growth
<!-- markdownlint-enable MD029 -->
## Principles
All the three tracks we will use to implement CI/CD time decay pattern are
associated with some challenges. As we progress with the implementation we will
need to solve many problems and devise many implementation details to make this
successful.
Below, we documented a few foundational principles to make it easier for
everyone to understand the vision described in this architectural blueprint.
### Removing pipeline data
While it might be tempting to remove old or archived data from our
databases this should be avoided. It is usually not desired to permanently
remove user data unless consent is given to do so. We can, however, move data
to a different data store, like object storage.
Archived data can still be needed sometimes (for example for compliance or
auditing reasons). We want to be able to retrieve this data if needed, as long
as permanent removal has not been requested or approved by a user.
### Accessing pipeline data in the UI
Implementing CI/CD data time-decay through partitioning might be challenging
when we still want to make it possible for users to access data stored in many
partitions.
We want to retain simplicity of accessing pipeline data in the UI. It will
require some backstage changes in how we reference pipeline data from other
resources, but we don't want to make it more difficult for users to find their
pipelines in the UI.
We may need to add "Archived" tab on the pipelines / builds list pages, but we
should be able to avoid additional steps / clicks when someone wants to view
pipeline status or builds associated with a merge request or a deployment.
We also may need to disable search in the "Archived" tab on pipelines / builds
list pages.
### Accessing pipeline data through the API
We accept the possible necessity of building a separate API endpoint /
endpoints needed to access pipeline data through the API.
In the new API users might need to provide a time range in which the data has
been created to search through their pipelines / builds. To make it
efficient it might be necessary to restrict access to querying data residing in
more than two partitions at once. We can do that by supporting time ranges
spanning the duration that equals to the builds archival policy.
It is possible to still allow users to use the old API to access archived
pipelines data, although a user provided partition identifier may be required.
### Other strategies considered
#### Partition CI/CD builds queuing database tables
While working on the [CI/CD Scale](../ci_scale/index.md) blueprint, we have
introduced a [new architecture for queuing CI/CD builds](https://gitlab.com/groups/gitlab-org/-/epics/5909#note_680407908)
for execution.
This allowed us to significantly improve performance. We still consider the new
solution as an intermediate mechanism, needed before we start working on the
next iteration. The following iteration that should improve the architecture of
builds queuing even more (it might require moving off the PostgreSQL fully or
partially).
In the meantime we want to ship another iteration, an intermediate step towards
more flexible and reliable solution. We want to partition the new queuing
tables, to reduce the impact on the database, to improve reliability and
database health.
Partitioning of CI/CD queuing tables does not need to follow the policy defined
for builds archival. Instead we should leverage a long-standing policy saying
that builds created more 24 hours ago need to be removed from the queue. This
business rule is present in the product since the inception of GitLab CI.
Epic: [Partition CI/CD builds queuing database tables](https://gitlab.com/groups/gitlab-org/-/epics/7438).
For more technical details about this topic see
[pipeline data partitioning design](pipeline_partitioning.md).
## Iterations
All three tracks can be worked on in parallel:
1. [Reduce the rate of builds metadata table growth](https://gitlab.com/groups/gitlab-org/-/epics/7434).
1. [Partition CI/CD pipelines database tables](https://gitlab.com/groups/gitlab-org/-/epics/5417).
1. [Partition CI/CD queuing tables using list partitioning](https://gitlab.com/groups/gitlab-org/-/epics/7438)
## Status
In progress.
## Timeline
- 2021-01-21: Parent [CI Scaling](../ci_scale/index.md) blueprint [merge request](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/52203) created.
- 2021-04-26: CI Scaling blueprint approved and merged.
- 2021-09-10: CI/CD data time decay blueprint discussions started.
- 2022-01-07: CI/CD data time decay blueprint [merged](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/70052).
- 2022-02-01: Blueprint [updated](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/79110) with new content and links to epics.
- 2022-02-08: Pipeline partitioning PoC [merge request](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/80186) started.
- 2022-02-23: Pipeline partitioning PoC [successful](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/80186#note_852704724)
- 2022-03-07: A way to attach an existing table as a partition [found and proven](https://gitlab.com/gitlab-org/gitlab/-/issues/353380#note_865237214).
- 2022-03-23: Pipeline partitioning design Google Doc (GitLab internal) started: `https://docs.google.com/document/d/1ARdoTZDy4qLGf6Z1GIHh83-stG_ZLpqsibjKr_OXMgc`.
- 2022-03-29: Pipeline partitioning PoC [concluded](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/80186#note_892674358).
- 2022-04-15: Partitioned pipeline data associations PoC [shipped](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/84071).
- 2022-04-30: Additional [benchmarking started](https://gitlab.com/gitlab-org/gitlab/-/issues/361019) to evaluate impact.
- 2022-06-31: [Pipeline partitioning design](pipeline_partitioning.md) document [merge request](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/87683) merged.
- 2022-09-01: Engineering effort started to implement partitioning.
- 2022-11-01: The fastest growing CI table partitioned: `ci_builds_metadata`.
- 2023-06-30: The second largest table partitioned: `ci_builds`.
- 2023-12-12: `ci_builds` and `ci_builds_metadata` growth is stopped by writing data to new partitions.
- 2024-02-05: `ci_pipeline_variables` is partitioned.
- 2024-03-26: `ci_job_artifacts` is partitioned.
- 2024-04-26: `ci_stages` is partitioned.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,15 +1,11 @@
---
status: ongoing
creation-date: "2024-05-27"
authors: [ "@fabiopitino", "@mbobin" ]
coach: [ "@fabiopitino", "@grzesiek" ]
approvers: [ "@jreporter", "@cheryl.li" ]
owning-stage: "~devops::verify"
description: 'Archiving pipeline data'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/pipeline_archival/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/pipeline_archival/).
# Archiving pipeline data
## Problem to solve
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 13 KiB

View File

@ -1,838 +1,11 @@
---
status: ongoing
creation-date: "2022-05-31"
authors: [ "@grzesiek" ]
coach: [ "@ayufan", "@grzesiek" ]
approvers: [ "@jreporter", "@cheryl.li" ]
owning-stage: "~devops::verify"
description: 'Pipeline data partitioning design'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/pipeline_partitioning/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/pipeline_partitioning/).
# Pipeline data partitioning design
## Context
Even if we move CI/CD metadata to a different store, or reduce the rate of
metadata growth in a different way, the problem of having billions of rows
describing pipelines, builds and artifacts, remains. We still may need to keep
reference to the metadata we might store in object storage and we still do need
to be able to retrieve this information reliably in bulk (or search through
it).
It means that by moving data to object storage we might not be able to reduce
the number of rows in CI/CD tables. Moving data to object storage should help
with reducing the data size, but not the quantity of entries describing this
data. Because of this limitation, we still want to partition CI/CD data to
reduce the impact on the database (indices size, auto-vacuum time and
frequency).
Epic: [Partition CI/CD pipelines database tables](https://gitlab.com/groups/gitlab-org/-/epics/5417).
## What problem are we trying to solve?
Our intent here is not to move this data out of our primary database elsewhere.
What want to divide very large database tables, that store CI/CD data, into
multiple smaller ones, using PostgreSQL partitioning features.
We want to partition the CI/CD dataset, because some of the database tables are
extremely large, which might be challenging in terms of scaling single node
reads, even after we ship the CI/CD database decomposition.
We want to reduce the risk of database performance degradation by transforming
a few of the largest database tables into smaller ones using PostgreSQL
declarative partitioning.
![pipeline data time decay](pipeline_data_time_decay.png)
## How are CI/CD data decomposition, partitioning, and time-decay related?
CI/CD decomposition is an extraction of a CI/CD database cluster out of the
"main" database cluster, to make it possible to have a different primary
database receiving writes. The main benefit is doubling the capacity for writes
and data storage. The new database cluster will not have to serve reads /
writes for non-CI/CD database tables, so this offers some additional capacity
for reads too.
CI/CD partitioning is dividing large CI/CD database tables into smaller ones.
This will improve reads capacity on every CI/CD database node, because it is
much less expensive to read data from small tables, than from large
multi-terabytes tables. We can add more CI/CD database replicas to better
handle the increase in the number of SQL queries that are reading data, but we
need partitioning to perform a single read more efficiently. Performance in
other aspects will improve too, because PostgreSQL will be more efficient in
maintaining multiple small tables than in maintaining a very large database
table.
CI/CD time-decay allows us to benefit from the strong time-decay
characteristics of pipeline data. It can be implemented in many different ways,
but using partitioning to implement time-decay might be especially beneficial.
When implementing a time decay we usually mark data as archived, and migrate it
out of a database to a different place when data is no longer relevant or
needed. Our dataset is extremely large (tens of terabytes), so moving such a
high volume of data is challenging. When time-decay is implemented using
partitioning, we can archive the entire partition (or set of partitions) by
updating a single record in one of our database tables. It is one of the
least expensive ways to implement time-decay patterns at a database level.
![decomposition_partitioning_comparison.png](decomposition_partitioning_comparison.png)
## Why do we need to partition CI/CD data?
We need to partition CI/CD data because our database tables storing pipelines,
builds, and artifacts are too large. The `ci_builds` database table size is
currently around 2.5 TB with an index of around 1.4 GB. This is too much and
violates our [principle of 100 GB max size](../database_scaling/size-limits.md).
We also want to [build alerting](https://gitlab.com/gitlab-com/gl-infra/tamland/-/issues/5)
to notify us when this number is exceeded.
Large SQL tables increase index maintenance time, during which freshly deleted tuples
cannot be cleaned by `autovacuum`. This highlight the need for small tables.
We will measure how much bloat we accumulate when (re)indexing huge tables. Base on this analysis,
we will be able to set up SLO (dead tuples / bloat), associated with (re)indexing.
We've seen numerous S1 and S2 database-related production environment
incidents, over the last couple of months, for example:
- S1: 2022-03-17 [Increase in writes in `ci_builds` table](https://gitlab.com/gitlab-com/gl-infra/production/-/issues/6625)
- S1: 2021-11-22 [Excessive buffer read in replicas for `ci_job_artifacts`](https://gitlab.com/gitlab-com/gl-infra/production/-/issues/5952)
- S2: 2022-04-12 [Transactions detected that have been running for more than 10m](https://gitlab.com/gitlab-com/gl-infra/production/-/issues/6821)
- S2: 2022-04-06 [Database contention plausibly caused by excessive `ci_builds` reads](https://gitlab.com/gitlab-com/gl-infra/production/-/issues/6773)
- S2: 2022-03-18 [Unable to remove a foreign key on `ci_builds`](https://gitlab.com/gitlab-com/gl-infra/production/-/issues/6642)
- S2: 2022-10-10 [The `queuing_queries_duration` SLI apdex violating SLO](https://gitlab.com/gitlab-com/gl-infra/production/-/issues/7852#note_1130123525)
We have approximately 50 `ci_*` prefixed database tables, and some of them
would benefit from partitioning.
A simple SQL query to get this data:
```sql
WITH tables AS (SELECT table_name FROM information_schema.tables WHERE table_name LIKE 'ci_%')
SELECT table_name,
pg_size_pretty(pg_total_relation_size(quote_ident(table_name))) AS total_size,
pg_size_pretty(pg_relation_size(quote_ident(table_name))) AS table_size,
pg_size_pretty(pg_indexes_size(quote_ident(table_name))) AS index_size,
pg_total_relation_size(quote_ident(table_name)) AS total_size_bytes
FROM tables ORDER BY total_size_bytes DESC;
```
See data from March 2022:
| Table name | Total size | Index size |
|-------------------------|------------|------------|
| `ci_builds` | 3.5 TB | 1 TB |
| `ci_builds_metadata` | 1.8 TB | 150 GB |
| `ci_job_artifacts` | 600 GB | 300 GB |
| `ci_pipelines` | 400 GB | 300 GB |
| `ci_stages` | 200 GB | 120 GB |
| `ci_pipeline_variables` | 100 GB | 20 GB |
| (...around 40 more) | | |
Based on the table above, it is clear that there are tables with a lot of
stored data.
While we have almost 50 CI/CD-related database tables, we are initially
interested in partitioning only 6 of them. We can start by partitioning the
most interesting tables in an iterative way, but we also should have a strategy
for partitioning the remaining ones if needed. This document is an attempt to
capture this strategy, describe as many details as possible, to share this
knowledge among engineering teams.
## How do we want to partition CI/CD data?
We want to partition the CI/CD tables in iterations. It might not be feasible
to partition all of the 6 initial tables at once, so an iterative strategy
might be necessary. We also want to have a strategy for partitioning the
remaining database tables when it becomes necessary.
It is also important to avoid large data migrations. We store almost 6
terabytes of data in the biggest CI/CD tables, in many different columns and
indexes. Migrating this amount of data might be challenging and could cause
instability in the production environment. Due to this concern, we've developed
a way to attach an existing database table as a partition zero without downtime
and excessive database locking, what has been demonstrated in one of the
[first proofs of concept](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/80186).
This makes creation of a partitioned schema possible without a downtime (for
example using a routing table `p_ci_pipelines`), by attaching an existing
`ci_pipelines` table as partition zero without exclusive locking. It will be
possible to use the legacy table as usual, but we can create the next partition
when needed and the `p_ci_pipelines` table will be used for routing queries. To
use the routing table we need to find a good partitioning key.
Our plan is to use logical partition IDs. We want to start with the
`ci_pipelines` table and create a `partition_id` column with a `DEFAULT` value
of `100` or `1000`. Using a `DEFAULT` value avoids the challenge of backfilling
this value for every row. Adding a `CHECK` constraint prior to attaching the
first partition tells PostgreSQL that we've already ensured consistency and
there is no need to check it while holding an exclusive table lock when
attaching this table as a partition to the routing table (partitioned schema
definition). We will increment this value every time we create a new partition
for `p_ci_pipelines`, and the partitioning strategy will be `LIST`
partitioning.
We will also create a `partition_id` column in the other initial 6 database
tables we want to iteratively partition. After a new pipeline is created, it
will get a `partition_id` assigned, and all the related resources, like builds
and artifacts, will share the same value. We want to add the `partition_id`
column into all 6 problematic tables because we can avoid backfilling this data
when we decide it is time to start partitioning them.
We want to partition CI/CD data iteratively. We plan to start with the
`ci_builds_metadata` table, because this is the fastest growing table in the CI
database and want to contain this rapid growth. This table has also the most
simple access patterns - a row from it is being read when a build is exposed to
a runner, and other access patterns are relatively simple too. Starting with
`p_ci_builds_metadata` will allow us to achieve tangible and quantifiable
results earlier, and will become a new pattern that makes partitioning the
largest table possible. We will partition builds metadata using the `LIST`
partitioning strategy.
Once we have many partitions attached to `p_ci_builds_metadata`, with many
`partition_ids` we will choose another CI table to partition next. In that case
we might want to use `RANGE` partitioning in for that next table because
`p_ci_builds_metadata` will already have many physical partitions, and
therefore many logical `partition_ids` will be used at that time. For example,
if we choose `ci_builds` as the next partitioning candidate, after having
partitioned `p_ci_builds_metadata`, it will have many different values stored
in `ci_builds.partition_id`. Using `RANGE` partitioning in that case might be
easier.
Physical partitioning and logical partitioning will be separated, and a
strategy will be determined when we implement physical partitioning for the
respective database tables. Using `RANGE` partitioning works similarly to using
`LIST` partitioning in database tables, but because we can guarantee continuity
of `partition_id` values, using `RANGE` partitioning might be a better
strategy.
### Multi-project pipelines
Parent-child pipeline will always be part of the same partition because child
pipelines are considered a resource of the parent pipeline. They can't be
viewed individually in the project pipeline list page.
On the other hand, multi-project pipelines can be viewed in the pipeline list page.
They can also be accessed from the pipeline graph as downstream/upstream links
when created via the `trigger` token or the API using a job token.
They can also be created from other pipelines by using trigger tokens, but in this
case we don't store the source pipeline.
While partitioning `ci_builds` we need to update the foreign keys to the
`ci_sources_pipelines` table:
```plain
Foreign-key constraints:
"fk_be5624bf37" FOREIGN KEY (source_job_id) REFERENCES ci_builds(id) ON DELETE CASCADE
"fk_d4e29af7d7" FOREIGN KEY (source_pipeline_id) REFERENCES ci_pipelines(id) ON DELETE CASCADE
"fk_e1bad85861" FOREIGN KEY (pipeline_id) REFERENCES ci_pipelines(id) ON DELETE CASCADE
```
A `ci_sources_pipelines` record references two `ci_pipelines` rows (parent and
the child). Our usual strategy has been to add a `partition_id` to the
table, but if we do it here we will force all multi-project
pipelines to be part of the same partition.
We should add two `partition_id` columns for this table, a
`partition_id` and a `source_partition_id`:
```plain
Foreign-key constraints:
"fk_be5624bf37" FOREIGN KEY (source_job_id, source_partition_id) REFERENCES ci_builds(id, source_partition_id) ON DELETE CASCADE
"fk_d4e29af7d7" FOREIGN KEY (source_pipeline_id, source_partition_id) REFERENCES ci_pipelines(id, source_partition_id) ON DELETE CASCADE
"fk_e1bad85861" FOREIGN KEY (pipeline_id, partition_id) REFERENCES ci_pipelines(id, partition_id) ON DELETE CASCADE
```
This solution is the closest to a two way door decision because:
- We retain the ability to reference pipelines in different partitions.
- If we later decide that we want to force multi-project pipelines in the same partition
we could add a constraint to validate that both columns have the same value.
## Why do we want to use explicit logical partition ids?
Partitioning CI/CD data using a logical `partition_id` has several benefits. We
could partition by a primary key, but this would introduce much more complexity
and additional cognitive load required to understand how the data is being
structured and stored in partitions.
CI/CD data is hierarchical data. Stages belong to pipelines, builds belong to
stages, artifacts belong to builds (with rare exceptions). We are designing a
partitioning strategy that reflects this hierarchy, to reduce the complexity
and therefore cognitive load for contributors. With an explicit `partition_id`
associated with a pipeline, we can cascade the partition ID number when trying
to retrieve all resources associated with a pipeline. We know that for a
pipeline `12345` with a `partition_id` of `102`, we are always able to find
associated resources in logical partitions with number `102` in other routing
tables, and PostgreSQL will know in which partitions these records are being
stored in for every table.
Another interesting benefit for using a single and incremental latest
`partition_id` number, associated with pipelines, is that in theory we can
cache it in Redis or in memory to avoid excessive reads from the database to
find this number, though we might not need to do this.
The single and uniform `partition_id` value for pipeline data gives us more
choices later on than primary-keys-based partitioning.
## Altering partitioned tables
It will still be possible to run `ALTER TABLE` statements against partitioned tables,
similarly to how the tables behaved before partitioning. When PostgreSQL runs
an `ALTER TABLE` statement against a parent partitioned table, it acquires the same
lock on all child partitions and updates each to keep them in sync. This differs from
running `ALTER TABLE` on a non-partitioned table in a few key ways:
- PostgreSQL acquires `ACCESS EXCLUSIVE` locks against a larger number of tables, but
not a larger amount of data, than it would were the table not partitioned.
Each partition will be locked similarly to the parent table, and all will be updated
in a single transaction.
- Lock duration will be increased based on the number of partitions involved.
All `ALTER TABLE` statements executed on the GitLab database (other than `VALIDATE CONSTRAINT`)
take small constant amounts of time per table modified. PostgreSQL will need
to modify each partition in sequence, increasing the runtime of the lock. This
time will still remain very small until there are many partitions involved.
- If thousands of partitions are involved in an `ALTER TABLE`, we will need to verify that
the value of `max_locks_per_transaction` is high enough to support all of the locks that
need to be taken during the operation.
## Splitting large partitions into smaller ones
We want to start with the initial `partition_id` number `100` (or higher, like
`1000`, depending on our calculations and estimations). We do not want to start
from 1, because existing tables are also large already, and we might want to
split them into smaller partitions. If we start with `100`, we will be able to
create partitions for `partition_id` of `1`, `20`, `45`, and move existing
records there by updating `partition_id` from `100` to a smaller number.
PostgreSQL will move these records into their respective partitions in a
consistent way, provided that we do it in a transaction for all pipeline
resources at the same time. If we ever decide to split large partitions into
smaller ones (it's not yet clear if we will need to do this), we might be able
to just use background migrations to update partition IDs, and PostgreSQL is
smart enough to move rows between partitions on its own.
### Naming conventions
A partitioned table is called a **routing** table and it will use the `p_`
prefix which should help us with building automated tooling for query analysis.
A table partition will be called **partition** and it can use the a physical
partition ID as suffix, for example `ci_builds_101`. Existing CI tables will
become **zero partitions** of the new routing tables. Depending on the chosen
[partitioning strategy](#how-do-we-want-to-partition-cicd-data) for a given
table, it is possible to have many logical partitions per one physical partition.
### Attaching first partition and acquiring locks
We learned when [partitioning](https://gitlab.com/gitlab-org/gitlab/-/issues/378644)
the first table that `PostgreSQL` requires an `AccessExclusiveLock` on the table and
all of the other tables that it references through foreign keys. This can cause a deadlock
if the migration tries to acquire the locks in a different order from the application
business logic.
To solve this problem, we introduced a **priority locking strategy** to avoid
further deadlock errors. This allows us to define the locking order and
then try keep retrying aggressively until we acquire the locks or run out of retries.
This process can take up to 40 minutes.
With this strategy, we successfully acquired a lock on `ci_builds` table after 15 retries
during a low traffic period([after `00:00 UTC`](https://dashboards.gitlab.net/d/web-main/web-overview?orgId=1&viewPanel=537181794&from=now-2d&to=now)).
See an example of this strategy in our [partition tooling](../../../development/database/partitioning/list.md#step-6---create-parent-table-and-attach-existing-table-as-the-initial-partition)).
### Partitioning steps
The database [partition tooling](../../../development/database/partitioning/list.md)
docs contain a list of steps to partition a table, but the steps are not enough
for our iterative strategy. As our dataset continues to grow we want to take
advantage of partitioning performance right away and not wait until all tables
are partitioned. For example, after partitioning the `ci_builds_metadata` table
we want to start writing and reading data to/from a new partition. This means
that we will increase the `partition_id` value from `100`, the default value,
to `101`. Now all of the new resources for the pipeline hierarchy will be
persisted with `partition_id = 101`. We can continue following the database
tooling instructions for the next table that will be partitioned, but we require
a few extra steps:
- add `partition_id` column for the FK references with default value of `100`
since the majority of records should have that value.
- change application logic to cascade the `partition_id` value
- correct `partition_id` values for recent records with a post deploy/background
migration, similar to this:
```sql
UPDATE ci_pipeline_metadata
SET partition_id = ci_pipelines.partition_id
FROM ci_pipelines
WHERE ci_pipelines.id = ci_pipeline_metadata.pipeline_id
AND ci_pipelines.partition_id in (101, 102);
```
- change the foreign key definitions
- ...
## Storing partitions metadata in the database
To build an efficient mechanism that will be responsible for creating
new partitions, and to implement time decay we want to introduce a partitioning
metadata table, called `ci_partitions`. In that table we would store metadata
about all the logical partitions, with many pipelines per partition. We may
need to store a range of pipeline ids per logical partition. Using it we will
be able to find the `partition_id` number for a given pipeline ID and we will
also find information about which logical partitions are "active" or
"archived", which will help us to implement a time-decay pattern using database
declarative partitioning.
Doing that will also allow us to use a Unified Resource Identifier for
partitioned resources, that will contain a pointer to a pipeline ID, we could
then use to efficiently lookup a partition the resource is stored in. It might
be important when a resources can be directly referenced by an URL, in UI or
API. We could use an ID like `1e240-5ba0` for pipeline `123456`, build `23456`.
Using a dash `-` can prevent an identifier from being highlighted and copied
with a mouse double-click. If we want to avoid this problem, we can use any
character of written representation that is not present in base-16 numeral
system - any letter from `g` to `z` in Latin alphabet, for example `x`. In that
case an example of an URI would look like `1e240x5ba0`. If we decide to update
the primary identifier of a partitioned resource (today it is just a big
integer) it is important to design a system that is resilient to migrating data
between partitions, to avoid changing identifiers when rebalancing happens.
`ci_partitions` table will store information about a partition identifier,
pipeline ids range it is valid for and whether the partitions have been
archived or not. Additional columns with timestamps may be helpful too.
## Implementing a time-decay pattern using partitioning
We can use `ci_partitions` to implement a time-decay pattern using declarative
partitioning. By telling PostgreSQL which logical partitions are archived we
can stop reading from these partitions using a SQL query like the one below.
```sql
SELECT * FROM ci_builds WHERE partition_id IN (
SELECT id FROM ci_partitions WHERE active = true
);
```
This query will make it possible to limit the number of partitions we will read
from, and therefore will cut access to "archived" pipeline data, using our data
retention policy for CI/CD data. Ideally we do not want to read from more than
two partitions at once, so we need to align the automatic partitioning
mechanisms with the time-decay policy. We will still need to implement new
access patterns for the archived data, presumably through the API, but the cost
of storing archived data in PostgreSQL will be reduced significantly this way.
There are some technical details here that are out of the scope of this
description, but by using this strategy we can "archive" data, and make it much
less expensive to reside in our PostgreSQL cluster by toggling a boolean
column value.
## Accessing partitioned data
It will be possible to access partitioned data whether it has been archived or
not, in most places in GitLab. On a merge request page, we will always show
pipeline details even if the merge request was created years ago. We can do
that because `ci_partitions` will be a lookup table associating a pipeline ID
with its `partition_id`, and we will be able to find the partition that the
pipeline data is stored in.
We will need to constrain access to searching through pipelines, builds,
artifacts etc. Search cannot be done through all partitions, as it would not
be efficient enough, hence we will need to find a better way of searching
through archived pipelines data. It will be necessary to have different access
patterns to access archived data in the UI and API.
There are a few challenges in enforcing usage of the `partition_id`
partitioning key in PostgreSQL. To make it easier to update our application to
support this, we have designed a new queries analyzer in our
[proof of concept merge request](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/80186).
It helps to find queries that are not using the partitioning key.
In a [separate proof of concept merge request](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/84071)
and [related issue](https://gitlab.com/gitlab-org/gitlab/-/issues/357090) we
demonstrated that using the uniform `partition_id` makes it possible to extend
Rails associations with an additional scope modifier so we can provide the
partitioning key in the SQL query.
Using instance dependent associations, we can easily append a partitioning key
to SQL queries that are supposed to retrieve associated pipeline resources, for
example:
```ruby
has_many :builds, -> (pipeline) { where(partition_id: pipeline.partition_id) }
```
The problem with this approach is that it makes preloading much more difficult
as instance dependent associations cannot be used with preloads:
```plaintext
ArgumentError: The association scope 'builds' is instance dependent (the
scope block takes an argument). Preloading instance dependent scopes is not
supported.
```
### Query analyzers
We implemented 2 query analyzers to detect queries that need to be fixed so that everything
keeps working with partitioned tables:
- One analyzer to detect queries not going through a routing table.
- One analyzer to detect queries that use routing tables without specifying the `partition_id` in the `WHERE` clauses.
We started by enabling our first analyzer in `test` environment to detect existing broken
queries. It is also enabled on `production` environment, but for a small subset of the traffic (`0.1%`)
because of scalability concerns.
The second analyzer will be enabled in a future iteration.
### Primary key
Primary key must include the partitioning key column to partition the table.
We first create a unique index including the `(id, partition_id)`.
Then, we drop the primary key constraint and use the new index created to set
the new primary key constraint.
`ActiveRecord` [does not support](https://github.com/rails/rails/blob/6-1-stable/activerecord/lib/active_record/attribute_methods/primary_key.rb#L126)
composite primary keys, so we must force it to treat the `id` column as a primary key:
```ruby
class Model < ApplicationRecord
self.primary_key = 'id'
end
```
The application layer is now ignorant of the database structure and all of the
existing queries from `ActiveRecord` continue to use the `id` column to access
the data. There is some risk to this approach because it is possible to
construct application code that results in duplicate models with the same `id`
value, but on a different `partition_id`. To mitigate this risk we must ensure
that all inserts use the database sequence to populate the `id` since they are
[guaranteed](https://www.postgresql.org/docs/12/sql-createsequence.html#id-1.9.3.81.7)
to allocate distinct values and rewrite the access patterns to include the
`partition_id` value. Manually assigning the ids during inserts must be avoided.
### Foreign keys
Foreign keys must reference columns that either are a primary key or form a
unique constraint. We can define them using these strategies:
#### Between routing tables sharing partition ID
For relations that are part of the same pipeline hierarchy it is possible to
share the `partition_id` column to define the foreign key constraint:
```plaintext
p_ci_pipelines:
- id
- partition_id
p_ci_builds:
- id
- partition_id
- pipeline_id
```
In this case, `p_ci_builds.partition_id` indicates the partition for the build
and also for the pipeline. We can add a FK on the routing table using:
```sql
ALTER TABLE ONLY p_ci_builds
ADD CONSTRAINT fk_on_pipeline_and_partition
FOREIGN KEY (pipeline_id, partition_id)
REFERENCES p_ci_pipelines(id, partition_id) ON DELETE CASCADE;
```
#### Between routing tables with different partition IDs
It's not possible to reuse the `partition_id` for all relations in the CI domain,
so in this case we'll need to store the value as a different attribute. For
example, when canceling redundant pipelines we store on the old pipeline row
the ID of the new pipeline that cancelled it as `auto_canceled_by_id`:
```plaintext
p_ci_pipelines:
- id
- partition_id
- auto_canceled_by_id
- auto_canceled_by_partition_id
```
In this case we can't ensure that the canceling pipeline is part of the same
hierarchy as the canceled pipelines, so we need an extra attribute to store its
partition, `auto_canceled_by_partition_id`, and the FK becomes:
```sql
ALTER TABLE ONLY p_ci_pipelines
ADD CONSTRAINT fk_cancel_redundant_pipelines
FOREIGN KEY (auto_canceled_by_id, auto_canceled_by_partition_id)
REFERENCES p_ci_pipelines(id, partition_id) ON DELETE SET NULL;
```
#### Between routing tables and regular tables
Not all of the tables in the CI domain will be partitioned, so we'll have routing
tables that will reference non-partitioned tables, for example we reference
`external_pull_requests` from `ci_pipelines`:
```sql
FOREIGN KEY (external_pull_request_id)
REFERENCES external_pull_requests(id)
ON DELETE SET NULL
```
In this case we only need to move the FK definition from the partition level
to the routing table so that new pipeline partitions may use it:
```sql
ALTER TABLE p_ci_pipelines
ADD CONSTRAINT fk_external_request
FOREIGN KEY (external_pull_request_id)
REFERENCES external_pull_requests(id) ON DELETE SET NULL;
```
#### Between regular tables and routing tables
Most of the tables from the CI domain reference at least one table that will be
turned into a routing tables, for example `ci_pipeline_messages` references
`ci_pipelines`. These definitions will need to be updated to use the routing
tables and for this they will need a `partition_id` column:
```plaintext
p_ci_pipelines:
- id
- partition_id
ci_pipeline_messages:
- id
- pipeline_id
- pipeline_partition_id
```
The foreign key can be defined by using:
```sql
ALTER TABLE ci_pipeline_messages ADD CONSTRAINT fk_pipeline_partitioned
FOREIGN KEY (pipeline_id, pipeline_partition_id)
REFERENCES p_ci_pipelines(id, partition_id) ON DELETE CASCADE;
```
The old FK definition will need to be removed, otherwise new inserts in the
`ci_pipeline_messages` with pipeline IDs from non-zero partition will fail with
reference errors.
### Indexes
We [learned](https://gitlab.com/gitlab-org/gitlab/-/issues/360148) that `PostgreSQL`
does not allow to create a single index (unique or otherwise) across all partitions of a table.
One solution to solve this problem is to embed the partitioning key inside the uniqueness constraint.
This might mean prepending the partition ID in a hexadecimal format before the token itself and storing
the concatenated string in a database. To do that we would need to reserve an appropriate number of
leading bytes in a token to accommodate for the maximum number of partitions we may have in the future.
It seems that reserving four characters, what would translate into 16-bits number in base-16,
might be sufficient. The maximum number we can encode this way would be FFFF, what is 65535 in decimal.
This would provide a unique constraint per-partition which
is sufficient for global uniqueness.
We have also designed a query analyzer that makes it possible to detect direct
usage of zero partitions, legacy tables that have been attached as first
partitions to routing tables, to ensure that all queries are targeting
partitioned schema or partitioned routing tables, like `p_ci_pipelines`.
## Why not partition using the project or namespace ID?
We do not want to partition using `project_id` or `namespace_id` because
sharding and podding is a different problem to solve, on a different layer of
the application. It doesn't solve the original problem statement of performance
growing worse over time as we build up infrequently read data. We may want to
introduce pods in the future, and that might become the primary mechanism of
separating data based on the group or project the data is associated with.
In theory we could use either `project_id` or `namespace_id` as a second
partitioning dimension, but this would add more complexity to a problem that is
already very complex.
## Partitioning builds queuing tables
We also want to partition our builds queuing tables. We currently have two:
`ci_pending_builds` and `ci_running_builds`. These tables are different from
other CI/CD data tables, as there are business rules in our product that make
all data stored in them invalid after 24 hours.
As a result, we will need to use a different strategy to partition those
database tables, by removing partitions entirely after these are older than 24
hours, and always reading from two partitions through a routing table. The
strategy to partition these tables is well understood, but requires a solid
Ruby-based automation to manage the creation and deletion of these partitions.
To achieve that we will collaborate with the Database team to adapt
[existing database partitioning tools](../../../development/database/partitioning/index.md)
to support CI/CD data partitioning.
## Iterating to reduce the risk
This strategy should reduce the risk of implementing CI/CD partitioning to
acceptable levels. We are also focusing on implementing partitioning for
reading only from two partitions initially to make it possible to detach zero
partitions in case of problems in our production environment. Every iteration
phase, described below has a revert strategy and before shipping database
changes we want to test them in our benchmarking environment.
The main way of reducing risk in case of this effort is iteration and making
things reversible. Shipping changes, described in this document, in a safe and
reliable way is our priority.
As we move forward with the implementation we will need to find even more ways
to iterate on the design, support incremental rollouts and have better control
over reverting changes in case of something going wrong. It is sometimes
challenging to ship database schema changes iteratively, and even more
difficult to support incremental rollouts to the production environment. This
can, however, be done, it just sometimes requires additional creativity, that
we will certainly need here. Some examples of how this could look like:
### Incremental rollout of partitioned schema
Once we introduce a first partitioned routing table (presumably
`p_ci_pipelines`) and attach its zero partition (`ci_pipelines`), we will need
to start interacting with the new routing table, instead of a concrete
partition zero. Usually we would override the database table the `Ci::Pipeline`
Rails model would use with something like `self.table_name = 'p_ci_pipelines'`.
Unfortunately this approach might not support incremental rollout, because
`self.table_name` will be read upon application boot up, and later we might be
unable revert this change without restarting the application.
One way of solving this might be introducing `Ci::Partitioned::Pipeline` model,
that will inherit from `Ci::Pipeline`. In that model we would set
`self.table_name` to `p_ci_pipeline` and return its meta class from
`Ci::Pipeline.partitioned` as a scope. This will allow us to use feature flags
to route reads from `ci_pipelines` to `p_ci_pipelines` with a simple revert
strategy.
### Incremental experimentation with partitioned reads
Another example would be related to the time when we decide to attach another
partition. The goal of Phase 1 will be have two partitions per partitioned
schema / routing table, meaning that for `p_ci_pipelines` we will have
`ci_pipelines` attached as partition zero, and a new `ci_pipelines_p1`
partition created for new data. All reads from `p_ci_pipelines` will also need
to read data from the `p1` partition and we should also iteratively experiment
with reads targeting more than one partition, to evaluate performance and
overhead of partitioning.
We can do that by moving _old_ data to `ci_pipelines_m1` (minus 1) partition
iteratively. Perhaps we will create `partition_id = 1` and move some really old
pipelines there. We can then iteratively migrate data into `m1` partition to
measure the impact, performance and increase our confidence before creating a
new partition `p1` for _new_ (still not created) data.
## Iterations
We want to focus on Phase 1 iteration first. The goal and the main objective of
this iteration is to partition the biggest 6 CI/CD database tables into 6
routing tables (partitioned schema) and 12 partitions. This will leave our
Rails SQL queries mostly unchanged, but it will also make it possible to
perform emergency detachment of "zero partitions" if there is a database
performance degradation. This will cut users off their old data, but the
application will remain up and running, which is a better alternative to
application-wide outage.
1. **Phase 0**: Build CI/CD data partitioning strategy: Done. ✅
1. **Phase 1**: Partition the 6 biggest CI/CD database tables.
1. Create partitioned schemas for all 6 database tables.
1. Design a way to cascade `partition_id` to all partitioned resources.
1. Implement initial query analyzers validating that we target routing tables.
1. Attach zero partitions to the partitioned database tables.
1. Update the application to target routing tables and partitioned tables.
1. Measure the performance and efficiency of this solution.
**Revert strategy**: Switch back to using concrete partitions instead of routing tables.
1. **Phase 2**: Add a partitioning key to add SQL queries targeting partitioned tables.
1. Implement query analyzer to check if queries targeting partitioned tables
are using proper partitioning keys.
1. Modify existing queries to make sure that all of them are using a
partitioning key as a filter.
**Revert strategy**: Use feature flags, query by query.
1. **Phase 3**: Build new partitioned data access patterns.
1. Build a new API or extend an existing one to allow access to data stored in
partitions that are supposed to be excluded based on the time-decay data
retention policy.
**Revert strategy**: Feature flags.
1. **Phase 4**: Introduce time-decay mechanisms built on top of partitioning.
1. Build time-decay policy mechanisms.
1. Enable the time-decay strategy on GitLab.com.
1. **Phase 5**: Introduce mechanisms for creating partitions automatically.
1. Make it possible to create partitions in an automatic way.
1. Deliver the new architecture to self-managed instances.
The diagram below visualizes this plan on Gantt chart. The dates
on the chart below are just estimates to visualize the plan better, these are
not deadlines and can change at any time.
```mermaid
gantt
title CI Data Partitioning Timeline
dateFormat YYYY-MM-DD
axisFormat %m-%y
section Phase 0
Build data partitioning strategy :done, 0_1, 2022-06-01, 90d
section Phase 1
Partition biggest CI tables :1_1, after 0_1, 200d
Biggest table partitioned :milestone, metadata, 2023-03-01, 1min
Tables larger than 100GB partitioned :milestone, 100gb, after 1_1, 1min
section Phase 2
Add paritioning keys to SQL queries :2_1, 2023-01-01, 120d
Emergency partition detachment possible :milestone, detachment, 2023-04-01, 1min
All SQL queries are routed to partitions :milestone, routing, after 2_1, 1min
section Phase 3
Build new data access patterns :3_1, 2023-05-01, 120d
New API endpoint created for inactive data :milestone, api1, 2023-07-01, 1min
Filtering added to existing API endpoints :milestone, api2, 2023-09-01, 1min
section Phase 4
Introduce time-decay mechanisms :4_1, 2023-08-01, 120d
Inactive partitions are not being read :milestone, part1, 2023-10-01, 1min
Performance of the database cluster improves :milestone, part2, 2023-11-01, 1min
section Phase 5
Introduce auto-partitioning mechanisms :5_1, 2023-09-01, 120d
New partitions are being created automatically :milestone, part3, 2023-12-01, 1min
Partitioning is made available on self-managed :milestone, part4, 2024-01-01, 1min
```
## Conclusions
We want to build a solid strategy for partitioning CI/CD data. We are aware of
the fact that it is difficult to iterate on this design, because a mistake made
in managing the database schema of our multi-terabyte PostgreSQL instance might
not be easily reversible without potential downtime. That is the reason we are
spending a significant amount of time to research and refine our partitioning
strategy. The strategy, described in this document, is subject to iteration as
well. Whenever we find a better way to reduce the risk and improve our plan, we
should update this document as well.
We've managed to find a way to avoid large-scale data migrations, and we are
building an iterative strategy for partitioning CI/CD data. We documented our
strategy here to share knowledge and solicit feedback from other team members.
## Who
DRIs:
<!-- vale gitlab.Spelling = NO -->
| Role | Who |
|---------------------|------------------------------------------------|
| Author | Grzegorz Bizon, Principal Engineer |
| Recommender | Kamil Trzciński, Senior Distinguished Engineer |
| Product Leadership | Jackie Porter, Director of Product Management |
| Engineering Leadership | Caroline Simpson, Engineering Manager / Cheryl Li, Senior Engineering Manager |
| Lead Engineer | Marius Bobin, Senior Backend Engineer |
| Senior Engineer | Maxime Orefice, Senior Backend Engineer |
| Senior Engineer | Tianwen Chen, Senior Backend Engineer |
<!-- vale gitlab.Spelling = YES -->
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,39 +1,11 @@
---
status: ongoing
creation-date: "2024-05-27"
authors: [ "@fabiopitino", "@mbobin" ]
coach: [ "@fabiopitino", "@grzesiek" ]
approvers: [ "@jreporter", "@cheryl.li" ]
owning-stage: "~devops::verify"
description: 'Reduce the growth rate of pipeline data'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/reduce_data_growth_rate/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/reduce_data_growth_rate/).
# Reduce the growth rate of pipeline data
## Problem to solve
TODO
## Strategies
### Delete pipeline processing data
Once a build gets archived, it is no longer possible to resume
pipeline processing in such pipeline. It means that all the metadata, we store
in PostgreSQL, that is needed to efficiently and reliably process builds can be
safely moved to a different data store.
Storing pipeline processing data is expensive as this kind of CI/CD
data represents a significant portion of data stored in CI/CD tables. Once we
restrict access to processing archived pipelines, we can move this metadata to
a different place - preferably object storage - and make it accessible on
demand, when it is really needed again (for example for compliance or auditing purposes).
We need to evaluate whether moving data is the most optimal solution. We might
be able to use de-duplication of metadata entries and other normalization
strategies to consume less storage while retaining ability to query this
dataset. Technical evaluation will be required to find the best solution here.
Epic: [Reduce the rate of builds metadata table growth](https://gitlab.com/groups/gitlab-org/-/epics/7434).
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,19 +1,11 @@
---
status: ongoing
creation-date: "2024-05-27"
authors: [ "@fabiopitino", "@mbobin" ]
coach: [ "@fabiopitino", "@grzesiek" ]
approvers: [ "@jreporter", "@cheryl.li" ]
owning-stage: "~devops::verify"
description: 'Retention policies for CI/CD data'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/retention_policies/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_data_decay/retention_policies/).
# Retention policies for CI/CD data
## Problem to solve
TODO
## Strategies
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,448 +1,11 @@
---
status: proposed
creation-date: "2023-05-15"
authors: [ "@furkanayhan" ]
coach: "@ayufan"
approvers: [ "@jreporter", "@cheryl.li" ]
owning-stage: "~devops::verify"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_pipeline_processing/'
remove_date: '2025-07-08'
---
# Future of CI Pipeline Processing
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_pipeline_processing/).
## Summary
GitLab CI is one of the oldest and most complex features in GitLab.
Over the years its YAML syntax has considerably grown in size and complexity.
In order to keep the syntax highly stable over the years, we have primarily been making additive changes
on top of the existing design and patterns.
Our user base has grown exponentially over the past years. With that, the need to support
their use cases and customization of the workflows.
While delivering huge value over the years, the various additive changes to the syntax have also caused
some surprising behaviors in the pipeline processing logic.
Some keywords accumulated a number of responsibilities, and some ambiguous overlaps were discovered among
keywords and subtle differences in behavior were introduced over time.
The current implementation and YAML syntax also make it challenging to implement new features.
In this design document, we will discuss the problems and propose
a new architecture for pipeline processing. Most of these problems have been discussed before in the
["Restructure CI job when keyword"](https://gitlab.com/groups/gitlab-org/-/epics/6788) epic.
## Goals
- We want to make the pipeline processing more understandable, predictable and consistent.
- We want to unify the behaviors of DAG and STAGE. STAGE can be written as DAG and vice versa.
- We want to decouple the manual jobs' blocking behavior from the `allow_failure` keyword.
- We want to clarify the responsibilities of the `when` keyword.
## Non-Goals
We will not discuss how to avoid breaking changes for now.
## Motivation
The list of problems is the main motivation for this design document.
### Problem 1: The responsibility of the `when` keyword
Right now, the [`when`](../../../ci/yaml/index.md#when) keyword has many responsibilities;
> - `on_success` (default): Run the job only when no jobs in earlier stages fail or have `allow_failure: true`.
> - `on_failure`: Run the job only when at least one job in an earlier stage fails. A job in an earlier stage
> with `allow_failure: true` is always considered successful.
> - `never`: Don't run the job regardless of the status of jobs in earlier stages.
> Can only be used in a [`rules`](../../../ci/yaml/index.md#rules) section or `workflow: rules`.
> - `always`: Run the job regardless of the status of jobs in earlier stages. Can also be used in `workflow:rules`.
> - `manual`: Run the job only when [triggered manually](../../../ci/jobs/job_control.md#create-a-job-that-must-be-run-manually).
> - `delayed`: [Delay the execution of a job](../../../ci/jobs/job_control.md#run-a-job-after-a-delay)
> for a specified duration.
It answers three questions;
- What's required to run? => `on_success`, `on_failure`, `always`
- How to run? => `manual`, `delayed`
- Add to the pipeline? => `never`
As a result, for example; we cannot create a `manual` job with `when: on_failure`.
This can be useful when persona wants to create a job that is only available on failure, but needs to be manually played.
For example; publishing failures to dedicated page or dedicated external service.
### Problem 2: Abuse of the `allow_failure` keyword
We control the blocker behavior of a manual job by the [`allow_failure`](../../../ci/yaml/index.md#allow_failure) keyword.
Actually, it has other responsibilities; _"determine whether a pipeline should continue running when a job fails"_.
Currently, a [manual job](../../../ci/jobs/job_control.md#create-a-job-that-must-be-run-manually);
- is not a blocker when it has `allow_failure: true` (by default)
- a blocker when it has `allow_failure: false`.
As a result, for example; we cannot create a `manual` job that is `allow_failure: false` and not a blocker.
```yaml
job1:
stage: test
when: manual
allow_failure: true # default
job2:
stage: deploy
```
Currently;
- `job1` is skipped.
- `job2` runs because `job1` is ignored since it has `allow_failure: true`.
- When we run/play `job1`;
- if it fails, it's marked as "success with warning".
#### `allow_failure` with `rules`
`allow_failure` becomes more confusing when using `rules`.
From [docs](../../../ci/yaml/index.md#when):
> The default behavior of `allow_failure` changes to true with `when: manual`.
> However, if you use `when: manual` with `rules`, `allow_failure` defaults to `false`.
From [docs](../../../ci/yaml/index.md#allow_failure):
> The default value for `allow_failure` is:
>
> - `true` for manual jobs.
> - `false` for jobs that use `when: manual` inside `rules`.
> - `false` in all other cases.
For example;
```yaml
job1:
script: ls
when: manual
job2:
script: ls
rules:
- if: $ALWAYS_TRUE
when: manual
```
`job1` and `job2` behave differently;
- `job1` is not a blocker because it has `allow_failure: true` by default.
- `job2` is a blocker `rules: when: manual` does not return `allow_failure: true` by default.
### Problem 3: Different behaviors in DAG/needs
The main behavioral difference between DAG and STAGE is about the "skipped" and "ignored" states.
**Background information:**
- skipped:
- When a job is `when: on_success` and its previous status is failed, it's skipped.
- When a job is `when: on_failure` and its previous status is not "failed", it's skipped.
- ignored:
- When a job is `when: manual` with `allow_failure: true`, it's ignored.
**Problem:**
The `skipped` and `ignored` states are considered successful in the STAGE processing but not in the DAG processing.
#### Problem 3.1. Handling of ignored status with manual jobs
**Example 1:**
```yaml
build:
stage: build
script: exit 0
when: manual
allow_failure: true # by default
test:
stage: test
script: exit 0
needs: [build]
```
- `build` is ignored (skipped) because it's `when: manual` with `allow_failure: true`.
- `test` is skipped because "ignored" is not a successful state in the DAG processing.
**Example 2:**
```yaml
build:
stage: build
script: exit 0
when: manual
allow_failure: true # by default
test:
stage: test
script: exit 0
```
- `build` is ignored (skipped) because it's `when: manual` with `allow_failure: true`.
- `test2` runs and succeeds.
#### Problem 3.2. Handling of skipped status with when: on_failure
**Example 1:**
```yaml
build_job:
stage: build
script: exit 1
test_job:
stage: test
script: exit 0
rollback_job:
stage: deploy
needs: [build_job, test_job]
script: exit 0
when: on_failure
```
- `build_job` runs and fails.
- `test_job` is skipped.
- Even though `rollback_job` is `when: on_failure` and there is a failed job, it is skipped because the `needs` list has a "skipped" job.
**Example 2:**
```yaml
build_job:
stage: build
script: exit 1
test_job:
stage: test
script: exit 0
rollback_job:
stage: deploy
script: exit 0
when: on_failure
```
- `build_job` runs and fails.
- `test_job` is skipped.
- `rollback_job` runs because there is a failed job before.
### Problem 4: The skipped and ignored states
Let's assume that we solved the problem 3 and the "skipped" and "ignored" states are not different in DAG and STAGE.
How should they behave in general? Are they successful or not? Should "skipped" and "ignored" be different?
Let's examine some examples;
**Example 4.1. The ignored status with manual jobs**
```yaml
build:
stage: build
script: exit 0
when: manual
allow_failure: true # by default
test:
stage: test
script: exit 0
```
- `build` is in the "manual" state but considered as "skipped" (ignored) for the pipeline processing.
- `test` runs because "skipped" is a successful state.
Alternatively;
```yaml
build1:
stage: build
script: exit 0
when: manual
allow_failure: true # by default
build2:
stage: build
script: exit 0
test:
stage: test
script: exit 0
```
- `build1` is in the "manual" state but considered as "skipped" (ignored) for the pipeline processing.
- `build2` runs and succeeds.
- `test` runs because "success" + "skipped" is a successful state.
**Example 4.2. The skipped status with when: on_failure**
```yaml
build:
stage: build
script: exit 0
when: on_failure
test:
stage: test
script: exit 0
```
- `build` is skipped because it's `when: on_failure` and its previous status is not "failed".
- `test` runs because "skipped" is a successful state.
Alternatively;
```yaml
build1:
stage: build
script: exit 0
when: on_failure
build2:
stage: build
script: exit 0
test:
stage: test
script: exit 0
```
- `build1` is skipped because it's `when: on_failure` and its previous status is not "failed".
- `build2` runs and succeeds.
- `test` runs because "success" + "skipped" is a successful state.
### Problem 5: The `dependencies` keyword
The [`dependencies`](../../../ci/yaml/index.md#dependencies) keyword is used to define a list of jobs to fetch
[artifacts](../../../ci/yaml/index.md#artifacts) from. It is a shared responsibility with the `needs` keyword.
Moreover, they can be used together in the same job. We may not need to discuss all possible scenarios but this example
is enough to show the confusion;
```yaml
test2:
script: exit 0
dependencies: [test1]
needs:
- job: test1
artifacts: false
```
### Information 1: Canceled jobs
Are a canceled job and a failed job the same? They have many differences so we could easily say "no".
However, they have one similarity; they can be "allowed to fail".
Let's define their differences first;
- A canceled job;
- It is not a finished job.
- Canceled is a user requested interruption of the job. The intent is to abort the job or stop pipeline processing as soon as possible.
- We don't know the result, there is no artifacts, etc.
- Since it's never run, the `after_script` is not run.
- Its eventual state is "canceled" so no job can run after it.
- There is no `when: on_canceled`.
- Even `when: always` is not run.
- A failed job;
- It is a machine response of the CI system to executing the job content. It indicates that execution failed for some reason.
- It is equal answer of the system to success. The fact that something is failed is relative,
and might be desired outcome of CI execution, like in when executing tests that some are failing.
- We know the result and [there can be artifacts](../../../ci/yaml/index.md#artifactswhen).
- `after_script` is run.
- Its eventual state is "failed" so subsequent jobs can run depending on their `when` values.
- `when: on_failure` and `when: always` are run.
**The one similarity is; they can be "allowed to fail".**
```yaml
build:
stage: build
script: sleep 10
allow_failure: true
test:
stage: test
script: exit 0
when: on_success
```
- If `build` runs and gets `canceled`, then `test` runs.
- If `build` runs and gets `failed`, then `test` runs.
#### An idea on using `canceled` instead of `failed` for some cases
There is another aspect. We often drop jobs with a `failure_reason` before they get executed,
for example when the namespace ran out of compute minutes or when limits are exceeded.
Dropping jobs in the `failed` state has been handy because we could communicate to the user the `failure_reason`
for better feedback. When canceling jobs for various reasons we don't have a way to indicate that.
We cancel jobs because the user ran out of Compute Credits while the pipeline was running,
or because the pipeline is auto-canceled by another pipeline or other reasons.
If we had a `stop_reason` instead of `failure_reason` we could use that for both cancelled and failed jobs
and we could also use the `canceled` status more appropriately.
### Information 2: Empty state
We [recently updated](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/117856) the documentation of
[the `when` keyword](../../../ci/yaml/index.md#when) for clarification;
> - `on_success`: Run the job only when no jobs in earlier stages fail or have `allow_failure: true`.
> - `on_failure`: Run the job only when at least one job in an earlier stage fails.
For example;
```yaml
test1:
when: on_success
script: exit 0
# needs: [] would lead to the same result
test2:
when: on_failure
script: exit 0
# needs: [] would lead to the same result
```
- `test1` runs because there is no job failed in the previous stages.
- `test2` does not run because there is no job failed in the previous stages.
The `on_success` means that "nothing failed", it does not mean that everything succeeded.
The same goes to `on_failure`, it does not mean that everything failed, but does mean that "something failed".
This semantic goes by a expectation that your pipeline succeeds, and this is happy path.
Not that your pipeline fails, because then it requires user intervention to fix it.
## Technical expectations
All proposals or future decisions must follow these goals;
1. The `allow_failure` keyword must only responsible for marking **failed** jobs as "success with warning".
- Why: It should not have another responsibility, such as determining a manual job is a blocker or not.
- How: Another keyword will be introduced to control the blocker behavior of a manual job.
1. With `allow_failure`, **canceled** jobs must not be marked as "success with warning".
- Why: "canceled" is a different state than "failed".
- How: Canceled with `allow_failure: true` jobs will not be marked as "success with warning".
1. The `when` keyword must only answer the question "What's required to run?". And it must be the only source of truth
for deciding if a job should run or not.
1. The `when` keyword must not control if a job is added to the pipeline or not.
- Why: It is not its responsibility.
- How: Another keyword will be introduced to control if a job is added to the pipeline or not.
1. The "skipped" and "ignored" states must be reconsidered.
- TODO: We need to discuss this more.
1. A new keyword structure must be introduced to specify if a job is an "automatic", "manual", or "delayed" job.
- Why: It is not the responsibility of the `when` keyword.
- How: A new keyword will be introduced to control the behavior of a job.
1. The `needs` keyword must only control the order of the jobs. It must not be used to control the behavior of the jobs
or to decide if a job should run or not. The DAG and STAGE behaviors must be the same.
- Why: It leads to different behaviors and confuses users.
- How: The `needs` keyword will only define previous jobs, like stage does.
1. The `needs` and `dependencies` keywords must not be used together in the same job.
- Why: It is confusing.
- How: The `needs` and `dependencies` keywords will be mutually exclusive.
## Proposal
N/A
## Design and implementation details
N/A
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 11 KiB

View File

@ -1,223 +1,11 @@
---
status: ongoing
creation-date: "2021-01-21"
authors: [ "@grzesiek" ]
coach: "@grzesiek"
approvers: [ "@cheryl.li", "@jreporter" ]
owning-stage: "~devops::verify"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_scale/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ci_scale/).
# CI/CD Scaling
## Summary
GitLab CI/CD is one of the most data and compute intensive components of GitLab.
Since its initial release in 2012,
the CI/CD subsystem has evolved significantly. It was [integrated into GitLab in September 2015](https://about.gitlab.com/releases/2015/09/22/gitlab-8-0-released/)
and has become [one of the most beloved CI/CD solutions](https://about.gitlab.com/blog/2017/09/27/gitlab-leader-continuous-integration-forrester-wave/).
GitLab CI/CD has come a long way since the initial release, but the design of
the data storage for pipeline builds remains almost the same since 2012. We
store all the builds in PostgreSQL in `ci_builds` table, and because we are
creating more than 5 million builds each day on GitLab.com we are reaching
database limits that are slowing our development velocity down.
On February 1st, 2021, GitLab.com surpassed 1 billion CI/CD builds created. In
February 2022 we reached 2 billion of CI/CD build stored in the database. The
number of builds continues to grow exponentially.
The screenshot below shows our forecast created at the beginning of 2021, that
turned out to be quite accurate.
![CI builds cumulative with forecast](ci_builds_cumulative_forecast.png)
## Goals
**Enable future growth by making processing 20M builds in a day possible.**
## Challenges
The current state of CI/CD product architecture needs to be updated if we want
to sustain future growth.
### We were running out of the capacity to store primary keys: DONE
The primary key in `ci_builds` table is an integer value, generated in a sequence.
Historically, Rails used to use [integer](https://www.postgresql.org/docs/14/datatype-numeric.html)
type when creating primary keys for a table. We did use the default when we
[created the `ci_builds` table in 2012](https://gitlab.com/gitlab-org/gitlab/-/blob/046b28312704f3131e72dcd2dbdacc5264d4aa62/db/ci/migrate/20121004165038_create_builds.rb).
[The behavior of Rails has changed](https://github.com/rails/rails/pull/26266)
since the release of Rails 5. The framework is now using `bigint` type that is 8
bytes long, however we have not migrated primary keys for `ci_builds` table to
`bigint` yet.
In early 2021 we had estimated that would run out of the capacity of the integer
type to store primary keys in `ci_builds` table before December 2021. If it had
happened without a viable workaround or an emergency plan, GitLab.com would go
down. `ci_builds` was just one of many tables that were running out of the
primary keys available in Int4 sequence.
Before October 2021, our Database team had managed to migrate all the risky
tables' primary keys to big integers.
See the [related Epic](https://gitlab.com/groups/gitlab-org/-/epics/5657) for more details.
### Some CI/CD database tables are too large: IN PROGRESS
There is more than two billion rows in `ci_builds` table. We store many
terabytes of data in that table, and the total size of indexes is measured in
terabytes as well.
This amount of data contributes to a significant number of performance
problems we experience on our CI PostgreSQL database.
Most of the problems are related to how PostgreSQL database works internally,
and how it is making use of resources on a node the database runs on. We are at
the limits of vertical scaling of the CI primary database nodes and we
frequently see a negative impact of the `ci_builds` table on the overall
performance, stability, scalability and predictability of the CI database
GitLab.com depends on.
The size of the table also hinders development velocity because queries that
seem fine in the development environment may not work on GitLab.com. The
difference in the dataset size between the environments makes it difficult to
predict the performance of even the most simple queries.
Team members and the wider community members are struggling to contribute the
Verify area, because we restricted the possibility of extending `ci_builds`
even further. Our static analysis tools prevent adding more columns to this
table. Adding new queries is unpredictable because of the size of the dataset
and the amount of queries executed using the table. This significantly hinders
the development velocity and contributes to incidents on the production
environment.
We also expect a significant, exponential growth in the upcoming years.
One of the forecasts done using [Facebook's Prophet](https://facebook.github.io/prophet/)
shows that in the first half of 2024 we expect seeing 20M builds created on
GitLab.com each day. In comparison to around 5M we see created today. This is
10x growth from numbers we saw in 2021.
![CI builds daily forecast](ci_builds_daily_forecast.png)
**Status**: As of October 2021 we reduced the growth rate of `ci_builds` table
by writing build options and variables to `ci_builds_metadata` table. We are
also working on partitioning the largest CI/CD database tables using
[time decay pattern](../ci_data_decay/index.md).
### Queuing mechanisms were using the large table: DONE
Because of how large the table is, mechanisms that we used to build queues of
pending builds (there is more than one queue), were not very efficient. Pending
builds represented a small fraction of what we store in the `ci_builds` table,
yet we needed to find them in this big dataset to determine an order in which we
wanted to process them.
This mechanism was very inefficient, and it had been causing problems on the
production environment frequently. This usually resulted in a significant drop
of the CI/CD Apdex score, and sometimes even caused a significant performance
degradation in the production environment.
There were multiple other strategies that we considered to improve performance and
reliability. We evaluated using [Redis queuing](https://gitlab.com/gitlab-org/gitlab/-/issues/322972), or
[a separate table that would accelerate SQL queries used to build queues](https://gitlab.com/gitlab-org/gitlab/-/issues/322766).
We decided to proceed with the latter.
In October 2021 we finished shipping the new architecture of builds queuing
[on GitLab.com](https://gitlab.com/groups/gitlab-org/-/epics/5909#note_680407908).
We then made the new architecture [generally available](https://gitlab.com/groups/gitlab-org/-/epics/6954).
### Moving big amounts of data is challenging: IN PROGRESS
We store a significant amount of data in `ci_builds` table. Some of the columns
in that table store a serialized user-provided data. Column `ci_builds.options`
stores more than 600 gigabytes of data, and `ci_builds.yaml_variables` more
than 300 gigabytes (as of February 2021).
It is a lot of data that needs to be reliably moved to a different place.
Unfortunately, right now, our background migrations
are not reliable enough to migrate this amount of data at scale. We need to
build mechanisms that will give us confidence in moving this data between
columns, tables, partitions or database shards.
Effort to improve background migrations will be owned by our Database Team.
**Status**: In progress. We plan to ship further improvements that will be
described in a separate architectural blueprint.
## Proposal
Below you can find the original proposal made in early 2021 about how we want
to move forward with CI Scaling effort:
> Making GitLab CI/CD product ready for the scale we expect to see in the
> upcoming years is a multi-phase effort.
>
> First, we want to focus on things that are urgently needed right now. We need
> to fix primary keys overflow risk and unblock other teams that are working on
> database partitioning and sharding.
>
> We want to improve known bottlenecks, like
> builds queuing mechanisms that is using the large table, and other things that
> are holding other teams back.
>
> Extending CI/CD metrics is important to get a better sense of how the system
> performs and to what growth should we expect. This will make it easier for us
> to identify bottlenecks and perform more advanced capacity planning.
>
> Next step is to better understand how we can leverage strong time-decay
> characteristic of CI/CD data. This might help us to partition CI/CD dataset to
> reduce the size of CI/CD database tables.
## Iterations
Work required to achieve our next CI/CD scaling target is tracked in the
[CI/CD Scaling](https://gitlab.com/groups/gitlab-org/-/epics/5745) epic.
1. ✓ Migrate primary keys to big integers on GitLab.com.
1. ✓ Implement the new architecture of builds queuing on GitLab.com.
1. ✓ [Make the new builds queuing architecture generally available](https://gitlab.com/groups/gitlab-org/-/epics/6954).
1. [Partition CI/CD data using time-decay pattern](../ci_data_decay/index.md).
## Status
Created at 21.01.2021, approved at 26.04.2021.
Status: In progress.
## Who
Proposal:
<!-- vale gitlab.Spelling = NO -->
| Role | Who |
|------------------------------|-----|
| Author | Grzegorz Bizon |
| Architecture Evolution Coach | Kamil Trzciński |
| Engineering Leader | Cheryl Li |
| Product Manager | Jackie Porter |
| Domain Expert / Verify | Fabio Pitino |
| Domain Expert / Database | Jose Finotto |
| Domain Expert / PostgreSQL | Nikolay Samokhvalov |
DRIs:
| Role | Who |
|-------------|-----|
| Leadership | Cheryl Li |
| Product | Jackie Porter |
| Engineering | Grzegorz Bizon |
Domain experts:
| Area | Who |
|----------------------------|-----|
| Domain Expert / Verify | Fabio Pitino |
| Domain Expert / Verify | Marius Bobin |
| Domain Expert / Database | Jose Finotto |
| Domain Expert / PostgreSQL | Nikolay Samokhvalov |
<!-- vale gitlab.Spelling = YES -->
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

View File

@ -1,289 +1,11 @@
---
status: proposed
creation-date: "2023-01-10"
authors: [ "@ankitbhatnagar", "@ahegyi", "@mikolaj_wawrzyniak" ]
coach: "@grzesiek"
approvers: [ "@nhxnguyen", "@stkerr" ]
owning-stage: "~workinggroup::clickhouse"
participating-stages: [ "~section::ops", "~section::dev" ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/clickhouse_ingestion_pipeline/'
remove_date: '2025-07-08'
---
# Scalable data ingestion abstraction for ClickHouse
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/clickhouse_ingestion_pipeline/).
## Table of Contents
- [Summary](#summary)
- [Why](#why)
- [How](#how)
- [Motivation](#motivation)
- [Case Studies](#case-studies)
- [Replicating existing data into ClickHouse](#1-replicating-existing-data-into-clickhouse)
- [Ingesting large volumes of data into ClickHouse](#2-ingesting-large-volumes-of-data-into-clickhouse)
- [Goals](#goals)
- [Non-goals](#non-goals)
- [General considerations](#general-considerations)
- [Challenges building this](#major-challenges-around-building-such-a-capability)
- [Proposed solution](#proposed-solution)
- [Design & Implementation](#design--implementation)
- [References](#references)
## Summary
Develop a scalable & reliable data ingestion abstraction to help efficiently ingest large volumes of data from high throughput systems into ClickHouse.
### Why
To enable any application at GitLab to write necessary data into ClickHouse regardless of the scale at which they generate data today, or in the future. Refer to [Motivation](#motivation) for why ClickHouse in the first place.
### How
By building a write abstraction (API/Library) that allows a user to write data into ClickHouse and has all necessary configurations, conventions and best-practices around instrumentation, service-discovery, etc, built into it out of the box.
## Motivation
ClickHouse is an online, analytical processing (OLAP) database that powers use-cases that require fetching real-time, aggregated data that does not mutate a lot. ClickHouse is highly performant and can scale to large volumes of data as compared to traditional transactional relational databases (OLTP) such as Postgres, MySQL. For further reading around ClickHouse's capabilities, see [[1]](https://about.gitlab.com/blog/2022/04/29/two-sizes-fit-most-postgresql-and-clickhouse/), [[2]](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres) and [[3]](https://posthog.com/blog/clickhouse-vs-postgres).
At GitLab, [our current and future ClickHouse uses/capabilities](https://gitlab.com/groups/gitlab-com/-/epics/2075) reference & describe multiple use-cases that could be facilitated by using ClickHouse as a backing datastore. A majority of these talk about the following two major areas of concern:
1. Being able to leverage [ClickHouse's OLAP capabilities](https://clickhouse.com/docs/en/faq/general/olap) enabling underlying systems to perform an aggregated analysis of data, both over short and long periods of time.
1. The fact that executing these operations with our currently existing datasets primarily in Postgres, is starting to become challenging and non-performant.
Looking forward, assuming a larger volume of data being produced by our application(s) and the rate at which it gets produced, the ability to ingest it into a *more* capable system, both effectively and efficiently helps us scale our applications and prepare for business growth.
## Case studies
From an initial assessment of all (reported) use-cases that intend to utilise ClickHouse, the following broad patterns of usage can be observed:
1. Efficiently replicating existing data from other databases into ClickHouse, most prominently Postgres.
1. Directly ingesting large volumes of data into ClickHouse for asynchronous processing, data aggregation & analysis.
The following section(s) explain details of each problem-domain:
### 1. Replicating existing data into ClickHouse
With due reference to our prior work around this, it has been established that logical replication from Postgres is too slow. Instead, we'll need to be able to emit data change events within database transactions which can then get processed asynchronously to write or update corresponding data in ClickHouse.
The following case-studies describe how these groups intend to solve the underlying problem:
- ~group::optimize has been working towards a scalable PostgreSQL data replication strategy which can be implemented on the application layer.
- [Proposal: Scalable data sync/replication strategy](https://gitlab.com/gitlab-org/gitlab/-/issues/382172) talks about such a strategy and the additional challenges with using Sidekiq for queueing/batching needs.
- It has been observed that pumping data from `PostgreSQL` into `ClickHouse` directly might not be the right way to approach the problem at hand.
- In addition to the problems described above, another class of problems when replicating data across systems is also the handling of data backfill and/or data migrations that happen upstream.
- [group::data](https://handbook.gitlab.com/handbook/business-technology/data-team/) has been working around syncing data from some of our Postgres databases into a Snowflake-based data warehouse. See this issue for optioned considered: [List down all possible options for postgres to snowflake pipeline](https://gitlab.com/gitlab-data/gitlab.com-saas-data-pipeline/-/issues/13) before designing the current system in place.
- With the work done around our [Next Gen GitLab SaaS Data Pipeline](https://docs.google.com/presentation/d/1hVaCY42YhaO5UvgLzp3mbuMYJIFuTFYFJjdhixFTxPE/edit#slide=id.g143a48de8a3_0_0), the data team owns a "custom" pipeline that does incremental data extractions based on an `updated_at` timestamp column. This helps import a significant subset of operational database relations into Snowflake data-warehouse.
- As the volume of data grows, we can foresee this (ETL) pipeline warranting more time and resources to execute resulting in delays across the time between data being produced and being available in Snowflake data-warehouse.
- We might also see data inconsistency/incompleteness issues emanating from the current setup since row deletions are not transferred into Snowflake, inflating data volume and skewing analysis. Any information about multiple updates happening between import interval period are also lost.
- Having a scalable ingestion pipeline that can help replicate data from our databases into an intermediate system and/or ClickHouse in near real-time would help improve the operational characteristics around this system.
### 2. Ingesting large volumes of data into ClickHouse
We need to be able to ingest large volumes of potentially unaggregated data into ClickHouse which may result into a large number of small writes as well. This can have an adverse effect on how ClickHouse processes and stores incoming data. To mitigate this problem, we need to queue & batch smaller writes into larger ones to keep the ingestion pipeline efficient at all times.
The following case-studies describe how each group intends to solve the underlying problem:
- ~group::observability explains their need of ingesting large amounts of data into ClickHouse, with the following two issues:
- [Proposal: GitLab Observability Platform - Data Ingestion](https://gitlab.com/gitlab-org/opstrace/opstrace/-/issues/1878) talks about using an external events store, such as Kafka, to first ingest data as received from users, then writing it into ClickHouse in larger batches thereby eliminating the need to write a large number of small writes without hampering write performance from how ClickHouse `MergeTree` processes ingested data.
- In addition, [ClickHouse: Investigate client-side buffering to batch writes into ClickHouse](https://gitlab.com/gitlab-org/opstrace/opstrace/-/issues/2044) talks about their experimentation with using application-local queueing/batching to work around the problems mentioned above.
- ~"group::analytics instrumentation" has been working on building our analytics offering and recently looking at building and/or improving parts of the system.
- [Product Analytics Collector Component](https://gitlab.com/groups/gitlab-org/-/epics/9346) talks about replacing Jitsu with Snowplow for collecting and processing tracking events. For more details of the proposal, see [Jitsu replacement](https://gitlab.com/gitlab-org/analytics-section/analytics-instrumentation/proposals/-/blob/62d332baf5701810d9e7a0b2c00df18431e82f22/doc/jitsu_replacement.md).
- The initial design was prototyped with [Snowplow as Jitsu Replacement PoC](https://gitlab.com/gitlab-org/analytics-section/product-analytics/devkit/-/merge_requests/37).
- From the design, it is easy to observe how large amounts of data will be ingested into ClickHouse and could potentially benefit from the use of a scalable ingestion pipeline.
## Goals
### Well-defined, established client abstractions
We want to define and establish a fully-functional application-side abstraction that can help ingest data into ClickHouse without getting in the way of how an application itself is designed while keeping the underlying code backend-agnostic. The proposed abstraction should become the default choice for any applications, core or satellite, at GitLab.
### Support for high throughput in volume of writes
A solution here should enable an application to write any amount of inserts (order of upto 1000-5000 writes per second) to the underlying database efficiently while also allowing for growth as the application scales out. Considering how ClickHouse processes incoming writes, a proposed solution should be able to batch a number of very small writes into larger batches.
### Reliable, consistent delivery of data
A solution here should also ensure reliable & consistent delivery of ingested data into the underlying database minimising undue loss of data before being eventually persisted into ClickHouse.
## Non-goals
### Addressing data types, schemas or formats
At this stage of this proposal, we're not optimizing for addressing which data types, schemas or formats we receive ingested data in. It should be delegated to the backend-specific implementations themselves and not handled within the write abstraction.
### Addressing where our data sources exist today
We're also not addressing any client-side specific details into the design at this point. The write abstraction should only remain a tool for the language in which it is written. As long as an application can use it to write data as any other third-party library, we should be good to build on top of it.
## General Considerations
Having addressed the details of the two aforementioned problem-domains, we can model a proposed solution with the following logical structure:
- Ingestion
- APIs/SDKs
- HTTP2/gRPC Sidecar
- Transport & Routing
- Multi-destination
- Digestion/Compute
- Enrichment
- Processing
- Persisting
## Major challenges around building such a capability
### Self-managed environments
The single, biggest challenge around introducing ClickHouse and related systems would be the ability to make it available to our users running GitLab in self-managed environments. The intended goals of this proposal are intentionally kept within those constraints. It is also prudent to establish that what we're *proposing* here be applicable to applications consuming ClickHouse from inside self-managed environments.
There are ongoing efforts to streamline distribution and deployment of ClickHouse instances for managed environment within the larger scope of [ClickHouse Usage at GitLab](../clickhouse_usage/index.md). A few other issues tackling parts of the aforementioned problem are:
- [Research and understand component costs and maintenance requirements of running a ClickHouse instance with GitLab](https://gitlab.com/gitlab-com/www-gitlab-com/-/issues/14384)
- [ClickHouse maintenance and cost research](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/116669)
### Wide variety of data sources, their structures & usage patterns
The data that we intend to ingest into ClickHouse can come from a wide variety of data sources and be structured in different schemas or formats. With that considered, it's non-trivial effort to draft a solution that suffices all use-cases efficiently.
Should we decide to build an intermediate ingestion system, any solution should help provide a source/schema/format-agnostic data transport layer with an established, matured client-abstraction to maximise the number of applications that can use it.
### Building on top of our current database infrastructure
Our current database infrastructure operates at a fairly large scale and adding more applications that continuously read/write against it adds to the pressure on the existing resources. It's important we move away any workloads and/or datasets that can be safely processed in a different context altogether.
### Service Discovery
We're still normalising the details around distribution and deployment of ClickHouse clusters and/or instances for our applications. Subject to how we end up doing it, for a client to be able to discover which ClickHouse cluster, shard or table would need to become a part any such solution.
## Proposed Solution
In light of the problems discussed earlier, it'd be in our better interests to allow the usage of an external, intermediate system subject to what one's needs might be especially around the volume & scale of data being writen from an application into ClickHouse.
Therefore, we intend to develop an abstraction that can enable an application to store data into ClickHouse regardless of the scale that they (currently) operate at. It also:
- Facilitates an application to switch from one *technology* to another should their performance and/or scale requirements change over time.
- Allows for backend-specific conventions, configurations & best practices such as instrumentation, service-discovery, etc. to be encoded in one place for all applications to leverage consistently.
## Design & Implementation
### Core assumptions
- We're only going to focus on writing data into ClickHouse as mentioned in aforementioned non-goals. With details of how our data lands into ClickHouse, this document does not (intentionally) address where this data comes from. Some of those details are delegated to the applications generating this data i.e as long as they can consume this abstraction, they should be able to write data into ClickHouse.
- We're going to delegate the choice of different storage backends to a following blueprint or epic since that's outside the scope of this design. With ClickHouse as the eventual destination for our data, this document only talks about writing data into it - either directly or indirectly via a queueing/batching system.
### Architecture
![Architecture](clickhouse_dbwriter.png)
Having an abstraction around writing data help client-side instrumentation to stay backend-agnostic allowing them to switch code paths depending on where it runs.
An example setup should look like:
```ruby
Gitlab::Database::Writer.config do |config|
#
# when using sync mode, data gets written directly into ClickHouse,
# therefore, it's also assumed the backend here is ClickHouse
config.mode = :sync OR :async
config.backend = :clickhouse # optional
# OR
#
# when using async mode, data is written to an intermediate system
# first, then written into ClickHouse asynchronously
config.mode = :async
config.backend = :pubsub OR :kafka OR :otherbackend
#
# then backend-specific configurations hereafter
#
config.url = 'tcp://user:pwd@localhost:9000/database'
# for example, a serializer helps define how data travels over the wire
config.json_serializer = ClickHouse::Serializer::JsonSerializer
# ...
end
# do application-specific processing
# eventually, write data using the object you just built
Gitlab::Database::Writer.write(
Gitlab::Database::Model::MyTable,
[{ id: 1, foo: 'bar' }],
)
```
We intend to keep `Gitlab::Database::Writer.backend` to be as close to the backend-specific client implementation as possible. Having a wrapper around a vanilla client helps us address peripheral concerns such as service-discovery for the backends while still allowing the user to leverage features of a given client.
### Iterations
Considering the large scope of this undertaking and the need for feedback around actual usage, we intend to build the proposed abstraction(s) across multiple iterations which can be described as follows:
#### Iteration 1 - Develop write abstraction with sync mode enabled
First, research and develop a simple write abstraction that our users can begin to use to write data into ClickHouse. This ensures our choice of the underlying client is well-researched and suffices to fulfill needs of as many reported use-cases as possible. Being able to see this running would help gather user-feedback and improve the write APIs/interfaces accordingly.
Given this feedback and more development with how we aim to deploy ClickHouse across our environments, it'd then be prudent to build into this abstraction necessary conventions, best practices and abstract away details around connection-pooling, service-discovery, etc.
#### Iteration 2 - Add support for schemas & data validation
In the next iteration, we plan to add support for schema usage and validation. This helps keep model definitions sane and allows for validating data to be inserted.
#### Iteration 3 - Add support for async mode, PoC with one backend
With the above two iterations well-executed, we can start to scale up our write abstractions adding the support for writing data into intermediate data stores before writing it into ClickHouse asynchronously. We aim to prototype such an implementation with atleast one such backend.
#### Further iterations
With a backend-agnostic abstraction becoming the ingestion interface a client interacts with, there's various other use-cases that can be solved from within this abstraction. Some of them are:
- Zero-configuration data ingestion from multiple sources
- Dynamically enriching data from multiple sources
- Offloading data to long-term retention data stores
### Possible backend implementations
- Applications writing directly to ClickHouse
- Application-local in-memory queueing/batching of data
- Application-local persistent queueing/batching of data
- Non-local queueing/batching of data before eventually writing into ClickHouse
- Managed cloud backends:
- [Google PubSub](https://cloud.google.com/pubsub)
- [AWS Kinesis](https://aws.amazon.com/kinesis/)
- Self-managed backends:
- [CHProxy](https://www.chproxy.org/)
- [Kafka](https://kafka.apache.org/)
- [RedPanda](https://redpanda.com/)
- [Vector](https://vector.dev/)
- [RabbitMQ](https://www.rabbitmq.com/)
### Additional complexity when using a non-local backend
- The need for running an additional process/sub-system that reads data from the concerned backend and writes it into ClickHouse efficiently and reliably.
- The additional hop across the backend also means that there might be potential delays in how soon this data lands into ClickHouse.
Though the points above describe additional complexity for an application, they can be treated as valid trade-off(s) assuming their need for data ingestion at scale.
### Comparing backends across multiple dimensions
| Dimension | CHProxy | Redis | Google PubSub | Apache Kafka |
|---|---|---|---|---|
| Operations | Trivial | Trivial | Managed | Non-trivial, complex |
| Data Retention | Non-durable | Non-durable | Durable | Durable |
| Performance | Good | Good | High | High |
| Data Streaming | None | Minimal | Good | Best |
| Suitable for self-managed environments | Trivial | Trivial | - | Complex |
## References
- [ClickHouse use-cases within Manage](https://gitlab.com/groups/gitlab-org/-/epics/7964)
- [List down all possible options for postgres to snowflake pipeline](https://gitlab.com/gitlab-data/gitlab.com-saas-data-pipeline/-/issues/13)
- [Design Spike for Snowplow For Data Event capture](https://gitlab.com/gitlab-data/analytics/-/issues/12397)
- [Audit Events Performance Limits](https://gitlab.com/gitlab-org/gitlab/-/issues/375545)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,318 +1,11 @@
---
status: proposed
creation-date: "2023-02-23"
authors: [ "@mikolaj_wawrzyniak", "@jdrpereira", "@pskorupa" ]
coach: "@DylanGriffith"
approvers: [ "@nhxnguyen" ]
owning-stage: "~workinggroup::clickhouse"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/clickhouse_read_abstraction_layer/'
remove_date: '2025-07-08'
---
# Consider an abstraction layer to interact with ClickHouse or alternatives
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/clickhouse_read_abstraction_layer/).
## Table of Contents
- [Summary](#summary)
- [Motivation](#motivation)
- [Goals](#goals)
- [Non-goals](#non-goals)
- [Possible solutions](#possible-solutions)
- [Recommended approach](#recommended-approach)
- [Overview of open source tools](#overview-of-open-source-tools)
- [Open Questions](#open-questions)
## Summary
Provide a solution standardizing read access to ClickHouse or its alternatives for GitLab installations that will not opt-in to install ClickHouse. After analyzing different [open-source tools](#overview-of-open-source-tools) and weighing them against an option to [build a solution internally](#recommended-approach). The current recommended approach proposes to use dedicated database-level drivers to connect to each data source. Additionally, it proposes the usage of [repository pattern](https://martinfowler.com/eaaCatalog/repository.html) to confine optionally database availability complexity to a single application layer.
## Motivation
ClickHouse requires significant resources to be run, and smaller installations of GitLab might not get a return from investment with provided performance improvement. That creates a risk that ClickHouse might not be globally available for all installations and features might need to alternate between different data stores available. Out of all [present & future ClickHouse use cases](https://gitlab.com/groups/gitlab-com/-/epics/2075) that have been already proposed as part of the working group 7 out of 10 uses data stores different than ClickHouse. Considering that context it is important to
support those use cases in their effort to adopt ClickHouse by providing them with tools and guidelines that will standardize interactions with available data stores.
The proposed solution can take different forms from stand-alone tooling
offering a unified interface for interactions with underlying data stores, to a set of libraries supporting each of the data stored individually backed by implementation guidelines that will describe rules and limitations placed around data stores interactions, and drawing borders of encapsulation.
## Goals
- Limit the impact of optionally available data stores on the overall GitLab application codebase to [single abstraction layer](../../../development/reusing_abstractions.md#abstractions)
- Support all data store specific features
- Support communication for satellite services of the main GitLab application
## Non-goals
- This proposal does not directly consider write communication with database, as this is a subject of [complementary effort](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/111148)
- This proposal does not directly consider schema changes and data migration challenges
Despite above points being non goals, it is acknowledge that they might impose some alterations to final solution which is expressed at the end of this document in the [Open questions](#open-questions) section.
## Possible Solutions
High-level goals described in the previous paragraph can be achieved by both in-house-built solutions as well as by adopting open-source tools.
The following sections will take a closer look into both of those avenues
### Recommended approach
In the spirit of MVC and iteration, it is proposed to start with a solution that would rely on drivers that directly interact
with corresponding data stores, like ActiveRecord for Ruby. For this solution to be able to achieve goals set for
this exit criteria and help mitigate the issue listed in the _Motivation_ section of this document, such drivers need to be supported
by a set of development guidelines enforced with static code analysis.
Such a solution was selected as preferred upon receiving feedback from different members of the working group concerned
about the risk of limitations that might be imposed by open-source tools, preventing groups from taking advantage of ClickHouse
features to their fullest. Members collaborating around working group criteria presented in this document, agree that
concerns around limitations could be mitigated by building a comprehensive set of prototypes, however time and effort
required to achieve that surpass the limits of this working group. It is also important to notice that ClickHouse adoption
is in an exploratory stage, and groups might not being even able to state what are their requirements just yet.
#### Proposed drivers
Following ClickHouse documentation there are the following drivers for Ruby and Go
##### Ruby
1. [ClickHouse Ruby driver](https://github.com/shlima/click_house) - Previously selected for use in GitLab as part of the Observability grup's research (see: [issue](https://gitlab.com/gitlab-org/gitlab/-/issues/358158))
1. [Clickhouse::Activerecord](https://github.com/PNixx/clickhouse-activerecord)
##### Go
1. [ClickHouse/clickhouse-go](https://github.com/ClickHouse/clickhouse-go) - Official SQL database client.
1. [uptrace/go-clickhouse](https://clickhouse.uptrace.dev/) - Alternative client.
##### Proposed client architecture
To keep the codebase well organized and limit coupling to any specific database engine it is important to encapsulate
interactions, including querying data to a single application layer, that would present its interface to layers above in
similar vain to [ActiveRecord interface propagation through abstraction layers](../../../development/reusing_abstractions.md)
Keeping underlying database engines encapsulated makes the recommended solution a good two-way door decision that
keeps the opportunity to introduce other tools later on, while giving groups time to explore and understand their use cases.
At the lowest abstraction layer, it can be expected that there will be a family of classes directly interacting with the ClickHouse driver, those classes
following MVC pattern implemented by Rails should be classified as _Models_.
Models-level abstraction builds well into existing patterns and guidelines but unfortunately does not solve the challenge of the optional availability of the ClickHouse database engine for self-managed instances. It is required to design a dedicated entity that will house responsibility of selecting best database to serve business logic request.
From the already mentioned existing abstraction [guidelines](../../../development/reusing_abstractions.md) `Finders` seems to be the closest to the given requirements, due to the fact that `Finders` encapsulate database specific interaction behind their own public API, hiding database vendors detail from all layers above them.
However, they are closely coupled to `ActiveRecord` ORM framework, and are bound by existing GitLab convention to return `ActiveRecord::Relation` objects, that might be used to compose even more complex queries. That coupling makes `Finders` unfit to deal with the optional availability of ClickHouse because returned data might come from two different databases, and might not be compatible with each other.
With all that above in mind it might be worth considering adding a new entity into the codebase that would exist on a similar level of abstraction as `Finders` yet it would be required to return an `Array` of data objects instead.
Required level of isolation can be achieved with usage of a [repository pattern](https://martinfowler.com/eaaCatalog/repository.html). The repository pattern is designed to separates business / domain logic from data access concerns, which is exactly what this proposal is looking for.
What is more the repository pattern does not limits operations performed on underlying databases allowing for full utilization of their features.
To implement the repository pattern following things needs to be created:
1. A **strategy** for each of supported databases, for example: `MyAwesomeFeature::Repository::Strategies::ClickHouseStrategy` and `MyAwesomeFeature::Repository::Strategies::PostgreSQLStrategy`. Strategies are responsible for implementing communication with underlying database ie: composing queries
1. A **repository** that is responsible for exposing high level interface to interact with database using one of available strategies selected with some predefined criteria ie: database availability. Strategies used by single repository must share the same public interface so they can be used interchangeable
1. A **Plain Old Ruby Object(PORO) Model** that represents data in business logic implemented by application layers using repository. It have to be database agnostic
It is important to notice that the repository pattern based solution has already been implemented by Observability group (kudos to: @ahegyi, @splattael and @brodock). [`ErrorTracking::ErrorRepository`](https://gitlab.com/gitlab-org/gitlab/-/blob/1070c008b9e72626e25296480f82f2ee2b93f847/lib/gitlab/error_tracking/error_repository.rb) is being used to support migration of error tracking features from PostgreSQL to ClickHouse (integrated via API), and uses feature flag toggle as database selection criteria, that is great example of optional availability of database.
`ErrorRepository` is using two strategies:
1. [`OpenApiStrategy`](https://gitlab.com/gitlab-org/gitlab/-/blob/d0bdc8370ef17891fd718a4578e41fef97cf065d/lib/gitlab/error_tracking/error_repository/open_api_strategy.rb) to interact with ClickHouse using API proxy entity
1. [`ActiveRecordStrategy`](https://gitlab.com/gitlab-org/gitlab/-/blob/d0bdc8370ef17891fd718a4578e41fef97cf065d/lib/gitlab/error_tracking/error_repository/active_record_strategy.rb) to interact with PostgreSQL using `ActiveRecord` framework
Each of those strategies return data back to abstraction layers above using following PORO Models:
1. [`Gitlab::ErrorTracking::Error`](https://gitlab.com/gitlab-org/gitlab/-/blob/a8ea29d51ff23cd8f5b467de9063b64716c81879/lib/gitlab/error_tracking/error.rb)
1. [`Gitlab::ErrorTracking::DetailedError`](https://gitlab.com/gitlab-org/gitlab/-/blob/a8ea29d51ff23cd8f5b467de9063b64716c81879/lib/gitlab/error_tracking/detailed_error.rb)
Additionally `ErrorRepository` is great example of remarkable flexibility offered by the repository pattern in terms of supported types of data stores, allowing to integrate solutions as different as a library and external service API under single unified interface. That example presents opportunity that the repository pattern in the future might be expanded beyond needs of ClickHouse and PostgreSQL when some use case would call for it.
Following [merge request](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/85907/diffs) documents changes done by observability group in order to migrate from using current GitLab architecture based on ActiveRecord Models, Services and Finders to the repository pattern.
##### Possible ways to enforce client architecture
It is not enough to propose a client-side architecture for it to fully be established as common practice it needs
to be automatically enforced, reducing the risk of developers unconsciously going against it. There are multiple ways to
introduce automated verification of repository pattern implementation including:
1. Utilize `ActiveRecord` query subscribers in a similar way to[Database::PreventCrossJoins](https://gitlab.com/gitlab-org/gitlab/-/blob/master/spec/support/database/prevent_cross_joins.rb) in order to detect queries to ClickHouse executed outside of _Strategies_
1. Expanding [`CodeReuse`](https://gitlab.com/gitlab-org/gitlab/-/tree/master/rubocop/cop/code_reuse) RuboCop rules to flag all usage of ClickHouse driver outside of _Strategies_
1. Create RuboCop rule that detects calls to utility method that checks the presence of ClickHouse instance (ie: `CurrentSettings.click_house_enabled?`) that are being made outside of _Repositories_
At this development stage, authors see all of the listed options as viable and promising, therefore a decision about which ones to use would be deferred to the moment when the first repository pattern implementation for ClickHouse will emerge.
### Overview of open-source tools
In this section authors provide an overview of existing 3rd party open-source solutions that were considered as alternative approaches to achieve stated goal, but was not selected as recommended approach.
#### Evaluation criteria
##### 1. License (MUST HAVE)
1. Solutions must be open source under an [acceptable license](https://handbook.gitlab.com/handbook/engineering/open-source/#acceptable-licenses).
##### 2. Support for different data stores (MUST HAVE)
1. It focuses on the fact whether the proposed abstraction layer can support both ClickHouse and PostgreSQL (must have)
1. Additional consideration might be if more than the two must-have storages are supported
1. The solution must support the [minimum required versions](../../../install/requirements.md#postgresql-requirements) for PostgreSQL
##### 3. Protocol compatibility
Every abstraction layer comes at the cost of limited API compared to direct access to the tool. This exit criterion is trying to bring understanding to the degree of trade-off being made on limiting tools API for the sake of a common abstraction.
1. List what read operations can be done via PostgreSQL and ClickHouse (`selects`, `joins`, `group by`, `order by`, `union` etc)
1. List what operations can be done with the proposed abstraction layer, how complicated it is to do such operations, and whether are there any performance concerns when compared to running operations natively
1. Does it still allow for direct access to a data source in case the required operation is not supported by the abstraction layer, eg: `ActiveRecord` allows for raw SQL strings to be run with `#execute`
##### 4. Operational effort
1. Deployment process: how complex is it? Is the proposed tool a library tool that is being added into the stack, or does it require additional services to be deployed independently along the GitLab system. What deployment types does the tool support (Kubernetes/VMs, SaaS/self-managed, supported OS, cloud providers). Does it support offline installation.
1. How many hardware resources does it need to operate
1. Does it require complex monitoring and operations to assure stable and performant services
1. Matured maintenance process and documentation around it: upgrades, backup and restore, scaling
1. High-availability support. Does the tool have documentation how to build HA cluster and perform failovers for self-managed? Does the tool support zero-downtime upgrade?
1. FIPS and FedRAMP compliance
1. Replication process and how the new tool would fit in GitLab Geo.
##### 5. Developer experience
1. Solutions must have well-structured, clear, and thoroughly documented APIs to ease adoption and reduce the learning curve.
##### 6. Maturity (nice to have)
1. How long does the solution exist? Is it used often? Does it have a stable community? If the license permits forking tool is also a considerable option
##### 7. Tech fit
1. Is the solution written in one of the programming languages we use at GitLab so that we can more easily contribute with bug fixes and new features?
##### 8. Interoperability (Must have)
1. Can the solution support both the main GitLab application written in Ruby on Rails also satellite services like container registry that might be written in Go
#### Open - Source solutions
##### 1. [Cube.dev](https://cube.dev/)
**Evaluation**
1. License
Apache 2.0 + MIT ✅
1. Support for different data stores
Yes ✅
1. Protocol compatibility
It uses OLAP theory concepts to aggregate data. This might be useful in some use cases like aggregating usage metrics, but not in others. It has APIs for both SQL queries and their own query format.
1. Operational effort
Separate service to be deployed using Docker or k8s. Uses Redis as a cache and data structure store.
1. Developer experience
Good [documentation](https://cube.dev/docs/product/introduction)
1. Maturity
Headless BI tools themselves are a fairly new idea, but Cube.js seems to be the leading open-source solution in this space.
The Analytics section uses it internally for our Product Analytics stack.
1. Tech fit
Uses REST and GraphQL APIs. It has its own query and data schema formats, but they are well-documented. Data definitions in either YAML or JavaScript.
**Comment**
The solution is already being used as a read interface for ClickHouse by ~"group::product analytics",
to gather first hand experience there was a conversation held with @mwoolf with key conclusions being:
1. ClickHouse driver for cube.dev is community-sourced, and it does not have a maintainer as of now, which means there is no active development. It is a small and rather simple repository that should work at least until a new major version of ClickHouse will arrive with some breaking changes
1. Cube.dev is written in Type Script and JavaScript which are part of GitLab technical stack, and there are engineers here with expertise in them, however Cube.dev is expected to be mostly used by backend developers, which does not have that much experience in mentioned technologies
1. Abstraction layer for simple SQL works, based on JSON will build correct query depending on the backend
1. Data store-specific functions (like window funnel ClickHouse) are not being translated to other engines, which requires additional cube schemas to be built to represent the same data.
1. Performance so far was not an issue both on local dev and on AWS VPS millions of rows import load testing
1. It expose postgres SQL like interface for most engines, but not for ClickHouse unfortunately so for sake of working group use case JSON API might be more feasible
1. Cube.dev can automatically generate schemas on the fly, which can be used conditionally in the runtime handling optional components like ClickHouse
There is also a [recording](https://youtu.be/iBPTCrvOBBs) of that conversation available.
##### 2. [ClickHouse FDW](https://github.com/ildus/clickhouse_fdw)
**Evaluation**
A ClickHouse Foreign Data Wrapper for PostgreSQL. It allows ClickHouse tables to be queried as if they were stored in PostgreSQL.
Could be a viable option to easily introduce ClickHouse as a drop-in replacement when Postgres stops scaling.
1. License
Apache 2.0 ✅
1. Support for different data stores
Yes, by calling ClickHouse through a PostgreSQL instance. ✅
1. Protocol compatibility
Supports SELECT, INSERT statements at a first glance. Not sure about joins. Allows for raw SQL by definition.
1. Operational effort
1. A PostgreSQL extension. Requires some mapping between the two DBs.
1. Might have adversary impact on PostgreSQL performance, when execution would wait for response from ClickHouse waisting CPU cycles on waiting
1. Require exposing and managing connection between deployments of PostgreSQL and ClickHouse
1. Developer experience
TBD
1. Maturity
It's been around for a few years and is listed in ClickHouse docs, but doesn't seem to be widely used.
1. Tech fit
Raw SQL statements.
**Comment**
##### 3. [Clickhouse::Activerecord](https://github.com/PNixx/clickhouse-activerecord)
**Evaluation**
1. License
MIT License ✅
1. Support for different data stores
Yes, in the sense that it provides a Clickhouse adapter for ActiveRecord in the application layer so that it can be used to query along PostgreSQL. ✅
1. Protocol compatibility
Not sure about joins - no examples.
1. Operational effort
Ruby on Rails library tool - ORM interface in a form of an ActiveRecord adapter.
1. Developer experience
Easy to work with for developers familiar with Rails.
1. Maturity
Has been around for a few years, but repo activity is scarce (not a bad thing by itself, however).
1. Tech fit
Rails library, so yes.
**Comment**
##### 4. [Metriql](https://metriql.com/)
**Evaluation**
A headless BI solution using DBT to source data. Similar to Cube.dev in terms of defining metrics from data and transforming them with aggregations.
The authors explain the differences between Metriql and other BI tools like Cube.js in this FAQ entry.
1. License
Apache 2.0 ✅
1. Support for different data stores
Uses DBT to read from data sources, so CH and PostgreSQL are possible.
1. Protocol compatibility
It uses OLAP theory concepts to aggregate data. It does allow for impromptu SQL queries through a REST API.
1. Operational effort
It's a separate service to deploy and requires DBT.
1. Developer experience
I assume it requires DBT knowledge to set up and use. It has a fairly simple REST API documented here.
1. Maturity
First release May 2021, but repo activity is scarce (not a bad thing by itself).
1. Tech fit
Connects with BI tools through a REST API or JDBC Adapter. Allows querying using SQL or MQL (which is a SQL flavor/subset).
**Comment**
##### 5. Notable rejected 3rd party solutions
ETL only solutions like Airflow and Meltano, as well as visualization tools like Tableau and Apache Superset, were excluded from the prospect list as they are usually clearly outside our criteria.
**[pg2ch](https://github.com/mkabilov/pg2ch)**
PostgreSQL to ClickHouse mirroring using logical replication.
Repo archived; explicitly labeled not for production use. Logical replication might not be performant enough at our scale - we don't use it in our PostgreSQL DBs because of performance concerns.
**Looker**
BI tooling.
Closed-source; proprietary.
**[Hasura](https://github.com/hasura/graphql-engine)**
GraphQL interface for database sources.
No ClickHouse support yet.
**[dbt Server](https://github.com/dbt-labs/dbt-server)**
HTTP API for dbt. MariaDB Business Source License (BSL) ❌
### Open questions
1. This proposal main focus is read interface, however depending on outcome of [complementary effort](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/111148) that focus on write interface similar concerns around optional availability might be applicable to write interaction. In case if ingestion pipeline would not resolve optional availability challenges for write interface it might be considerable to include write interactions into repository pattern implementation proposed in this document.
1. Concerns around ClickHouse schema changes and data migrations is not covered by any existing working group criteria, even though solving this challenges as a whole is outside of the scope of this document it is prudent to raise awareness that some alterations to proposed repository pattern based implementation might be required in order to support schema changes.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,62 +1,11 @@
---
status: proposed
creation-date: "2023-02-02"
authors: [ "@nhxnguyen" ]
coach: "@grzesiek"
approvers: [ "@dorrino", "@nhxnguyen" ]
owning-stage: "~devops::data stores"
participating-stages: ["~section::ops", "~section::dev"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/clickhouse_usage/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/clickhouse_usage/).
# ClickHouse Usage at GitLab
## Summary
[ClickHouse](https://clickhouse.com/) is an open-source column-oriented database management system. It can efficiently filter, aggregate, and sum across large numbers of rows. In FY23, GitLab selected ClickHouse as its standard data store for features with big data and insert-heavy requirements such as Observability and Analytics. This blueprint is a product of the [ClickHouse working group](https://handbook.gitlab.com/handbook/company/working-groups/clickhouse-datastore/). It serves as a high-level blueprint to ClickHouse adoption at GitLab and references other blueprints addressing specific ClickHouse-related technical challenges.
## Motivation
In FY23-Q2, the Monitor:Observability team developed and shipped a [ClickHouse data platform](https://gitlab.com/groups/gitlab-org/-/epics/7772) to store and query data for Error Tracking and other observability features. Other teams have also begun to incorporate ClickHouse into their current or planned architectures. Given the growing interest in ClickHouse across product development teams, it is important to have a cohesive strategy for developing features using ClickHouse. This will allow teams to more efficiently leverage ClickHouse and ensure that we can maintain and support this functionality effectively for SaaS and self-managed customers.
### Use Cases
Many product teams at GitLab are considering ClickHouse when developing new features and to improve performance of existing features.
During the start of the ClickHouse working group, we [documented existing and potential use cases](https://gitlab.com/groups/gitlab-com/-/epics/2075#use-cases) and found that there was interest in ClickHouse from teams across all DevSecOps stage groups.
### Goals
As ClickHouse has already been selected for use at GitLab, our main goal now is to ensure successful adoption of ClickHouse across GitLab. It is helpful to break down this goal according to the different phases of the product development workflow.
1. Plan: Make it easy for development teams to understand if ClickHouse is the right fit for their feature.
1. Develop and Test: Give teams the best practices and frameworks to develop ClickHouse-backed features.
1. Launch: Support ClickHouse-backed features for SaaS and self-managed.
1. Improve: Successfully scale our usage of ClickHouse.
### Non-goals
A strategy for integrating ClickHouse into GitLab Dedicated has not begun. Leadership guidance has been to wait until there is clearer demand for ClickHouse backed features before prioritizing this.
### Product roadmap
#### FY24 H2 (past)
In FY24 Q2 we began working to integrate ClickHouse with GitLab.com to support multiple features under development (see [issue](https://gitlab.com/gitlab-com/www-gitlab-com/-/issues/34299)). We did not move forward attempting to integrate with self managed at this time due to the uncertain costs and management requirements for self-managed instances. This near-term implementation will be used to develop best practices and strategy to direct self-managed users. This will also constantly shape our recommendations for self-managed instances that want to onboard ClickHouse early. As of FY24 Q3 ClickHouse is available for use with GitLab.com.
#### FY25 H1 (current)
After we have formulated best practices of managing ClickHouse ourselves for GitLab.com, we will begin to offer supported recommendations for self-managed instances that want to run ClickHouse themselves. During this phase we will allow users to "Bring your own ClickHouse" similar to our [approach for Elasticsearch](../../../integration/advanced_search/elasticsearch.md#install-elasticsearch-or-aws-opensearch-cluster). For the features that require ClickHouse for optimal usage (Value Streams Dashboard, [Product Analytics](https://gitlab.com/groups/gitlab-org/-/epics/8921)), this will be the initial go-to-market action. Notably, the Observability team has made the decision to support self-managed users via GitLab Cloud Connector instead of following this approach.
#### Long-term
We will work towards a packaged reference version of ClickHouse capable of being easily managed with minimal cost increases for self-managed users. We should be able to reliably instruct users on the management of ClickHouse and provide accurate costs for usage. This will mean any feature could depend on ClickHouse without decreasing end-user exposure.
## Best Practices
Best practices and guidelines for developing performant, secure, and scalable features using ClickHouse are located in the [ClickHouse developer documentation](../../../development/database/clickhouse/index.md).
## Cost and maintenance analysis
ClickHouse components cost and maintenance analysis is located in the [ClickHouse Self-Managed component costs and maintenance requirements](self_managed_costs_and_requirements/index.md).
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,65 +1,11 @@
---
status: proposed
creation-date: "2023-04-04"
authors: [ "@niskhakova", "@dmakovey" ]
coach: "@grzesiek"
approvers: [ "@dorrino", "@nhxnguyen" ]
owning-stage: "~workinggroup::clickhouse"
participating-stages: ["~section::enablement"]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/clickhouse_usage/self_managed_costs_and_requirements/'
remove_date: '2025-07-08'
---
# ClickHouse Self-Managed component costs and maintenance requirements
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/clickhouse_usage/self_managed_costs_and_requirements/).
## Summary
[ClickHouse](https://clickhouse.com/) requires additional cost and maintenance for self-managed customers:
- **Resource allocation cost**: ClickHouse requires a considerable amount of resources to run optimally.
- [Minimum cost estimation](#minimum-self-managed-component-costs) shows that setting up ClickHouse can be applicable only for very large Reference Architectures: 25k and up.
- **High availability**: ClickHouse SaaS supports HA. No documented HA configuration for self-managed at the moment.
- **Geo setups**: Sync and replication complexity for GitLab Geo setups.
- **Upgrades**: An additional database to maintain and upgrade along with existing Postgres database. This also includes compatibility issues of mapping GitLab version to ClickHouse version and keeping them up-to-date.
- **Backup and restore:** Self-managed customers need to have an engineer who is familiar with backup strategies and disaster recovery process in ClickHouse or switch to ClickHouse SaaS.
- **Monitoring**: ClickHouse can use Prometheus, additional component to monitor and troubleshoot.
- **Limitations**: Azure object storage is not supported. GitLab does not have the documentation or support expertise to assist customers with deployment and operation of self-managed ClickHouse.
- **ClickHouse SaaS**: Customers using a self-managed GitLab instance with regulatory or compliance requirements, or latency concerns likely cannot use ClickHouse SaaS.
### Minimum self-managed component costs
Based on [ClickHouse spec requirements](https://gitlab.com/gitlab-com/www-gitlab-com/-/issues/14384#note_1307456092) analysis
and collaborating with ClickHouse team, we identified the following minimal configurations for ClickHouse self-managed:
1. ClickHouse High Availability (HA)
- ClickHouse - 2 machines with >=16-cores, >=64 GB RAM, SSD, 10 GB Internet. Each machine also runs Keeper.
- [Keeper](https://clickhouse.com/docs/en/guides/sre/keeper/clickhouse-keeper) - 1 machine with 2 CPU, 4 GB of RAM, SSD with high IOPS
1. ClickHouse non-HA
- ClickHouse - 1 machine with >=16-cores, >=64 GB RAM, SSD, 10 GB Internet.
The following [cost table](https://gitlab.com/gitlab-com/www-gitlab-com/-/issues/14384#note_1324085466) was compiled using the machine CPU and memory requirements for ClickHouse, and comparing them to the
GitLab Reference Architecture sizes and [costs](../../../../administration/reference_architectures/index.md#cost-calculator-templates) from the GCP calculator.
| Reference Architecture | ClickHouse type | ClickHouse cost / (GitLab cost + ClickHouse cost) |
|-------------|-----------------|-----------------------------------|
| [20 RPS / 1k users - non HA](https://cloud.google.com/products/calculator#id=a6d6a94a-c7dc-4c22-85c4-7c5747f272ed) | [non-HA](https://cloud.google.com/products/calculator#id=9af5359e-b155-451c-b090-5f0879bb591e) | 78.01% |
| [40 RPS / 2k users- non HA](https://cloud.google.com/products/calculator#id=0d3aff1f-ea3d-43f9-aa59-df49d27c35ca) | [non-HA](https://cloud.google.com/products/calculator#id=9af5359e-b155-451c-b090-5f0879bb591e) | 44.50% |
| [60 RPS / 3k users - HA](https://cloud.google.com/products/calculator/#id=15fc2bd9-5b1c-479d-bc46-d5ce096b8107) | [HA](https://cloud.google.com/products/calculator#id=9909f5af-d41a-4da2-b8cc-a0347702a823) | 37.87% |
| [100 RPS / 5k users - HA](https://cloud.google.com/products/calculator/#id=9a798136-53f2-4c35-be43-8e1e975a6663) | [HA](https://cloud.google.com/products/calculator#id=9909f5af-d41a-4da2-b8cc-a0347702a823) | 30.92% |
| [200 RPS / 10k users - HA](https://cloud.google.com/products/calculator#id=cbe61840-31a1-487f-88fa-631251c2fde5) | [HA](https://cloud.google.com/products/calculator#id=9909f5af-d41a-4da2-b8cc-a0347702a823) | 20.47% |
| [500 RPS / 25k users - HA](https://cloud.google.com/products/calculator#id=b4b8b587-508a-4433-adc8-dc506bbe924f) | [HA](https://cloud.google.com/products/calculator#id=9909f5af-d41a-4da2-b8cc-a0347702a823) | 14.30% |
| [1000 RPS / 50k users - HA](https://cloud.google.com/products/calculator/#id=48b4d817-d6cd-44b8-b069-0ba9a5d123ea) | [HA](https://cloud.google.com/products/calculator#id=9909f5af-d41a-4da2-b8cc-a0347702a823) | 8.16% |
NOTE:
The ClickHouse Self-Managed component evaluation is the minimum estimation for the costs
with a simplified architecture.
The following components increase the cost, and were not considered in the minimum calculation:
- Disk size - depends on data size, hard to estimate.
- Disk types - ClickHouse recommends [fast SSDs](https://clickhouse.com/docs/ru/operations/tips#storage-subsystem).
- Network usage - ClickHouse recommends using [10 GB network, if possible](https://clickhouse.com/docs/en/operations/tips#network).
- For HA we sum minimum cost across all reference architectures from 60 RPS / 3k users to 1000 RPS / 50k users, but HA specs tend to increase with user count.
### Resources
- [Research and understand component costs and maintenance requirements of running a ClickHouse instance with GitLab](https://gitlab.com/gitlab-com/www-gitlab-com/-/issues/14384)
- [ClickHouse for Error Tracking on GitLab.com](https://gitlab.com/gitlab-com/gl-infra/readiness/-/blob/master/library/database/clickhouse/index.md)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,350 +1,11 @@
---
status: ongoing
creation-date: "2022-12-28"
authors: [ "@dgruzd", "@DylanGriffith" ]
coach: "@DylanGriffith"
approvers: [ "@joshlambert", "@changzhengliu" ]
owning-stage: "~devops::enablement"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/code_search_with_zoekt/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/code_search_with_zoekt/).
# Use Zoekt For code search
## Summary
We will be implementing an additional code search functionality in GitLab that
is backed by [Zoekt](https://github.com/sourcegraph/zoekt), an open source
search engine that is specifically designed for code search. Zoekt will be used as
an API by GitLab and remain an implementation detail while the user interface
in GitLab will not change much except for some new features made available by
Zoekt.
This will be rolled out in phases to ensure that the system will actually meet
our scaling and cost expectations and will run alongside code search backed by
Elasticsearch until we can be sure it is a viable replacement. The first step
will be making it available for `gitlab-org` for internal and expanding
customer by customer based on customer interest.
## Motivation
GitLab code search functionality today is backed by Elasticsearch.
Elasticsearch has proven useful for other types of search (issues, merge
requests, comments and so-on) but is by design not a good choice for code
search where users expect matches to be precise (ie. no false positives) and
flexible (for example, support
[substring matching](https://gitlab.com/gitlab-org/gitlab/-/issues/325234)
and
[regexes](https://gitlab.com/gitlab-org/gitlab/-/issues/4175)). We have
[investigated our options](https://gitlab.com/groups/gitlab-org/-/epics/7404)
and [Zoekt](https://github.com/sourcegraph/zoekt) is pretty much the only well
maintained open source technology that is suited to code search. Based on our
research we believe it will be better to adopt a well maintained open source
database than attempt to build our own. This is mostly due to the fact that our
research indicates that the fundamental architecture of Zoekt is what we would
implement again if we tried to implement something ourselves.
Our
[early benchmarking](https://gitlab.com/gitlab-org/gitlab/-/issues/370832#note_1183611955)
suggests that Zoekt will be viable at our scale, but we feel strongly
that investing in building a beta integration with Zoekt and rolling it out
group by group on GitLab.com will provide better insights into scalability and
cost than more accurate benchmarking efforts. It will also be relatively low
risk as it will be rolled out internally first and later rolled out to
customers that wish to participate in the trial.
### Goals
The main goals of this integration will be to implement the following highly
requested improvements to code search:
1. [Exact match (substring match) code searches in advanced search](https://gitlab.com/gitlab-org/gitlab/-/issues/325234)
1. [Support regular expressions with Advanced Global Search](https://gitlab.com/gitlab-org/gitlab/-/issues/4175)
1. [Support multiple line matches in the same file](https://gitlab.com/gitlab-org/gitlab/-/issues/668)
The initial phases of the rollout will be designed to catch and resolve scaling
or infrastructure cost issues as early as possible so that we can pivot early
before investing too much in this technology if it is not suitable.
### Non-Goals
The following are not goals initially but could theoretically be built upon
this solution:
1. Improving security scanning features by having access to quickly perform
regex scans across many repositories
1. Saving money on our search infrastructure - this may be possible with
further optimizations, but initial estimates suggest the cost is similar
1. AI/ML features of search used to predict what users might be interested in
finding
1. Code Intelligence and Navigation - likely code intelligence and navigation
features should be built on structured data rather than a trigram index but
regex based searches (using Zoekt) may be a suitable fallback for code which
does not have structured metadata enabled or dynamic languages where static
analysis is not very accurate. Zoekt in particular may not be well suited
initially, despite existing symbol extraction using ctags, because ctags
symbols may not contain enough data for accurate navigation and Zoekt
doesn't undersand dependencies which would be necessary for cross-project
navigation.
## Proposal
An
[initial implementation of a Zoekt integration](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/105049)
was created to demonstrate the feasibility of using Zoekt as a drop-in
replacement for Elasticsearch code searches. This blueprint will extend on all
the details needed to provide a minimum viable change as well steps needed to
scale this to a larger customer rollout on GitLab.com.
## Design and implementation details
### User Experience
When a user performs an advanced search on a group or project that is part
of the Zoekt rollout we will present a toggle somewhere in the UI to change
to "precise search" (or some other UX TBD) which switches them from
Elasticsearch to Zoekt. Early user feedback will help us assess the best way
to present these choices to users and ultimately we will want to remove the
Elasticsearch option if we find Zoekt is a suitable long term option.
### Indexing
Similar to our Elasticsearch integration, GitLab will notify Zoekt every time
there are updates to a repository. We've introduced a new indexer called
[`gitlab-zoekt-indexer`](https://gitlab.com/gitlab-org/gitlab-zoekt-indexer) and
we are going to replace the legacy indexer that needs to clone repositories with it.
The new indexer expects a payload with all required information to connect to
Gitaly in order to index the repository.
The rails side of the integration will be a Sidekiq worker that is scheduled
every time there is an update to a repository and it will simply call this
`/indexer/index` endpoint in Zoekt. This will also need to send a Gitaly token
that can allow Zoekt to connect to Gitaly.
We're going to encrypt the connection with SSL and add basic auth in [Add authentication for GitLab -> Zoekt HTTP calls](https://gitlab.com/gitlab-org/gitlab/-/issues/389749)
before enabling the new indexer since it receives Gitaly secrets from GitLab.
```mermaid
sequenceDiagram
participant user as User
participant gitaly as Gitaly
participant gitlab_sidekiq as GitLab Sidekiq
participant zoekt as Zoekt
user->>gitlab_git: git push git@gitlab.com:gitlab-org/gitlab.git
gitlab_git->>gitlab_sidekiq: ZoektIndexerWorker.perform_async(278964)
gitlab_sidekiq->>zoekt: POST /indexer/index {"GitalyConnectionInfo": {"Address": "tcp://gitaly:2305", "Storage": "default", "Token": "secret_token", "Path": "@hashed/a/b/c.git"}, "RepoId":7}
zoekt->>gitaly: go gitaly client
```
The Sidekiq worker can leverage de-duplication based on the `project_id`.
Zoekt supports indexing multiple projects we'll likely need to, eventually,
allow a way for users to configure additional branches (beyond the default
branch) and this will need to be sent to Zoekt. We will need to decide if these
branch lists are sent every time we index the project or only when they change
configuration.
There may be race conditions with multiple Zoekt processes indexing the same
repo at the same time. For this reason we should implement a locking mechanism
somewhere to ensure we are only indexing 1 project in 1 place at a time. We
could make use of the same Redis locking we use for indexing projects in
Elasticsearch.
### Searching
Searching will be implemented using the `/api/search` functionality in
Zoekt. There is also
[an open PR to fix this endpoint in Zoekt](https://github.com/sourcegraph/zoekt/pull/506),
and again we may consider working from a fork until this is fixed. GitLab will
prepend all searches with the appropriate filter for repositories based on the
user's search context (group or project) in the same way we do for
Elasticsearch. For Zoekt this will be implemented as a query string regex that
matches all the searched repositories.
### Zoekt infrastructure
Each Zoekt node will need to run a
[`gitlab-zoekt-indexer`](https://gitlab.com/gitlab-org/gitlab-zoekt-indexer/-/blob/main/cmd/gitlab-zoekt-indexer/main.go)
and a
[`zoekt-webserver`](https://github.com/sourcegraph/zoekt/blob/main/cmd/zoekt-webserver/main.go).
These are both webservers with different responsibilities.
The actual `.zoekt` index files will be stored on an SSD for fast searches.
These web servers need to run on the same node as they access the same files.
The `gitlab-zoekt-indexer` is responsible for writing the `.zoekt` index files.
The `zoekt-webserver` is responsible for responding to searches that it performs
by reading these `.zoekt` index files.
### Rollout strategy
Initially Zoekt code search will only be available to `gitlab-org`. After that
we'll start rolling it out to specific customers that have requested better
code search experience. As we learn about scaling and make improvements we will
gradually roll it out to all licensed groups on GitLab.com. We will use a
similar approach to Elasticsearch for keeping track of which groups are indexed
and which are not. This will be based on a new table `zoekt_indexed_namespaces`
with a `namespace_id` reference. We will only allow rolling out to top level
namespaces to simplify the logic of checking for all layers of group
inheritance. Once we've rolled out to all licensed groups we'll enable logic to
automatically enroll newly licensed groups. This table also may be a place to
store per-namespace sharding and replication data as described below.
### Sharding and replication strategy
Zoekt does not have any inbuilt sharding, and we expect that we'll need
multiple Zoekt servers to reach the scale to provide search functionality to
all of GitLab licensed customers.
There are 2 clear ways to implement sharding:
1. Build it on top of, or in front of Zoekt, as an independent component. Building
all the complexities of a distributed database into Zoekt is not likely to
be a good direction for the project so most likely this would be an
independent piece of infrastructure that proxied requests to the correct
shard.
1. Manage the shards inside GitLab. This would be an application layer in
GitLab which chooses the correct shard to send indexing and search requests
to.
Likewise, there are a few ways to implement replication:
1. Server-side where Zoekt replicas are aware of other Zoekt replicas and they
stream updates from some primary to remain in sync
1. Client-side replication where clients send indexing requests to all replicas
and search requests to any replica
We plan to implement sharding inside GitLab application but replication may be
best served at the level of the filesystem of Zoekt servers rather than sending
duplicated updates from GitLab to all replicas. This could be some process on
Zoekt servers that monitors for changes to the `.zoekt` files in a specific
directory and syncs those updates to the replicas. This will need to be
slightly more sophisticated than `rsync` because the files are constantly
changing and files may be getting deleted while the sync is happening so we
would want to be syncing the updates in batches somehow without slowing down
indexing.
Implementing sharding in GitLab simplifies the additional infrastructure
components that need to be deployed and allows more flexibility to control our
rollout to many customers alongside our rollout of multiple shards.
Implementing syncing from primary -> replica on Zoekt nodes at the filesystem
level optimizes that overall resource usage. We only need to sync the index
files to replicas as the bare repo is just a cache. This saves on:
1. Disk space on replicas
1. CPU usage on replicas as it does not need to rebuild the index
1. Load on Gitaly to clone the repos
We plan to defer the implementation of these high availability aspects until
later, but a preliminary plan would be:
1. GitLab is configured with a pool of Zoekt servers
1. GitLab assigns groups randomly a Zoekt primary server
1. There will also be Zoekt replica servers
1. Periodically Zoekt primary servers will sync their `.zoekt` index files to
their respective replicas
1. There will need to be some process by which to promote a replica to a
primary if the primary is having issues. We will be using Consul for
keeping track of which is the primary and which are the replicas.
1. When indexing a project GitLab will queue a Sidekiq job to update the index
on the primary
1. When searching we will randomly select one of the Zoekt primaries or replica
servers for the group being searched. We don't care which is "more up to
date" as code search will be "eventually consistent" and all reads may read
slightly out of date indexes. We will have a target of maximum latency of
index updates and may consider removing nodes from rotation if they are too
far out of date.
1. We will shard everything by top level group as this ensures group search can
always search a single Zoekt server. Aggregation may be possible for global
searches at some point in future if this turns out to be important. Smaller
self-managed instances may use a single Zoekt server allowing global
searches to work without any aggregation being implemented. Depending on our
largest group sizes and scaling limitations of a single node Zoekt server we
may consider implementing an approach where a group can be assigned multiple
shards.
The downside of the chosen path will be added complexity of managing all these
Zoekt servers from GitLab when compared with a "proxy" layer outside of GitLab
that is managing all of these shards. We will consider this decision a work in
progress and reassess if it turns out to add too much complexity to GitLab.
#### Sharding proposal using GitLab `::Zoekt::Shard` model
This is already implemented as the `::Zoekt::IndexedNamespace`
implements a many-to-many relationship between namespaces and shards.
#### Sharding proposal with self-registering Zoekt nodes
This proposal is mostly inspired by GitLab Runner's architecture with the main difference
that the communication is bidirectional. We've arrived to this after discussions in [Zoekt Sharding and Replication](https://gitlab.com/gitlab-org/gitlab/-/issues/419900).
##### Alternatives we've considered
We've considered different options for where to manage the Zoekt cluster state including Raft and Zoekt's own database. We decided that there are many benefits to having the whole cluster state managed by GitLab instead of Zoekt so we're opting to keep Zoekt nodes as naive as possible.
The main benefits are:
1. The deployment cycle for GitLab is faster than Zoekt which requires many version bumps across many projects
1. GitLab already has lots tooling that can be used for managing the state that we are already familiar with including Postgres, Redis, Sidekiq and others
1. The engineers mainly working on this project have much more experience with Rails than Go and spend more time writing Rails code than Go code as the other search features are mostly in Rails
Some of those benefits could also be seen as downsides and maybe not the right choice for different projects owned by different teams.
##### High level proposal
<img src="diagrams/sharding_proposal_2023-08.drawio.png" height="600" alt="">
1. Zoekt nodes are started with 3 additional arguments: its own address, shard name, and GitLab URL.
1. We'd like to keep shard name separate so that one will be able to migrate a shard to a different address.
1. When Zoekt is running in k8s, we can pass `hostname --fqdn` (for example, `gitlab-zoekt-1.gitlab-zoekt.default.svc.cluster.local`) as an argument for the address. Customers running Zoekt on bare-metal will need to configure it separately.
1. Zoekt most likely will use [Internal API](../../../development/internal_api/index.md) to connect to GitLab. We might also want to use a separate GitLab URL to keep the traffic internal and to avoid extra traffic cost.
1. GitLab will maintain a lookup table with `last_seen_at` and shard's name (we could expand `::Zoekt::Shard`). We'll also need to introduce the concept of replicas and primaries.
1. Zoekt nodes (indexers in this case) will send periodic requests to get new jobs with its address and name to the configured GitLab URL. GitLab will either register a new node or update the existing record in the lookup table.
1. After the job is completed, `zoekt-indexer` will send a callback to GitLab to indicate that the job has been completed.
1. If after a specified time GitLab doesn't receive a request, it can reassign namespaces to different shards and mark the missing shard as unavailable.
1. When executing searches, we can round-robin requests to primaries and replicas. We might even want to implement retries. For example, if a request to primary fails, we send another request to replica right away or vice versa. Here is a related issue: [Consider circuit breaker for Zoekt code search](https://gitlab.com/gitlab-org/gitlab/-/issues/393445).
1. Initially, we might want to skip replication until we implement efficiently moving and copying index files between shards (rsync for example).
1. Rebalancing most likely will happen in a cron Sidekiq worker, which will consider if an indexed namespace has enough replicas as well as available storage.
An example of command we might consider running in k8s:
```shell
./gitlab-zoekt-indexer -index_dir=/data/index -shard_name=`hostname` -address=`hostname --fqdn`
```
When we add more replicas to the stateful set, it should automatically handle addresses and shard names. For example:
- `gitlab-zoekt-0` / `gitlab-zoekt-0.gitlab-zoekt.default.svc.cluster.local`
- `gitlab-zoekt-1` / `gitlab-zoekt-1.gitlab-zoekt.default.svc.cluster.local`
- ..
Possible jobs indexer can receive:
- `index_repositories(ids: [1,2,3,4])`
- `delete_repositories(ids: [5,6])`
- `copy_index(from: 'gitlab-zoekt-0', to: 'gitlab-zoekt-1', repo_id: 4)`
#### Replication and service discovery using Consul
If we plan to replicate at the Zoekt node level as described above we need to
change our data model to use a one-to-many relationship from `zoekt_shards -> namespaces`.
This means making the `namespace_id` column unique in
`zoekt_indexed_namespaces`. Then we need to implement a service discovery
approach where the `index_url` always points at a primary Zoekt node and the
`search_url` is a DNS record with N replicas and the primary. We then choose
randomly from `search_url` records when searching.
### Iterations
1. Make available for `gitlab-org`
1. Improve monitoring
1. Improve performance
1. Make available for select customers
1. Implement sharding
1. Implement replication
1. Make available to many more licensed groups
1. Implement automatic (re)balancing of shards
1. Estimate costs for rolling out to all licensed groups and decide if it's worth it or if we need to optimize further or adjust our plan
1. Rollout to all licensed groups
1. Improve performance
1. Assess costs and decide whether we should roll out to all free customers
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,247 +1,11 @@
---
status: ongoing
creation-date: "2021-02-07"
authors: [ "@alexpooley", "@ifarkas" ]
coach: "@grzesiek"
approvers: [ "@m_gill", "@mushakov" ]
author-stage: "~devops::plan"
owning-stage: "~devops::data stores"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/consolidating_groups_and_projects/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/consolidating_groups_and_projects/).
# Consolidating Groups and Projects
Numerous features exist exclusively within groups or projects. The boundary between group and project features used to be clear.
However, there is growing demand to have group features in projects, and
project features in groups. For example, having issues in groups, and epics
in projects.
The [Simplify Groups & Projects Working Group](https://handbook.gitlab.com/handbook/company/working-groups/simplify-groups-and-projects/)
determined that our architecture is a significant hurdle in sharing features
across groups and projects.
Architecture issue: <https://gitlab.com/gitlab-org/architecture/tasks/-/issues/7>
## Challenges
### Feature duplication
When a feature needs to be made available on a different level, we have
no established process in place. This results in the reimplementation of
the same feature. Those implementations diverge from each other over time as
they all live on their own. A few more problems with this approach:
- Features are coupled to their container. In practice, it is not straight
forward to decouple a feature from its container. The degree of coupling
varies across features.
- Naive duplication of features will result in a more complex and fragile codebase.
- Generalizing solutions across groups and projects may degrade system performance.
- The range of features spans across many teams, and these changes will need to
manage development interference.
- The group/project hierarchy creates a natural feature hierarchy. When features
exist across containers the feature hierarchy becomes ambiguous.
- Duplication of features slows down development velocity.
There is potential for significant architectural changes. These changes will
have to be independent of the product design, so that customer experience
remains consistent.
### Performance
Resources can only be queried in elaborate/complicated ways. This caused
performance issues with authorization, epics, and many other places. As an
example, to query the projects a user has access to, the following sources need
to be considered:
- Personal projects
- Direct group membership
- Direct project membership
- Inherited group membership
- Inherited project membership
- Group sharing
- Inherited membership via group sharing
- Project sharing
Group/project membership, group/project sharing are also examples of
duplicated features.
## Goals
For now, this blueprint strictly relates to the engineering challenges.
- Consolidate the group and project container architecture.
- Develop a set of solutions to decouple features from their container.
- Decouple engineering changes from product changes.
- Develop a strategy to make architectural changes without adversely affecting
other teams.
- Provide a solution for requests asking for features to be made available at other levels.
## Proposal
Use our existing `Namespace` model as a container for features. We already have
a `Namespace` associated with `User` (personal namespace), and with `Group`
(which is a subclass of `Namespace`). We can extend this further, by associating
`Namespace` with `Projects` by introducing `ProjectNamespaces`. Each `Project`
should be owned by its `ProjectNamespace`, and this relation should replace the
existing `Project` <-> `Group` / personal namespace relation.
We also lack a model specific for personal namespaces, and we use the generic
`Namespace` model instead. This is confusing, but can be fixed by creating a
dedicated subclass: `UserNamespace`.
As a result, the `Namespace` hierarchy will transition to:
```mermaid
classDiagram
Namespace <|-- UserNamespace
Namespace <|-- Group
Namespace <|-- ProjectNamespace
```
New features should be implemented on `Namespace`. Similarly, when a feature
need to be reimplemented on a different level, moving it to `Namespace`
essentially makes it available on all levels:
- Personal namespaces
- Groups
- Projects
Various traversal queries are already available on `Namespaces` to query the
group hierarchy. `Projects` represent the leaf nodes in the hierarchy, but with
the introduction of `ProjectNamespace`, these traversal queries can be used to
retrieve projects as well.
This also enables further simplification of some of our core features:
- Routes should be generated based on the `Namespace` hierarchy, instead of
mixing the project with the group hierarchy.
- There is no need to differentiate between `GroupMembers` and `ProjectMembers`.
All `Members` should be related to a `Namespace`. This can lead to simplified
querying, and potentially deduplicating policies.
As more and more features will be migrated to `Namespace`, the role of the `Project`
model will diminish over time to essentially a container around the repository
related functionality.
## Iterations
The work required to establish `Namespace` as a container for our features is
tracked under [Consolidate Groups and Projects](https://gitlab.com/groups/gitlab-org/-/epics/6473)
epic.
### Phase 1 (complete)
- [Phase 1 epic](https://gitlab.com/groups/gitlab-org/-/epics/6697).
- **Goals**:
1. Ensure every project receives a corresponding record in the `namespaces`
table with `type='Project'`.
1. For user namespaces, the type changes from `NULL` to `User`.
We should make sure that projects, and the project namespace, are equivalent:
- **Create project:** Use Rails callbacks to ensure a new project namespace is
created for each project. Project namespace records should contain `created_at` and
`updated_at` attributes equal to the project's `created_at`/`updated_at` attributes.
- **Update project:** Use the `after_save` callback in Rails to ensure some
attributes are kept in sync between project and project namespaces.
Read [`project#after_save`](https://gitlab.com/gitlab-org/gitlab/blob/6d26634e864d7b748dda0e283eb2477362263bc3/app/models/project.rb#L101-L101)
for more information.
- **Delete project:** Use FKs cascade delete or Rails callbacks to ensure when a `Project`
or its `ProjectNamespace` is removed, its corresponding `ProjectNamespace` or `Project`
is also removed.
- **Transfer project to a different group:** Make sure that when a project is transferred,
its corresponding project namespace is transferred to the same group.
- **Transfer group:** Make sure when transferring a group that all of its sub-projects,
either direct or through descendant groups, have their corresponding project
namespaces transferred correctly as well.
- **Export or import project**
- **Export project** continues to export only the project, and not its project namespace,
in this phase. The project namespace does not contain any specific information
to export at this point. Eventually, we want the project namespace to be exported as well.
- **Import project** creates a new project, so the project namespace is created through
Rails `after_save` callback on the project model.
- **Export or import group:** When importing or exporting a `Group`, projects are not
included in the operation. If that feature is changed to include `Project` when its group is
imported or exported, the logic must include their corresponding project namespaces
in the import or export.
After ensuring these points, run a database migration to create a `ProjectNamespace`
record for every `Project`. Project namespace records created during the migration
should have `created_at` and `updated_at` attributes set to the migration runtime.
The project namespaces' `created_at` and `updated_at` attributes would not match
their corresponding project's `created_at` and `updated_at` attributes. We want
the different dates to help audit any of the created project namespaces, in case we need it.
After this work completes, we must migrate data as described in
[Backfill `ProjectNamespace` for every Project](https://gitlab.com/gitlab-org/gitlab/-/issues/337100).
### Phase 2 (complete)
- [Phase 2 epic](https://gitlab.com/groups/gitlab-org/-/epics/6768).
- **Goal**: Link `ProjectNamespace` to other entities on the database level.
In this phase:
- Communicate the changes company-wide at the engineering level. We want to make
engineers aware of the upcoming changes, even though teams are not expected to
collaborate actively until phase 3.
- Raise awareness to avoid regressions and conflicting or duplicate work that
can be dealt with before phase 3.
### Phase 3 (ongoing)
- [Phase 3 epic](https://gitlab.com/groups/gitlab-org/-/epics/6585).
In this phase we are migrating basic, high-priority project functionality from `Project` to `ProjectNamespace`, or directly to `Namespace`. Problems to solve as part of this phase:
- [Unify members/members actions](https://gitlab.com/groups/gitlab-org/-/epics/8010) - on UI and API level.
- Starring: Right now only projects can be starred. We want to bring this to the group level.
- Common actions: Destroying, transferring, restoring. This can be unified on the controller level and then propagated lower.
- Archiving currently only works on the project level. This can be brought to the group level, similar to the mechanism for "pending deletion".
- Avatar's serving and actions.
### Phase 4
- [Phase 4 epic](https://gitlab.com/groups/gitlab-org/-/epics/8687)
In this phase we are migrating additional functionality from `Project` to `ProjectNamespace`/`Namespace`:
- Replace usages of `Project` with `ProjectNamespace` in the code.
- API changes to expose namespaces and namespace features.
- Investigate if we extend API for `groups` or we introduce a `namespaces` endpoint and slowly deprecate `groups` and `projects` endpoints.
- Break down each feature that needs to be migrated from `Project` to `ProjectNamespace` or `Namespace`.
- Investigate if we can move a feature from `Project -> Namespace` directly vs `Project -> ProjectNamespace -> Namespace`. This can be decided on a feature by feature case.
- [Migrate Project#namespace to reference ProjectNamespace](https://gitlab.com/groups/gitlab-org/-/epics/6581).
- [Routes consolidation between Project & ProjectNamespace](https://gitlab.com/gitlab-org/gitlab/-/issues/337103).
- [Policies consolidation](https://gitlab.com/groups/gitlab-org/-/epics/6689).
### Phase 5
- [Phase 5 epic](https://gitlab.com/groups/gitlab-org/-/epics/6944)
We should strive to do the code clean up as we move through the phases. However, not everything can be cleaned up while something is still being developed. For example, dropping database columns can be done as the last task when we are sure everything is working. This phase will focus on:
- Code cleanup
- Database cleanup
## Migrating features to Namespaces
The initial iteration will provide a framework to house features under `Namespaces`. Stage groups will eventually need to migrate their own features and functionality over to `Namespaces`. This may impact these features in unexpected ways. Therefore, to minimize UX debt and maintain product consistency, stage groups will have to consider several factors when migrating their features over to `Namespaces`:
1. **Conceptual model**: What are the current and future state conceptual models of these features ([see object modeling for designers](https://hpadkisson.medium.com/object-modeling-for-designers-an-introduction-7871bdcf8baf))? These should be documented in Pajamas (example: [merge requests](https://design.gitlab.com/objects/merge-request/)).
1. **Merge conflicts**: What inconsistencies are there across project, group, and administrator levels? How might these be addressed? For an example of how we rationalized this for labels, see [this issue](https://gitlab.com/gitlab-org/gitlab/-/issues/338820).
1. **Inheritance & information flow**: How is information inherited across our container hierarchy currently? How might this be impacted if complying with the new [inheritance behavior](https://gitlab.com/gitlab-org/gitlab/-/issues/343316) framework?
1. **Settings**: Where can settings for this feature be found currently? How will these be impacted by `Namespaces`?
1. **Access**: Who can access this feature and is that impacted by the new container structure? Are there any role or privacy considerations?
1. **Tier**: Is there any tier functionality that is differentiated by projects and groups?
1. **Documentation**: Is the structure and content of documentation impacted by these changes at all?
1. **Solution proposal**:
- Think big: This analysis provides a great opportunity to zoom out and consider the feature UX as a whole. How could you make this feature lovable based on the new structure, inheritance, and capabilities afforded by `Namespaces`? Is there any UI which doesn't comply with Pajamas?
- Start small: What are the product changes that need to be made to assist with the migration?
- Move fast: Prioritise these solution ideas, document in issues, and create a roadmap for implementation.
## Related topics
- [Organization developer documentation](../../../development/organization/index.md)
- [Organization user documentation](../../../user/organization/index.md)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,221 +1,11 @@
---
status: proposed
creation-date: "2023-05-19"
authors: [ "@jcai-gitlab", "@toon" ]
coach: [ ]
approvers: [ ]
owning-stage: "~devops::systems"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/git_data_offloading/'
remove_date: '2025-07-08'
---
# Offload data to cheaper storage
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/git_data_offloading/).
## Summary
Managing Git data storage costs is a critical part of our business. Offloading
Git data to cheaper storage can save on storage cost.
## Motivation
At GitLab, we keep most Git data stored on SSDs to keep data access fast. This
makes sense for data that we frequently need to access. However, given that
storage costs scale with data growth, we can be a lot smarter about what kinds
of data we keep on SSDs and what kinds of data we can afford to offload to
cheaper storage.
For example, large files (or in Git nomenclature, "blobs") are not frequently
modified since they are usually non-text files (images, videos, binaries, etc).
Often, [Git LFS](https://git-lfs.com/) is used for repositories that contain
these large blobs in order to avoid having to push a large file onto the Git
server. However, this relies on client side setup.
Or, if a project is "stale" and hasn't been accessed in a long time, there is no
need to keep paying for fast storage for that project.
Instead, we can choose to put **all** blobs of that stale project onto cheaper
storage. This way, the application still has access to the commit history and
trees so the project browsing experience is not affected, but all files are on
slower storage since they are rarely accessed.
If we had a way to separate Git data into different categories, we could then
offload certain data to a secondary location that is cheaper. For example, we
could separate large files that may not be accessed as frequently from the rest
of the Git data and save it to an HDD rather than an SDD mount.
## Requirements
There are a set of requirements and invariants that must be given for any
particular solution.
### Saves storage cost
Ultimately, this solution needs to save on storage cost. Separating out certain
Git data for cheaper storage can go towards this savings.
We need to evaluate the solution's added cost against the projected savings from
offloading data to cheaper storage. Here are some criteria to consider:
- How much money would we save if all large blobs larger than X were put on HDD?
- How much money would we save if all stale projects had their blobs on HDD?
- What's the operational overhead for running the offloading mechanism in terms
of additional CPU/Memory cost?
- What's the network overhead? e.g. is there an extra roundtrip to a different
node via the network to retrieve large blobs.
- Access cost, e.g. when blobs would be stored in an object store.
### Opaque to downstream consumers of Gitaly
This feature is purely storage optimization and, except for potential
performance slowdown, shouldn't affect downstream consumers of Gitaly. For
example, the GitLab application should not have to change any of its logic in
order to support this feature.
This feature should be completely invisible to any callers of Gitaly. Rails or
any consumer should not need to know about this or manage it in any way.
### Operationally Simple
When working with Git data, we want to keep things as simple as possible to
minimize risk of repository corruption. Keep things operationally simple and
keep moving pieces outside of Git itself to a minimum. Any logic that modifies
repository data should be upstreamed in Git itself.
## Proposal
We will maintain a separate object database for each repository connected
through the [Git alternates mechansim](https://git-scm.com/docs/gitrepository-layout#Documentation/gitrepository-layout.txt-objects).
We can choose to filter out certain Git objects for this secondary object
database (ODB).
Place Git data into this secondary ODB based on a filter. We have
options based on [filters in Git](https://git-scm.com/docs/git-rev-list#Documentation/git-rev-list.txt---filterltfilter-specgt).
We can choose to place large blobs based on some limit into a secondary ODB, or
we can choose to place all blobs onto the secondary ODB.
## Design and implementation details
### Git
We need to add a feature to `git-repack(1)` that will allow us to segment
different kinds of blobs into different object databases. We're tracking this
effort in [this issue](https://gitlab.com/gitlab-org/git/-/issues/159).
### Gitaly
During Gitaly housekeeping, we can do the following:
1. Use `git-repack(1)` to write packfiles into both the main repository's object
database, and a secondary object database. Each repository has its own
secondary object database for offloading blobs based on some criteria.
1. Ensure the `.git/objects/info/alternates` file points to the secondary
object database from step (1).
### Criteria
Whether objects are offloaded to another object database can be determined based
on one or many of the following criteria.
#### By Tier
Free projects might have many blobs offloaded to cheaper storage, while Ultimate
projects have all their objects placed on the fastest storage.
#### By history
If a blob was added a long time ago and is not referred by any recent commit it
can get offloaded, while new blobs remain on the main ODB.
#### By size
Large blobs are a quick win to reduce the expensive storage size, so they might
get prioritized to move to cheaper storage.
#### Frequency of Access
Frequently used project might remain fully on fast storage, while inactive
projects might have their blob offloaded.
### Open Questions
#### How do we delete objects?
When we want to delete an unreachable object, the repack would need to be aware
of both ODBs and be able to evict unreachable objects regardless of whether
the objects are in the main ODB or in the secondary ODB. This picture is
complicated if the main ODB also has an [object pool](https://gitlab.com/gitlab-org/gitaly/-/blob/master/doc/object_pools.md)
ODB, since we wouldn't ever want to delete an object from the pool ODB.
#### Potential Solution: Modify Git to delete an object from alternates
We would need to modify repack to give it the ability to delete unreachable
objects in alternate ODBs. We could add repack configs `repack.alternates.*`
that tell it how to behave with alternate directories. For example, we could
have `repack.alternates.explodeUnreachable`, which indicates to repack that it
should behave like `-A` in any alternate ODB it is linked to.
#### How does this work with object pools?
When we use alternates, how does this interact with object pools? Does the
object pool also offload data to secondary storage? Does the object pool member?
In the most complex case this means that a single repository has four different
object databases, which may increase complexity.
Possibly we can mark some packfiles as "keep", using the
[--keep-pack](https://git-scm.com/docs/git-pack-objects#Documentation/git-pack-objects.txt---keep-packltpack-namegt)
and
[--honor-pack-keep](https://git-scm.com/docs/git-pack-objects#Documentation/git-pack-objects.txt---honor-pack-keep)
options.
#### Potential Solution: Do not allow object pools to offload their blobs
For the sake of not adding too much complexity, we could decide that object
pools will not offload their blobs. Instead, we can design housekeeping to
offload blobs from the repository before deduplicating with the object pool.
Theoretically, this means that offloaded blobs will not end up in the object
pool.
#### How will this work with Raft + WAL?
How will this mechanism interact with Raft and the write-ahead log?
The WAL uses hard-links and copy-free moves, to avoid slow copy operations. But
that does not work across different file systems. At some point repacks and such
will likely also go through the log. Transferring data between file systems can
lead to delays in transaction processing.
Ideally we keep the use of an alternate internal to the node and not have to
leak this complexity to the rest of the cluster. This is a challenge, given we
have to consider available space when making placement decisions. It's possible
to keep this internal by only showing the lower capacity of the two storages,
but that could also lead to inefficient storage use.
## Problems with the design
### Added complexity
The fact that we are adding another object pool to the mix adds complexity to
the system, and especially with repository replication since we are adding yet
another place to replicate data to.
### Possible change in cost over time
The cost of the different storage types might change over time. To anticipate
for this, it should be easy to adapt to such changes.
### More points of failure
Having some blobs on a separate storage device adds one more failure scenario
where the device hosting the large blobs may fail.
## Alternative Solutions
### Placing entire projects onto cheaper storage
Instead of placing Git data onto cheaper storage, the Rails application could
choose to move a project in its entirety to a mounted HDD drive.
#### Possible optimization
Giving these machines with cheaper storage extra RAM might help to deal with the
slow read/write speeds due to the use of page cache. It's not sure though this
will turn out to be cheaper overall.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,373 +1,11 @@
---
status: proposed
creation-date: "2023-05-30"
authors: [ "@qmnguyen0711" ]
approvers: [ ]
owning-stage: "~devops::enablement"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitaly_adaptive_concurrency_limit/'
remove_date: '2025-07-08'
---
# Gitaly Adaptive Concurrency Limit
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitaly_adaptive_concurrency_limit/).
## Summary
Gitaly, a Git server, needs to push back on its clients to reduce the risk of
incidents. In the past, we introduced per-RPC concurrency limit and pack-objects
concurrency limit. Both systems were successful, but the configurations were
static, leading to some serious drawbacks. This blueprint proposes an adaptive
concurrency limit system to overcome the drawbacks of static limits. The
algorithm primarily uses the [Additive Increase/Multiplicative Decrease](https://en.wikipedia.org/wiki/Additive_increase/multiplicative_decrease)
approach to gradually increase the limit during normal processing but quickly
reduce it during an incident. The algorithm focuses on lack of resources and
serious latency degradation as criteria for determining when Gitaly is in
trouble.
## Motivation
To reduce the risk of incidents and protect itself, Gitaly should be able to
push back on its clients when it determines some limits have been reached. In
the [prior attempt](https://gitlab.com/groups/gitlab-org/-/epics/7891), we laid
out some foundations for [backpressure](https://gitlab.com/gitlab-org/gitaly/-/blob/382d1e57b2cf02763d3d65e31ff4d38f467b797c/doc/backpressure.md)
by introducing two systems: per-RPC concurrency limits and pack-objects
concurrency limits.
Per-RPC concurrency limits allows us to configure a maximum amount of in-flight
requests simultaneously. It scopes the limit by RPC and repository. Pack-objects
concurrency limit restricts the concurrent Git data transfer request by IP. One
note, the pack-objects concurrency limit is applied on cache misses, only. If
this limit is exceeded, the request is either put in a queue or rejected if the
queue is full. If the request remains in the queue for too long, it will also be
rejected.
Although both of them yielded promising results on GitLab.com, the
configurations, especially the value of the concurrency limit, are static. There
are some drawbacks to this:
- It's tedious to maintain a sane value for the concurrency limit. Looking at
this [production configuration](https://gitlab.com/gitlab-com/gl-infra/chef-repo/-/blob/db11ef95859e42d656bb116c817402635e946a32/roles/gprd-base-stor-gitaly-common.json),
each limit is heavily calibrated based on clues from different sources. When the
overall scene changes, we need to tweak them again.
- Static limits are not good for all usage patterns. It's not feasible to pick a
fit-them-all value. If the limit is too low, big users will be affected. If the
value is too loose, the protection effect is lost.
- A request may be rejected even though the server is idle as the rate is not
necessarily an indicator of the load induced on the server.
To overcome all of those drawbacks while keeping the benefits of concurrency
limiting, one promising solution is to make the concurrency limit adaptive to
the currently available processing capacity of the node. We call this proposed
new mode "Adaptive Concurrency Limit".
## Goals
- Make Gitaly smarter in push-back traffic when it's under heavy load, thus enhancing the reliability and resiliency of Gitaly.
- Minimize the occurrences of Gitaly saturation incidents.
- Decrease the possibility of clients inaccurately reaching the concurrency limit, thereby reducing the ResourceExhausted error rate.
- Facilitate seamless or fully automated calibration of the concurrency limit.
## Non-goals
- Increase the workload or complexity of the system for users or administrators. The adaptiveness proposed here aims for the opposite.
## Proposal
The proposed Adaptive Concurrency Limit algorithm primarily uses the Additive
Increase/Multiplicative Decrease ([AIMD](https://en.wikipedia.org/wiki/Additive_increase/multiplicative_decrease))
approach. This method involves gradually increasing the limit during normal
process functioning but quickly reducing it when an issue (backoff event)
occurs. There are various criteria for determining whether Gitaly is in trouble.
In this proposal, we focus on two things:
- Lack of resources, particularly memory and CPU, which are essential for
handling Git processes.
- Serious latency degradation.
The proposed solution is heavily inspired by many materials about this subject
shared by folks from other companies in the industry, especially the following:
- TCP Congestion Control ([RFC-2581](https://www.rfc-editor.org/rfc/rfc2581), [RFC-5681](https://www.rfc-editor.org/rfc/rfc5681),
[RFC-9293](https://www.rfc-editor.org/rfc/rfc9293.html#name-tcp-congestion-control), [Computer Networks: A Systems Approach](https://book.systemsapproach.org/congestion/tcpcc.html)).
- Netflix adaptive concurrency limit ([blog post](https://tech.olx.com/load-shedding-with-nginx-using-adaptive-concurrency-control-part-1-e59c7da6a6df)
and [implementation](https://github.com/Netflix/concurrency-limits))
- Envoy Adaptive Concurrency
([doc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/adaptive_concurrency_filter#config-http-filters-adaptive-concurrency))
We cannot blindly apply a solution without careful consideration and expect it
to function flawlessly. The suggested approach considers Gitaly's specific
constraints and distinguishing features, including cgroup utilization and
upload-pack RPC, among others.
The proposed solution does not aim to replace the existing limits in Gitaly
for [RPC concurrency](../../../administration/gitaly/concurrency_limiting.md#limit-rpc-concurrency)
and [pack object concurrency](../../../administration/gitaly/concurrency_limiting.md#limit-pack-objects-concurrency),
but automatically tweak the parameters. This means
that other aspects, such as queuing, in-queue timeout, queue length,
partitioning, and scoping, will remain unchanged. The proposed solution only
focuses on modifying the current **value** of the concurrency limit.
## Design and implementation details
### AIMD Algorithm
The Adaptive Concurrency Limit algorithm primarily uses the Additive
Increase/Multiplicative Decrease ([AIMD](https://en.wikipedia.org/wiki/Additive_increase/multiplicative_decrease))
approach. This method involves gradually increasing the limit during normal
process functioning but quickly reducing it when an issue occurs.
During initialization, we configure the following parameters:
- `initialLimit`: Concurrency limit to start with. This value is essentially
equal to the current static concurrency limit.
- `maxLimit`: Maximum concurrency limit.
- `minLimit`: Minimum concurrency limit so that the process is considered as
functioning. If it's equal to 0, it rejects all upcoming requests.
- `backoffFactor`: how fast the limit decreases when a backoff event occurs (`0 < backoff < 1`, default to `0.75`)
When the Gitaly process starts, it sets `limit = initialLimit`, in which `limit`
is the maximum in-flight requests allowed at a time.
Periodically, maybe once per 15 seconds, the value of the `limit` is
re-calibrated:
- `limit = limit + 1` if there is no backoff event since the last
calibration. The new limit cannot exceed `maxLimit`.
- `limit = limit * backoffFactor` otherwise. The new limit cannot be lower than
`minLimit`.
When a process can no longer handle requests or will not be able to handle them
soon, it is referred to as a back-off event. Ideally, we would love to see the
efficient state as long as possible. It's the state where Gitaly is at its
maximum capacity.
![Adaptive Concurrency Limit Flow](adaptive_concurrency_limit_flow.png)
Ideally, min/max values are safeguards that aren't ever meant to be hit during
operation, even overload. In fact, hitting either probably means that something
is wrong and the dynamic algorithms aren't working well enough.
### How requests are handled
The concurrency limit restricts the total number of in-flight requests (IFR) at
a time.
- When `IFR < limit`, Gitaly handles new requests without waiting. After an
increment, Gitaly immediately handles the subsequent request in the queue, if
any.
- When `IFR = limit`, it means the limit is reached. Subsequent requests are
queued, waiting for their turn. If the queue length reaches a configured limit,
Gitaly rejects new requests immediately. When a request stays in the queue long
enough, it is also automatically dropped by Gitaly.
- When `IRF > limit`, it's appropriately a consequence of backoff events. It
means Gitaly handles more requests than the newly appointed limits. In addition
to queueing upcoming requests similarly to the above case, Gitaly may start
load-shedding in-flight requests if this situation is not resolved long enough.
At several points in time we have discussed whether we want to change queueing
semantics. Right now we admit queued processes from the head of the queue
(FIFO), whereas it was proposed several times that it might be preferable to
admit processes from the back (LIFO).
Regardless of the rejection reason, the client received a `ResourceExhausted`
response code as a signal that they would back off and retry later. Since most
direct clients of Gitaly are internal, especially GitLab Shell and Workhorse,
the actual users received some friendly messages. Gitaly can attach
[exponential pushback headers](https://gitlab.com/gitlab-org/gitaly/-/issues/5023)
to force internal clients to back off. However, that's a bit brutal and may lead
to unexpected results. We can consider that later.
### Backoff events
Each system has its own set of signals, and in the case of Gitaly, there are two
aspects to consider:
- Lack of resources, particularly memory and CPU, which are essential for
handling Git processes like `git-pack-objects(1)`. When these resources are limited
or depleted, it doesn't make sense for Gitaly to accept more requests. Doing so
would worsen the saturation, and Gitaly addresses this issue by applying cgroups
extensively. The following section outlines how accounting can be carried out
using cgroup.
- Serious latency degradation. Gitaly offers various RPCs for different purposes
besides serving Git data that is hard to reason about latencies. A significant
overall latency decline is an indication that Gitaly should not accept more
requests. Another section below describes how to assert latency degradation
reasonably.
Apart from the above signals, we can consider adding more signals in the future
to make the system smarter. Some examples are Go garbage collector statistics,
networking stats, file descriptors, etc. Some companies have clever tricks, such
as [using time drifting to estimate CPU saturation](https://www.linkedin.com/blog/engineering/data-management/hodor-detecting-and-addressing-overload-in-linkedin-microservic).
#### Backoff events of Upload Pack RPCs
Upload Pack RPCs and their siblings PackObjects RPC are unique to Gitaly. They
are for the heaviest operations: transferring large volumes of Git data. Each
operation may take minutes or even hours to finish. The time span of each
operation depends on multiple factors, most notably the number of requested
objects and the internet speed of clients.
Thus, latency is a poor signal for determining the backoff event. This type of
RPC should only depend on resource accounting at this stage.
#### Backoff events of other RPCs
As stated above, Gitaly serves various RPCs for different purposes. They can
also vary in terms of acceptable latency as well as when to recognize latency
degradation. Fortunately, the current RPC concurrency limits implementation
scopes the configuration by RPC and repository individually. The latency signal
makes sense in this setting.
Apart from latency, resource usage also plays an important role. Hence, other
RPCs should use both latency measurement and resource accounting signals.
### Resource accounting with cgroup
The issue with saturation is typically not caused by Gitaly, itself but rather by the
spawned Git processes that handle most of the work. These processes are contained
within a [cgroup](https://gitlab.com/gitlab-org/gitaly/-/blob/382d1e57b2cf02763d3d65e31ff4d38f467b797c/doc/cgroups.md),
and the algorithm for bucketing cgroup can be
found [here](https://gitlab.com/gitlab-org/gitaly/-/blob/382d1e57b2cf02763d3d65e31ff4d38f467b797c/internal/cgroups/v1_linux.go#L166-166).
Typically, Gitaly selects the appropriate cgroup for a request based on the
target repository. There is also a parent cgroup to which all repository-level
cgroups belong to.
Cgroup statistics are widely accessible. Gitaly can trivially fetch both
resource capacity and current resource consumption via the following information
in [cgroup control file](https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt):
- `memory.limit_in_bytes`
- `memory.usage_in_bytes`
- `cpu.cfs_period_us`
- `cpu.cfs_quota_us`
- `cpuacct.usage`
Fetching those statistics may imply some overheads. It's not necessary to keep
them updated in real-time. Thus, they can be processed periodically in the limit
adjustment cycle.
In the past, cgroup has been reliable in preventing spawned processes from
exceeding their limits. It is generally safe to trust cgroup and allow processes
to run without interference. However, when the limits set by cgroup are reached
(at 100%), overloading can occur. This often leads to a range of issues such as
an increase in page faults, slow system calls, memory allocation problems, and
even out-of-memory kills. The consequences of such incidents are
highlighted in
[this example](https://gitlab.com/gitlab-com/gl-infra/production/-/issues/8713#note_1352403481). Inflight requests are significantly impacted, resulting in unacceptable delays,
timeouts, and even cancellations.
Besides, through various observations in the past, some Git processes such as
`git-pack-objects(1)` build up memory over time. When a wave of `git-pull(1)`
requests comes, the node can be easily filled up with various memory-hungry
processes. It's much better to stop this accumulation in the first place.
As a result, to avoid overloading, Gitaly employs a set of soft limits, such as
utilizing only 75% of memory capacity and 90% of CPU capacity instead of relying
on hard limits. Once these soft limits are reached, the concurrency adjuster
reduces the concurrency limit in a multiplicative manner. This strategy ensures
that the node has enough headroom to handle potential overloading events.
In theory, the cgroup hierarchy allows us to determine the overloading status
individually. Thus, Gitaly can adjust the concurrency limit for each repository
separately. However, this approach would be unnecessarily complicated in
practice. In contrast, it may lead to confusion for operators later.
As a good start, Gitaly recognizes an overloading event in _either_ condition:
- Soft limits of the parent cgroup are reached.
- Soft limits of **any** of the repository cgroup are reached
It is logical for the second condition to be in place since a repository's
capacity limit can be significant to the parent cgroup's capacity. This means
that when the repository cgroup reaches its limit, fewer resources are available
for other cgroups. As a result, reducing the concurrency limit delays the
occurrence of overloading.
#### Latency measurement
When re-calibrate the concurrency limit, latency is taken into account for RPCs
other than Upload Pack. Two things to consider when measuring latencies:
- How to record latencies
- How to recognize a latency degradation
It is clear that a powerful gRPC server such as Gitaly has the capability to
manage numerous requests per second per node. A production server can serve up
to thousands of requests per second. Keeping track and storing response times in
a precise manner is not practical.
The heuristic determining whether the process is facing latency degradation is
interesting. The most naive solution is to pre-define a static latency
threshold. Each RPC may have a different threshold. Unfortunately, similar to
static concurrency limiting, it's challenging and tedious to pick a reasonable
up-to-date value.
Fortunately, there are some famous algorithms for this line of problems, mainly
applied in the world of TCP Congestion Control:
- Vegas Algorithm ([CN: ASA - Chapter 6.4](https://book.systemsapproach.org/congestion/avoidance.html), [Reference implementation](https://Github.com/Netflix/concurrency-limits/blob/master/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/VegasLimit.java))
- Gradient Algorithm ([Paper](https://link.springer.com/chapter/10.1007/978-3-642-20798-3_25), [Reference implementation](https://Github.com/Netflix/concurrency-limits/blob/master/concurrency-limits-core/src/main/java/com/netflix/concurrency/limits/limit/Gradient2Limit.java))
The two algorithms are capable of automatically determining the latency
threshold without any pre-defined configuration. They are highly efficient and
statistically reliable for real-world scenarios. In my opinion, both algorithms
are equally suitable for our specific use case.
### Load-shedding
Gitaly being stuck in the overloaded situation for too long can be denoted by
two signs:
- A certain amount of consecutive backoff events
- More in-flight requests than concurrency limit for a certain amount of them
In such cases, a particular cgroup or the whole Gitaly node may become
unavailable temporarily. In-flight requests are likely to either be canceled or
timeout. On GitLab.com production, an incident is triggered and called for human
intervention. We can improve this situation by load-shedding.
This mechanism deliberately starts to kill in-flight requests selectively. The
main purpose is to prevent cascading failure of all inflight requests.
Hopefully, after some of them are dropped, the cgroup/node can recover back to
the normal situation fast without human intervention. As a result, it leads to
net availability and resilience improvement.
Picking which request to kill is tricky. In many systems, request criticality is
considered. A request from downstream is assigned with a criticality point.
Requests with lower points are targeted first. Unfortunately, GitLab doesn't
have a similar system. We have an
[Urgency system](https://docs.Gitlab.com/ee/development/application_slis/rails_request.html),
but it is used for response time committing rather than criticality.
As a replacement, we can prioritize requests harming the system the most. Some
criteria to consider:
- Requests consuming a significant percentage of memory
- Requests consuming a significant of CPU over time
- Slow clients
- Requests from IPs dominating the traffic recently
- In-queue requests/requests at an early stage. We dont want to reject requests that are almost finished.
To get started, we can pick the first two criteria first. The list can be
reinforced when learning from production later.
## References
- Linkedin HODOR system
- [https://www.youtube.com/watch?v=-haM4ZpYNko](https://www.youtube.com/watch?v=-haM4ZpYNko)
- [Hodor: Detecting and addressing overload in LinkedIn microservices](https://www.linkedin.com/blog/engineering/data-management/hodor-detecting-and-addressing-overload-in-linkedin-microservic)
- [https://www.linkedin.com/blog/engineering/infrastructure/hodor-overload-scenarios-and-the-evolution-of-their-detection-a](https://www.linkedin.com/blog/engineering/infrastructure/hodor-overload-scenarios-and-the-evolution-of-their-detection-a)
- Google SRE chapters about load balancing and overload:
- [https://sre.google/sre-book/load-balancing-frontend/](https://sre.google/sre-book/load-balancing-frontend/)
- [https://sre.google/sre-book/load-balancing-datacenter/](https://sre.google/sre-book/load-balancing-datacenter/)
- [https://sre.google/sre-book/handling-overload/](https://sre.google/sre-book/handling-overload/)
- [https://sre.google/sre-book/addressing-cascading-failures/](https://sre.google/sre-book/addressing-cascading-failures/)
- [https://sre.google/workbook/managing-load/](https://sre.google/workbook/managing-load/)
- [Netflix Performance Under Load](https://netflixtechblog.medium.com/performance-under-load-3e6fa9a60581)
- [Netflix Adaptive Concurrency Limit](https://Github.com/Netflix/concurrency-limits)
- [Load Shedding with NGINX using adaptive concurrency control](https://tech.olx.com/load-shedding-with-nginx-using-adaptive-concurrency-control-part-1-e59c7da6a6df)
- [Overload Control for Scaling WeChat Microservices](http://web1.cs.columbia.edu/~junfeng/papers/dagor-socc18.pdf)
- [ReactiveConf 2019 - Jay Phelps: Backpressure: Resistance is NOT Futile](https://www.youtube.com/watch?v=I6eZ4ZyI1Zg)
- [AWS re:Invent 2021 - Keeping Netflix reliable using prioritized load shedding](https://www.youtube.com/watch?v=TmNiHbh-6Wg)
- [AWS Using load shedding to avoid overload](https://aws.amazon.com/builders-library/using-load-shedding-to-avoid-overload/)
- ["Stop Rate Limiting! Capacity Management Done Right" by Jon Moore](https://www.youtube.com/watch?v=m64SWl9bfvk)
- [Using load shedding to survive a success disaster—CRE life lessons](https://cloud.google.com/blog/products/gcp/using-load-shedding-to-survive-a-success-disaster-cre-life-lessons)
- [Load Shedding in Web Services](https://medium.com/helpshift-engineering/load-shedding-in-web-services-9fa8cfa1ffe4)
- [Load Shedding in Distributed Systems](https://blog.sofwancoder.com/load-shedding-in-distributed-systems)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,342 +1,11 @@
---
status: proposed
creation-date: "2023-02-01"
authors: [ "@samihiltunen" ]
owning-stage: "~devops::enablement"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitaly_plugins/'
remove_date: '2025-07-08'
---
# Gitaly Plugins
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitaly_plugins/).
## Summary
This blueprint describes a plugin interface for Gitaly. Plugins would be executables that Gitaly invokes before and after committing a transaction.
Plugins would allow for implementing access check logic locally on the Gitaly node. Performing the access checks locally on the Gitaly node improves performance. Plugins would reduce complexity by hiding internal details that are currently leaking through Gitaly's API.
The hard-coded access check logic and custom hooks would be replaced by plugins.
## Motivation
### Background
Gitaly is a database system for storing Git repositories and strives to be decoupled from rest of GitLab. While separation of concerns in general is a good practice, Gitaly is also used outside of GitLab. The decoupling functions as a guideline on what sort of functionality should be implemented in Gitaly. Only functionality supporting Gitaly's main goal of providing repository storage and access should be implemented directly in Gitaly.
Some use cases require tighter integration with Gitaly's write flows. For example:
- Authorization checks must run before Gitaly accepts a write in order to reject unauthorized writes.
- Notifications should be sent after writes in order to trigger CI jobs.
This logic is not built directly into Gitaly to separate concerns. Gitaly calls [the Rails application's internal API](../../../development/internal_api/index.md) for both cases:
- Before accepting a write, Gitaly calls `/internal/allowed`. The response from the endpoint decides whether or not Gitaly accepts the write.
- After accepting a write, Gitaly calls `/internal/post_receive`.
In addition to calling Rails application's internal API, Gitaly supports [custom hooks](../../../administration/server_hooks.md). Custom hooks are
executables Gitaly invokes before and after accepting a write and conform to the interface of
[Git hooks](https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks). The `pre-receive` hook can reject a write, `update` hook can drop single reference
update, and `post-receive` can be used to be notified of accepted writes.
'Custom logic' is used to refer to both of the internal API calls and custom hooks in the blueprint.
### Problems
#### Overlapping functionality
Both the internal API and custom hooks are invoked in the same locations, and have similar functionality. They provide redundant functionality with the main difference being the author of the logic:
- Internal API is used by GitLab.
- Custom hooks are used on self-managed instances to plug in custom logic.
Maintaining redundant functionality is a burden. Custom hooks not being used by GitLab means they are not thoroughly exercised in production on GitLab.com.
The internal API calls are essentially GitLab-specific hooks that are hard-coded into Gitaly.
#### Not all writes can be hooked into
Gitaly is hard-coded to invoke the custom logic only during certain write operations. These operations are decided by where GitLab needs to invoke custom
logic. Roughly, these points are:
- The various `*ReceivePack` variants that invoke custom logic. These are used to run the custom logic on pushes.
- `OperationService` RPCs that are invoked by Rails when a user performs changes in a repository. Custom logic is invoked during these because the Rails
application wants to run authorization checks to see whether the user is allowed to make the change.
Custom logic is not invoked for other writes as GitLab doesn't need it. For example, excluded writes include repository creations, repository deletions, and reference updates through `WriteRef`.
This creates a tight coupling with the needs of GitLab.
#### Custom hooks interface determined by Git
Custom hooks conform to the interface of Git's hooks. This can be limiting and has led to an inconsistent interface:
- Reference updates are streamed through stdin.
- Other variables are passed in through environment variables.
- Custom prefixes required on error messages to pass them through Gitaly to the user.
- Invoking `update` hook for each reference is inefficient.
- No natural point to extend the payload to cover other supported writes than reference updates.
A better interface could be defined as Gitaly executes the custom hooks. There's no need to conform to the interface Git uses for hooks.
#### Performance issues
Rails performs authorization checks in the internal API by fetching data through Gitaly's public API. This leads to performance issues as Rails may need to fetch the complete write to perform its checks. For example, [pre-receive secret detection](../secret_detection/index.md) would require fetching all new blobs in a write to perform its checks. This could be avoided if there was a way to run the check locally on the Gitaly node.
#### Leaking internals
Before a write is accepted, Gitaly holds the new objects in a quarantine directory. These objects should not be accessible through Gitaly's API for other users. The authorization checks need access to these objects though. This is handled by sending the path of the quarantine directory to Rails, and Rails passes this to Gitaly in follow-up calls. This leaks internal details:
- The quarantines are exposed in the public API.
- On Gitaly Cluster, the quarantine directory is in a different location on each node. This requires Praefect to support force routing calls to the primary replica so the quarantine path points to the correct location in the follow-up calls.
- This gets even more complicated with [transactions](../gitaly_transaction_management/index.md). The quarantine paths are relative to the transaction's snapshot. In order for the quarantine paths to apply, the relative path sent in the request should be the snapshot repository's relative path. Praefect however requires the original relative path of the repository to route the request to the correct Gitaly node.
Leaking internals adds complexity and has to be worked around.
#### Complexity
In addition to the performance issues and leaking internal, the current access check flow has proven to be complex and difficult to understand:
1. The Rails application calling Gitaly for an RPC.
1. Gitaly calling back to the Rails application for access checks.
1. The Rails application calling Gitaly multiple times again to fetch data required by the access checks.
This complexity could be reduced by not fanning out multiple other RPCs from a given RPC but instead keeping all of the checks in the context of
currently-executing RPC.
## Solution
Define a plugin interface in Gitaly that enables efficient implementation of pre-commit checks without leaking internal details to the API.
### Goals
- Single plugin interface to replace calls to internal API and custom hooks.
- Enable efficient execution of access checks.
- Enables execution of the checks close to the data on the Gitaly node.
- Do not dictate the implementation, and enable the plugin authors to implement their plugin
in the best manner for the use case.
- Remove GitLab-specific assumptions from the API.
- Hard-coded calls to the internal API removed and moved behind the plugin interface.
- Invoke the plugin for every write.
- Remove GitLab specific fields such as `gl_repository` and `gl_project_path` from Gitaly's API.
- Do not leak internal details.
- Remove quarantine directories from the API.
- Remove force routing to primary.
- Remove the need to pipe the transaction's snapshot path through the public API.
- A clean, well-defined interface set by the needs of Gitaly and its users, not Git.
## Proposal
The proposal is written with Gitaly's upcoming [transaction management](../gitaly_transaction_management/index.md) in mind.
### Plugins
Plugins would be executables that Gitaly invokes at certain points during transaction execution. Plugins being executables ensures:
- Protection: plugins would run in separate processes. Gitaly would be guarded against memory leaks and crashes in the plugins.
- Flexibility:
- Plugins could be implemented using whatever tools the authors prefer.
- Plugins could be implemented as one-off executables, or may call into a server daemon.
- Separation of concerns: Gitaly would just need to execute an executable and defers other responsibility to it.
Plugins would be invoked at two points during transaction execution:
- `before-commit`:
- Invoked before a transaction is committed.
- Allows for rejecting a transaction.
- `after-commit`:
- Invoked after a transaction is committed.
- Transaction is already committed, so this just serves as a notification.
The names are chosen to disambiguate from Git's `pre-commit` and `post-commit` hooks. `Plugin` is used to further disambiguate from Git's hooks.
Gitaly would allow for configuring a single plugin. Gitaly would be relieved from deciding whether to execute multiple plugins concurrently or sequentially, in
which order, and whether a single plugin failing the write stops the execution of further plugins. These decisions would be delegated to plugins. Support for
running multiple plugin executables can be implemented in a plugin if truly needed.
The single plugin covers all partitions/repositories. Repository-specific logic can be implemented in the plugin to support similar use cases previously served by repository-specific custom hooks.
Gitaly and the plugin would communicate over `stdin` and `stdout` of the plugin process. Each message written to the other process would be prefixed with an `uint64` describing
the length of the payload that follows. This would make for a simple message passing protocol. Protocol buffers would be used to define the schema and serialize the payload.
When the plugin is invoked, it would writes its supported protocol version to `stdout` as a big-endian encoded `uint16`. This would enable Gitaly to evolve the API in
backwards-incompatible manner by supporting multiple versions of the protocol side-by-side. If the plugin's protocol was not supported, Gitaly would kill the plugin and fail
the write. If the protocol is supported, Gitaly would proceed.
The initial version of the protocol is described below.
After the version negotiation, Gitaly sends messages to the plugin via `stdin`. Below is the message schema:
```protobuf
// PluginRequest is the payload that describes the transaction being executed.
message PluginRequest {
// ReferenceChanges describes reference changes of the transaction.
message ReferenceChanges {
// Change describes a single reference change made in the transaction.
message Change {
// reference_name is the name of the reference being changed.
bytes reference_name = 1;
// old_oid is the reference's previous object ID. Zero OID indicates the reference did not exist.
string old_oid = 2;
// new_oid is the reference's new object ID. Zero OID indicates a reference deletion.
string new_oid = 3;
}
repeated Change changes = 1;
}
// Header contains details of the plugin's execution environment and the transaction.
message Header {
// storage is the name of the target storage of the transaction.
string storage = 1;
// relative_path is the relative path of the target repository of the transaction.
string relative_path = 2;
// push_options contains the push options sent by the client during a push, if any.
repeated bytes push_options = 3;
// client_metadata contains a blob sent by the client to Gitaly to pass through to
// the plugin. It allows the client to send parameters to the plugin transparently
// through Gitaly. The metadata is sent by the client by setting the gRPC metadata
// header `gitaly-plugin-metadata-bin` in the request to Gitaly.
//
// This can be used to pipe GitLab specific data from Rails to the plugin, such as
// `gl_project` and `gl_user`.
bytes plugin_metadata = 4;
// git_command_path contains the absolute path to the Git command that should be used to
// access the repository.
string git_command_path = 5;
// repository_path contains the absolute path of the transaction's target repository. It
// points a snapshot of the actual repository.
string repository_path = 6;
// git_object_directory is an absolute path to the transaction's quarantine directory
// where the new objects are written. It must be set for the Git invocations as an
// environment variable through `GIT_OBJECT_DIRECTORY` for the objects to be readable.
string git_object_directory = 7;
// git_alternate_object_directories points to the object database that contains the objects
// that existed prior to the transaction. It must be set for the Git invocation as an
// environment variable through `GIT_ALTERNATE_OBJECT_DIRECTORIES` for the objects to be
// readable.
//
// `GIT_ALTERNATE_OBJECT_DIRECTORIES` can be left unset if the Git invocation should only
// read the new objects introduced in the transaction. This can be useful for some operations
// that may for example want to scan only new blobs.
string git_alternate_object_directories = 8;
}
oneof message {
// header is always the first message sent to the plugin.
Header header = 1;
// reference_changes are the reference_changes being performed by this transaction. reference_changes
// may be chunked over multiple messages.
ReferenceChanges reference_changes = 2;
}
```
The header is always sent in the first message. Reference changes follow the header and may be chunked over multiple messages. Gitaly closes the plugin's `stdin` once it is done sending messages.
The initial protocol supports only hooking into reference changes of a single repository as hooks currently do. The protocol can later be extended as needed to support for example:
- Other write types, such as repository creations and deletions.
- Transactions targeting multiple repositories.
After receiving the payload, the plugin would run its logic. It would access the Git repository at the given path through the provided Git command to
retrieve the data it needs. On finishing, the plugin would write a response to stdout that contains
[a status](https://github.com/googleapis/googleapis/blob/master/google/rpc/status.proto#L35) to indicate whether or not the transaction should be committed
or aborted.
```protobuf
// PluginResponse is the response message the plugin passes back to Gitaly after finishing.
message PluginResponse {
// status indicates whether the transaction should succeed or fail.
google.rpc.Status status = 1
}
```
- If the status has code `OK`, the transaction is committed.
- If the status has any other code, the transaction is aborted. The status is passed back
to the client as is. This enables the plugin to communicate a rejection reason with an
accurate status code, message, and additional details.
The plugin should return a zero exit code if it executed successfully. This should be done even if the transaction is rejected.
If the plugin returns a non-zero exit code, the plugin is considered to have failed. This is different from the plugin rejecting a write. When the plugin fails, `stdout` is ignored and an error is returned to the client. Gitaly logs an error with the contents of `stderr` and rejects the write.
If the plugin is taking too long, as defined by RPC deadlines, Gitaly kills the plugin. This is handled as a plugin failure as described above.
#### Compatibility guarantees
Gitaly would guarantee the repository at the given path is accessible through the `git` command at the given path. All access to the repository must go through the provided
`git` binary.
The repository's layout, location, file formats, and storage details are not included in the compatibility guarantee. This enables Gitaly to iterate freely
on the storage formats. Modifying the repository is not supported.
The payload's format should be guaranteed to remain backwards compatible within a protocol version. New keys may be introduced without bumping the version.
#### Custom hooks
Because Gitaly would support only configuring a single plugin, a question may arise on how the GitLab plugin could be configured along with a custom one. While running multiple
plugins wouldn't be directly supported, a custom plugin could be configured and the plugin could invoke the GitLab plugin. This would allow for plugging in custom logic
alongside the GitLab access check plugin. Because the custom plugin would control calling the GitLab plugin, it would have full control on whether to run GitLab plugin before,
after, or concurrently with its own logic.
### Migration
Migration to plugins would take a few steps:
1. Create a project for the GitLab access check plugin. This is where the existing internal API logic will be migrated.
1. Package the plugin in the distributions and deploy and configure it on the Gitaly nodes.
1. Migrate the internal API calling logic from Gitaly to the plugin.
1. At this point, we'd have the existing internal API calling logic from Gitaly executed as a plugin.
1. We can now begin migrating the access checks step by step into the plugin.
All access to the new objects of a transaction should go through the plugin. Gitaly provides just the interface to hook into its writes.
How exactly the access checks are performed is left up to the teams responsible for the access checks. The access checks could fetch only policy from Rails and run the checks locally if the check in question depends heavily on the Git data. This improves efficiency by removing the loopback calls and accessing the data directly on the Gitaly node. Some access check logic could still remain in Rails if it is best fit for it.
Once all of repository access in access checks goes through the plugin, the quarantine directories and the force-route-to-primary functionality can be removed from the API of Gitaly.
#### End state
The following diagram demonstrates the desired end state:
1. Gitaly no longer calls to Rails directly.
1. The plugin fetches only access check policy from Rails. The access checks are run locally on the repository.
1. Gitaly commits or aborts the transaction based on the plugin's result.
```mermaid
sequenceDiagram
Rails->>+Gitaly: Mutator RPC
Gitaly-->>Gitaly: Perform Operations
Gitaly-->>Gitaly: Queue for Commit
Gitaly->>+Plugin: Execute
Plugin->>Gitaly: Version Negotiation
Gitaly->>Plugin: Payload
Plugin->>+Rails: Fetch Access Check Policy
Plugin->>+Repository: Run Access Checks
Plugin->>-Gitaly: Respond
Gitaly->>Gitaly: Commit / Abort
Gitaly->>-Rails: Respond
```
### Considerations
#### Security
The plugins should only be configurable by administrators only and are considered trusted. There are no plans to sandbox them.
#### Subprocesses
If the plugin launches a subprocess, it should keep it in the same process group so Gitaly can kill the entire process group if it wants to terminate the plugin. Gitaly should consider running the plugins in their own cgroups so we can guarantee killing all subprocesses.
#### `update` hook's functionality is not supported
The plugin currently does not support dropping certain reference updates as is possible with `update` hook. Support for the hook was left out as it's not clear how useful it is. If necessary, it could be supported by having the plugin write messages to Gitaly on which references to drop from the update.
### Future opportunities
#### Guaranteed post-receive notification deliveries ([#5411](https://gitlab.com/gitlab-org/gitaly/-/issues/5411))
Currently Gitaly does not guarantee the delivery of post-receive notifications. The delivery could fail for any reason. For example, Gitaly could crash or
the Rails application could be unavailable. This can lead to unexpected behavior.
1. The notifications trigger CI pipelines after writes. If the delivery fails, the pipelines may not be triggered.
1. Code search indexes new changes based on the notifications. If the delivery fails, the indexes can become stale.
With transactions stored in the write-ahead log, Gitaly could guarantee the delivery of the `after-commit` notifications after a crash by recovering the transactions from the log and reattempting delivery. To facilitate this, all necessary information should be stored in the log entry, for example the push options.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,427 +1,11 @@
---
status: ongoing
creation-date: "2023-05-30"
authors: [ "@samihiltunen" ]
owning-stage: "~devops::enablement"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitaly_transaction_management/'
remove_date: '2025-07-08'
---
# Transaction management in Gitaly
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitaly_transaction_management/).
## Summary
Gitaly is a database system for storing Git repositories. This blueprint covers implementing transaction management in Gitaly that guarantees
ACID-properties by introducing:
- Write-ahead logging. Work on this is already underway and tracked in [Implement write-ahead logging in Gitaly](https://gitlab.com/groups/gitlab-org/-/epics/8911).
- Serializable snapshot isolation through multiversion concurrency control.
The goal is to improve reliability when dealing with concurrent access and interrupted writes. Transaction management makes it easier to contribute to Gitaly because transactions
deal with the concurrency and failure-related anomalies.
This is the first stage of implementing a [decentralized Raft-based architecture for Gitaly Cluster](https://gitlab.com/groups/gitlab-org/-/epics/8903).
## Motivation
Transaction management in Gitaly is lacking. Gitaly doesn't provide the guarantees typically expected from database-like software. Databases typically guarantee the ACID
properties:
- Atomicity: all changes in a transaction happen completely or not at all.
- Consistency: all changes leave the data in a consistent state.
- Isolation: concurrent transactions execute as if they were the only transaction running in the system.
- Durability: changes in a transaction persist and survive system crashes once acknowledged.
Gitaly does not access storage transactionally and violates these properties in countless ways. To give some examples:
- Atomicity:
- References are updated one by one with Git. If the operation is interrupted, some references
may be updated and some not.
- Objects may be written into a repository but fail to be referenced.
- Custom hooks are updated by moving their old directory out of the way and moving the new one in place. If this operation fails half way, the repository's
existing hooks are removed but new ones are not written.
- Consistency:
- Gitaly migrates objects from a quarantine directory to the main repository. It doesn't consider the dependencies between objects while doing so. If this process is interrupted, and an object missing its dependencies is later referenced, the repository ends up corrupted.
- Crashes might leave stale locks on the disk that prevent further writes.
- Isolation:
- Any operation can fail due to the repository being deleted concurrently.
- References and object database contents can be modified while another operation is reading them.
- Backups can be inconsistent due to concurrent write operations modifying the data. Backups can even end up containing state that never existed on the
server, which can happen if custom hooks are updated while they are being backed up.
- Modifying and executing custom hooks concurrently can lead to custom hooks not being executed. This can happen if the execution happens between the old
hooks being removed and new ones being put in place.
- Durability: multiple missing fsyncs were recently discovered in Gitaly.
Not adhering to ACID properties can lead to:
- Inconsistent reads.
- Inconsistent backups that contain state that never existed on the server.
- Repository corruption.
- Writes missing after crashes.
- Stale locks that lead to unavailability.
Lack of isolation makes some features infeasible. These are generally long running read operations, such as online checksums for verifying data and online backups. The data being modified concurrently can cause these to yield incorrect results.
The list is not exhaustive. Compiling an exhaustive list is not fruitful due to the large number of various scenarios that can happen due to concurrent interactions and
write interruptions. However, there is a clear need to solve these problems in a systematic manner.
## Solution
The solution is to implement a transaction manager in Gitaly that guarantees ACID-properties. This centralizes the transactional logic into a single component.
All operations accessing user data will run in a transaction with the transaction manager upholding transactional guarantees. This eases developing Gitaly as the RPC handlers can be developed as if they were the only one running in the system with durability and atomicity of changes guaranteed on commit.
### Goals
- Transaction management that guarantees ACID-properties.
- Transactional guarantees cover access to all user data:
- References
- Objects
- Custom hooks
- Write-ahead log for durability and atomicity.
- Serializable Snapshot Isolation (SSI). Multiversion concurrency control (MVCC) for non-blocking concurrency.
- Minimal changes to existing code in Gitaly.
- Make it easier to contribute to Gitaly.
- Enable future use cases:
- [Backups with WAL archiving](#continuous-backups-with-wal-archiving).
- [Replication with Raft](#raft-replication).
- [Expose transactional interface to Gitaly clients](#expose-transactions-to-clients).
## Proposal
The design below is the end state we want to reach. The in-progress implementation in Gitaly deviates in some aspects. We'll gradually get closer to the end state as the work progresses.
### Partitioning
The user data in Gitaly is stored in repositories. These repositories are accessed independently from each other.
Each repository lives on a single storage. Gitaly identifies repositories with a composite key of `(storage_name, relative_path)`. Storage names are unique. Two storages may contain a repository with the same relative path. Gitaly considers these two distinct repositories.
The synchronization required for guaranteeing transactional properties has a performance impact. To reduce the impact, a transaction only spans a subset of the data stored on a Gitaly node.
The first boundary is the storage. The storages are independent of each other and host distinct repositories. Transactions never span across storages.
Storages are further divided into partitions:
- Transactional properties are maintained within a partition. Transactions never span across partitions.
- A partition stores some data and provides access to that data with transactional guarantees. The data will generally be repositories. Partitions may also
store key-value data, which will be used in future with [the new cluster architecture](#raft-replication) to store cluster metadata.
- Partitions will be the unit of replication with [Raft](#raft-replication).
Repositories:
- Within a storage might depend on each other. This is the case with objects pools and the repositories that borrow from them. Their operations must be
synchronized because changes in the pool would affect the object database content of the borrowing repository.
- That are not borrowing from an object pool are independent from each other. They are also accessed independently.
- That depend on each other go in the same partition. This generally means object pools and their borrowers. Most repositories will have their own partition.
The logical data hierarchy looks as follows:
``` mermaid
graph
subgraph "Gitaly Node"
G[Process] --> S1[Storage 1]
G[Process] --> S2[Storage 2]
S1 --> P1[Partition 1]
S1 --> P2[Partition 2]
S2 --> P3[Partition 3]
S2 --> P4[Partition 4]
P1 --> R1[Object Pool]
P1 --> R2[Member Repo 1]
P1 --> R3[Member Repo 2]
R2 --> R1
R3 --> R1
P2 --> R4[Repository 3]
P3 --> R5[Repository 4]
P4 --> R6[Repository 5]
P4 --> R7[Repository 6]
end
```
### Transaction management
Transactional properties are guaranteed within a partition. Everything described here is within the scope of a single partition.
Each partition will have a transaction manager that manages the transactions operating on data in the partition. Higher-level concepts used in the
transaction management are covered below.
#### Serializable snapshot isolation
Prior to transactions, Gitaly didn't isolate concurrent operations from each other. Reads could read an in-between state due to writes running concurrently. Reading the same data multiple times could lead to different results if a concurrent operation modified the data in-between the two reads. Other anomalies were also possible.
The transaction manager provides serializable snapshot isolation (SSI) for transactions. Each transaction is assigned a read snapshot when it begins. The read snapshot contains the latest committed data for a repository. The data remains the same despite any concurrent changes being committed.
Multiversion concurrency control (MVCC) is used for non-blocking concurrency. MVCC works by always writing updates into a new location, leaving the old
versions intact. With multiple versions maintained, the reads are isolated from the updates as they can keep reading the old versions. The old versions are
garbage collected after there are no transactions reading them anymore.
The snapshot covers all user data:
- References
- Objects
- Custom hooks
Git doesn't natively provide tools to implement snapshot isolation. Therefore, repository snapshots are implemented on the file system by copying the directory
structure of the repository into a temporary directory and hard linking the contents of the repository in place. Git never updates references or objects in
place but always writes new files so the hard-linked files remain unchanged in the snapshots. The correct version of custom hooks for the read snapshot is
also linked into place. For information on performance concerns, see [Performance Considerations](#performance-considerations).
The snapshot works for both reading and writing because it is a normal Git repository. The Git writes performed in the snapshot are captured through the
reference transaction hook. After the transaction commits, the performed changes are write-ahead logged and ultimately applied to the repository from the log.
After the transaction commits or aborts, the transaction's temporary state, including the snapshot, is removed. Old files are automatically removed by the
file system after they are not linked to by the repository nor any transaction's snapshot.
To maintain consistency, writes into the actual repository are blocked while the snapshot is taken. The transaction manager is the single-writer to the
repository, which means that only the log application is blocked while a snapshot is taken.
#### Serializability
Serializability is a strong correctness guarantee. It ensures that the outcome of concurrent transactions is equal to some serial execution of them. Guaranteeing serializability makes life easy for users of the transactions. They can perform their changes as if they were the only user of the system and trust that the result is correct regardless of any concurrent activity.
The transaction manager provides serializability through optimistic locking.
Each read and write is operating on a snapshot of the repository. The locks acquired by Git are targeting different snapshot repositories, which allows all of
the transactions to proceed concurrently, staging their changes because they are not operating on shared resources.
When committing a transaction, the transaction manager checks whether any resources being updated or read were changed by an overlapping transaction that committed. If so, the later transaction is rejected due to a serialization violation. If there are no conflicts, the transaction is appended to the log. Once the transaction is logged, it is successfully committed. The transaction gets ultimately applied to the repository from the log. This locking mechanism allows all transactions to proceed unblocked until commit. It is general enough for identifying write conflicts of any resource.
For true serializability, we would also have to track reads performed. This is to prevent write skew, where a transaction bases its update on a stale read of
another value that was updated by a concurrent transaction. Git does not provide a way to track which references were read as part of a command. Because we
don't have a general way to track references a transaction read, write skew is permitted.
Predicate locks can be explicity acquired in a transaction. These provide hints to the transaction manager that allow it to prevent write skew to the extent
they are used.
#### Write-ahead log
Prior to transactions, the writes updated the target data on the disk directly. This creates a problem if the writes are interrupted while they are being performed.
For example, given a write:
- `ref-a new-oid old-oid`
- `ref-b new-oid old-oid`
If the process crashes after updating `ref-a` but not yet updating `ref-b`, the state now contains a partially-applied transaction. This violates atomicity.
The transaction manager uses a write-ahead log to provide atomicity and durability. Transaction's changes are written into a write-ahead log on commit prior to applying them to log's projections. If a crash occurs, the transaction is recovered from the log and performed to completion.
All writes into a partition go through the write-ahead log. Once a transaction is logged, it's applied from the log to:
- The Git repository. The repository's current state is constructed from the logged transactions.
- An embedded database shared between all partitions on a storage. Write-ahead logging-related bookkeeping state is kept here.
Most writes are fully self-contained in the log entry. Reference updates that include new objects are not. The new objects are logged in a packfile. The objects in a packfile may
depend on existing objects in the repository. This is problematic for two reasons:
- The dependencies might be garbage collected while the packfile is in the log waiting for application.
- The dependencies in the actual repository's object database might be garbage collected while a transaction is verifying connectivity of new objects against
its snapshot.
Both of these issues can be solved by writing internal references to the packfile's dependencies before committing the log entry. These internal references
can be cleared when the log entry is pruned. For more information, see [issue 154](https://gitlab.com/gitlab-org/git/-/issues/154) on the GitLab fork of Git.
### Integration
Gitaly contains over 150 RPCs. We want to plug in the transaction management without having to modify all of them. This can be achieved by plugging in a
gRPC interceptor that handles opening and committing transactions before each handler. The interceptor:
1. Begins the transaction.
1. Rewrites the repository in the request to point to the transaction's snapshot repository.
1. Invokes the RPC handler with the rewritten repository.
1. Commits or rolls back the transaction depending on whether the handler returns successfully or not.
The existing code in the handlers already knows how to access the repositories from the request. Because we rewrite the repository to point to the snapshot,
they'll be automatically snapshot-isolated because their operations will target the snapshot.
RPCs that perform non-Git writes, such as `SetCustomHooks`, will need to be adapted because we don't have a way to hook into their writes like we do with
the reference transaction hook. However, these however are a small minority, namely:
- Custom hook updates.
- Repository creations.
- Repository deletions.
To support integrating these, we'll provide a helper function to include the data in the transaction. We'll pipe the transaction through the request context.
The biggest concern with integrating the transaction management is missing some locations that write to the repository without respecting the transaction logic. Because
we are rewriting the request's repository to the snapshot repository, this is not an issue. The RPC handlers do not know the real location of the repository so they can't
accidentally write there. Any writes they perform to the snapshot repository that are not included in the transaction will be discarded. This should fail tests and alert
us to the problem.
There may be some locations in Gitaly that would benefit from having the real repository's relative path. An example could be a cache, such as the pack objects cache, that uses the relative path as cache key. It would be problematic if each transaction has a their own snapshot repository and thus a different relative path. If needed, the real relative path could be piped through the request context. The snapshots can be shared between multiple read only transactions which would keep the relative path stable. This should work for at least some of the cases where the cache should expire anyway when the data changes.
The pre-receive hook would send the rewritten repositories to the authorization endpoint at `internal/allowed`. The follow up requests from the endpoint to Gitaly would already contain the relative path pointing to the snapshot repository with a quarantine configured. The transaction middleware can detect this and not start another transaction.
To retain backwards compatibility with Prafect, the transaction manager will cast votes to Praefect when committing a transaction. Reference transaction hooks won't because
the changes there are only captured in the transaction, not actually committed yet.
Housekeeping must be integrated with the transaction processing. Most of the clean up-related housekeeping tasks, such as removing temporary files or stale locks, are no longer needed. All of the trash left by Git on failures is contained in the snapshots and removed with them when the transaction finishes.
That leaves reference and object repacking, object pruning, and building the various indexes. All of these can be done in transactions. The new packs, for
example, can be computed in a snapshot. When committing, the transaction manager can check whether their changes conflict with any other concurrently-committed transaction.
For example, an object that was pruned in a snapshot could be concurrently referenced from another transaction. If there are conflicts, the transaction manager either:
- Resolves the conflict if possible.
- Aborts the transaction and retries the housekeeping task.
The transaction manager should keep a track of how many packfiles and loose references there are in a repository, and trigger a repack when necessary.
The above allows for almost completely transparent integration with the existing code in Gitaly. We only have to update a couple of write RPCs to include the data in the transaction if it is set. This keeps the migrationary period manageable with minimal conditional logic spread throughout the code base.
### Performance considerations
The most glaring concern is the cost of snapshotting a repository. We are copying the directory structure of the repository and hard linking the files in
place before a request is processed. This might not be as problematic as it first sounds because:
- The snapshotting is essentially only creating directory entries. These are quick syscalls. The number of files in the repository increases the number of
directory entries and links we need to create in the snapshot. This can be mitigated by maintaining the repositories in good shape by repacking objects
and references. Reftables will also eventually help reduce the number of loose references. The write-ahead log only writes objects into the repository
as packfiles so loose objects won't be a concern in the future.
- These will be in-memory operations. They'll target the page cache and don't need to be fsynced.
- The snapshots can be shared between read-only transactions because they don't perform any modifications in them. This means that we only have to create
snapshots for writes, and for reads when a new version was committed after creating the previous read-only snapshot. Writes are relatively rare.
- The isolation level can be configurable on a per-transaction level for performance. Snapshot isolation is not needed when an RPC fetches a single blob.
Serializing the writes requires them to be committed one by one, which could become a bottleneck. However:
- The data partitioning minimizes this bottleneck:
- We only have to serialize writes within a partition.
- Most repositories will have their own partition.
- Object pools and their borrowers must be in the same partition. This could result in large partitions which may lead to a performance degradation. However:
- The object pools are currently undergoing a redesign. See [the blueprint](../object_pools/index.md) for more details.
- The partition assignments of the object pools, the origin repository, and the forks are better handled in context of the object deduplication design.
Some possible approaches include:
- Keeping the origin repository in its own partition. This ensures forking a repository does not lead to performance degradation for the forked repository.
- Splitting the forks into multiple partitions with each having their own copy of the object pool. This ensures the forks will retain acceptable
performance at the cost of increased storage use due to object pool duplication.
- Checking for write conflicts can be done entirely in memory because the transaction manager can keep track of which resources have been modified by
concurrent transactions. This allows for finer-grained locking than Git supports, especially when it comes to reference deletions.
The snapshot isolation requires us to keep multiple versions of data. This will increase storage usage. The actual impact depends on the amount of the new data written and the open transactions that are holding on to the old data.
On the other hand, the snapshot isolation brings performance benefits:
- `fsync` can be turned off for most writes because they target the snapshots. The writes that are committed to the real repository will be `fsync`ed by the transaction manager.
- Transactions never block each other because they'll write locks in their own snapshot. For example, transactions can concurrently delete references because they each have
their own `packed-refs` file.
- Writes into the main repository can be batched together. For example, if multiple reference deletions are committed around the same time, they can be applied to the repository
in a single write, resulting in rewriting the `packed-refs` file only once.
Snapshot isolation also enables features that were not previously feasible. These are generally long-running read operations:
- Online checksumming requires that the data doesn't change during the checksumming operation. This would previously require a lock on the repository. This can be done without
any blocking because the checksum can be computed from the snapshot.
- Online (consistent) backups become possible because they can be built from the snapshot.
## Life of a transaction
The diagram below models the flow of a write transaction that updates some references. The diagram shows the key points of how the transactions are handled:
- Each transaction has a snapshot of the repository.
- The RPC handlers never operate on the repository itself.
- The changes performed in the snapshot are captured in the transaction.
- The changes are committed after the RPC has returned successfully.
- The transaction is asynchronously applied to the repository from the log.
Beginning and committing a transaction may block other transactions. Open transactions proceed concurrently without blocking:
1. Shared lock is acquired on the repository when the snapshot is being created. Multiple snapshots can be taken at the same time but the no changes can be written into
the repository.
1. Transactions run concurrently without any blocking until the commit call where the serializability checks are done.
1. Log application acquires an exclusive lock on the repository, which blocks snapshotting.
```mermaid
sequenceDiagram
autonumber
gRPC Server->>+Transaction Middleware: Request
Transaction Middleware->>+Transaction Manager: Begin
Transaction Manager->>+Transaction: Open Transaction
participant Repository
critical Shared Lock on Repository
Transaction->>+Snapshot: Create Snapshot
end
Transaction->>Transaction Manager: Transaction Opened
Transaction Manager->>Transaction Middleware: Begun
Transaction Middleware->>+RPC Handler: Rewritten Request
RPC Handler->>+git update-ref: Update References
git update-ref->>Snapshot: Prepare
Snapshot->>git update-ref: Prepared
git update-ref->>Snapshot: Commit
Snapshot->>git update-ref: Committed
git update-ref->>+Reference Transaction Hook: Invoke
Reference Transaction Hook->>Transaction: Capture Updates
Transaction->>Reference Transaction Hook: OK
Reference Transaction Hook->>-git update-ref: OK
git update-ref->>-RPC Handler: References Updated
RPC Handler->>-Transaction Middleware: Success
Transaction Middleware->>Transaction: Commit
Transaction->>Transaction Manager: Commit
critical Serializability Check
Transaction Manager->>Transaction Manager: Verify Transaction
end
Transaction Manager->>Repository: Log Transaction
Repository->>Transaction Manager: Transaction Logged
Transaction Manager->>Transaction: Committed
Transaction->>Snapshot: Remove Snapshot
deactivate Snapshot
Transaction->>-Transaction Middleware: Committed
Transaction Middleware->>-gRPC Server: Success
critical Exclusive Lock on Repository
Transaction Manager->>-Repository: Apply Transaction
end
```
## Future opportunities
### Expose transactions to clients
Once Gitaly internally has transactions, the next natural step is to expose them to the clients. For example, Rails could run multiple operations in a single transaction. This would
extend the ACID guarantees to the clients, which would solve a number of issues:
- The clients would have ability to commit transactions atomically. Either all changes they make are performed or none are.
- The operations would automatically be guarded against races through the serializability guarantees.
For Gitaly maintainers, extending the transactions to clients enables reducing our API surface. Gitaly has multiple RPCs that perform the same operations. For example, references
are updated in multiple RPCs. This increases complexity. If the clients can begin, stage changes, and commit a transaction, we can have fewer, more fine grained RPCs. For
example, `UserCommitFiles` could be modeled with more fine grained commands as:
- `Begin`
- `WriteBlob`
- `WriteTree`
- `WriteCommit`
- `UpdateReference`
- `Commit`
This makes the API composable because the clients can use the single-purpose RPCs to compose more complex operations. This might lead to a concern that each operation requires
multiple RPC calls, increasing the latency due to roundtrips. This can be mitigated by providing an API that allows for batching commands.
Other databases provide these features through explicit transactions and a query language.
### Continuous backups with WAL archiving
Incremental backups are currently prohibitively slow because they must always compute the changes between the previous backup and the current state of the repository. Because
all writes to a partition go through the write-ahead log, it's possible to stream the write-ahead log entries to incrementally back up the repository. For more information,
see [Repository Backups](../repository_backups/index.md).
### Raft replication
The transactions provide serializability on a single partition. The partition's write-ahead log can be replicated using a consensus algorithm such as Raft. Because Raft
guarantees linearizability for log entry commits, and the transaction manager ensures serializability of transactions prior to logging them, all operations across the replicas
get serializability guarantees. For more information, see [epic 8903](https://gitlab.com/groups/gitlab-org/-/epics/8903).
## Alternative solutions
No alternatives have been proposed to the transaction management. The current state of squashing concurrency- and write interruption-related bugs one by one is not scalable.
### Snapshot isolation with reftables
Our preliminary designs for snapshot isolation relied on reftables, a new reference backend in Git. Reftables have been a work in progress for years and there doesn't seem to
be a clear timeline for when they'll actually land in Git. They have a number of shortcomings compared to the proposed solution here:
- Reftables only cover references in a snapshot. The snapshot design here covers the complete repository, most importantly object database content.
- Reftables would require heavy integration as each Git invocation would have to be wired to read the correct version of a reftable. The file system -based snapshot design
here requires no changes to the existing Git invocations.
- The design here gives a complete snapshot of a repository, which enables running multiple RPCs on the same transaction because the transaction's state is stored on the disk
during the transaction. Each RPC is able to read the transaction's earlier writes but remain isolated from other transactions. It's unclear how this would be implemented with
reftables, especially when it comes to object isolation. This is needed if we want to extend the transaction interface to the clients.
- The snapshots are independent from each other. This reduces synchronization because each transaction can proceed with staging their changes without being blocked by any
other transactions. This enables optimistic locking for better performance.
Reftables are still useful as a more efficient reference backend but they are not needed for snapshot isolation.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,62 +1,11 @@
---
owning-stage: "~devops::verify"
description: 'GitLab CI Events ADR 001: Use hierarchical events'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/decisions/001_hierarchical_events/'
remove_date: '2025-07-08'
---
# GitLab CI Events ADR 001: Use hierarchical events
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/decisions/001_hierarchical_events/).
## Context
We did some brainstorming in [an issue](https://gitlab.com/gitlab-org/gitlab/-/issues/424865)
with multiple use-cases for running CI pipelines based on subscriptions to CI
events. The pattern of using hierarchical events emerged, it is clear that
events may be grouped together by type or by origin.
For example:
```yaml
annotate:
on: issue/created
script: ./annotate $[[ event.issue.id ]]
summarize:
on: issue/closed
script: ./summarize $[[ event.issue.id ]]
```
When making this decision we didn't focus on the syntax yet, but the grouping
of events seems to be useful in majority of use-cases.
We considered making it possible for users to subscribe to multiple events in a
group at once:
```yaml
audit:
on: events/gitlab/gitlab-org/audit/*
script: ./audit $[[ event.operation.name ]]
```
The implication of this is that events within the same groups should share same
fields / schema definition.
## Decision
Use hierarchical events: events that can be grouped together and that will
share the same fields following a stable contract. For example: all _issue_
events will contain `issue.iid` field.
How we group events has not been decided yet, we can either do that by
labeling or grouping using path-like syntax.
## Consequences
The implication is that we will need to build a system with stable interface
describing events' payload and / or schema.
## Alternatives
An alternative is not to use hierarchical events, and making it necessary to
subscribe to every event separately, without giving users any guarantess around
common schema for different events. This would be especially problematic for
events that naturally belong to some group and users expect a common schema
for, like audit events.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,96 +1,11 @@
---
status: proposed
creation-date: "2023-03-15"
authors: [ "@furkanayhan" ]
owners: [ "@fabiopitino" ]
coach: "@grzesiek"
approvers: [ "@fabiopitino", "@jreporter", "@cheryl.li" ]
owning-stage: "~devops::verify"
participating-stages: [ "~devops::package", "~devops::deploy" ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/'
remove_date: '2025-07-08'
---
# GitLab CI Events
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/).
## Summary
In order to unlock innovation and build more value, GitLab is expected to be
the center of automation related to DevSecOps processes. We want to transform
GitLab into a programming environment, that will make it possible for engineers
to model various workflows on top of CI/CD pipelines. Today, users must create
custom automation around webhooks or scheduled pipelines to build required
workflows.
In order to make this automation easier for our users, we want to build a
powerful CI/CD eventing system, that will make it possible to run pipelines
whenever something happens inside or outside of GitLab.
A typical use-case is to run a CI/CD job whenever someone creates an issue,
posts a comment, changes a merge request status from "draft" to "ready for
review" or adds a new member to a group.
To build that new technology, we should:
1. Emit many hierarchical events from within GitLab in a more advanced way than we do it today.
1. Make it affordable to run this automation, that will react to GitLab events, at scale.
1. Provide a set of conventions and libraries to make writing the automation easier.
## Goals
While ["GitLab Events Platform"](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/113700)
aims to build new abstractions around emitting events in GitLab, "GitLab CI
Events" blueprint is about making it possible to:
1. Define a way in which users will configure when an event emitted will result in a CI pipeline being run.
1. Describe technology required to match subscriptions with events at GitLab.com scale and beyond.
1. Describe technology we could use to reduce the cost of running automation jobs significantly.
## Proposal
### Decisions
- [001: Use hierarchical events](decisions/001_hierarchical_events.md)
### Requirements
Any accepted proposal should take in consideration the following requirements and characteristics:
1. Defining events should be done in separate files.
- If we define all events in a single file, then the single file gets too complicated and hard to
maintain for users. Then, users need to separate their configs with the `include` keyword again and we end up
with the same solution.
- The structure of the pipelines, the personas and the jobs will be different depending on the events being
subscribed to and the goals of the subscription.
1. A single subscription configuration file should define a single pipeline that is created when an event is triggered.
- The pipeline config can include other files with the `include` keyword.
- The pipeline can have many jobs and trigger child pipelines or multi-project pipelines.
1. The events and handling syntax should use the existing CI config syntax where it is pragmatic to do so.
- It'll be easier for users to adapt. It'll require less work to implement.
1. The event subscription and emiting events should be performant, scalable, and non blocking.
- Reading from the database is usually faster than reading from files.
- A CI event can potentially have many subscriptions.
This also includes evaluating the right YAML files to create pipelines.
- The main business logic (e.g. creating an issue) should not be affected
by any subscriptions to the given CI event (e.g. issue created).
1. The CI events design should be implemented in a maintainable and extensible way.
- If there is a `issues/create` event, then any new event (`merge_request/created`) can be added without
much effort.
- We expect that many events will be added. It should be trivial for developers to
register domain events (e.g. 'issue closed') as GitLab-defined CI events.
- Also, we should consider the opportunity of supporting user-defined CI events long term (e.g. 'order shipped').
### Options
For now, we have technical 5 proposals;
1. [Proposal 1: Using the `.gitlab-ci.yml` file](proposal-1-using-the-gitlab-ci-file.md)
Based on;
- [GitLab CI Workflows PoC](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/91244)
- [PoC NPM CI events](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/111693)
1. [Proposal 2: Using the `rules` keyword](proposal-2-using-the-rules-keyword.md)
Highly inefficient way.
1. [Proposal 3: Using the `.gitlab/ci/events` folder](proposal-3-using-the-gitlab-ci-events-folder.md)
Involves file reading for every event.
1. [Proposal 4: Creating events via a CI config file](proposal-4-creating-events-via-ci-files.md)
Separate configuration files for defininig events.
1. [Proposal 5: Combined proposal](proposal-5-combined-proposal.md)
Combination of all of the proposals listed above.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,54 +1,11 @@
---
owning-stage: "~devops::verify"
description: 'GitLab CI Events Proposal 1: Using the .gitlab-ci.yml file'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-1-using-the-gitlab-ci-file/'
remove_date: '2025-07-08'
---
# GitLab CI Events Proposal 1: Using the `.gitlab-ci.yml` file
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-1-using-the-gitlab-ci-file/).
Currently, we have two proof-of-concept (POC) implementations:
- [GitLab CI Workflows PoC](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/91244)
- [PoC NPM CI events](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/111693)
They both have similar ideas;
1. Find a new CI Config syntax to define pipeline events.
Example 1:
```yaml
workflow:
events:
- events/package/published
# or
workflow:
on:
- events/package/published
```
Example 2:
```yaml
spec:
on:
- events/package/published
- events/package/removed
# on:
# package: [published, removed]
---
do_something:
script: echo "Hello World"
```
1. Upsert a workflow definition to the database when new configuration gets
pushed.
1. Match subscriptions and publishers whenever something happens at GitLab.
## Discussion
1. How to efficiently detect changes to the subscriptions?
1. How do we handle differences between workflows / events / subscriptions on
different branches?
1. Do we need to upsert subscriptions on every push?
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,35 +1,11 @@
---
owning-stage: "~devops::verify"
description: 'GitLab CI Events Proposal 2: Using the rules keyword'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-2-using-the-rules-keyword/'
remove_date: '2025-07-08'
---
# GitLab CI Events Proposal 2: Using the `rules` keyword
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-2-using-the-rules-keyword/).
Can we do it with our current [`rules`](../../../ci/yaml/index.md#rules) system?
```yaml
workflow:
rules:
- events: ["package/*"]
test_package_published:
script: echo testing published package
rules:
- events: ["package/published"]
test_package_removed:
script: echo testing removed package
rules:
- events: ["package/removed"]
```
1. We don't upsert subscriptions to the database.
1. We'll have a single worker which runs when something happens in GitLab.
1. The worker just tries to create a pipeline with the correct parameters.
1. Pipeline runs when `rules` subsystem finds a job to run.
## Challenges
1. For every defined event run, we need to enqueue a new pipeline creation worker.
1. Creating pipelines and selecting builds to run is a relatively expensive operation
1. This will not work on GitLab.com scale.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,57 +1,11 @@
---
owning-stage: "~devops::verify"
description: 'GitLab CI Events Proposal 3: Using the .gitlab/ci/events folder'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-3-using-the-gitlab-ci-events-folder/'
remove_date: '2025-07-08'
---
# GitLab CI Events Proposal 3: Using the `.gitlab/ci/events` folder
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-3-using-the-gitlab-ci-events-folder/).
In this proposal we want to create separate files for each group of events. We
can define events in the following format:
```yaml
# .gitlab/ci/events/package-published.yml
spec:
events:
- name: package/published
---
include:
- local: .gitlab-ci.yml
with:
event: $[[ gitlab.event.name ]]
```
And in the `.gitlab-ci.yml` file, we can use the input;
```yaml
# .gitlab-ci.yml
spec:
inputs:
event:
default: push
---
job1:
script: echo "Hello World"
job2:
script: echo "Hello World"
job-for-package-published:
script: echo "Hello World"
rules:
- if: $[[ inputs.event ]] == "package/published"
```
When an event happens;
1. We'll enqueue a new job for the event.
1. The job will search for the event file in the `.gitlab/ci/events` folder.
1. The job will run `Ci::CreatePipelineService` for the event file.
## Problems & Questions
1. For every defined event run, we need to enqueue a new job.
1. Every event-job will need to search for files.
1. This would be only for the project-scope events.
1. This will not work for GitLab.com scale.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,79 +1,11 @@
---
owning-stage: "~devops::verify"
description: 'GitLab CI Events Proposal 4: Defining subscriptions in a dedicated configuration file'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-4-creating-events-via-ci-files/'
remove_date: '2025-07-08'
---
# GitLab CI Events Proposal 4: Defining subscriptions in a dedicated configuration file
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-4-creating-events-via-ci-files/).
Each project can have its own configuration file for defining subscriptions to
events. For example, `.gitlab-ci-event.yml`. In this file, we can define events
in the following format:
```yaml
events:
- package/published
- issue/created
```
When this file is changed in the project repository, it is parsed and the
events are created, updated, or deleted. This is highly similar to
[Proposal 1](proposal-1-using-the-gitlab-ci-file.md) except that we don't need
to track pipeline creations every time.
1. Upsert events to the database when `.gitlab-ci-event.yml` gets updated.
1. Create inline reactions to events in code to trigger pipelines.
## Filtering jobs
We can filter jobs by using the `rules` keyword. For example:
```yaml
test_package_published:
script: echo testing published package
rules:
- events: ["package/published"]
test_package_removed:
script: echo testing removed package
rules:
- events: ["package/removed"]
```
Otherwise, we can make it work either a CI variable;
```yaml
test_package_published:
script: echo testing published package
rules:
- if: $CI_EVENT == "package/published"
test_package_removed:
script: echo testing removed package
rules:
- if: $CI_EVENT == "package/removed"
```
or an input like in the [Proposal 3](proposal-3-using-the-gitlab-ci-events-folder.md):
```yaml
spec:
inputs:
event:
default: push
---
test_package_published:
script: echo testing published package
rules:
- if: $[[ inputs.event ]] == "package/published"
test_package_removed:
script: echo testing removed package
rules:
- if: $[[ inputs.event ]] == "package/removed"
```
## Challenges
1. This will not work on GitLab.com scale.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,99 +1,11 @@
---
owning-stage: "~devops::verify"
description: 'GitLab CI Events Proposal 5: Combined proposal'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-5-combined-proposal/'
remove_date: '2025-07-08'
---
# GitLab CI Events Proposal 5: Combined proposal
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ci_events/proposal-5-combined-proposal/).
In this proposal we have separate files for cohesive groups of events. The
files are being included into the main `.gitlab-ci.yml` configuration file.
```yaml
# my/events/packages.yaml
spec:
events:
- events/package/published
- events/audit/package/*
inputs:
env:
---
do_something:
script: ./run_for $[[ event.name ]] --env $[[ inputs.env ]]
rules:
- if: $[[ event.payload.package.name ]] == "my_package"
```
In the `.gitlab-ci.yml` file, we can enable the subscription:
```yaml
# .gitlab-ci.yml
include:
- local: my/events/packages.yaml
inputs:
env: test
```
GitLab will detect changes in the included files, and parse their specs. All
the information required to define a subscription will be encapsulated in the
spec, hence we will not need to read a whole file. We can easily read `spec`
header and calculate its checksum what can become a workflow identifier.
Once we see a new identifier, we can redefine subscriptions for a particular
project and then to upsert them into the database.
We will use an efficient GIN index matching technique to match publishers with
the subscribers to run pipelines.
The syntax is also compatible with CI Components, and make it easier to define
components that will only be designed to run for events happening inside
GitLab.
## No entrypoint file variant
Another variant of this proposal is to move away from the single GitLab CI YAML
configuration file. In such case we would define another search **directory**,
like `.gitlab/workflows/` where we would store all YAML files.
We wouldn't need to `include` workflow / events files anywhere, because these
would be found by GitLab automatically. In order to implement this feature this
way we would need to extend features like "custom location for `.gitlab-ci.yml`
file".
Example, without using a main configuration file (the GitLab CI YAML file would
be still supported):
```yaml
# .gitlab/workflows/push.yml
spec:
events:
- events/repository/push
---
rspec-on-push:
script: bundle exec rspec
```
```yaml
# .gitlab/workflows/merge_requests.yml
spec:
events:
- events/merge_request/push
---
rspec-on-mr-push:
script: bundle exec rspec
```
```yaml
# .gitlab/workflows/schedules.yml
spec:
events:
- events/pipeline/schedule/run
---
smoke-test:
script: bundle exec rspec --smoke
```
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,120 +1,11 @@
---
status: proposed
creation-date: "2023-03-06"
authors: [ "@grzesiek", "@fabiopitino" ]
coach: "@ayufan"
approvers: [ "@jreporter", "@sgoldstein" ]
owning-stage: "~devops::ops section"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_events_platform/'
remove_date: '2025-07-08'
---
# GitLab Events Platform
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_events_platform/).
## Summary
GitLab codebase has grown a lot since the [first commit](https://gitlab.com/gitlab-org/gitlab/-/commit/93efff945215)
made in 2011. We've been able to implement many features that got adopted by
millions of users. There is a demand for more features, but there is also an
opportunity of a paradigm change: instead of delivering features that cover
specific use-cases, we can start building a platform that our users will be
able to extend with automation as they see fit. We can build a flexible and
generic DevSecOps solution that will integrate with external and internal
workflows using a robust eventing system.
In this design document we propose to add a few additional layers of
abstraction to make it possible to:
1. Design a notion of events hierarchy that encodes their origin and schema.
1. Publish events from within the application code using Publishers.
1. Intercept and transform events from external sources using Gateways.
1. Subscribe to internal / external events using Subscribers.
1. Hide queueing and processing implementation details behind an abstraction.
This will allow us to transform GitLab into a generic automation tooling, but
will also reduce the complexity of existing events-like features:
1. [Webhooks](../../../user/project/integrations/webhook_events.md)
1. [Audit Events](../../../administration/audit_event_reports.md)
1. [GitLab CI Events](https://about.gitlab.com/blog/2022/08/03/gitlab-ci-event-workflows/)
1. [Package Events](https://gitlab.com/groups/gitlab-org/-/epics/9677)
1. [GraphQL Events](https://gitlab.com/gitlab-org/gitlab/-/blob/dabf4783f5d758f69d947f5ff2391b4b1fb5f18a/app/graphql/graphql_triggers.rb)
## Goals
Build required abstractions and their implementation needed to better manage
internally and externally published events.
## Challenges
1. There is no solution allowing users to build subscribers and publishers.
1. There is no solution for managing subscriptions outside of the Ruby code.
1. There are many events-like features inside GitLab not using common abstractions.
1. Our current eventing solution `Gitlab::EventStore` is tightly coupled with Sidekiq.
1. There is no unified and resilient way to subscribe to externally published events.
1. Payloads associated with events differ a lot, similarly to how we define schemas.
1. Not all events are strongly typed, there is no solution to manage their hierarchy.
1. Events are not being versioned, it is easy to break schema contracts.
1. We want to build more features based on events, but because of missing
abstractions the value we could get from the implementations is limited.
## Proposal
### Publishers
Publishing events from within our Rails codebase is an important piece of the
proposed architecture. Events should be strongly typed, ideally using Ruby classes.
For example, we could emit events in the following way:
```ruby
include Gitlab::Events::Emittable
emit Gitlab::Events::Package::Published.new(package)
```
- Publishing events should be a non-blocking, and near zero-cost operation.
- Publishing events should take their origin and identity into the account.
- Publishing events should build their payload based on their lineage.
- `emit` can be a syntactic sugar over mechanism used in `GitLab::EventStore`.
### Subscribers
Subscribers will allow application developers to subscribe to arbitrary events,
published internally or externally. Subscribers could also allow application
developers to build subscription mechanisms that could be used by our users to,
for example, subscribe to project events to trigger pipelines.
Events that subscribers will subscribe to will becomes contracts, hence we
should version them or use backwards-and-forward compatible solution (like
Protobuf).
### Gateways
Gateways can be used to intercept internal and external events and change their
type, augment lineage and transform their payloads.
Gateways can be used, for example, to implement sink endpoints to intercept
Cloud Events, wrap into an internally used Ruby classes and allow developers /
users to subscribe to them.
We also may be able to implement [cross-Cell](../cells/index.md) communication through a
generic events bus implemented using Gateways.
There are also ideas around cross-instance communication to improve how GitLab
can coordinate complex deployments that involve multiple instances.
### Processing
Today in order to queue events, we either use PostgreSQL or Sidekiq. Both
mechanisms are being used interchangeably and are tightly coupled with existing
solution.
The main purpose of building an abstraction for queuing and processing is to be
able to switch to a different queuing backend when needed. For example, we
could queue some of the events on Google Pub/Sub, and send those through a
dedicated Gateway on their way back to the application.
### Observability
In order to understand interactions between events, publishers and subscribers
we may need to deliver a proper instrumentation _via_ OpenTelemetry. This will
allow us to visualize these interactions with Distributed Tracing Backends.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,283 +1,11 @@
---
status: proposed
creation-date: "2023-04-13"
authors: [ "@andrewn" ]
coach: "@grzesiek"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ml_experiments/'
remove_date: '2025-07-08'
---
# GitLab Service-Integration: AI and Beyond
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/gitlab_ml_experiments/).
This document is an abbreviated proposal for Service-Integration to allow teams
within GitLab to rapidly build new application features that leverage AI, ML,
and data technologies.
## Executive Summary
This document proposes a service-integration approach to setting up
infrastructure to allow teams within GitLab to build new application features
that leverage AI, ML, and data technologies at a rapid pace. The scope of the
document is limited specifically to internally hosted features, not third-party
APIs. The current application architecture runs most GitLab application features
in Ruby. However, many ML/AI experiments require different resources and tools,
implemented in different languages, with huge libraries that do not always play
nicely together, and have different hardware requirements. Adding all these
features to the existing infrastructure will increase the size of the GitLab
application container rapidly, resulting in slower startup times, increased
number of dependencies, security risks, negatively impacting development
velocity, and increasing complexity due to different hardware requirements. As
an alternative, the proposal suggests adding services to avoid overloading
GitLabs main workloads. These services will run independently with isolated
resources and dependencies. By adding services, GitLab can maintain the
availability and security of GitLab.com, and enable engineers to rapidly iterate
on new ML/AI experiments.
## Scope
The infrastructure, platform, and other changes related to ML/AI experiments is
broad. This blueprint is limited specifically to the following scope:
1. Production workloads, running (directly or indirectly) as a result of
requests into the GitLab application (`gitlab.com`), or an associated
subdomains (for example, `codesuggestions.gitlab.com`).
1. Excludes requests from the GitLab application, made to third-party APIs
outside of our infrastructure. From an Infrastructure point-of-view, external
AI/ML API requests are no different from other API (non ML/AI) requests and
generally follow the existing guidelines that are in place for calling
external APIs.
1. Excludes training and tuning workloads not _directly_ connected to our
production workloads. Training and tuning workloads are distinct from
production workloads and will be covered by their own blueprint(s).
## Running Production ML/AI experiment workloads
### Why Not Simply Continue To Use The Existing Application Architecture?
Let's start with some background on how the application is deployed:
1. Most GitLab application features are implemented in Ruby and run in one of
two types of Ruby deployments: broadly Rails and Sidekiq (although we do
partition this traffic further for different workloads).
1. These Ruby workloads have two main container images `gitlab-webservice-ee`
and `gitlab-sidekiq-ee`. All the code, libraries, binaries, and other
resources that we use to support the main Ruby part of the codebase are
embedded within these images.
1. There are thousands of pods running these containers in production for
GitLab.com at any moment in time. They are started up and shut down at a high
rate throughout the day as traffic demands on the site fluctuate.
1. For _most_ new features developed, any new supporting resources need to be
added to either one, or both of these containers.
![current containers](https://docs.google.com/drawings/d/e/2PACX-1vQh9ToJDy6ceKVMZxSJK5kjBjgKUKdnHcigqTz-Jte1G65aV9js5XZhCC-VYNtkJ_gnoNfob4z-DCui/pub?w=692&h=286)\
[source](https://docs.google.com/drawings/d/1RiTUnsDSkTGaMqK_RfUlCd_rQ6CgSInhfQJNewIKf1M/edit)
Many of the initial discussions focus on adding supporting resources to these
existing containers ([example](https://gitlab.com/gitlab-org/gitlab/-/issues/403630#note_1345192671)).
Choosing this approach would have many downsides, in terms of both the velocity
at which new features can be iterated on, and in terms of the availability of
GitLab.com.
Many of the AI experiments that GitLab is considering integrating into the
application are substantially different from other libraries and tools that have
been integrated in the past.
1. ML toolkits are **implemented in a plethora of languages**, each requiring
separate runtimes. Python, C, C++ are the most common, but there is a long
tail of languages used.
1. There are a very large number of tools that we're looking to integrate with and
**no single tool will support all the features that are being investigated**.
Tensorflow, PyTorch, Keras, Scikit-learn, Alpaca are just a
few examples.
1. **These libraries are huge**. Tensorflow's container image with GPU support
is 3GB, PyTorch is 5GB, Keras is 300MB. Prophet is ~250MB.
1. Many of these **libraries do not play nicely together**: they may have
dependencies that are not compatible, or require different versions of
Python, or GPU driver versions.
It's likely that in the next few months, GitLab will experiment with many
different features, using many different libraries.
Trying to deploy all of these features into the existing infrastructure would
have many downsides:
1. **The size of the GitLab application container would expand very rapidly** as
each new experiment introduces a new set of supporting libraries, each
library is as big, or bigger, than the existing GitLab application within the
container.
1. **Startup times for new workloads would increase**, potentially impacting the
availability of GitLab.com during high-traffic periods.
1. The number of dependencies within the container would increase rapidly,
putting pressure on the engineering teams to **keep ahead of exploits and vulnerabilities**.
1. **The security attack surface within the container would be greatly increased**
with each new dependency. These containers include secrets which,
if leaked via an exploit would need costly application-wide secret rotation
to be done.
1. **Development velocity will be negatively impacted** as engineers work to
avoid dependency conflicts between libraries.
1. Additionally there may be **extra complexity due to different hardware
requirements** for different libraries with appropriate drivers etc for GPUs,
TPUs, CUDA versions, etc.
1. Our Kubernetes workloads have been tuned for the existing multithreaded Ruby
request (Rails) and message (Sidekiq) processes. Adding extremely
resource-intensive applications into these workloads would affect unrelated
requests, **starving requests of CPU and memory and requiring complex tuning
to ensure fairness**. Failure to do this would impact our availability of
GitLab.com.
![fat containers](https://docs.google.com/drawings/d/e/2PACX-1vSW0Pm_7yZV-0JNmgfOHhQlvh6XsJYtrrzkPPhURf5sCbsQDKc0I0kCIbfios3ifD5tmcNvuchXSVUB/pub?w=686&h=364)
\
[source](https://docs.google.com/drawings/d/1aYffBzzea5QuZ-mTMteowefbV7VmsOuq2v4BqbPd6KE/edit)
### Proposal: Avoid Overfilling GitLabs Application Containers with Service-Integration
GitLab.com migrated to Kubernetes several years back, but for numerous good
reasons, the application architecture deployed for GitLab.com remains fairly
simple.
Instead of embedding these applications directly into the Rails and/or Sidekiq
containers, we run them as small, independent Kubernetes deployments, isolated
from the main workload.
![use services instead of fat containers](https://docs.google.com/drawings/d/e/2PACX-1vSRrPo0TNtXG8Yqj37TO2PaND9PojGZzNRs2rcTA37-vBZm5WZlfxLDCKVJD1vYHTbGy1KY1rDYHwlg/pub?w=1008&h=564)\
[source](https://docs.google.com/drawings/d/1ZPprcSYH5Oqp8T46I0p1Hhr-GD55iREDvFWcpQq9dTQ/edit)
The service-integration approach has already been used for the
[GitLab Duo Suggested Reviewers feature](https://gitlab.com/gitlab-com/gl-infra/readiness/-/merge_requests/114)
that has been deployed to GitLab.com.
This approach would have many advantages:
1. **Componentization and Replaceability**: some of these AI feature experiments
will likely be short-lived. Being able to shut them down (possibly quickly,
in an emergency, such as a security breach) is important. If they are
terminated, they are less likely to leave technical debt behind in our main
application workloads.
1. **Security Isolation**: experimental services can run with access to a
minimal set of secrets, or possibly none. Ideally, the services would be
stateless, with data being passed in, processed, and returned to the caller
without access to PostgreSQL or other data sources. In the event of a remote
code exploit or other security breach, the attacker would have limited access
to sensitive data.
1. In lieu of direct access to the main or CI Postgres clusters, services
would be provided with access to the internal GitLab API through a
predefined internal URL. The platform should provide instrumentation and
monitoring on this address.
1. In future iterations, but out of scope for the initial delivery, the
platform could facilitate automatic authentication against the internal
API, for example by managing and injecting short-lived API tokens into
internal API calls, or OIDC etc.
1. **Resource Isolation**: resource-intensive workloads would be isolated to
individual containers. OOM failures would not impact requests outside of the
experiment. CPU saturation would not slow down unrelated requests.
1. **Dependency Isolation**: different AI libraries will have conflicting
dependencies. This will not be an issue if they're run as separate services
in Kubernetes.
1. **Container Size**: the size of the main application containers is not
drastically increased, placing a burden on the application.
1. **Distribution Team Bottleneck**: The Distribution team avoids becoming a
bottleneck as demands for many different libraries to be included in the main
application containers increase.
1. **Stronger Ownership of Workloads**: teams can better understand how their
workloads are running as they run in isolation.
However, there are several outstanding questions:
1. **Availability Requirements**: would experimental services have the same
availability requirements (and alerting requirements) as the main
application?
1. **Oncall**: would teams be responsible for handling pager alerts for their
services?
1. **Support for non-SAAS GitLab instances**: initially all experiments would
target GitLab.com, but eventually we may need to consider how to support
other instances.
1. There are three possible modes for services:
1. `M1`: GitLab.com only: only GitLab.com supports the service.
1. `M2`: SAAS-hosted for use with self-managed instance and
instance-hosted: a singular SAAS-hosted service supports self-managed
instances and GitLab.com. This is similar to the [GitLab Plus proposal](https://gitlab.com/groups/gitlab-org/-/epics/308).
1. `M3`: Instance-hosted: each instance has a copy of the service.
GitLab.com has a copy for GitLab.com. Self-managed instances host their
copy of the service. This is similar to the container registry or
Gitaly today.
1. Initially, most experiments will probably be option 1 but may be promoted
to 2 or 3 as they mature.
1. **Promotion Process**: ML/AI experimental features will need to be promoted
to non-experimental status as they mature. A process for this will need to be
established.
#### Proposed Guidelines for Building ML/AI Services
1. Avoid adding any large ML/AI libraries needed to support experimentation to
the main application.
1. Create an platform to support individual ML/AI experiments.
1. Encourage supporting services to be stateless (excluding deployed models and
other resources generated during ML training).
1. ML/AI experiment support services must not access main application
datastores, including but not limited to main PostgreSQL, CI PostgreSQL, and
main application Redis instances.
1. In the main application, client code for services should reside behind a
feature-flag toggle, for fine-grained control of the feature.
#### Technical Details
Some points, in greater detail:
##### Traffic Access
1. Ideally these services should not be exposed externally to Internet traffic: only internally to our existing Rails and Sidekiq workloads should be routed.
1. For services intended to run at `M2`: "SAAS-hosted for use with self-managed instance and instance-hosted", we would expect to migrate the service to a public endpoint once sufficient security review has been performed.
##### Platform Requirements
In order to quickly deploy and manage experiments, an minimally viable platform
will need to be provided to stage-group teams. The technical implementation
details of this platform are out of scope for this blueprint and will require
their own blueprint (to follow).
However, Service-Integration will establish certain necessary and optional
requirements that the platform will need to satisfy.
###### Ease of Use, Ownership Requirements
| ID | Required | Detail | Epic/Issue | Done? |
|---|---|---|---|---|
| `R100` | Required | The platform should be easy to use: imagine Heroku with [GitLab Production Readiness-approved](https://handbook.gitlab.com/handbook/engineering/infrastructure/production/readiness/) defaults. | [Runway to [BETA] : Increased Adoption and Self Service](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/1115) | **{dotted-circle}** No |
| `R110` | Required | With the exception of an Infrastructure-led onboarding process, services are owned, deployed and managed by stage-group teams. In other words,services follow a "You Build It, You Run It" model of ownership.| [[Paused] Discussion: Tiered Support Model for Runway](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/97) | **{dotted-circle}** No |
| `R120` | Required | Programming-language agnostic: no requirements for services. Services should be packaged as container images.| [Runway to [BETA] : Increased Adoption and Self Service](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/1115) | **{dotted-circle}** No |
| `R130` | Recommended | Each service should be evaluated against the GitLab.com [Service Maturity Model](https://handbook.gitlab.com/handbook/engineering/infrastructure/service-maturity-model/).| [Discussion: Introduce an 'Infrastructure Well-Architected Service Framework'](https://gitlab.com/gitlab-com/gl-infra/scalability/-/issues/2537) | **{dotted-circle}** No |
| `R140` | Recommended | Services using the platform have expedited production-readiness processes. {::nomarkdown}<ol><li>Production-readiness requirements graded by service maturity: low-traffic, low-maturity experimental services will have lower requirement thresholds than more mature services. </li><li> By default, the platform should provide services with defaults that would pass production-readiness review for the lowest service maturity-level. </li><li> At introduction, lowest maturity services can be deployed without production readiness, provided the meet certain automatically validated requirements. This removes Infrastructure gate-keeping from being a blocker to experimental service delivery.</li></ol>{:/} | | |
###### Observability Requirements
| ID | Required | Detail | Epic/Issue | Done? |
|---|---|---|---|---|
| `R200` | Required | The platform must provide SLIs for services out-of-the-box.{::nomarkdown}<ol><li>While it is recommended that services expose internal metrics, it is not mandatory. The platform will provide monitoring from the load-balancer. This is to speed up deployment by removing barriers to experimentation.</li><li>For services that provide internal metrics scrape endpoints, the platform must be configurable to collect these.</li><li>The platform must provide generic load-balancer level SLIs for all services. Service owners must be able to select from constructing SLIs from internal application metrics, the platform-provided external SLIs, or a combination of both.</li></ol>{:/} | [Observability: Default Metrics](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/72), [Observability: Custom Metrics](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/67) | **{check-circle}** Yes |
| `R210` | Required | Observability dashboards, rules, alerts (with per-term routing) must be generated from a manifest. | [Observability: Metrics Catalog](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/74) | **{check-circle}** Yes |
| `R220` | Required | Standardized logging infrastructure.{::nomarkdown}<ol><li>Mandate that all logging emitted from services must be Structured JSON. Text logs are permitted but not recommended.</li><li>See <a href="#common-service-libraries">Common Service Libraries</a> for more details of building common SDKs for observability.</li></ol>{:/} | [Observability: Logs in Elasticsearch for model-gateway](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/75), [Observability: Runway logs available to users](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/84) | |
###### Deployment Requirements
| ID | Required | Detail | Epic/Issue | Done? |
|---|---|---|---|---|
| `R300` | Required | No secrets stored in CI/CD. {::nomarkdown} <ol><li>Authentication with Cloud Provider Resources should be exclusively via OIDC, managed as part of the platform.</li><li> Secrets should be stored in the Infrastructure-provided Hashicorp Vault for the environment and passed to applications through files or environment variables. </li><li>Generation and management of service account tokens should be done declaratively, without manual interaction.</li></ul>{:/} | [Secrets Management](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/52) | **{dotted-circle}** No |
| `R310` | Required | Multiple environment should be supported, eg Staging and Production. | | **{check-circle}** Yes |
| `R320` | Required | The platform should be cost-effective. Kubernetes clusters should support multiple services and teams. | | |
| `R330` | Recommended | Gradual rollouts, rollbacks, blue-green deployments. | | |
| `R340` | Required | Services should be isolated from one another. | | |
| `R350` | Recommended | Services should have the ability to specify node characteristic requirements (eg, GPU). | | |
| `R360` | Required | Developers should not need knowledge of Helm, Kubernetes, Prometheus in order to deploy. All required values are configured and validated in project-hosted manifest before generating Kubernetes manifests, Prometheus rules, etc. | | |
| `R370` | | Initially services should be synchronous only - using REST or GRPC requests.{::nomarkdown}<ol><li>This does not however preclude long-running HTTP(s) requests, for example long-polling or Websocket requests.</li></ol>{:/} | | |
| `R390` | | Each service hosted in its own GitLab repository with deployment manifest stored in the repository. {::nomarkdown}<ol><li>Continuous deployments that are initiated from the CI pipeline of the corresponding GitLab repository.</li></ol>{:/} | | |
##### Security Requirements
| ID | Required | Detail | Epic/Issue | Done? |
|---|---|---|---|---|
| `R400` | | Stateful services deployed on the platform that utilize their own stateful storage (for example, custom deployed Postgres instance), must not store application security tokens, cloud-provider service keys or other long-lived security tokens in their stateful stores. | | |
| `R410` | | Long-lived shared secrets are discouraged, and should be referenced in the service manifest as such, to allow for accounting and monitoring. | | |
| `R420` | | Services using long-lived shared secrets should ensure that secret rotation can take place without downtime. {::nomarkdown}<ol><li>During a rotation, old and new generations of secrets should pass authentication, allowing gradual roll-out of new secrets.</li></ol>{:/} | | |
##### Common Service Libraries
| ID | Required | Detail | Epic/Issue | Done? |
|---|---|---|---|---|
| `R500` | Required | Experimental services would be required to adopt and use [LabKit](https://gitlab.com/gitlab-org/labkit) (for Go services), or [LabKit-Ruby](https://gitlab.com/gitlab-org/ruby/gems/labkit-ruby) for observability, context, correlation, FIPs verification, etc. {::nomarkdown}<ol><li>At present, there is no LabKit-Python library, but some experiments will run in Python, so building a library to providing observability, context, correlation services in Python will be required. </li></ol>{:/} | [Scalability: Labkit as the in-application platform toolkit](https://gitlab.com/gitlab-com/gl-infra/scalability/-/issues/2793) | |
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,162 +1,11 @@
---
status: accepted
creation-date: "2021-01-07"
authors: [ "@grzesiek" ]
coach: [ "@ayufan", "@grzesiek" ]
approvers: [ "@dsatcher", "@deuley" ]
owning-stage: "~devops::manage"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/graphql_api/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/graphql_api/).
# GraphQL API
[GraphQL](https://graphql.org/) is a data query and manipulation language for
APIs, and a runtime for fulfilling queries with existing data.
At GitLab we want to adopt GraphQL to make it easier for the wider community to
interact with GitLab in a reliable way, but also to advance our own product by
modeling communication between backend and frontend components using GraphQL.
We've recently increased the pace of the adoption by defining quarterly OKRs
related to GraphQL migration. This resulted in us spending more time on the
GraphQL development and helped to surface the need of improving tooling we use
to extend the new API.
This document describes the work that is needed to build a stable foundation that
will support our development efforts and a large-scale usage of the [GraphQL API](../../../api/graphql/index.md).
## Summary
The GraphQL initiative at GitLab [started around three years ago](https://gitlab.com/gitlab-org/gitlab/-/commit/9c6c17cbcdb8bf8185fc1b873dcfd08f723e4df5).
Most of the work around the GraphQL ecosystem has been done by volunteers that are
[GraphQL experts](https://gitlab.com/groups/gitlab-org/graphql-experts/-/group_members?with_inherited_permissions=exclude).
The [retrospective on our progress](https://gitlab.com/gitlab-org/gitlab/-/issues/235659)
surfaced a few opportunities to streamline our GraphQL development efforts and
to reduce the risk of performance degradations and possible outages that may
be related to the gaps in the essential mechanisms needed to make the GraphQL
API observable and operable at scale.
Amongst small improvements to the GraphQL engine itself we want to build a
comprehensive monitoring dashboard, that will enable team members to make sense
of what is happening inside our GraphQL API. We want to make it possible to define
SLOs, triage breached SLIs and to be able to zoom into relevant details using
Grafana and Elastic. We want to see historical data and predict future usage.
It is an opportunity to learn from our experience in evolving the REST API, for
the scale, and to apply this knowledge onto the GraphQL development efforts. We
can do that by building query-to-feature correlation mechanisms, adding
scalable state synchronization support and aligning GraphQL with other
architectural initiatives being executed in parallel, like
[the support for direct uploads](https://gitlab.com/gitlab-org/gitlab/-/issues/280819).
GraphQL should be secure by default. We can avoid common security mistakes by
building mechanisms that will help us to enforce
[OWASP GraphQL recommendations](https://cheatsheetseries.owasp.org/cheatsheets/GraphQL_Cheat_Sheet.html)
that are relevant to us.
Understanding what are the needs of the wider community will also allow us to
plan deprecation policies better and to design parity between GraphQL and REST
API that suits their needs.
## Challenges
### Make sense of what is happening in GraphQL
Being able to see how GraphQL performs in a production environment is a
prerequisite for improving performance and reliability of that service.
We do not yet have tools that would make it possible for us to answer a
question of how GraphQL performs and what the bottlenecks we should optimize
are. This, combined with a pace of GraphQL adoption and the scale in which we
expect it operate, imposes a risk of an increased rate of production incidents
what will be difficult to resolve.
We want to build a comprehensive Grafana dashboard that will focus on
delivering insights of how GraphQL endpoint performs, while still empowering
team members with capability of zooming in into details. We want to improve
logging to make it possible to better correlate GraphQL queries with feature
using Elastic and to index them in a way that performance problems can be
detected early.
- Build a comprehensive Grafana dashboard for GraphQL
- Build a GraphQL query-to-feature correlation mechanisms
- Improve logging GraphQL queries in Elastic
- Redesign error handling on frontend to surface warnings
### Manage volatile GraphQL data structures
Our GraphQL API will evolve with time. GraphQL has been designed to make such
evolution easier. GraphQL APIs are easier to extend because of how composable
GraphQL is. On the other hand this is also a reason why versioning of GraphQL
APIs is considered unnecessary. Instead of versioning the API we want to mark
some fields as deprecated, but we need to have a way to understand what is the
usage of deprecated fields, types and a way to visualize it in a way that is
easy to understand. We might want to detect usage of deprecated fields and
notify users that we plan to remove them.
- Define a data-informed deprecation policy that will serve our users better
- Build a dashboard showing usage frequency of deprecated GraphQL fields
- Build mechanisms required to send deprecated fields usage in Service Ping
### Ensure consistency with the rest of the codebase
GraphQL is not the only thing we work on, but it cuts across the entire
application. It is being used to expose data collected and processed in almost
every part of our product. It makes it tightly coupled with our monolithic
codebase.
We need to ensure that how we use GraphQL is consistent with other mechanisms
we've designed to improve performance and reliability of GitLab.
We have extensive experience with evolving our REST API. We want to apply
this knowledge onto GraphQL and make it performant and secure by default.
- Design direct uploads for GraphQL
- Build GraphQL query depth and complexity histograms
- Visualize the amount of GraphQL queries reaching limits
- Add support for GraphQL ETags for existing features
### Design GraphQL interoperability with REST API
We do not plan to deprecate our REST API. It is a simple way to interact with
GitLab, and GraphQL might never become a full replacement of a traditional REST
API. The two APIs will need to coexist together. We will need to remove
duplication between them to make their codebases maintainable. This symbiosis,
however, is not only a technical challenge we need to resolve on the backend.
Users might want to use the two APIs interchangeably or even at the same time.
Making it interoperable by exposing a common scheme for resource identifiers is
a prerequisite for interoperability.
- Make GraphQL and REST API interoperable
- Design common resource identifiers for both APIs
### Design scalable state synchronization mechanisms
One of the most important goals related to GraphQL adoption at GitLab is using
it to model interactions between GitLab backend and frontend components. This
is an ongoing process that has already surfaced the need of building better
state synchronization mechanisms and hooking into existing ones.
- Design a scalable state synchronization mechanism
- Evaluate state synchronization through pub/sub and websockets
- Build a generic support for GraphQL feature correlation and feature ETags
- Redesign frontend code responsible for managing shared global state
## Iterations
### In the scope of the blueprint
1. [GraphQL API architecture](https://gitlab.com/groups/gitlab-org/-/epics/5842)
1. [Build comprehensive Grafana dashboard for GraphQL](https://gitlab.com/groups/gitlab-org/-/epics/5841)
1. [Improve logging of GraphQL requests in Elastic](https://gitlab.com/groups/gitlab-org/-/epics/4646)
1. [Build GraphQL query correlation mechanisms](https://gitlab.com/groups/gitlab-org/-/epics/5320)
1. [Design a better data-informed deprecation policy](https://gitlab.com/groups/gitlab-org/-/epics/5321)
### Future iterations
1. [Build a scalable state synchronization for GraphQL](https://gitlab.com/groups/gitlab-org/-/epics/5319)
1. [Add support for direct uploads for GraphQL](https://gitlab.com/gitlab-org/gitlab/-/issues/280819)
1. [Review GraphQL design choices related to security](https://gitlab.com/gitlab-org/security/gitlab/-/issues/339)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

View File

@ -1,255 +1,11 @@
---
status: proposed
creation-date: "2023-03-04"
authors: [ "@eduardobonet" ]
coach: "@shekharpatnaik"
approvers: [ "@kbychu", "@mray2020" ]
owning-stage: "~devops::data science"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/model_experiments_and_registry/'
remove_date: '2025-07-08'
---
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/model_experiments_and_registry/).
<!-- Blueprints often contain forward-looking statements -->
<!-- vale gitlab.FutureTense = NO -->
# Merge Model experiments into Model registry
## Summary
Keeping Model experiments and model registry as two different features will add
unnecessary overhead to UX and code maintenance, and we aim to merge model
experiments into model registry. This will require some changes to the data layer
and might lead to data loss depending on the chosen strategy.
## Motivation
Removing Model experiments while providing all functionality on Model registry
consolidates the user journey into a single feature without sacrificing usability.
### Goals
- Include all functionality from Model Experiments into Model registry
- Deprecate Model experiments (<https://docs.gitlab.com/ee/user/project/ml/experiment_tracking/>)
## Context
[Machine learning model experiments](../../../user/project/ml/experiment_tracking/index.md)
is a feature released in 16.2 that allow users to store model candidates and their
associated metadata into GitLab. The two main entities in Model experiments are
a candidate, a combination of training code, parameters and data, and an experiment,
a collection of comparable candidates. Model experiments are used to track evolution
of candidates within an experiment according to user defined metrics, and to manage
the metadata associated to these candidates. One of the key functionalities in Model
experiments is the compatibility layer with [MLflow client](../../../user/project/ml/experiment_tracking/mlflow_client.md),
allowing existing MLflow users to use GitLab as their new solution without
changes to their codebase.
[More context on is Model experiments](https://www.youtube.com/watch?v=qC8yssVEh8A)
[Model registry](../../../user/project/ml/model_registry/index.md) is a
feature being released in 16.8 to address the follow up need of model experiments:
managing and deploying models and their versions. In its current form it is a
package registry that allows users to manage their model's metadata. In addition
to having a collection of versions, users can also create candidates within that
model. In terms of usage, a model candidate can then be promoted into a model version.
![Experiments vs Models](img/experiments_vs_models.png)
## Design and implementation details
### Data Layer changes
This blueprint propose the following architectural changes:
- Ml::Candidates will belong to Ml::Model instead of Ml::Experiment
- All Ml::Candidates to use `ml_model` package type instead of `generic`
- When creating an experiment using the MLflow client API, an Ml::Model will be created instead of an Ml::Experiment
- Deletion of Ml::Experiments and Ml::ExperimentMetadata
Data topology changes:
| | |
|---------|---------------------------------------------|
| Before | ![img_1.png](img/data_topology_before.png) |
| Changes | ![img_1.png](img/data_topology_changes.png) |
| After | ![img.png](img/data_topology_after.png) |
### Milestone 1: Migrate Ml::Candidates packages to ml_model type
Goal:
- \[ \] All packages associated to an Ml::Candidate are of `ml_model` type
Existing candidates use the generic package type to store artifacts, while model
registry uses the new ml model type and its endpoints. These endpoints allow more domain control
over the creation of packages, and should simplify the logic behind storing
candidate artifacts. We can keep existing candidates using the generic package,
or find a way to migrate them to the new ml model package type.
As of Feb 2024, gitab.com database has ~38k candidates out of which ~4k have packages that need to be migrated. We
don't have much data on self managed instances, but hte migration should work
for them as well.
#### Iteration 1: New candidates use the ml_model package type
This is already being done for candidates that are part of a model_version.
For others, when we identify the model registry feature flag is enabled we could
already support candidate by adding a new endpoint to [`ml_model_packages.rb`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/lib/api/ml_model_packages.rb#L38)
#### Iteration 2: Packages for existing candidates to be migrated to ml_model
Add migration to updates all generic packages associated to a candidate to
package_type `ml_model`. These packages follow a different naming convention
(`ml_experiment_{experiment_iid}/{candidate_iid}`)
than that supported by ml_model packages (`{ml_model_name}/{semver_version}`),
but since for now they will still belong to an experiment we should allow
ml_models to accept the existing name and version.
### Milestone 2: Use Ml::Model as the parent of Ml::Candidates
Goal:
- \[ \] `experiment_id` column on Ml::Candidate is removed, and a column `model_id` is added
- \[ \] Add candidate comparison table to the model detail page
Ml::Model is now composed by a default_experiment Ml::Experiment, with the same
name as the model, which holds the Ml::Candidates assigned to the model. Removing
this indirection.
The only feature that Model experiments has and Model registry doesn't support
as of now is a table view to compare and sort candidates by metrics, model
registry only shows a list without much information
### Milestone 3: Replace Ml::Experiment with Ml::Model
Goals:
- \[ \] MLflow client compat endpoints (`experiments/create`) create a model instead of an experiment
- \[ \] A new model is created for each experiment
- \[ \] All candidates are created within models
- \[ \] Ml::Candidates parent changes from an experiment to a model
Experiments is an abstraction that serves only to group candidates. But if
candidates are assigned to models, a model already plays the role of experiment
and this becomes unnecessary. Worst case possible, a user can still create a
scratch model to collect candidates without the intention of promoting them,
which is the exact same as an experiment. Removing the experiment table will
simplify the codebase
#### Step 1: Add display name to Ml::Models
Model names follow a strict regex which experiments don't. We will need to implement
a Display name for models so that the original name of the experiment is kept,
but add an additional slugified version of the display name.
#### Step 2: Block creation of new experiments, only create models
Change experiment mlflow endpoints to create models instead of experiments.
#### Step 3: Block creation of new experiments, only create models
Create a model for every experiment, and associate existing candidates to those.
New candidates will always be associated with models. An Experiment of name `My Experiment`
will have a model of Display Name `My Experiment` and name `my_experiment`.
In this step, all candidates will be associated to a model, either the newly
created model or the model which the experiment is the default_experiment for.
A new column `model_id` needs to be added to Ml::Candidate
### Milestone 4: Cleanup
Goals:
- \[ \] Delete tables Ml::Experiments and Ml::ExperimentsMetadata
- \[ \] Delete ExperimentsController (and related helpers)
- \[ \] Delete frontend code under [`ml/experiment_tracking`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/assets/javascripts/ml/experiment_tracking/)
# Alternatives
## Keeping existing separation
Pros:
- No work required on the sort term
Cons:
- Overhead in maintaining two different features that partially do the same thing
- Increase in code complexity to handle two different features
- More complex user journey
## Deprecate Model experiments without migrating data
Since model experiments
Pros:
- Considerably less work to be done, we can simply delete the tables.
Cons:
- Loss of early adopters confidence on testing experimental features
# Extra
Existing topology as diagram
```mermaid
erDiagram
MlExperiment ||--o{ MlCandidate : compares
MlExperiment ||--o{ MlExperimentMedatadata : has
MlCandidate ||--o{ MlCandidateParam : has
MlCandidate ||--o{ MlCandidateMetric : has
MlCandidate ||--o{ MlCandidateMetadata : has
MlCandidate ||--o{ PackagesPackage : stores
MlModel ||--o{ MlExperiment : has_with_same_name
MlModel ||--o{ MlModelVersion : organizes
MlModel ||--o{ MlModelMedatadata : has
MlModelVersion ||--o{ MlCandidate : has
MlModelVersion ||--o{ MlModelVersionMedatadata : has
MlModelVersion ||--o{ PackagesPackage : stores
MlCandidateParam {
string name
string value
}
MlCandidateMetric {
string name
float value
int step
}
MlCandidateMetadata {
string name
string value
}
MlExperimentMedatadata {
string name
string value
}
MlCandidate {
bigint id
bigint iid
string name
uuid eid
}
MlExperiment {
bigint id
bigint iid
string name
}
MlModel {
bigint id
string name
}
MlModelVersion {
bigint id
string version
}
MlModelVersionMedatadata {
string name
string value
}
MlModelMedatadata {
string name
string value
}
```
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,910 +1,11 @@
---
status: proposed
creation-date: "2023-03-30"
authors: [ "@pks-gitlab" ]
coach: [ ]
approvers: [ ]
owning-stage: "~devops::systems"
participating-stages: [ "~devops::create" ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/object_pools/'
remove_date: '2025-07-08'
---
# Iterate on the design of object pools
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/object_pools/).
## Summary
Forking repositories is at the heart of many modern workflows for projects
hosted in GitLab. As most of the objects between a fork and its upstream project
will typically be the same, this opens up potential for optimizations:
- Creating forks can theoretically be lightning fast if we reuse much of the
parts of the upstream repository.
- We can save on storage space by deduplicating objects which are shared.
This architecture is currently implemented with object pools which hold objects
of the primary repository. But the design of object pools has organically grown
and is nowadays showing its limits.
This blueprint explores how we can iterate on the design of object pools to fix
long standing issues with it. Furthermore, the intent is to arrive at a design
that lets us iterate more readily on the exact implementation details of object
pools.
## Motivation
The current design of object pools is showing problems with scalability in
various different ways. For a large part the problems come from the fact that
object pools have organically grown and that we learned as we went by.
It is proving hard to fix the overall design of object pools because there is no
clear ownership. While Gitaly provides the low-level building blocks to make
them work, it does not have enough control over them to be able to iterate on
their implementation details.
There are thus two major goals: taking ownership of object pools so that it
becomes easier to iterate on the design, and fixing scalability issues once we
can iterate.
### Lifecycle ownership
While Gitaly provides the interfaces to manage object pools, the actual life
cycle of them is controlled by the client. A typical lifecycle of an object pool
looks as following:
1. An object pool is created via `CreateObjectPool()`. The caller provides the
path where the object pool shall be created as well as the origin repository
from which the repository shall be created.
1. The origin repository needs to be linked to the object pool explicitly by
calling `LinkRepositoryToObjectPool()`.
1. The object pool needs to be regularly updated via `FetchIntoObjectPool()`
that fetches all changes from the primary pool member into the object pool.
1. To create forks, the client needs to call `CreateFork()` followed by
`LinkRepositoryToObjectPool()`.
1. Repositories of forks are unlinked by calling `DisconnectGitAlternates()`.
This will reduplicate objects.
1. The object pool is deleted via `DeleteObjectPool()`.
This lifecycle is complex and leaks a lot of implementation details to the
caller. This was originally done in part to give the Rails side control and
management over Git object visibility. GitLab project visibility rules are
complex and not a Gitaly concern. By exposing these details Rails can control
when pool membership links are created and broken. It is not clear at the
current point in time how the complete system works and its limits are not
explicitly documented.
In addition to the complexity of the lifecycle we also have multiple sources of
truth for pool membership. Gitaly never tracks the set of members of a pool
repository but can only tell for a specific repository that it is part of said
pool. Consequently, Rails is forced to maintain this information in a database,
but it is hard to maintain that information without becoming stale.
### Repository maintenance
Related to the lifecycle ownership issues is the issue of repository
maintenance. As mentioned, keeping an object pool up to date requires regular
calls to `FetchIntoObjectPool()`. This is leaking implementation details to the
client, but was done to give the client control over syncing the primary
repository with its object pool. With this control, private repositories can be
prevented from syncing and consquently leaking objects to other repositories in
the fork network.
We have had good success with moving repository maintenance into Gitaly so that
clients do not need to know about on-disk details. Ideally, we would do the same
for repositories that are the primary member of an object pool: if we optimize
its on-disk state, we will also automatically update the object pool.
There are two issues that keep us from doing so:
- Gitaly does not know about the relationship between an object pool and its
members.
- Updating object pools is expensive.
By making Gitaly the single source of truth for object pool memberships we would
be in a position to fix both issues.
### Fast forking
In the current implementation, Rails first invokes `CreateFork()` which results
in a complete `git-clone(1)` being performed to generate the fork repository.
This is followed by `LinkRepositoryToObjectPool()` to link the fork with the
object pool. It is not until housekeeping is performed on the fork repository
that objects are deduplicated. This is not only leaking implementation details
to clients, but it also keeps us from reaping the full potential benefit of
object pools.
In particular, creating forks is a lot slower than it could be since a clone is
always performed before linking. If the steps of creating the fork and linking
the fork to the pool repository were unified, the initial clone could be
avoided.
### Clustered object pools
Gitaly Cluster and object pools development overlapped. Consequently they are
known to not work well together. Praefect does neither ensure that repositories
with object pools have their object pools present on all nodes, nor does it
ensure that object pools are in a known state. If at all, object pools only work
by chance.
The current state has led to cases where object pools were missing or had
different contents per node. This can result in inconsistently observed state in
object pool members and writes that depend on the object pool's contents to
fail.
One way object pools might be handled for clustered Gitaly could be to have the
pool repositories duplicated on nodes that contain repositories dependent on
them. This would allow members of a fork network to exist of different nodes. To
make this work, repository replciation would have to be aware of object pools
and know when it needs to duplicate them onto a particular node.
## Requirements
There are a set of requirements and invariants that must be given for any
particular solution.
### Private upstream repositories should not leak objects to forks
When a project has a visibility setting that is not public, the objects in the
repository should not be fetched into an object pool. An object pool should only
ever contain objects from the upstream repository that were at one point public.
This prevents private upstream repositories from having objects leaked to forks
through a shared object pool.
### Forks cannot sneak objects into upstream projects
It should not be possible to make objects uploaded in a fork repository
accessible in the upstream repository via a shared object pool. Otherwise
potentially unauthorized users would be able to "sneak in" objects into
repositories by simply forking them.
Despite leading to confusion, this could also serve as a mechanism to corrupt
upstream repositories by introducing objects that are known to be broken.
### Object pool lifetime exceeds upstream repository lifetime
If the upstream repository gets deleted, its object pool should remain in place
to provide continued deduplication of shared objects between the other
repositories in the fork network. Thus it can be said that the lifetime of the
object pool is longer than the lifetime of the upstream repository. An object
pool should only be deleted if there are no longer any repositories referencing
it.
### Object lifetime
By deduplicating objects in a fork network, repositories become dependent on the
object pool. Missing objects in the pooled repository could lead to corruption
of repositories in the fork network. Therefore, objects in the pooled repository
must continue to exist as long as there are repositories referencing them.
Without a mechanism to accurately determine if a pooled object is referenenced
by one of more repositories, all objects in the pooled repository must remain.
Only when there are no repositories referencing the object pool can the pooled
repository, and therfore all its objects, be removed.
### Object sharing
An object that is deduplicated will become accessible from all forks of a
particular repository, even if it has never been reachable in any of the forks.
The consequence is that any write to an object pool immediately influences all
of its members.
We need to be mindful of this property when repositories connected to an object
pool are replicated. As the user-observable state should be the same on all
replicas, we need to ensure that both the repository and its object pool are
consistent across the different nodes.
## Proposal
In the current design, management of object pools mostly happens on the client
side as they need to manage their complete lifecyclethem. This requires Rails to
store the object pool relationships in the Rails database, perform fine-grained
management of every single step of an object pool's life, and perform periodic
Sidekiq jobs to enforce state by calling idempotent Gitaly RPCs. This design
significantly increases complexity of an already-complex mechanism.
Instead of handling the full lifecycle of object pools on the client-side, this
document proposes to instead encapsulate the object pool lifecycle management
inside of Gitaly. Instead of performing low-level actions to maintain object
pools, clients would only need to tell Gitaly about updated relationships
between a repository and its object pool.
This brings us multiple advantages:
- The inherent complexity of the lifecycle management is encapsulated in a
single place, namely Gitaly.
- Gitaly is in a better position to iterate on the low-level technical design of
object pools in case we find a better solution compared to "alternates" in the
future.
- We can ensure better interplay between Gitaly Cluster, object pools and
repository housekeeping.
- Gitaly becomes the single source of truth for object pool relationships and
can thus start to manage it better.
Overall, the goal is to raise the abstraction level so that clients need to
worry less about the technical details while Gitaly is in a better position to
iterate on them.
### Move lifecycle management of pools into Gitaly
The lifecycle management of object pools is leaking too many details to the
client, and by doing so makes parts things both hard to understand and
inefficient.
The current solution relies on a set of fine-grained RPCs that manage the
relationship between repositories and their object pools. Instead, we are aiming
for a simplified approach that only exposes the high-level concept of forks to
the client. This will happen in the form of three RPCs:
- `ForkRepository()` will create a fork of a given repository. If the upstream
repository does not yet have an object pool, Gitaly will create it. It will
then create the new repository and automatically link it to the object pool.
The upstream repository will be recorded as primary member of the object pool,
the fork will be recorded as a secondary member of the object pool.
- `UnforkRepository()` will remove a repository from the object pool it is
connected to. This will stop deduplication of objects. For the primary object
pool member this also means that Gitaly will stop pulling new objects into the
object pool.
- `GetObjectPool()` returns the object pool for a given repository. The pool
description will contain information about the pool's primary object pool
member as well as all secondary object pool members.
Furthermore, the following changes will be implemented:
- `RemoveRepository()` will remove the repository from its object pool. If it
was the last object pool member, the pool will be removed.
- `OptimizeRepository()`, when executed on the primary object pool member, will
also update and optimize the object pool.
- `ReplicateRepository()` needs to be aware of object pools and replicate them
correctly. Repositories shall be linked to and unlink from object pools as
required. While this is a step towards fixing the Praefect world, which may
seem redundant given that we plan to deprecate Praefect anyway, this RPC call
is also used for other use cases like repository rebalancing.
With these changes, Gitaly will have much tighter control over the lifecycle of
object pools. Furthermore, as it starts to track the membership of repositories
in object pools it can become the single source of truth for fork networks.
### Fix inefficient maintenance of object pools
In order to update object pools, Gitaly performs a fetch of new objects from the
primary object pool member into the object pool. This fetch is inefficient as it
needs to needlessly negotiate objects that are new in the primary object pool
member. But given that objects are deduplicated already in the primary object
pool member it means that it should only have objects in its object database
that do not yet exist in the object pool. Consequently, we should be able to
skip the negotiation completely and instead link all objects into the object
pool that exist in the source repository.
In the current design, these objects are kept alive by creating references to
the just-fetched objects. If the fetch deleted references or force-updated any
references, then it may happen that previously-referenced objects become
unreferenced. Gitaly thus creates keep-around references so that they cannot
ever be deleted. Furthermore, those references are required in order to properly
replicate object pools as the replication is reference-based.
These two things can be solved in different ways:
- We can set the `preciousObjects` repository extension. This will instruct all
versions of Git which understand this extension to never delete any objects
even if `git-prune(1)` or similar commands were executed. Versions of Git that
do not understand this extension would refuse to work in this repository.
- Instead of replicating object pools via `git-fetch(1)`, we can instead
replicate them by sending over all objects part of the object database.
Taken together this means that we can stop writing references in object pools
altogether. This leads to efficient updates of object pools by simply linking
all new objects into place, and it fixes issues we have seen with unbounded
growth of references in object pools.
## Design and implementation details
### Moving lifecycle management of object pools into Gitaly
As stated, the goal is to move the ownership of object pools into Gitaly.
Ideally, the concept of object pools should not be exposed to callers at all
anymore. Instead, we want to only expose the higher-level concept of networks of
repositories that share objects with each other in order to deduplicate them.
The following subsections review the current object pool-based architecture and
then propose the new object deduplication network-based architecture.
#### Object pool-based architecture
Managing the object pool lifecycle in the current architecture requires a
plethora of RPC calls and requires a lot of knowledge from the calling side. The
following sequence diagram shows a simplified version of the lifecycle of an
object pool. It is simplified insofar as we only consider there to be a single
object pool member.
```mermaid
sequenceDiagram
Rails->>+Gitaly: CreateObjectPool
Gitaly->>+Object Pool: Create
activate Object Pool
Object Pool-->>-Gitaly: Success
Gitaly-->>-Rails: Success
Rails->>+Gitaly: LinkRepositoryToObjectPool
Gitaly->>+Upstream: Link
Upstream-->>-Gitaly: Success
Gitaly-->-Rails: Success
Rails->>+Gitaly: OptimizeRepository
Gitaly->>+Upstream: Optimize
Upstream-->-Gitaly: Success
Gitaly-->-Rails: Success
Rails->>+Gitaly: CreateFork
Gitaly->>+Fork: Create
activate Fork
Fork-->>-Gitaly: Success
Gitaly-->>-Rails: CreateFork
note over Rails, Fork: Fork exists but is not connected to the object pool.
Rails->>+Gitaly: LinkRepositoryToObjectPool
Gitaly->>+Fork: Link
Fork-->>-Gitaly: Success
Gitaly-->-Rails: Success
note over Rails, Fork: Fork is connected to object pool, but objects are duplicated.
Rails->>+Gitaly: OptimizeRepository
Gitaly->>+Fork: Optimize
Fork-->-Gitaly: Success
Gitaly-->-Rails: Success
loop Regularly
note over Rails, Fork: Rails needs to ensure that the object pool is regularly updated.
Rails->>+Gitaly: FetchIntoObjectPool
Gitaly->>+Object Pool: Fetch
Object Pool-->>-Gitaly: Success
Gitaly-->>-Rails: Success
end
alt Disconnect Fork
note over Rails, Fork: Forks can be disconnected to stop deduplicating objects.
Rails->>+Gitaly: DisconnectGitAlternates
Gitaly->>+Fork: Disconnect
Fork-->>-Gitaly: Success
Gitaly-->>-Rails: Success
else Delete Fork
note over Rails, Fork: Or the fork is deleted eventually.
Rails->>+Gitaly: RemoveRepository
Gitaly->>+Fork: Remove
Fork-->>-Gitaly: Success
deactivate Fork
Gitaly-->>-Rails: Success
end
Rails->>+Gitaly: DisconnectGitAlternates
Gitaly->>+Upstream: Disconnect
Upstream-->>-Gitaly: Success
Gitaly-->>-Rails: Success
Rails->>+Gitaly: DeleteObjectPool
Gitaly->>+Object Pool: Remove
Object Pool-->>-Gitaly: Success
deactivate Object Pool
Gitaly-->>-Rails: Success
```
The following steps are involved in creating the object pool:
1. The object pool is created from its upstream repository by calling
`CreateObjectPool()`. It contains all the objects that the upstream
repository contains at the time of creation.
1. The upstream repository is linked to the object pool by calling
`LinkRepositoryToObjectPool()`. Its objects are not automatically
deduplicated.
1. Objects in the upstream repository get deduplicated by calling
`OptimizeRepository()`.
1. The fork is created by calling `CreateFork()`. This RPC call only takes the
upstream repository as input and does not know about the already-created
object pool. It thus performs a second full copy of objects.
1. Fork and object pool are linked by calling `LinkRepositoryToObjectPool()`.
This writes the `info/alternates` file in the fork so that it becomes
aware of the additional object database, but doesn't cause the objects to
become deduplicated.
1. Objects in the fork get deduplicated by calling `OptimizeRepository()`.
1. The calling side is now expected to regularly call `FetchIntoObjectPool()` to
fetch new objects from the upstream repository into the object pool. Fetched
objects are not automatically deduplicated in the upstream repository.
1. The fork can be detached from the object pool in two ways:
- Explicitly by calling `DisconnectGitAlternates()`, which removes the
`info/alternates` file and reduplicates all objects.
- By calling `RemoveRepository()` to delete the fork altogether.
1. When the object pool is empty, it must be removed by calling
`DeleteObjectPool()`.
It is clear that the whole lifecycle management is not well-abstracted and that
the clients need to be aware of many of its intricacies. Furthermore, we have
multiple sources of truth for object pool memberships that can (and in practice
do) diverge.
#### Object deduplication network-based architecture
The proposed new architecture simplifies this process by completely removing the
notion of object pools from the public interface. Instead, Gitaly exposes the
high-level notion of "object deduplication networks". Repositories can join
these networks with one of two roles:
- Read-write object deduplication network members regularly update the set of
objects that are part of the object deduplication network.
- Read-only object deduplication network members are passive members and never
update the set of objects that are part of the object deduplication network.
The set of objects that can be deduplicated across members of the object
deduplication network thus consists only of objects fetched from the read-write
members. All members benefit from the deduplication regardless of their role.
Typically:
- The original upstream repository is designated as the read-write member of
the object deduplication network.
- Forks are read-only object deduplication network members.
It is valid for object deduplication networks to only have read-only members.
In that case the network is not updated with new shared objects, but the
existing shared objects remain in use.
Though object pools continue to be the underlying mechanism, the higher level of
abstraction would allow us to swap out the mechanism if we ever decide to do so.
While clients of Gitaly need to perform fine-grained lifecycle management of
object pools in the object pool-based architecture, the object deduplication
network-based architecture only requires them to manage memberships of object
deduplication networks. The following diagram shows the equivalent flow to the
object pool-based architecture in the object deduplication network-based
architecture:
```mermaid
sequenceDiagram
Rails->>+Gitaly: CreateFork
Gitaly->>+Object Pool: Create
activate Object Pool
Object Pool -->>-Gitaly: Success
Gitaly->>+Fork: Create
Fork->>+Object Pool: Join
Object Pool-->>-Fork: Success
Fork-->>-Gitaly: Success
activate Fork
Gitaly-->>-Rails: CreateFork
loop Regularly
Rails->>+Gitaly: OptimizeRepository
Gitaly->>+Fork: Optimize
Gitaly->>+Object Pool: Optimize
Object Pool-->>-Gitaly: Success
Fork-->>-Gitaly: Success
Gitaly-->>-Rails: Success
end
alt Disconnect Fork
Rails->>+Gitaly: RemoveRepositoryFromObjectDeduplicationNetwork
Gitaly->>+Fork: Disconnect
alt Last member
Gitaly->>+Object Pool: Remove
Object Pool-->>-Gitaly: Success
end
Fork-->>-Gitaly: Success
Gitaly-->>-Rails: Success
else Delete Fork
Rails->>+Gitaly: RemoveRepository
Gitaly->>+Fork: Remove
alt Last member
Gitaly->>+Object Pool: Remove
Object Pool-->>-Gitaly: Success
end
Fork-->>-Gitaly: Success
deactivate Fork
Gitaly-->>-Rails: Success
end
```
The following major steps are involved:
1. The fork is created, where the request instructs Gitaly to have both
upstream and fork repository join an object deduplication network. If the
upstream project is part of an object deduplication network already, then
the fork joins that object deduplication network. If it isn't, Gitaly
creates an object pool and joins the upstream repository as a read-write
member and the fork as a read-only member. Objects of the fork are
immediately deduplicated. Gitaly records the membership of both repositories
in the object pool.
1. The client regularly calls `OptimizeRepository()` on either the upstream or
the fork project, which is something that clients already know to do. The
behavior changes depending on the role of the object deduplication network
member:
- When executed on a read-write object deduplication network member, the
object pool may be updated based on a set of heuristics. This will pull
objects which have been newly created in the read-write object
deduplication network member into the object pool so that they are
available for all members in the object deduplication network.
- When executed on a read-only object deduplication network member, the
object pool will not be updated so that objects which are only part of the
read-only object deduplication network member will not get shared across
members. The object pool may still be optimized though as required, for
example by repacking objects.
1. Both the upstream and the fork project can leave the object deduplication
network by calling `RemoveRepositoryFromObjectNetwork()`. This reduplicates
all objects and disconnects the repositories from the object pool.
Furthermore, if the repository was a read-write object deduplication network
member, Gitaly will stop using it as a source to update the pool.
Alternatively, the fork can be deleted with a call to `RemoveRepository()`.
Both calls update the memberships of the object pool to reflect that
repositories have left it. Gitaly deletes the object pool if it has no
members left.
With this proposed flow the creation, maintenance, and removal of object pools
is handled opaquely inside of Gitaly. In addition to the above, two more
supporting RPCs may be provided:
- `AddRepositoryToObjectDeduplicationNetwork()` to let a preexisting repository
join into an object deduplication network with a specified role.
- `ListObjectDeduplicationNetworkMembers()` to list all members and their roles
of the object deduplication network that a repository is a member of.
#### Migration to the object deduplication network-based architecture
Migration towards the object deduplication network-based architecture involves
a lot of small steps:
1. `CreateFork()` starts automatically linking against preexisting object
pools. This allows fast forking and removes the notion of object pools for
callers when creating a fork.
1. Introduce `AddRepositoryToObjectDeduplicationNetwork()` and
`RemoveRepositoryFromObjectDeduplicationNetwork()`. Deprecate
`AddRepositoryToObjectPool()` and `DisconnectGitAlternates()` and migrate
Rails to use the new RPCs. The object deduplication network is identified
via a repository, so this drops the notion of object pools when handling
memberships.
1. Start recording object deduplication network memberships in `CreateFork()`,
`AddRepositoryToObjectDeduplicationNetwork()`,
`RemoveRepositoryFromObjectDeduplicationNetwork()` and `RemoveRepository()`.
This information empowers Gitaly to take control over the object pool
lifecycle.
1. Implement a migration so that we can be sure that Gitaly has an up-to-date
view of all members of object pools. A migration is required so that Gitaly
can automatically handle the lifecycle of on object pool, which:
- Enables `OptimizeRepository()` to automatically fetch objects from
read-write object pool members.
- Allows Gitaly to automatically remove empty object pools.
1. Change `OptimizeRepository()` so that it also optimizes object pools
connected to the repository, which allows us to deprecate and eventually
remove `FetchIntoObjectPool()`.
1. Adapt `RemoveRepositoryFromObjectDeduplicationNetwork()` and
`RemoveRepository()` to remove empty object pools.
1. Adapt `CreateFork()` to automatically create object pools, which allows us
to remove the `CreateObjectPool()` RPC.
1. Remove the `ObjectPoolService` and the notion of object pools from the Gitaly
public API.
This plan is of course subject to change.
### Gitaly Cluster concerns
#### Repository creation
When a repository is forked for the first time, Rails creates an object pool via
the `CreateObjectPool()` RPC. This means object pool creation is handled outside
of Gitaly. Subsequently, the object pool is linked to upstream and fork
repositories. When a repository has its Git `alternates` file configured to link
to another repository, these two repositories must exist on the same physical
storage.
The repository and its object pool existing on the same physical storage is
particularly important for Praefect because it is dependent of the repository's
replication factor. A replication factor is a configuration that controls how
many storages the repository is replicated to in the Praefect virtual storage.
By default, the replication factor is equal to the number of storages in
Praefect. This means that when using the default replication factor, a
repository is available on all storages in the cluster. When a custom
replication factor is used, the number of replicas can be reduced so that a
repository only exists on a subset of storages in Praefect.
Gitaly Cluster persists repositories and their assigned storages in the Praefect
PostgreSQL database. The database is updated when new repositories are created
on the virtual storage. When a new repository is created, the replication factor
specifies how many storages are randomly assigned to the repository. The
following scenario outlines how a custom replication factor can be problematic
for object pools:
1. A new repository is created in a Gitaly Cluster that has five storage nodes.
The replication factor is set to three. Therefore, three storages are
randomly selected and assigned for this new repository in Praefect. For
example, the assignments are storages 1, 2, and 3. Storages 4 and 5 do not
have a copy of this repository.
1. The repository gets forked for the first time thus requiring an object pool
repository be created with the `CreateObjectPool()` RPC. Because the
replication factor is set to three, another randomly selected set of three
storages are assigned in Praefect for the new object pool repository. For
example, the object pool repository is assigned to storages 3, 4, and 5. Note
that these assignments do not entirely match the upstream repository's.
1. The forked copy of the repository gets created with the `CreateFork()` RPC
and is also assigned to three randomly-selected storages. For example, the
fork repository gets assigned storages 1, 3, and 5. These assignments also do
not entirely match the upstream and object pool repository's storage
assignments.
1. Both the upstream and fork repositories are linked to the object pool via
separate invocations of `LinkRepositoryToObjectPool()`. For this RPC to
succeed the object pool must exist on the same storage as the repository
that is linking to it. The upstream repository fails to link on storages 1
and 2. The fork repository fails to link on storage 2. The
`LinkRepositoryToObjectPool()` RPC is not transactional so a single failure
of the RPC on any of the storages results in an error being proxied back to
the client. Therefore, in this scenario `LinkRepositoryToObjectPool()` on
both the upstream and fork repository always result in an error response.
To fix this problem, we must ensure Praefect always routes `CreateObjectPool()`
and `CreateFork()` RPC requests to the same set of storages as the upstream
repository. This ensures that these repositories always have the required object
pool repository available so that linking to them can succeed.
The main downside of this is that repositories in an object deduplication
network are pinned to the same set of storages. This could unevenly stress
individual storages as an object deduplication network grows larger. In the
future this can be avoided altogether when Praefect has the ability to create
object pools on a storage where it is required but not already present.
#### Repository replication
The `ReplicateRepository()` RPC is not aware of object pools and only replicates
from the source repository. This means that replication of a source repository
linked to an object pool repository results in a target repository with no Git
`alternates` file and consequently no deduplication of objects.
The `ReplicateRepository()` RPC has two main uses:
- Storage moves performed in the GitLab API rely on the `ReplicateRepository()`
RPC to replicate repositories from one storage to another. Since this RPC is
currently not object pool aware, the resulting replica on the target storage
does not replicate the Git `alternates` file from the source repository or
recreate any object pools. Instead, the replica is always a complete
self-contained copy of the source repository. Consequently, the object pool
relationship for the repository project in Rails is also removed. When moving
repositories in an object deduplication network from one storage to another,
the replicated repositories can result in increased storage usage because
there is no longer any deduplication of objects on the target storage.
- When a repository replica becomes outdated in Praefect, the
`ReplicateRepository()` RPC is internally used by Praefect replication jobs to
replicate over the out-of-date replica from an up-to-date replica. Replication
jobs are queued by the Praefect replication manager when replicas become
outdated. Though the `ReplicateRepository()` RPC is not aware of object pools,
the replication job checks if the source repository is linked to an object
pool. If the source repository is linked, the job recreates the corresponding
Git `alternates` file on the target repository. However, it is currently
possible for an object pool to not exist on the same storage as the replica.
When this happens, replication always fails because the replica is unable to
link to the non-existent object pool. This means it is possible for replicas
to remain outdated permanently.
Object pools required by the source repository should be replicated to the
target storage along with the repository during the `ReplicateRepository()` RPC.
This preserves object deduplication for repositories in an object deduplication
network. Because storage moves performed in the GitLab API remove any object
pool relationships, recreating object pools on the target storage results in
orphaned object pools. This new object pool replication behavior of the
`ReplicateRepository()` RPC should be controlled by the client to prevent
breaking changes. Object pool replication for storage moves can be enabled once
either:
- The Rails side is updated to preserve the object pool relationship.
- The object pool lifecycle is managed within Gitaly.
When it comes to replication of object pools, there are scenarios Praefect needs
to be capable of handling. Special consideration must be made in these cases so
Praefect can keep track of all the repositories it manages in its PostgreSQL
database to ensure they stay up to date.
- Replication of an external source repository that is linked to an object pool
to Gitaly Cluster can result in the target virtual storage's Praefect needing
to create a new object pool repository. To handle this, it needs to be known
if the source repository is using an object pool. From there it can be checked
if Praefect has an entry for the object pool repository in its `repositories`
database table, and if not, create one. Next, Praefect storage assignments for
the object pool need to be generated and persisted in the
`repository_assignments` database table.
- It can not be guaranteed that the target repository storage in Praefect
already contains the required object pool. Thus an individual storage may need
to have an object pool assigned to it. This new assignment must also be
tracked by the object pool repository in Praefect. To handle this, Praefect
has to detect when a target storage does not contain the required object pool
and persist the new storage assignment in the `repository_assignments`
database table.
## Problems with the design
As mentioned before, object pools are not a perfect solution. This section goes
over the most important issues.
### Complexity of lifecycle management
Even though the lifecycle of object pools becomes easier to handle once it is
fully owned by Gitaly, it is still complex and needs to be considered in many
ways. Handling object pools in combination with their repositories is not an
atomic operation as any action by necessity spans over at least two different
resources.
### Performance issues
As object pools deduplicate objects, the end result is that object pool members
never have the full closure of objects in a single packfile. This is not
typically an issue for the primary object pool member, which by definition
cannot diverge from the object pool's contents. But secondary object pool
members can and often will diverge from the original contents of the upstream
repository.
This leads to two different sets of reachable objects in secondary object pool
members. Unfortunately, due to limitations in Git itself, this precludes the use
of a subset of optimizations:
- Packfiles cannot be reused as efficiently when serving fetches to serve
already-deltified objects. This requires Git to recompute deltas on the fly
for object pool members which have diverged from object pools.
- Packfile bitmaps can only exist in object pools as it is not possible nor
easily feasible for these bitmaps to cover multiple object databases. This
requires Git to traverse larger parts of the object graph for many operations
and especially when serving fetches.
### Dependent writes across repositories
The design of object pools introduces significant complexity into the Raft world
where we use a write-ahead log for all changes to repositories. In the ideal
case, a Raft-based design would only need to care about the write-ahead log of a
single repository when considering requests. But with object pools, we are
forced to consider both reads and writes for a pooled repository to be dependent
on all writes in its object pool having been applied.
## Alternative Solutions
The proposed solution is not obviously the best choice as it has issues both
with complexity (management of the lifecycle) and performance (inefficiently
served fetches for pool members).
This section explores alternatives to object pools and why they have not been
chosen as the new target architecture.
### Stop using object pools altogether
An obvious way to avoid all of the complexity is to stop using object pools
altogether. While it is charming from an engineering point of view as we can
significantly simplify the architecture, it is not a viable approach from the
product perspective as it would mean that we cannot support efficient forking
workflows.
### Primary repository as object pool
Instead of creating an explicit object pool repository, we could just use the
upstream repository as an alternate object database of all forks. This avoids a
lot of complexity around managing the lifetime of the object pool, at least
superficially. Furthermore, it circumvents the issue of how to update object
pools as it will always match the contents of the upstream repository.
It has a number of downsides though:
- Repositories can now have different states, where some of the
repositories are allowed to prune objects and others aren't. This introduces a
source of uncertainty and makes it easy to accidentally delete objects in a
repository and thus corrupt its forks.
- When upstream repositories go private we must stop updating objects which are
supposed to be deduplicated across members of the fork network. This means
that we would ultimately still be forced to create object pools once this
happens in order to freeze the set of deduplicated objects at the point in
time where the repository goes private.
- Deleting repositories becomes more complex as we need to take into account
whether a repository is linked to by forks.
### Reference namespaces
With `gitnamespaces(7)`, Git provides a mechanism to partition references into
different sets of namespaces. This allows us to serve all forks from a single
repository that contains all objects.
One neat property is that we have the global view of objects referenced by all
forks together in a single object database. We can thus easily perform shared
housekeeping across all forks at once, including deletion of objects that are
not used by any of the forks anymore. Regarding objects, this is likely to be
the most efficient solution we could potentially aim for.
There are again some downsides though:
- Calculating usage quotas must by necessity use actual reachability of objects
into account, which is expensive to compute. This is not a showstopper, but
something to keep in mind.
- One stated requirement is that it must not be possible to make objects
reachable in other repositories from forks. This property could theoretically
be enforced by only allowing access to reachable objects. That way an object
can only be accessed through virtual repository if the object is reachable from
its references. Reachability checks are too compute heavy for this to be practical.
- Even though references are partitioned, large fork networks would still easily
end up with multiple millions of references. It is unclear what the impact on
performance would be.
- The blast radius for any repository-level attacks significantly increases as
you would not only impact your own repository, but also all forks.
- Custom hooks would have to be isolated for each of the virtual repositories.
Since the execution of Git hooks is controled it should be possible to handle
this for each of the namespaces.
### Filesystem-based deduplication
The idea of deduplicating objects on the filesystem level was floating around at
several points in time. While it would be nice if we could shift the burden of
this to another component, it is likely not easy to implement due to the nature
of how Git works.
The most important contributing factor to repository sizes are Git objects.
While it would be possible to store the objects in their loose representation
and thus deduplicate on that level, this is infeasible:
- Git would not be able to deltify objects, which is an extremely important
mechanism to reduce on-disk size. It is unlikely that the size reduction
caused by deduplication would outweigh the size reduction gained from the
deltification mechanism.
- Loose objects are significantly less efficient when accessing the repository.
- Serving fetches requires us to send a packfile to the client. Usually, Git is
able to reuse large parts of already-existing packfiles, which significantly
reduces the computational overhead.
Deduplicating on the loose-object level is thus infeasible.
The other unit that one could try to deduplicate is packfiles. But packfiles are
not deterministically generated by Git and will furthermore be different once
repositories start to diverge from each other. So packfiles are not a natural
fit for filesystem-level deduplication either.
An alternative could be to use hard links of packfiles across repositories. This
would cause us to duplicate storage space whenever any repository decides to
perform a repack of objects and would thus be unpredictable and hard to manage.
### Custom object backend
In theory, it would be possible to implement a custom object backend that allows
us to store objects in such a way that we can deduplicate them across forks.
There are several technical hurdles though that keep us from doing so without
significant upstream investments:
- Git is not currently designed to have different backends for objects. Accesses
to files part of the object database are littered across the code base with no
abstraction level. This is in contrast to the reference database, which has at
least some level of abstraction.
- Implementing a custom object backend would likely necessitate a fork of the
Git project. Even if we had the resources to do so, it would introduce a major
risk factor due to potential incompatibilities with upstream changes. It would
become impossible to use vanilla Git, which is often a requirement that exists
in the context of Linux distributions that package GitLab.
Both the initial and the operational risk of ongoing maintenance are too high to
really justify this approach for now. We might revisit this approach in the
future.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,204 +1,11 @@
---
status: accepted
creation-date: "2021-11-18"
authors: [ "@nolith" ]
coach: "@glopezfernandez"
approvers: [ "@marin" ]
owning-stage: "~devops::data stores"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/object_storage/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/object_storage/).
# Object storage: `direct_upload` consolidation
## Abstract
GitLab stores three classes of user data: database records, Git
repositories, and user-uploaded files (which are referred to as
file storage throughout the blueprint).
The user and contributor experience for our file
storage has room for significant improvement:
- Initial GitLab setup experience requires creation and setup of 13
buckets, instead of just 1.
- Features using file storage require contributors to think about both local
storage and object storage, which leads to friction and
complexity. This often results in broken features and security issues.
- Contributors who work on file storage often also have to write code
for Workhorse, Omnibus, and cloud native GitLab (CNG).
## Problem definition
Object storage is a fundamental component of GitLab, providing the
underlying implementation for shared, distributed, highly-available
(HA) file storage.
Over time, we have built support for object storage across the
application, solving specific problems in a
[multitude of iterations](https://handbook.gitlab.com/handbook/company/working-groups/object-storage/#company-efforts-on-uploads).
This has led to increased complexity across the board, from development
(new features and bug fixes) to installation:
- New GitLab installations require the creation and configuration of
several object storage buckets instead of just one, as each group of
features requires its own. This has an impact on the installation
experience and new feature adoption, and takes us further away from
boring solutions.
- The release of cloud native GitLab required the removal of NFS
shared storage and the development of direct upload, a feature that
was expanded, milestone after milestone, to several type of uploads,
but never enabled globally.
- Today, GitLab supports both local storage and object storage. Local
storage only works on single box installations or with a NFS, which
[we no longer recommend](../../../administration/nfs.md) to our
users and is no longer in use on GitLab.com.
- Understanding all the moving parts and the flow is extremely
complicated: we have CarrierWave, Fog, Go S3/Azure SDKs, all
being used, and that complicates testing as well.
- Fog and CarrierWave are not maintained to the level of the native
SDKs (for example, AWS S3 SDK), so we have to maintain or monkey
patch those tools to support requested customer features
(for example, [issue #242245](https://gitlab.com/gitlab-org/gitlab/-/issues/242245))
that would normally be "free".
- In many cases, we copy around object storage files needlessly
(for example, [issue #285597](https://gitlab.com/gitlab-org/gitlab/-/issues/285597)).
Large files (for example, LFS and packages) are slow to finalize or don't work
at all as a result.
## Improvements over the current situation
The following is a brief description of the main directions we can take to
remove the pain points affecting our object storage implementation.
This is also available as [a YouTube video](https://youtu.be/X9V_w8hsM8E) recorded for the
[Object Storage Working Group](https://handbook.gitlab.com/handbook/company/working-groups/object-storage/).
### Simplify GitLab architecture by shipping MinIO
In the beginning, object storage support was a Premium feature, not
part of our CE distribution. Because of that, we had to support both
local storage and object storage.
With local storage, there is the assumption of a shared storage
between components. This can be achieved by having a single box
installation, without HA, or with a NFS, which
[we no longer recommend](../../../administration/nfs.md).
We have a testing gap on object storage. It also requires Workhorse
and MinIO, which are not present in our pipelines, so too much is
replaced by a mock implementation. Furthermore, the presence of a
shared disk, both in CI and in local development, often hides broken
implementations until we deploy on an HA environment.
One consideration we can take is to investigate shipping MinIO as part of the product. This could reduce the differences
between a cloud and a local installation, standardizing our file
storage on a single technology.
The removal of local disk operations would reduce the complexity of
development as well as mitigate several security attack vectors as
we no longer write user-provided data on the local storage.
It would also reduce human errors as we will always run a local object
storage in development mode and any local file disk access should
raise a red flag during the merge request review.
This effort is described in [this epic](https://gitlab.com/groups/gitlab-org/-/epics/6099).
Before considering any specific third-party technology, the
open source software licensing implications should be considered. As of 23 April 2021, [MinIO is subject to the AGPL v3 license](https://github.com/minio/minio/commit/069432566fcfac1f1053677cc925ddafd750730a). GitLab Legal must be consulted before any decision is taken to ship MinIO as proposed in this blueprint.
### Enable direct upload by default on every upload
Because every group of features requires its own bucket, we don't have
direct upload enabled everywhere. Contributing a new upload requires
coding it in both Ruby on Rails and Go.
Implementing a new feature that does not have a dedicated bucket
requires the developer to also create a merge request in Omnibus
and CNG, as well as coordinate with SREs to configure the new bucket
for our own environments.
This also slows down feature adoptions, because our users need to
reconfigure GitLab and prepare a new bucket in their
infrastructure. It also makes the initial installation more complex
feature after feature.
Implementing a direct upload by default, with a
[consolidated object storage configuration](../../../administration/object_storage.md#configure-a-single-storage-connection-for-all-object-types-consolidated-form)
will reduce the number of merge requests needed to ship a new feature
from four to only one. It will also remove the need for SRE
intervention as the bucket will always be the same.
This will simplify our development and review processes, as well as
the GitLab configuration file. And every user will immediately have
access to new features without infrastructure chores.
### Simplify object storage code
Our implementation is built on top of a 3rd-party framework where
every object storage client is a 3rd-party library. Unfortunately some
of them are unmaintained.
[We have customers who cannot push 5 GB Git LFS objects](https://gitlab.com/gitlab-org/gitlab/-/issues/216442),
but with such a vital feature implemented in 3rd-party libraries we
are slowed down in fixing it, and we also rely on external maintainers
to merge and release fixes.
Before the introduction of direct upload, using the
[CarrierWave](https://github.com/carrierwaveuploader/carrierwave)
library, _"a gem that provides a simple and extremely flexible way to
upload files from Ruby applications."_, was the boring solution.
However this is no longer our use-case, as we upload files from
Workhorse, and we had to [patch CarrierWave's internals](https://gitlab.com/gitlab-org/gitlab/-/issues/285597#note_452696638)
to support direct upload.
A brief proposal covering CarrierWave removal and a new streamlined
internal upload API is described
[in this issue comment](https://gitlab.com/gitlab-org/gitlab/-/issues/213288#note_325358026).
Ideally, we wouldn't need to duplicate object storage clients in Go
and Ruby. By removing CarrierWave, we can make use of the officially
supported native clients when the provider S3 compatibility level is
not sufficient.
## Iterations
In this section we list some possible iterations. This is not
intended to be the final roadmap, but is a conversation started for the
Object Storage Working Group.
1. Create a new catchall bucket and a unified internal API for
authorization without CarrierWave.
1. Ship MinIO with Omnibus (CNG images already include it).
1. Expand GitLab-QA to cover all the supported configurations.
1. Deprecate local disk access.
1. Deprecate configurations with multiple buckets.
1. Implement a bucket-to-bucket migration.
1. Migrate the current CarrierWave uploads to the new implementation.
1. On the next major release: Remove support for local disk access and
configurations with multiple buckets.
### Benefits of the current iteration plan
The current plan is designed to provide tangible benefits from the
first step.
With the introduction of the catchall bucket, every upload currently
not subject to direct upload will get its benefits, and new features
could be shipped with a single merge request.
Shipping MinIO with Omnibus will allow us to default new installations
to object storage, and Omnibus could take care of creating
buckets. This will simplify HA installation outside of Kubernetes.
Then we can migrate each CarrierWave uploader to the new
implementation, up to a point where GitLab installation will only
require one bucket.
## Additional reading materials
- [Uploads development guide](../../../development/uploads/index.md).
- [Speed up the monolith, building a smart reverse proxy in Go](https://archive.fosdem.org/2020/schedule/event/speedupmonolith/): a presentation explaining a bit of workhorse history and the challenge we faced in releasing the first cloud-native installation.
- [Object Storage improvements epic](https://gitlab.com/groups/gitlab-org/-/epics/483).
- We are moving to GraphQL API, but [we do not support direct upload](https://gitlab.com/gitlab-org/gitlab/-/issues/280819).
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -14,14 +14,14 @@ participating-stages: []
## Summary
This design document outlines a system for storing and querying logs which will be a part of GitLab Observability Backend (GOB), together with [tracing](../observability_tracing/index.md) and [metrics](../observability_metrics/index.md).
This design document outlines a system for storing and querying logs which will be a part of GitLab Observability Backend (GOB), together with [tracing](../observability_tracing/index.md) and [metrics](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/observability_metrics/).
At its core the system is leveraging [OpenTelemetry logging](https://opentelemetry.io/docs/specs/otel/logs/) specification for data ingestion and ClickHouse database for storage.
The users will interact with the data through GitLab UI.
The system itself is multi-tenant and offers our users a way to store their application logs, query them, and in future iterations correlate with other observability signals (traces, errors, metrics, etc...).
## Motivation
After [tracing](../observability_tracing/index.md) and [metrics](../observability_metrics/index.md), logging is the last observability signal that we need to support to be able to provide our users with a fully-fledged observability solution.
After [tracing](../observability_tracing/index.md) and [metrics](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/observability_metrics/), logging is the last observability signal that we need to support to be able to provide our users with a fully-fledged observability solution.
One could argue that logging itself is also the most important observability signal because it is so widespread.
It predates metrics and tracing in the history of application observability and is usually implemented as one of the first things during development.
@ -55,7 +55,7 @@ Without logging support, it would be very hard if not impossible to fully unders
## Proposal
The architecture of logs ingestion follows the patterns outlined in the [tracing](../observability_tracing/index.md) and [metrics](../observability_metrics/index.md) proposals:
The architecture of logs ingestion follows the patterns outlined in the [tracing](../observability_tracing/index.md) and [metrics](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/observability_metrics/) proposals:
![System Overview](system_overview.png)
@ -585,7 +585,7 @@ The `()|AND|OR` are nesting operands and can only include other non-nesting oper
We may defer the implementation of the nesting operands for later iterations.
There is implicit AND between the operands at the top level of the query structure.
The query schema is intentionally kept simple compared to [the one used in the metrics proposal](../observability_metrics/index.md#api-structure).
The query schema is intentionally kept simple compared to [the one used in the metrics proposal](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/observability_metrics/#api-structure).
We may add fields like `QueryContext`, `BackendContext`, etc... in later iterations once a need arises.
For now, we keep the schema as simple as possible and just make sure that the API is versioned so that we can change it easily in the future.

View File

@ -1,286 +1,11 @@
---
status: proposed
creation-date: "2022-11-09"
authors: [ "@ankitbhatnagar" ]
coach: "@mappelman"
approvers: [ "@sguyon", "@nicholasklick" ]
owning-stage: "~monitor::observability"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/observability_metrics/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/observability_metrics/).
# GitLab Observability - Metrics
## Summary
Developing a multi-user system to store & query observability data typically formatted in widely accepted, industry-standard formats such as OpenTelemetry using Clickhouse as the underlying storage with support for long-term data retention and aggregation.
## Motivation
From the six pillars of Observability, commonly abbreviated as `TEMPLE` - Traces, Events, Metrics, Profiles, Logs & Errors, Metrics constitute one of the most important of those for modern day systems helping their users gather insights about the operational posture of monitored systems.
Metrics which are commonly structured as timeseries data have the following characteristics:
- indexed by their corresponding timestamps;
- continuously expanding in size;
- usually aggregated, down-sampled, and queried in ranges; and
- have very write-intensive requirements.
Within GitLab Observability Backend, we aim to add the support for our customers to ingest and query observability data around their systems & applications, helping them improve the operational health of their systems.
### Goals
With the development of the proposed system, we have the following goals:
- Scalable, low latency & cost-effective monitoring system backed by Clickhouse whose performance has been proven via repeatable benchmarks.
- Support for long-term storage for metrics, ingested via an OpenTelemetry-compliant agent and queried via GitLab-native UI with probable support for metadata and exemplars.
The aforementioned goals can further be broken down into the following four sub-goals:
#### Ingesting data
- For the system to be capable of ingesting large volumes of writes and reads, we aim to ensure that it must be horizontally scalable & provide durability guarantees to ensure no writes are dropped once ingested.
#### Persisting data
- We aim to support ingesting telemetry/data instrumented using OpenTelemetry specifications. For a first iteration, any persistence we design for our dataset will be multi-tenant by default, ensuring we can store observability data for multiple groups/projects within the same storage backend.
#### Reading data
- We aim to support querying data via a GitLab-native UX which would mean using a custom DSL/Query Builder sending API requests to our backend which would then translate them into Clickhouse SQL. From our internal discussions around this, [Product Analytics Visualisation Designer](https://gitlab.com/gitlab-org/gitlab-services/design.gitlab.com/-/analytics/dashboards/visualization-designer) is a good source of inspiration for this.
#### Deleting data
- We aim to support being able to delete any ingested data should such a need arise. This is also in addition to us naturally deleting data when a configured TTL expires and/or respective retention policies are enforced. We must, within our schemas, build a way to delete data by labels OR their content, also add to our offering the necessary tooling to do so.
### Non-Goals
With the goals established above, we also want to establish what specific things are non-goals with the current proposal. They are:
- With our first iteration here, we do not aim to support querying ingested telemetry via [PromQL](https://prometheus.io/docs/prometheus/latest/querying/basics/) deferring that to as & when such a business need arises. However, users will be able to ingest their metrics using the OpenTelemetry Line Protocol (OTLP), e.g. via the [Prometheus Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/prometheusreceiver/README.md) in case of Prometheus metrics.
## Proposal
We intend to use GitLab Observability Backend (GOB) as a framework for the Metrics implementation so that its lifecycle can be managed via already established components of our backend.
![Architecture](metrics_indexing_at_ingestion.png)
As depicted in the diagram above, an OTEL-collector pipeline, indexer & query service are components that need to be developed as proposed here while the remaining peripheral components either already exist or can be provisioned via existing code in our centralised `scheduler` within GOB.
**On the write path**:
- We expect to receive incoming data via `HTTP/JSON` similar to what we do for our existing services, e.g. errortracking, tracing.
- We aim to heavily deduplicate incoming timeseries by indexing/caching per-series metadata to reduce our storage footprint.
- We aim to ensure avoiding writing a lot of small writes into Clickhouse by batching data before writing it into Clickhouse.
**On the read path**:
![MetricsReadPath](metrics-read-path.png)
- We aim to allow our users to use GitLab itself to read ingested data, which will necessitate building a dedicated `Query Service` on our backend to be able to service API requests originating from GitLab.
- We aim implement necessary query validation, sanitation and rate-limiting for any resource consumption to ensure underlying systems remain in good operational health at all times.
### GitLab Observability Tenant
With the recent changes to our backend design especially around deprecating the use of a Grafana-based UX, we have found opportunities to streamline how we provision tenants within our system. This initiative had led to the development of a custom CR - `GitLabObservabilityTenant` intended to model a dedicated set of resources **per top-level GitLab namespace**. From a scalability perspective, this means we deploy a dedicated instance of `Ingress` & `Ingester` per top-level GitLab namespace to make sure we can scale each tenant subject to traffic volumes of its respective groups & projects. It also helps isolate resource consumption across tenants in an otherwise multi-tenant system such as ours.
### Indexing per-series metadata
As an internal part of the `ingester`, we aim to index per-series labels and/or metadata to be able to deduplicate incoming timeseries data and segregate them into metadata and points-data. This helps reduce our storage footprint by an order of magnitude keeping total cost of operation low. This indexed data can also be consumed by the `Query Service` to efficiently compute timeseries for all incoming read requests. This part of our architecture is also described in more detail in [Proposal: Indexing metrics labels for efficiently deduplicating & querying time series data](https://gitlab.com/gitlab-org/opstrace/opstrace/-/issues/2397).
### Query Service
The `Query Service` consists of two primary components - 1. a request parser & 2. a backend-specific querier implementation. On the request path, once its received on the designated endpoint(s), it is handled by a handler which is a part of the request parser. The parser's responsibility is to unmarshal incoming query payloads, validate the contents and produce a `SearchContext` object which describes how must this query/request be processed. Within a `SearchContext` object is a `QueryContext` attribute which further defines one or more `Query` objects - each a completely independent data query against one of our backends.
![QueryServiceInternals](query-service-internals.png)
#### API structure
For the user-facing API, we intend to add support via HTTP/JSON endpoint(s) with user-queries marshalled as payloads within a request body. For example, to compute the sum of a minutely delta of metric:`apiserver_request_total` over all values of label:`instance`, you'd send a POST request to `https://observe.gitlab.com/query/$GROUP/$PROJECT/metrics` with the following as body:
```json
{
"queries": {
"A": {
"type": "metrics",
"filters": [
{
"key": "__name__",
"value": "apiserver_request_total",
"operator": "eq"
}
],
"aggregation": {
"function": "rate",
"interval": "1m"
},
"groupBy": {
"attribute": [
"instance"
],
"function": "sum"
},
"sortBy": {},
"legend": {}
}
},
"expression": "A"
}
```
#### Query representation as an AST
```plaintext
type SearchContext struct {
UserContext *UserContext `json:"authContext"`
BackendContext *BackendContext `json:"backendContext"`
StartTimestamp int64 `json:"start"`
EndTimestamp int64 `json:"end"`
StepIntervalSeconds int64 `json:"step"`
QueryContext *QueryContext `json:"queryContext"`
CorrelationContext *CorrelationContext `json:"correlationContext"`
Variables map[string]interface{} `json:"variables,omitempty"`
}
```
Generally speaking:
- `SearchContext` defines how a search must be executed.
- It internally contains a `QueryContext` which points to one or more `Query`(s) each targeting a given backend.
- Each `Query` must be parsed & processed independently, supplemented by other common attributes within a `QueryContext` or `SearchContext`.
- `Query` defines an AST-like object which describes how must a query be performed.
- It is intentionally schema-agnostic allowing it to be serialised and passed around our system(s).
- It is also an abstraction that hides details of how we model data internal to our databases from the querying entity.
- Assuming an incoming query can be parsed & validated into a `Query` object, a `Querier` can execute a search/query against it.
- `UserContext` defines if a request has access to the data being searched for.
- It is perhaps a good place to model & enforce request quotas, rate-limiting, etc.
- Populating parts of this attribute depend on the parser reading other global state via the API gateway or Gatekeeper.
- `BackendContext` defines which backend must a request be processed against.
- It helps route requests to an appropriate backend in a multitenant environment.
- For this iteration though, we intend to work with only one backend as is the case with our architecture.
- `CorrelationContext` defines how multiple queries can be correlated to each other to build a cohesive view on the frontend.
- For this iteration though, we intend to keep it empty and only work on adding correlation vectors later.
## Intended target-environments
Keeping inline with our current operational structure, we intend to deploy the metrics offering as a part of GitLab Observability Backend, deployed on the following two target environments:
- kind cluster (for local development)
- GKE cluster (for staging/production environments)
## Production Readiness
### Batching
Considering we'll need to batch data before ingesting large volumes of small writes into Clickhouse, the design must account for app-local persistence to allow it to locally batch incoming data before landing it into Clickhouse in batches of a predetermined size in order to increase performance and allow the table engine to continue to persist data successfully.
We have considered the following alternatives to implement app-local batching:
- In-memory - non durable
- BadgerDB - durable, embedded, performant
- Redis - trivial, external dependency
- Kafka - non-trivial, external dependency but it can augment multiple other use-cases and help other problem domains at GitLab.
**Note**: Similar challenges have also surfaced with the CH interactions `errortracking` - the subsystem has in its current implementation. There have been multiple attempts to solve this problem domain in the past - [this MR](https://gitlab.com/gitlab-org/opstrace/opstrace/-/merge_requests/1660) implemented an in-memory alternative while [this one](https://gitlab.com/gitlab-org/opstrace/opstrace/-/merge_requests/1767) attempted an on-disk alternative.
Any work done in this area of concern would also benefit other subsystems such as errortracking, logging, etc.
### Scalability
We intend to start testing the proposed implementation with 10K metric-points per second to test/establish our initial hypothesis, though ideally, we must design the underlying backend for 1M points ingested per second.
### Benchmarking
We propose the following three dimensions be tested while benchmarking the proposed implementation:
- Data ingest performance (functional)
- Mean query response times (functional)
- Storage requirements (operational)
For understanding performance, we'll need to first compile a list of such queries given the data we ingest for our tests. Clickhouse query logging is super helpful while doing this.
NOTE:
Ideally, we aim to benchmark the system to be able to ingest >1M metric points/sec while consistently serving most queries under <1 sec.
### Past work & references
- [Benchmark ClickHouse for metrics](https://gitlab.com/gitlab-org/opstrace/opstrace/-/issues/1666)
- [Incubation:APM ClickHouse evaluation](https://gitlab.com/gitlab-org/incubation-engineering/apm/apm/-/issues/4)
- [Incubation:APM ClickHouse metrics schema](https://gitlab.com/gitlab-org/incubation-engineering/apm/apm/-/issues/10)
- [Our research around TimescaleDB](https://gitlab.com/gitlab-com/gl-infra/reliability/-/issues/14137)
- [Current Workload on our Thanos-based setup](https://gitlab.com/gitlab-com/gl-infra/reliability/-/issues/15420#current-workload)
- [Scaling-200m-series](https://opstrace.com/blog/scaling-200m-series)
### Cost-estimation
- We aim to make sure the system is cost-effective to our users for ingesting & querying telemetry data. One of the more significant factors affecting underlying costs are how we model & store ingested data which the intended proposal must optimize for by measures such as reducing data redundancy, pruning unused metrics, etc.
- We must consider the usage of multiple storage medium(s), especially:
- Tiered storage
- Object storage
### Tooling
As an overarching outcome here, we aim to build the necessary tooling and/or telemetry around ingested data to enable all user personas to have visibility into high cardinality metrics to help prune or drop unused metrics. It'd be prudent to have usage statistics e.g. per-metric scrape frequencies, to make sure our end-users are not ingesting data at a volume they do not need and/or find useful.
## Future iterations
### Linkage across telemetry pillars, exemplars
We must build the metrics system in a way to be able cross-reference ingested data with other telemetry pillars, such as traces, logs and errors, so as to provide a more holistic view of all instrumentation a system sends our way.
### Support for user-defined SQL queries to aggregate data and/or generate materialized views
We should allow users of the system to be able to run user-defined, ad-hoc queries similar to how Prometheus recording rules help generate custom metrics from existing ones.
### Support for scalable data ingestion
We believe that should we feel the need to start buffering data local to the ingestion application and/or move away from Clickhouse for persisting data, on-disk WALs would be a good direction to proceed into given their prevelant usage among other monitoring systems.
### Query Service features
- Adding support for compound queries and/or expressions.
- Consolidation of querying capabilities for tracing, logs & errortracking via the query engine.
- Using the query engine to build integrations such as alerting.
- Adding support for other monitoring/querying standards such as PromQL, MetricQL, OpenSearch, etc
- Adding automated insights around metric cardinality & resource consumption.
## Planned roadmap
The following section enlists how we intend to implement the aforementioned proposal around building Metrics support into GitLab Observability Service. Each corresponding document and/or issue contains further details of how each next step is planned to be executed.
### 16.5
- Research & draft design proposal and/or requirements.
- Produce architectural blueprint, open for feedback.
### 16.6
- Develop support for OpenTelemetry-based ingestion.
- Develop support for querying data; begin with an API to list all ingested metrics scoped to a given tenant.
- Develop support for displaying a list of ingested metrics within GitLab UI.
- Release experimental version.
### 16.7
- Develop support for querying data, add metrics search endpoints for supported metric-types.
- Develop our first iteration of the query builder, enable querying backend APIs.
- Develop a metrics details page with the ability to graph data returned via backend APIs.
- Setup testing, ensure repeatable benchmarking/testing can be performed.
- Release Beta version, open for early usage by internal and external customers.
### 16.9 (Gap to allow for user feedback for GA release)
- Develop end-to-end testing, complete necessary production readiness, address feedback from users.
- Release GA version.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 55 KiB

View File

@ -1,296 +1,11 @@
---
status: ongoing
creation-date: "2023-04-05"
authors: [ "@lohrc", "alexpooley" ]
coach: "@ayufan"
approvers: [ "@lohrc" ]
owning-stage: "~devops::data stores"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/).
# Organization
This document is a work in progress and represents the current state of the Organization design.
## Glossary
- Organization: An Organization is the umbrella for one or multiple top-level Groups. Organizations are isolated from each other by default meaning that cross-Namespace features will only work for Namespaces that exist in a single Organization.
- Top-level Group: Top-level Group is the name given to the topmost Group of all other Groups. Groups and Projects are nested underneath the top-level Group.
- Cell: A Cell is a set of infrastructure components that contains multiple Organizations. The infrastructure components provided in a Cell are shared among Organizations, but not shared with other Cells. This isolation of infrastructure components means that Cells are independent from each other.
- User: An Organization has many Users. Joining an Organization makes someone a Member of that Organization.
- Member: Adding a User to a Group or Project within an Organization makes them a Member. Members are always Users, but Users are not necessarily Members of a Group or Project within an Organization. For instance, a User could just have accepted the invitation to join an Organization, but not be a Member of any Group or Project it contains.
- Non-User: A Non-User of an Organization means a User is not part of that specific Organization. Non-Users are able to interact with public Groups and Projects of an Organization, and can raise issues and comment on them.
## Summary
Organizations solve the following problems:
1. Enables grouping of top-level Groups. For example, the following top-level Groups would belong to the Organization `GitLab`:
1. `https://gitlab.com/gitlab-org/`
1. `https://gitlab.com/gitlab-com/`
1. Allows different Organizations to be isolated. Top-level Groups of the same Organization can interact with each other but not with Groups in other Organizations, providing clear boundaries for an Organization, similar to a self-managed instance. Isolation should have a positive impact on performance and availability as things like User dashboards can be scoped to Organizations.
1. Allows integration with Cells. Isolating Organizations makes it possible to allocate and distribute them across different Cells.
1. Removes the constraint of having a single hierarchy. An Organization is a container that could be filled with any collection of hierarchies that make sense.
1. Enables centralized control of user profiles. With an Organization-specific user profile, administrators can control the user's role in a company, enforce user emails, or show a graphical indicator that a user is part of the Organization. An example could be adding a "GitLab employee" stamp on comments.
1. Organizations allows us to better unify the experience on SaaS and self-managed deployments. The Organization admin will have access to instance-equivalent Admin Area settings with most of the configuration controlled at the Organization level. Instance-level workflows like Dashboards can also be shifted to the Organization.
## Motivation
### Goals
The Organization focuses on creating a better experience for Organizations to manage their GitLab experience. By introducing Organizations and [Cells](../cells/index.md) we can improve the reliability, performance and availability of GitLab.com.
- Wider audience: Many instance-level features are admin only. We do not want to lock out users of GitLab.com in that way. We want to make administrative capabilities that previously only existed for self-managed users available to our GitLab.com users as well. This also means we would give users of GitLab.com more independence from GitLab.com admins in the long run. Today, there are actions that self-managed admins can perform that GitLab.com users have to request from GitLab.com admins, for instance banning malicious actors.
- Improved UX: Inconsistencies between the features available at the Project and Group levels create navigation and usability issues. Moreover, there isn't a dedicated place for Organization-level features.
- Aggregation: Data from all Groups and Projects in an Organization can be aggregated.
- An Organization includes settings, data, and features from all Groups and Projects under the same owner (including personal Namespaces).
- Cascading behavior: Organization cascades behavior to all the Projects and Groups that are owned by the same Organization. It can be decided at the Organization level whether a setting can be overridden or not on the levels beneath.
- Minimal burden on customers: The addition of Organizations should not change existing Group and Project paths to minimize the impact of URL changes.
### Non-Goals
Due to urgency of delivering Organizations as a prerequisite for Cells, it is currently not a goal to build Organization functionality on the Namespace framework.
## Proposal
We create Organizations as a new lightweight entity, with just the features and workflows which it requires. We already have much of the functionality present in Groups and Projects, and Groups themselves are essentially already the top-level entity. It is unlikely that we need to add significant features to Organizations outside of some key settings, as top-level Groups can continue to serve this purpose at least on GitLab.com. From an infrastructure perspective, cluster-wide shared data must be both minimal (small in volume) and infrequently written.
```mermaid
graph TD
o[Organization] -. has many .- g
ns[Namespace] --> g[Group]
ns[Namespace] --> pns[ProjectNamespace] -. has one .- p[Project]
ns --> un[UserNamespace]
g -. has many .- p
un -. has many .- p
ns[Namespace] -. has many .- ns[Namespace]
```
All instances would set a default Organization.
### Benefits
- No changes to URL's for Groups moving under an Organization, which makes moving around top-level Groups very easy.
- Low risk rollout strategy, as there is no conversion process for existing top-level Groups.
- The Organization becomes the key for identifying what is part of an Organization, which is on its own table for performance and clarity.
### Drawbacks
- By not basing Organizations on the existing namespace construct, it is not clear how we would avoid duplicating the effort of achieving parity for features like reporting between GitLab.com and self-managed, without doing the work twice. (At instance/organization level for top-level reporting, and at group-level for sub-group level reporting)
- Long term, it may make sense to shift billing from top-level Groups to the Organization level.
## Data Exploration
From an initial [data exploration](https://gitlab.com/gitlab-data/analytics/-/issues/16166#note_1353332877), we retrieved the following information about Users and Organizations:
- For the users that are connected to an organization the vast majority of them (98%) are only associated with a single organization. This means we expect about 2% of Users to navigate across multiple Organizations.
- The majority of Users (78%) are only Members of a single top-level Group.
- 25% of current top-level Groups can be matched to an organization.
- Most of these top-level Groups (83%) are associated with an organization that has more than one top-level Group.
- Of the organizations with more than one top-level Group the (median) average number of top-level Groups is 3.
- Most top-level Groups that are matched to organizations with more than one top-level Group are assumed to be intended to be combined into a single organization (82%).
- Most top-level Groups that are matched to organizations with more than one top-level Group are using only a single pricing tier (59%).
- Most of the current top-level Groups are set to public visibility (85%).
- Less than 0.5% of top-level Groups share Groups with another top-level Group. However, this means we could potentially break 76,000 existing links between top-level Groups by introducing the Organization.
Based on this analysis we expect to see similar behavior when rolling out Organizations.
## Design and Implementation Details
Cells will be rolled out in three phases: Cells 1.0, Cells 1.5 and Cells 2.0.
The Organization functionality available in each phase is described below.
### Organization MVC
#### Organizations on Cells 1.0 (FY24Q2-FY25Q2)
The Organization MVC for Cells 1.0 will contain the following functionality:
- Instance setting to allow the creation of multiple Organizations. This will be enabled by default on GitLab.com, and disabled for self-managed GitLab.
- Organizations for 1.0 will contain the minimal set of features required to implement isolation. Features that are present in top-level groups for SaaS, such as billing or enterprise users, will remain here.
- The only users who will need to have a role defined and be invited specifically to an Organization are it's Owners. Typical end users will be invited at the group level, re-using the existing invitation workflows. The organization can be inferred by either the group or user.
- Admin overview of Organizations. All created Organizations are listed in the Admin Area section `Organizations`.
- All existing top-level Groups on GitLab.com are part of the `default Organization`.
- Organization Owner. The creation of an Organization appoints that User as the Organization Owner. Once established, the Organization Owner can appoint other Organization Owners.
- Organization Users. A User can only be part of one Organization for Cells 1.0. A new account needs to be created for each Organization a User wants to be part of. Users can only be deleted from an Organization, but not removed.
- Organization creation form. Containing the Organization name, ID, description, and avatar. Organization settings are editable by the Organization Owner.
- Setup flow. New Users are able to create new Organizations. They can also create new top-level Groups in an Organization.
- Private visibility. Initially, Organizations can only be `private`. Private Organizations can only be seen by the Users that are part of the private Organization. They can only contain private Groups and Projects. The only exception to this is the default Organization on the Primary Cell, which is `public`, and contains all currently existing Groups and Projects on GitLab.com.
- Organization settings page with the added ability to remove an Organization. Deletion of the default Organization is prevented.
- Groups. This includes the ability to create, edit, and delete Groups, as well as a Groups overview that can be accessed by the Organization Owner and Users.
- Projects. This includes the ability to create, edit, and delete Projects, as well as a Projects overview that can be accessed by the Organization Owner and Users.
- Personal Namespaces. Users get [a personal Namespace in each Organization](../cells/impacted_features/personal-namespaces.md) they are associated with.
- User Profile. Each [User Profile will be scoped to the Organization](../cells/impacted_features/user-profile.md).
- Isolation. Organizations themselves are not fully isolated, isolation is a result of being on a Secondary Cell. We aim to complete [phase 1 of Organization isolation](https://gitlab.com/groups/gitlab-org/-/epics/11837), with the goal to `define sharding_key` and `desired_sharding_key` rules.
##### Dependencies on other services
- Organizations rely on the Topology Service
- to guarantee the uniqueness of global claims (like usernames, emails, namespaces, SSH public keys, and more) across the cluster.
- provides IDs that are unique across the cluster.
- Organizations rely on the router to route requests to the correct Cell based on eg. path, token prefix, users, or SSH public keys.
- All Cells have their own application secrets
- Application settings are synchronized across Cells
##### Some affected features
- All forms of authentication. As the Topology Service cannot classify the request with an unauthenticated user, the process is as follows:
1. Cell #1 displays the login form.
1. Cell #1 identifies the user based on the request data.
1. Cell #1 looks up the user's associated Cell from the Topology Service.
1. Cell #1 sets a cookie indicating the associated Cell and redirects the user.
1. The router routes the request to the correct Cell based on the cookie.
1. Cell X authenticates the user
- Audit events are not available as there is an ongoing discussion related to a ClickHouse migration.
- Billing stays at top-level Group.
- Enterprise Users or verified domains are not required to be used with Organizations.
- Public visibility of Groups and Projects, or unauthenticated requests are not allowed apart from Cell #1.
##### Open questions
- To minimize the number of cluster-wide resources, consider refactoring [Standalone resources](../../../api/api_resources.md#standalone-resources) to scope them to an Organization, Group, or Project.
- Consider refactoring global endpoints (e.g. `/jwt/auth`) to be scoped to an Organization, Group, or Project, unless they are supporting cluster-wide resources.
#### Organizations on Cells 1.5 (FY25Q3-FY25Q3)
Organizations in the context of Cells 1.5 will contain the following functionality:
- Organization Users can be part of multiple Organizations using one account. Users are able to navigate between their Organizations using an Organization switcher. Non-Enterprise Users can be removed from or leave an Organization.
- Organizations are fully isolated. We aim to complete [phase 2 of Organization isolation](https://gitlab.com/groups/gitlab-org/-/epics/11838), with the goal to implement isolation constraints.
#### Organizations on Cells 2.0 (FY25Q4-FY26Q1)
Organizations in the context of Cells 2.0 will contain the following functionality:
- Public visibility. Organizations can now also be `public`, containing both private and public Groups and Projects.
- [Users can transfer existing top-level Groups into Organizations](https://gitlab.com/groups/gitlab-org/-/epics/11711).
### Organization Access
See [Organization Users](organization-users.md).
### Roles and Permissions
Organizations will have an Owner role. Compared to Users, they can perform the following actions:
| Action | Owner | User |
| ------ | ------ | ----- |
| View Organization settings | ✓ | |
| Edit Organization settings | ✓ | |
| Delete Organization | ✓ | |
| Remove Users | ✓ | |
| View Organization front page | ✓ | ✓ |
| View Groups overview | ✓ | ✓ (1) |
| View Projects overview | ✓ | ✓ (1) |
| View Users overview | ✓ | ✓ (2) |
| View Organization activity page | ✓ | ✓ (1) |
| Transfer top-level Group into Organization if Owner of both | ✓ | |
(1) Members can only see what they have access to.
(2) Users can only see Users from Groups and Projects they have access to.
[Roles](../../../user/permissions.md) at the Group and Project level remain as they currently are.
#### Relationship between Organization Owner and Instance Admin
Users with the (Instance) Admin role can currently [administer a self-managed GitLab instance](../../../administration/index.md).
As functionality is moved to the Organization level, Organization Owners will be able to access more features that are currently only accessible to Admins.
On our SaaS platform, this helps us in empowering enterprises to manage their own Organization more efficiently without depending on the Instance Admin, which is currently a GitLab team member.
On SaaS, we expect the Instance Admin and the Organization Owner to be different users.
Self-managed instances are generally scoped to a single organization, so in this case it is possible that both roles are fulfilled by the same person.
There are situations that might require intervention by an Instance Admin, for instance when Users are abusing the system.
When that is the case, actions taken by the Instance Admin overrule actions of the Organization Owner.
For instance, the Instance Admin can ban or delete a User on behalf of the Organization Owner.
### Routing
Today only Users, Projects, Namespaces and container images are considered routable entities which require global uniqueness on `https://gitlab.com/<path>/-/`.
Initially, Organization routes will be [unscoped](../../../development/routing.md).
Organizations will follow the path `https://gitlab.com/-/organizations/org-name/` as one of the design goals is that the addition of Organizations should not change existing Group and Project paths.
## Impact of the Organization on Other Domains
We want a minimal amount of infrequently written tables in the shared database.
If we have high write volume or large amounts of data in the shared database then this can become a single bottleneck for scaling and we lose the horizontal scalability objective of Cells.
With isolation being one of the main requirements to make Cells work, this means that existing features will mostly be scoped to an Organization rather than work across Organizations.
One exception to this are Users, which are stored in the cluster-wide shared database.
For a deeper exploration of the impact on select features, see the [list of features impacted by Cells](../cells/index.md#impacted-features).
### Alignment between Organization and Fulfillment
Fulfillment enhancements for Organizations will happen in a different timeline to the [Cells](../cells/index.md) project and should not be seen as blockers to any Cells timelines.
For Cells 1.0, Billing remains at the top-level Group. Said otherwise, Billing will not occur at the Organization level. The guidance for Cells 1.0 is for GitLab.com SaaS customers to use a single top-level Group to keep Billing consolidated.
We are currently [evaluating future architecture designs](https://gitlab.com/gitlab-org/gitlab/-/issues/443708) (e.g. Zuora Billing Accounts being aligned to Organizations) but have yet to determine the North star direction and how/if it aligns to the Cells iterations.
### Open-source Contributions in Organizations
Several aspects of the current open-source workflow will be impacted by the introduction of Organizations.
We are conducting deeper research around this specific problem in [issue 420804](https://gitlab.com/gitlab-org/gitlab/-/issues/420804).
## Post-MVC Iterations
After the initial rollout of Organizations, the following functionality will be added to address customer needs relating to their implementation of GitLab:
1. [Organizations can invite Users](https://gitlab.com/gitlab-org/gitlab/-/issues/420166).
1. Complete [phase 3 of Organization isolation](https://gitlab.com/groups/gitlab-org/-/epics/11839), with the goal to allow customers to move existing namespaces out of the default Organization into a new Organization.
1. Internal visibility will be made available on Organizations that are part of GitLab.com.
1. Restrict inviting Users outside of the Organization.
1. Enterprise Users will be made available at the Organization level.
1. Organizations are able to ban Users.
1. Projects can be created from the Organization-level Projects overview.
1. Groups can be created from the Organization-level Groups overview.
1. Move billing from top-level Group to Organization.
1. Audit events at the Organization level.
1. Set merge request approval rules at the Organization level and cascade to all Groups and Projects.
1. Security policies at the Organization level.
1. Vulnerability Report and Dependency List at the Organization level.
1. Cascading Organization setting to enforce security scans.
1. Merge request approval policies at the Organization level.
1. Compliance frameworks.
1. [Support the agent for Kubernetes sharing at the Organization level](https://gitlab.com/gitlab-org/gitlab/-/issues/382731).
## Organization Rollout
We propose the following steps to successfully roll out Organizations:
- Phase 1: Rollout
- Organizations will be rolled out using the concept of a `default Organization`. All existing top-level groups on GitLab.com are already part of this `default Organization`. The Organization UI is feature flagged and can be enabled for a specific set of users initially, and the global user pool at the end of this phase. This way, users will already become familiar with the concept of an Organization and the Organization UI. No features would be impacted by enabling the `default Organization`. See issue [#418225](https://gitlab.com/gitlab-org/gitlab/-/issues/418225) for more details.
- Phase 2: Temporary onboarding changes
- New customers can create new Organizations from scratch. Top-level Groups cannot be migrated yet into a new Organization, so all content must be newly created in an Organization.
- Phase 3: Migration of existing customers
- GitLab, the organization, will be one of the first entities to migrate into a separate Organization. We move all top-level Groups that belong to GitLab into the new GitLab Organization, including the `gitLab-org` and `gitLab-com` top-level Groups. See issue [#418228](https://gitlab.com/gitlab-org/gitlab/-/issues/418228) for more details.
- Once top-level Group transfer from the default Organization to another Organization becomes available, existing customers can create their own Organization and migrate their top-level Groups into it. Creation of an Organization remains optional.
- Phase 4: Permanent onboarding changes
- All new customers will only have the option to start their journey by creating a new Organization.
- Phase 5: Targeted efforts
- Organizations are promoted, e.g. via a banner message, targeted conversations with large customers via the CSMs. Creating a separate Organization will remain a voluntary action.
- We increase the value proposition of the Organization, for instance by moving billing to the Organization level to provide incentives for more customers to move to a separate Organization. Adoption will be monitored.
A force-option will only be considered if the we do not achieve the load distribution we are aiming for with Cells.
## Alternative Solutions
An alternative approach to building Organizations is to convert top-level Groups into Organizations. The main advantage of this approach is that features could be built on top of the Namespace framework and therewith leverage functionality that is already available at the Group level. We would avoid building the same feature multiple times. However, Organizations have been identified as a critical driver of Cells. Due to the urgency of delivering Cells, we decided to opt for the quickest and most straightforward solution to deliver an Organization, which is the lightweight design described above. More details on comparing the two Organization proposals can be found [here](https://gitlab.com/gitlab-org/tenant-scale-group/group-tasks/-/issues/56).
## Frequently Asked Questions
See [Organization: Frequently Asked Questions](organization-faq.md).
## Decision Log
- 2023-05-10: [Billing is not part of the Organization MVC](https://gitlab.com/gitlab-org/gitlab/-/issues/406614#note_1384055365)
- 2023-05-15: [Organization route setup](https://gitlab.com/gitlab-org/gitlab/-/issues/409913#note_1388679761)
## Links
- [Organization epic](https://gitlab.com/groups/gitlab-org/-/epics/9265)
- [Organization MVC design](https://gitlab.com/groups/gitlab-org/-/epics/10068)
- [Enterprise Users](../../../user/enterprise_user/index.md)
- [Cells blueprint](../cells/index.md)
- [Cells epic](https://gitlab.com/groups/gitlab-org/-/epics/7582)
- [Namespaces](../../../user/namespace/index.md)
- [Organization Isolation](isolation.md)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,152 +1,11 @@
---
status: ongoing
creation-date: "2023-10-11"
authors: [ "@DylanGriffith" ]
coach:
approvers: [ "@lohrc", "@alexpooley" ]
owning-stage: "~devops::data stores"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/isolation/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/isolation/).
# Organization Isolation
This blueprint details requirements for Organizations to be isolated.
Watch a [video introduction](https://www.youtube.com/watch?v=kDinjEHVVi0) that summarizes what Organization isolation is and why we need it.
Read more about what an Organization is in [Organization](index.md).
## What?
<img src="diagrams/organization-isolation.drawio.png" width="800" alt="">
All Cell-local data and functionality in GitLab (all data except the few
things that need to exist on all Cells in the cluster) must be isolated.
Isolation means that data or features can never cross Organization boundaries.
Many features in GitLab can link data together.
A few examples of things that would be disallowed by Organization Isolation are:
1. [Related issues](../../../user/project/issues/related_issues.md): Users would not be able to take an issue in one Project in `Organization A` and relate that issue to another issue in a Project in `Organization B`.
1. [Share a project/group with a group](../../../user/group/manage.md#share-a-group-with-another-group): Users would not be allowed to share a Group or Project in `Organization A` with another Group or Project in `Organization B`.
1. [System notes](../../../user/project/system_notes.md): Users would not get a system note added to an issue in `Organization A` if it is mentioned in a comment on an issue in `Organization B`.
## Why?
<img src="diagrams/organization-isolation-broken.drawio.png" width="800" alt="">
[GitLab Cells](../cells/index.md) depend on using the Organization as the sharding key, which will allow us to shard data between different Cells.
Initially, when we start rolling out Organizations, we will be working with a single Cell `Cell 1`.
`Cell 1` is our current GitLab.com deployment.
Newly created Organizations will be created on `Cell 1`.
Once Cells are ready, we will deploy `Cell 2` and begin migrating Organizations from `Cell 1` to `Cell 2`.
Migrating workloads off will be critical to allowing us to rebalance our data across a fleet of servers and eventually run much smaller GitLab instances (and databases).
If today we allowed users to create Organizations that linked to data in other Organizations, these links would suddenly break when an Organization is moved to a different Cell (because it won't know about the other Organization).
For this reason we need to ensure from the very beginning of rolling out Organizations to customers that it is impossible to create any links that cross the Organization boundary, even when Organizations are still on the same Cell.
If we don't, we will create even more mixed up related data that cannot be migrated between Cells.
Not fulfilling the requirement of isolation means we risk creating a new top-level data wrapper (Organization) that cannot actually be used as a sharding key.
The Cells project initially started with the assumption that we'd be able to shard by top-level Groups.
We quickly learned that there were no constraints in the application that isolated top-level Groups.
Many users (including ourselves) had created multiple top-level Groups and linked data across them.
So we decided that the only way to create a viable sharding key was to create another wrapper around top-level Groups.
Organizations were something our customers already wanted to gain more administrative capabilities as available in self-managed, and aggregate data across multiple top-level Groups, so this became a logical choice.
Again, this leads us to realize that we cannot allow multiple Organizations to get mixed in together the same way we had with top-level Groups, otherwise we will end up back where we started.
## How?
Multiple POCs have been implemented to demonstrate how we will provide robust developer facing and customer facing constraints in the GitLab application and database that enforce the described isolation constraint.
These are:
1. [Enforce Organization Isolation based on `project_id` and `namespace_id` column on every table](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/133576)
1. [Enforce Organization Isolation based on `organization_id` on every table](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/129889)
1. [Validate if a top-level group is isolated to be migrated to an Organization](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/131968)
The major constraint these POCs were trying to overcome was that there is no standard way in the GitLab application or database to even determine what Organization (or Project or namespace) a piece of data belongs to.
This means that the first step is to implement a standard way to efficiently find the parent Organization for any model or row in the database.
The proposed solution is ensuring that every single table that exists in the `gitlab_main_cell`, `gitlab_ci` and `gitlab_pm` (Cell-local) databases must include a valid sharding key that is a reference to `projects`, `namespaces` or `organizations`.
At first we considered enforcing everything to have an `organization_id`, but we determined that this would be too expensive to update for customers that need to migrate large Groups out of the default Organization.
The added benefit is that more than half of our tables already have one of these columns.
Additionally, if we can't consistently attribute data to a top-level Group, then we won't be able to validate if a top-level Group is safe to be moved to a new Organization.
Once we have consistent sharding keys we can use them to validate all data on insert are not crossing any Organization boundaries.
We can also use these sharding keys to help us decide whether:
- Existing namespaces in the default Organization can be moved safely to a new Organization, because the namespace is already isolated.
- The namespace owner would need to remove some links before migrating to a new Organization.
- A set of namespaces is isolated as a group and could be moved together in bulk to a new Organization.
## Detailed steps
1. Implement developer facing documentation explaining the requirement to add these sharding keys and how they should choose.
1. Add a way to declare a sharding key in `db/docs` and automatically populate it for all tables that already have a sharding key
1. Implement automation in our CI pipelines and/or DB migrations that makes it impossible to create new tables without a sharding key.
1. Implement a way for people to declare a desired sharding key in `db/docs` as
well as a path to the parent table from which it is migrated. Will only be
needed temporarily for tables that don't have a sharding key
1. Attempt to populate as many "desired sharding key" as possible in an
automated way and delegate the MRs to other teams
1. Fan out issues to other teams to manually populate the remaining "desired
sharding key"
1. Start manually creating then automating the creation of migrations for
tables to populate sharding keys from "desired sharding key"
1. Once all tables have sharding keys or "desired sharding key", we ship an
evolved version of the
[POC](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/133576), which
will enforce that newly inserted data cannot cross Organization boundaries.
This may need to be expanded to more than just foreign keys, and should also
include loose foreign keys and possibly any relationships described in
models. It can temporarily depend on inferring, at runtime, the sharding key
from the "desired sharding key" which will be a less performant option while
we backfill the sharding keys to all tables but allow us to unblock
implementing the isolation rules and user experience of isolation.
1. Finish migration of ~300 tables that are missing a sharding key:
1. The Tenant Scale team migrates the first few tables.
1. We build a dashboard showing our progress and continue to create
automated MRs for the sharding keys that can be automatically inferred
and automate creating issues for all the sharding keys that can't be
automatically inferred
1. Validate that all existing sharding key columns on all Cell-local tables can reliably be assumed to be the sharding key. This requires assigning issues to teams to confirm that these columns aren't used for some other purpose that would actually not be suitable.
1. We allow customers to create new Organizations without the option to migrate namespaces into them. All namespaces need to be newly created in their new Organization.
1. Implement new functionality in GitLab similar to the [POC](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/131968), which allows a namespace owner to see if their namespace is fully isolated.
1. Implement functionality that allows namespace owners to migrate an existing namespace from one Organization to another. Most likely this will be existing customers that want to migrate their namespace out of the default Organization into a newly created Organization. Only isolated namespaces as implemented in the previous step will be allowed to move.
1. Expand functionality to validate if a namespace is isolated, so that users can select multiple namespaces they own and validate that the selected group of namespaces is isolated. Links between the selected namespaces would stay intact.
1. Implement functionality that allows namespace owners to migrate multiple existing namespaces from one Organization to another. Only isolated namespaces as implemented in the previous step will be allowed to move.
1. We build better tooling to help namespace owners with cleaning up unwanted links outside of their namespace to allow more customers to migrate to a new Organization. This step would be dependent on the amount of existing customers that actually have links to clean up.
The implementation of this effort will be tracked in [#11670](https://gitlab.com/groups/gitlab-org/-/epics/11670).
## Alternatives considered
### Add any data that need to cross Organizations to cluster-wide tables
We plan on having some data at the cluster level in our Cells architecture (for example
Users), so it might stand to reason that we can make any data cluster-wide
that might need to cross Organization boundaries and this would solve the problem.
This could be an option for a limited set of features and may turn out to be
necessary for some critical workflows.
However, this should not become the default option, because it will ultimately lead to the Cells architecture not achieving the horizontal scaling goals.
Features like [sharing a group with a group](../../../user/group/manage.md#share-a-group-with-another-group) are very tightly connected to some of the worst performing functionality in our
application with regard to scalability.
We are hoping that by splitting up our databases in Cells we will be able to unlock more scaling headroom and reduce the problems associated with supporting these features.
### Do nothing and treat these anomalies as an acceptable edge case
This idea hasn't been explored deeply but is rejected on the basis that these
anomalies will appear as data loss while moving customer data between Cells.
Data loss is a very serious kind of bug, especially when customers are not opting into being moved between servers.
### Solve these problems feature by feature
This could be done, for example, by implementing an application rule that
prevents users from adding an issue link between Projects on different Organizations.
We would need to find all such features by asking teams, and
they would need to fix them all as a special case business rule.
This may be a viable, less robust option, but it does not give us a lot of confidence in our system.
Without a robust way to ensure that all Organization data is isolated, we would have to trust that each feature we implement has been manually checked.
This creates a real risk that we miss something, and again we would end up with customer data loss.
Another challenge here is that if we are not confident in our isolation constraints, then we may end up attributing various unrelated bugs to possible data loss.
As such it could become a rabbit hole to debug all kinds of unrelated bugs.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,46 +1,11 @@
---
stage: enablement
group: Tenant Scale
description: 'Organization: FAQ'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/organization-faq/'
remove_date: '2025-07-08'
---
# Organization: Frequently Asked Questions
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/organization-faq/).
## Do we expect large SaaS customers to be licensed at the Organization level?
For example, should they have the ability to include multiple top-level Groups under one license?
Yes, this has been discussed with Fulfillment and is part of the post MVC roadmap for Organizations.
See also [Alignment between Organization and Fulfillment](index.md#alignment-between-organization-and-fulfillment).
## Do we expect to be able to configure alternate GitLab domain names for Organizations (such as `customer.gitlab.com`)?
There is no plan at this point to allow configuration of alternate GitLab domain names.
We have previously heard that sub-domains bring administrative challenges.
GitLab Dedicated will be a much better fit for that at this moment.
## Do we expect Organizations to have visibility settings (public/private) of their own?
Will visibility remain a property of top-level Groups?
Organizations are public for now but will have their own independent visibility settings.
See also [When can Users see an Organization?](organization-users.md#when-can-users-see-an-organization).
## What would the migration of a feature from the top-level Group to the Organization look like?
One of our requirements is that everything needs to be mapped to an Organization.
Only that way will we achieve the isolation we are striving for.
For SaaS, all existing Groups and Projects are already mapped to `Org_ID = 1` in the backend.
`Org_ID = 1` corresponds to the `Default Organization`, meaning that upon Organization rollout, all existing Groups and Projects will be part of the default Organization and will be seen in that context.
Because we want to achieve as much parity as possible between SaaS and self-managed, self-managed customers would also get everything mapped to the default Organization.
The difference between SaaS and self-managed is that for SaaS we expect users to create many Organizations, and for self-managed we do not.
We will control this via a `can_create_organization` application setting that will be enabled by default on SaaS and disabled by default for self-managed users.
Consider whether your feature can support cascading, or in other words, whether the functionality is capable of existing on multiple nested levels without causing conflicts.
If your feature can support cascading:
- Today, you should add your feature to the top-level Group for both SaaS and self-managed, and to the instance for self-managed.
- Once the Organization is ready, you would migrate your instance level feature over the Organization object at which point it would be available at both the Organization and top-level Group for all customers.
If your feature cannot support cascading:
- Today, you should add your feature to the top-level Group for SaaS only, and to the instance for self-managed. The top-level Group functionality would be hidden for self-managed users.
- Once the Organization is ready, you would migrate instance functionality to the Organization for self-managed customers, but hide it at the Organization level for SaaS. On SaaS, users would continue to manage their functionality at the top-level Group, and not at the Organization level. At some point in the future when 99% of paying customers have moved to their own Organization, you could clean things up by introducing a breaking change and unhiding it from the Organization level for all customers (SaaS and self-managed) and removing the functionality from the top-level Group.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,295 +1,11 @@
---
stage: enablement
group: Tenant Scale
description: 'Organization Users'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/organizations-users/'
remove_date: '2025-07-08'
---
# Organization Users
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/organization/organizations-users/).
Users can become an Organization member in the following way:
- Organization Owners create an account on behalf of a user, and then share it with the user.
Organization members can get access to Groups and Projects in an Organization as:
- A Group Member: this grants access to the Group and all its Projects,
regardless of their visibility.
- A Project Member: this grants access to the Project, and limited access to
parent Groups, regardless of their visibility.
- A Non-Member: this grants access to public and internal Groups and Projects of
that Organization. To access a private Group or Project in an Organization, a
user must become a member. Internal visibility will not be available for
Organization in Cells 1.0.
Organization members can be managed in the following ways:
- As [Enterprise Users](../../../user/enterprise_user/index.md), managed by the
Organization. This includes control over their User account and the ability to
block the User. In the context of Cells 1.0, Organization members will
essentially function like Enterprise Users.
- As Non-Enterprise Users, managed by the default Organization. Non-Enterprise Users
can be removed from an Organization, but the User keeps ownership of
their User account. This will only be considered post Cells 1.0.
Enterprise Users are only available to Organizations with a Premium or Ultimate
subscription. Organizations on the free tier will only be able to host
Non-Enterprise Users.
## How do Users join an Organization?
Users are visible across all Organizations. This allows Users to move between
Organizations. Users can join an Organization by:
1. Being invited by an Organization Owner. Because Organizations are private on
Cells 1.0, only the Organization Owner can add new Users to an Organization
by iniviting them to create an account.
1. Becoming a Member of a Namespace (Group, Subgroup, or Project) contained
within an Organization. A User can become a Member of a Namespace by:
- Being invited by username
- Being invited by email address
- Requesting access. This requires visibility of the Organization and
Namespace and must be accepted by the owner of the Namespace. Access cannot
be requested to private Groups or Projects.
1. Becoming an Enterprise User of an Organization. Bringing Enterprise Users to
the Organization level is planned post MVC. For the Organization MVC
Enterprise Users will remain at the top-level Group.
The creator of an Organization automatically becomes the Organization Owner. It
is not necessary to become a User of a specific Organization to comment on or
create public issues, for example. All existing Users can create and comment on
all public issues.
## How do Users sign in to an Organization?
TBD
## When can Users see an Organization?
For Cells 1.0, an Organization can only be private. Private Organizations can
only be seen by their Organization members. They can only contain private Groups
and Projects.
For Cells 1.5, Organizations can also be public. Public Organizations can be
seen by everyone. They can contain public and private Groups and Projects.
In the future, Organizations will get an additional internal visibility setting
for Groups and Projects. This will allow us to introduce internal Organizations
that can only be seen by the Users it contains. This would mean that only Users
that are part of the Organization will see:
- The Organization front page, instead of a 404 when navigating to the
Organization URL
- Name of the Organization
- Description of the Organization
- Organization pages, such as the Activity page, Groups, Projects, and Users
overview. Content of these pages will be determined by each User's access to
specific Groups and Projects. For instance, private Projects would only be
seen by the members of this Project in the Project overview.
- Internal Groups and Projects
As an end goal, we plan to offer the following scenarios:
| Organization visibility | Group/Project visibility | Who sees the Organization? | Who sees Groups/Projects? |
| ------ | ------ | ------ | ------ |
| public | public | Everyone | Everyone |
| public | internal | Everyone | Organization members |
| public | private | Everyone | Group/Project members |
| internal | internal | Organization members | Organization members |
| internal | private | Organization members | Group/Project members |
| private | private | Organization members | Group/Project members |
## What can Users see in an Organization?
Users can see the things that they have access to in an Organization. For
instance, an Organization member would be able to access only the private Groups
and Projects that they are a member of, but could see all public Groups and
Projects. Actionable items such as issues, merge requests and the to-do list are
seen in the context of the Organization. This means that a User might see
10 merge requests they created in `Organization A`, and 7 in `Organization B`, when
in total they have created 17 merge requests across both Organizations.
## What is a Billable Member?
How Billable Members are defined differs between GitLabs two main offerings:
- Self-managed (SM): [Billable Members are Users who consume seats against the SM License](../../../subscriptions/self_managed/index.md#subscription-seats).
Custom roles elevated above the Guest role are consuming seats.
- GitLab.com (SaaS): [Billable Members are Users who are Members of a Namespace (Group or Project) that consume a seat against the SaaS subscription for the top-level Group](../../../subscriptions/gitlab_com/index.md#how-seat-usage-is-determined).
Currently, [Users with Minimal Access](../../../user/permissions.md#users-with-minimal-access)
and Users without a Group count towards a licensed seat, but [that's changing](https://gitlab.com/gitlab-org/gitlab/-/issues/330663#note_1133361094).
These differences and how they are calculated and displayed often cause
confusion. For both SM and SaaS, we evaluate whether a User consumes a seat
against the same core rule set:
1. They are active users
1. They are not bot users
1. For the Ultimate tier, they are not a Guest
For (1) this is determined differently per offering, in terms of both what
classifies as active and also due to the underlying model that we refer to
(User vs Member). To help demonstrate the various associations used in GitLab relating
to Billable Members, here is a relationship diagram:
```mermaid
graph TD
A[Group] <-.type of.- B[Namespace]
C[Project] -.belongs to.-> A
E[GroupMember] <-.type of.- D[Member]
G[User] -.has many.-> F
F -.belongs to.-> C
F[ProjectMember] <-.type of.- D
G -.has many.-> E -.belongs to.-> A
GGL[GroupGroupLink] -.belongs to.->A
PGL[ProjectGroupLink] -.belongs to.->A
PGL -.belongs to.->C
```
GroupGroupLink is the join table between two Group records, indicating that one
Group has invited the other. ProjectGroupLink is the join table between a Group
and a Project, indicating the Group has been invited to the Project.
SaaS has some additional complexity when it comes to the relationships that
determine whether or not a User is considered a Billable Member, particularly
relating to Group/Project membership that can often lead to confusion. An
example of that are Members of a Group that have been invited into another Group
or Project and therewith become billable.
There are two charts as the flow is different for each:
- [SaaS chart](#saas-chart)
- [SM chart](#sm-chart)
(These charts are placed at the bottom of the page, due to length.)
## How can Users switch between different Organizations?
For Organizations in the context of Cells 1.0, Users will only be able to be
part of a single Organization. If a user wants to be part of multiple
Organizations, they have to join every additional Organization with a new user
account.
Later, in the context of Cells 1.5, Users can utilize a
[context switcher](https://gitlab.com/gitlab-org/gitlab/-/issues/411637). This feature
allows easy navigation and access to different Organizations' content and
settings. By clicking on the context switcher and selecting a specific
Organization from the provided list, Users can seamlessly transition their view
and permissions, enabling them to interact with the resources and
functionalities of the chosen Organization.
## What happens when a User is deleted?
We've identified three different scenarios where a User can be removed from an Organization:
1. Removal: The User is removed from the organization_users table. This is
similar to the User leaving a company, but the User can join the Organization
again after access approval.
1. Banning: The User is banned. This can happen in case of misconduct but the
User cannot be added again to the Organization until they are unbanned. In
this case, we keep the organization_users entry and change the permission to
none.
1. Deleting: The User is deleted. We assign everything the User has authored to
the Ghost User and delete the entry from the organization_users table.
As part of the Organization MVC, Organization Owners can remove Organization
members. This means that the User's membership entries are deleted from all
Groups and Projects that are contained within the Organization. In addition, the
User entry is removed from the `organization_users` table.
Actions such as banning and deleting a User will be added to the Organization at a later point.
## Organization Non-Users
Non-Users are external to the Organization and can only access the public
resources of an Organization, such as public Projects.
## SaaS chart
```mermaid
flowchart TD
root[SaaS User Billable Flow]-->UserActive{`User.state` is active?}
UserActive -.Yes.-> IsMember[Check if User is a Member <br/>of the Root Group hierarchy]
UserActive -.No.-> NotBillable[Not Billable]
IsMember --> DM
Member -.Yes.->IsBot{Is User a Bot? <br/>See note 2}
NotMember -.No.->NotBillable
IsBot -.Yes.->NotBillable
IsBot -.No.->Active[Active `Member` state?]
Active --> MemberStateIsActive
ActiveMember-.Yes.-> MinAccess{Member has <br/>Minimal Access level?}
NotActive-.No.-> NotBillable
MinAccess -.Yes.-> NotBillable
MinAccess -.No.-> HighestRoleGuest?{Member Highest Role<br/> is Guest?}
HighestRoleGuest? -.Yes.-> LicenseType{Ultimate License?}
LicenseType -.No.-> Billable
HighestRoleGuest? -.No.-> Billable
LicenseType -.Yes.-> NotBillable
subgraph in_hierarchy[User Is a Member of the Root Group hierarchy]
DM{Direct Member of the Root Group?} -.No.->DMSub{Direct Member of a sub-group?}
DM -.Yes.->Member[Is a Member]
DMSub -.Yes.->Member
DMSub -.No.->DMProject{Member of a Project in the hierarchy?}
DMProject -.Yes.->Member
DMProject -.No.-> InvitedMember{Member of an invited Group?}
InvitedMember -.Yes.-> Member
InvitedMember -.No.-> NotMember[Not a Member<br/>See note 1]
end
subgraph activesub[Is Member Active?]
MemberStateIsActive{`Member.state` is active?} -.Yes.-> RequestedInvite{User Requested Access?<br/>See note 3}
MemberStateIsActive -.No.-> NotActive
RequestedInvite -.Yes.-> AcceptedRequest{Request was accepted?}
AcceptedRequest -.No.-> NotActive[Not an active member]
AcceptedRequest -.Yes.-> ActiveMember[Active Member]
RequestedInvite -.No.-> Invited{User was Invited?<br/>See note 4}
Invited -.No.-> NotActive
Invited -.Yes.-> AcceptedInvite{User accepted invite?}
AcceptedInvite -.No.->NotActive
AcceptedInvite -.Yes.->ActiveMember
end
```
## SM chart
```mermaid
flowchart TD
user[Is the User Billable?]
user -->UserState{Active `User` State?}
UserState -.Yes.-> H{Human?}
UserState -.No.-> NotBillable
H -.No.-> PB{Project Bot?}
PB -.No.-> SU{Service User?}
SU -.No.-> NotBillable[Not Billable]
SU -.Yes.-> InGroupOrProject
PB -.Yes.-> InGroupOrProject
H -.Yes.-> InGroupOrProject{Member of a Group or Project?}
InGroupOrProject -.No.-> LicenseType
InGroupOrProject -.Yes.-> HighestRoleGuest?{Highest Role is Guest?}
HighestRoleGuest? -.Yes.-> LicenseType{Ultimate License?}
LicenseType -.No.-> Billable
HighestRoleGuest? -.No.-> Billable
LicenseType -.Yes.-> NotBillable
```
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,188 +1,11 @@
---
status: proposed
creation-date: "2023-03-10"
authors: [ "@jessieay", "@jarka" ]
coach: "@grzesiek"
approvers: [ "@hsutor", "@adil.farrukh" ]
owning-stage: "~devops::manage"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/permissions/'
remove_date: '2025-07-08'
---
# Permissions Changes required to enable Custom Roles
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/permissions/).
## Summary
Today, the GitLab permissions system is a backend implementation detail of our
static [role-based access control system](../../../user/permissions.md#roles).
In %15.9, we [announced](https://about.gitlab.com/blog/2023/03/08/expanding-guest-capabilities-in-gitlab-ultimate/)
a customer MVC of the custom roles feature. The MVC introduced the ability to
add one single permission (`read_code`) to a custom role based on a default
GitLab Guest role. The MVC was [implemented](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/106256) by
taking an existing permission from the GitLab authorization framework and
enabling it if a custom role has it set to `true`.
Post-MVC, the Auth group has started work on making more permissions
customizable, with the ultimate goal of making *all* permissions customizable.
As we've started planning this work, there are two large challenges:
1. The GitLab permissions system is not a stable, backwards-compatible API.
But [the custom roles feature is built on top of the current permissions system](https://gitlab.com/gitlab-org/gitlab/-/issues/352891#note_993031741).
Which means that custom roles relies on permissions being a stable,
backwards-compatible API. So we must change how we approach our permissions
system if we plan to continue on with the current architecture.
1. Refactoring our permissions system is difficult due to the sheer number of
permissions (over 700), duplication of permissions checks throughout the
codebase, and the importance of permissions for security (cost
of errors is very high).
This blueprint goes into further detail on each of these challenges and suggests
a path for addressing them.
## What are custom roles?
Our permissions system supports six default roles (Guest, Reporter, Developer, Maintainer, Owner) and users are assigned to per project or group, they can't be modified. Custom roles should solve the problem that our current system is static.
With custom roles, customers can define their own roles and give them permissions they want to. For every role they create they can assign set of permissions. For example, a newly created role "Engineer" could have `read code` and `admin merge requests` enabled but abilities such as `admin issues` disabled.
## Motivation
This plan is important to define because the [custom roles project](https://gitlab.com/groups/gitlab-org/-/epics/4035)'s
current architecture is built off of our current permissions system, [Declarative Policy](https://gitlab.com/gitlab-org/ruby/gems/declarative-policy).
Declarative Policy makes it inexpensive to add new permissions, which has
resulted in our current state of having [over 700 permissions](https://gitlab.com/gitlab-org/gitlab/-/issues/393454#more-context)
in the `gitlab-org/gitlab` codebase. Even our [permissions documentation](../../../user/permissions.md)
contains a table with over 200 rows, each row representing a unique
"permission." Up until now, the proliferation of permissions in the code has
been manageable because these checks are not part of a public API. With custom
roles, however, that is changing.
Our current authorization checks are [often duplicated and sprinkled throughout application code](https://gitlab.com/gitlab-org/gitlab/-/issues/352891#note_958192650). For a single web request, there might be several different
permissions checked in the UI to determine if a user can see those page
elements, another few permissions checks in the Rails controller to determine if
the user can access the route at all, and maybe a few more permissions checks
sprinkled into other Ruby service classes that run as part of the page load.
This approach is [recommended in the GitLab developer documentation](../../../development/permissions/authorizations.md#where-should-permissions-be-checked)
as a "defense-in-depth" measure.
In the context of custom roles, however, this approach will not work. When a
group admin wants to enable a user to take a single action via a custom role,
that group admin should be able to toggle a single, well-named permission to
enable the user with the custom role to view or update a resource. This means
that, for a single web request, we must ensure that only one well-named
permission is checked. And, the access granted for that permission must be
relatively stable so that the admin is not giving users more access than they
think they are. Otherwise, creating and managing custom roles will be overly
complex and a security nightmare.
While the Auth group owns permissions as a feature, each team owns a set of permissions related to their domain area.
corner of the `gitlab-org/gitlab` codebase. As a result, all engineering teams that
are contributing to the `gitlab-org/gitlab` codebase touch permissions. This
means that it is even more important to provide clear guidelines on the future
of permissions and automate the enforcement of these guidelines.
### Goals
- Make it possible to customize all permissions via custom roles.
- Make the GitLab permissions system worthy of being a public API.
- Improve the naming and consistency of permissions.
- Reduce the overall number of permissions from 700+ to < 100.
- Reduce risk of refactors related to permissions.
- Make refactoring permissions easier by having a way to evaluate behavior other than unit tests and documentation.
- Track ownership of individual permissions so that DRIs can be consulted on any changes related to a permission that they own
- Create a SSoT for permissions behavior.
- Automate generation of permissions documentation.
### Non-Goals
- Pause custom roles project indefinitely while we refactor our existing permissions system (there is high demand for this as an Ultimate feature).
- Perform a total re-write or re-build of our permissions system (too much upfront investment without providing customer value).
- Iteratively work on custom roles without ever getting to feature complete ("iterate to nowhere").
## Proposal
1. Introduce a linter that ensures all new permissions adhere to naming
conventions.
1. Reduce the overall number of permissions from 700+ to < 100 by consolidating
our existing permissions.
1. Introduce ownership tags for each permission that requires owning group to
review any MRs that update that permission.
1. Create a Rake task for generating permissions documentation from code so that
we have a Single Source of Truth for permissions.
## Alternative Solutions
### Do nothing
Pros:
- No need to lengthy architecture conversation or plan
- May discover methods for improving permissions system organically as we move
forward.
Cons:
- Slow progress in building custom role feature without blueprint for how to
think about permissions system as a whole
- Permissions system can spiral into an unmaintainable code if we iterate on it without a strategically important vision.
### Leave the current permissions system as-is and build a parallel system
Instead, build a parallel Declarative Policy-based system alongside it to use for custom roles.
Pros:
- Faster to design and build a new system than to do a large-scale refactor of the existing system.
- Auth team can own this new system entirely.
Cons:
- Maintaining 2 systems
- Each new "regular" permission added needs a parallel addition to the
custom roles system. This makes it difficult to have feature parity between
custom roles and default roles.
- Replacing our existing RBAC system with custom roles (an eventual goal of the
custom roles feature) is more difficult with this approach because it requires
retiring the legacy permissions system.
### Bundle existing permissions into custom permissions
Use "custom permissions" for the custom roles API.
Pros:
- Faster to design and build a new system than to do a large-scale refactor of the existing system.
- Auth team can own these new bundled permissions
Cons:
- Bundling permissions is less granular; the goal of custom permissions is to
enable granular access.
- Each new "regular" permission added needs a parallel addition to the
bundled permissions for custom roles. This makes it difficult to have feature
parity between custom roles and default roles.
## Glossary
- **RBAC**: Role-based access control; "a method of restricting network access based
on the roles of individual users." RBAC is the method of access control that
GitLab uses.
- **Default roles**: the 5 categories that GitLab users can be grouped into: Guest,
Reporter, Developer, Maintainer, Owner ([documentation](../../../user/permissions.md#roles)).
A default role can be thought of as a group of permissions.
- **Declarative Policy**: [code library](https://gitlab.com/gitlab-org/ruby/gems/declarative-policy/)
used by GitLab to define our authorization logic.
- **Permissions**: a specific ability that a user with a Role has. For example, a
Developer can create merge requests but a Guest cannot. Each row listed in
[the permissions documentation](../../../user/permissions.md#project-members-permissions)
represents a "permission" but these may not have a 1:1 mapping with a Declarative Policy
[ability](https://gitlab.com/gitlab-org/ruby/gems/declarative-policy/-/blob/main/doc/defining-policies.md#invocation).
An ability is how permissions are represented in the GitLab codebase.
- **Access level**: integer value representing a default role, used for determining access and calculating inherited user access in group hierarchies ([documentation](../../../api/access_requests.md#valid-access-levels)).
## Resources
- [Custom Roles MVC announcement](https://github.blog/changelog/2021-10-27-enterprise-organizations-can-now-create-custom-repository-roles/)
- [Custom Roles lunch and learn notes](https://docs.google.com/document/d/1x2ExhGJl2-nEibTaQE_7e5w2sDCRRHiakrBYDspPRqw/edit#)
- [Discovery on auto-generating documentation for permissions](https://gitlab.com/gitlab-org/gitlab/-/issues/352891#note_989392294).
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,416 +1,11 @@
---
status: accepted
creation-date: "2022-09-08"
authors: [ "@grzesiek", "@marshall007", "@fabiopitino", "@hswimelar" ]
coach: "@andrewn"
approvers: [ "@sgoldstein" ]
owning-stage: "~devops::enablement"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/rate_limiting/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/rate_limiting/).
# Next Rate Limiting Architecture
## Summary
Introducing reasonable application limits is a very important step in any SaaS
platform scaling strategy. The more users a SaaS platform has, the more
important it is to introduce sensible rate limiting and policies enforcement
that will help to achieve availability goals, reduce the problem of noisy
neighbours for users and ensure that they can keep using a platform
successfully.
This is especially true for GitLab.com. Our goal is to have a reasonable and
transparent strategy for enforcing application limits, which will become a
definition of a responsible usage, to help us with keeping our availability and
user satisfaction at a desired level.
We've been introducing various application limits for many years already, but
we've never had a consistent strategy for doing it. What we want to build now is
a consistent framework used by engineers and product managers, across entire
application stack, to define, expose and enforce limits and policies.
Lack of consistency in defining limits, not being able to expose them to our
users, support engineers and satellite services, has negative impact on our
productivity, makes it difficult to introduce new limits and eventually
prevents us from enforcing responsible usage on all layers of our application
stack.
This blueprint has been written to consolidate our limits and to describe the
vision of our next rate limiting and policies enforcement architecture.
## Goals
**Implement a next architecture for rate limiting and policies definition.**
## Challenges
- We have many ways to define application limits, in many different places.
- It is difficult to understand what limits have been applied to a request.
- It is difficult to introduce new limits, even more to define policies.
- Finding what limits are defined requires performing a codebase audit.
- We don't have a good way to expose limits to satellite services like Registry.
- We enforce a number of different policies via opaque external systems
(Pipeline Validation Service, Bouncer, Watchtower, Cloudflare, HAProxy).
- There is not standardized way to define policies in a way consistent with defining limits.
- It is difficult to understand when a user is approaching a limit threshold.
- There is no way to automatically notify a user when they are approaching thresholds.
- There is no single way to change limits for a namespace / project / user / customer.
- There is no single way to monitor limits through real-time metrics.
- There is no framework for hierarchical limit configuration (instance / namespace / subgroup / project).
- We allow disabling rate-limiting for some marquee SaaS customers, but this
increases a risk for those same customers. We should instead be able to set
higher limits.
## Opportunity
We want to build a new framework, making it easier to define limits, quotas and
policies, and to enforce / adjust them in a controlled way, through robust
monitoring capabilities.
<!-- markdownlint-disable MD029 -->
1. Build a framework to define and enforce limits in GitLab Rails.
2. Build an API to consume limits in satellite service and expose them to users.
3. Extract parts of this framework into a dedicated GitLab Limits Service.
<!-- markdownlint-enable MD029 -->
The most important opportunity here is consolidation happening on multiple
levels:
1. Consolidate on the application limits tooling used in GitLab Rails.
1. Consolidate on the process of adding and managing application limits.
1. Consolidate on the behavior of hierarchical cascade of limits and overrides.
1. Consolidate on the application limits tooling used across entire application stack.
1. Consolidate on the policies enforcement tooling used across entire company.
Once we do that we will unlock another opportunity: to ship the new framework /
tooling as a GitLab feature to unlock these consolidation benefits for our
users, customers and entire wider community audience.
### Limits, quotas and policies
This document aims to describe our technical vision for building the next rate
limiting architecture for GitLab.com. We refer to this architectural evolution
as "the next rate limiting architecture", but this is a mental shortcut,
because we actually want to build a better framework that will make it easier
for us to manage not only rate limits, but also quotas and policies.
Below you can find a short definition of what we understand by a limit, by a
quota and by a policy.
- **Limit:** A constraint on application usage, typically used to mitigate
risks to performance, stability, and security.
- _Example:_ API calls per second for a given IP address
- _Example:_ `git clone` events per minute for a given user
- _Example:_ maximum artifact upload size of 1 GB
- **Quota:** A global constraint in application usage that is aggregated across an
entire namespace over the duration of their billing cycle.
- _Example:_ 400 compute minutes per namespace per month
- _Example:_ 10 GB transfer per namespace per month
- **Policy:** A representation of business logic that is decoupled from application
code. Decoupled policy definitions allow logic to be shared across multiple services
and/or "hot-loaded" at runtime without releasing a new version of the application.
- _Example:_ decode and verify a JWT, determine whether the user has access to the
given resource based on the JWT scopes and claims
- _Example:_ deny access based on group-level constraints
(such as IP allowlist, SSO, and 2FA) across all services
Technically, all of these are limits, because rate limiting is still
"limiting", quota is usually a business limit, and policy limits what you can
do with the application to enforce specific rules. By referring to a "limit" in
this document we mean a limit that is defined to protect business, availability
and security.
### Framework to define and enforce limits
First we want to build a new framework that will allow us to define and enforce
application limits, in the GitLab Rails project context, in a more consistent
and established way. In order to do that, we will need to build a new
abstraction that will tell engineers how to define a limit in a structured way
(presumably using YAML or Cue format) and then how to consume the limit in the
application itself.
We already do have many limits defined in the application, we can use them to
triangulate to find a reasonable abstraction that will consolidate how we
define, use and enforce limits.
We envision building a simple Ruby library here (we can add it to LabKit) that
will make it trivial for engineers to check if a certain limit has been
exceeded or not.
```yaml
name: my_limit_name
actors: user
context: project, group, pipeline
type: rate / second
group: pipeline::execution
limits:
warn: 2B / day
soft: 100k / s
hard: 500k / s
```
```ruby
Gitlab::Limits::RateThreshold.enforce(:my_limit_name) do |threshold|
actor = current_user
context = current_project
threshold.available do |limit|
# ...
end
threshold.approaching do |limit|
# ...
end
threshold.exceeded do |limit|
# ...
end
end
```
In the example above, when `my_limit_name` is defined in YAML, engineers will
be check the current state and execute appropriate code block depending on the
past usage / resource consumption.
Things we want to build and support by default:
1. Comprehensive dashboards showing how often limits are being hit.
1. Notifications about the risk of hitting limits.
1. Automation checking if limits definitions are being enforced properly.
1. Different types of limits - time bound / number per resource etc.
1. A panel that makes it easy to override limits per plan / namespace.
1. Logging that will expose limits applied in Kibana.
1. An automatically generated documentation page describing all the limits.
### Support rate limits based on resources used
One of the problems of our rate limiting system is that values are static
(e.g. 100 requests per minutes) and irrespective of the complexity or resources
used by the operation. For example:
- Firing 100 requests per minute to fetch a simple resource can have very different
implications than creating a CI pipeline.
- Each pipeline creation action can perform very differently depending on the
pipeline being created (small MR pipeline VS large scheduled pipeline).
- Paginating resources after an offset of 1000 starts to become expensive on the database.
We should allow some rate limits to be defiened as `computing score / period` where for
computing score we calculate the milliseconds accumulated (for all requests executed
and inflight) within a given period (for example: 1 minute).
This way if a user is sending expensive requests they are likely to hit the rate limit earlier.
### API to expose limits and policies
Once we have an established a consistent way to define application limits we
can build a few API endpoints that will allow us to expose them to our users,
customers and other satellite services that may want to consume them.
Users will be able to ask the API about the limits / thresholds that have been
set for them, how often they are hitting them, and what impact those might have
on their business. This kind of transparency can help them with communicating
their needs to customer success team at GitLab, and we will be able to
communicate how the responsible usage is defined at a given moment.
Because of how GitLab architecture has been built, GitLab Rails application, in
most cases, behaves as a central enterprise service bus (ESB) and there are a
few satellite services communicating with it. Services like container registry,
GitLab Runners, Gitaly, Workhorse, KAS could use the API to receive a set of
application limits those are supposed to enforce. This will still allow us to
define all of them in a single place.
We should, however, avoid the possible negative-feedback-loop, that will put
additional strain on the Rails application when there is a sudden increase in
usage happening. This might be a big customer starting a new automation that
traverses our API or a Denial of Service attack. In such cases, the additional
traffic will reach GitLab Rails and subsequently also other satellite services.
Then the satellite services may need to consult Rails again to obtain new
instructions / policies around rate limiting the increased traffic. This can
put additional strain on Rails application and eventually degrade performance
even more. In order to avoid this problem, we should extract the API endpoints
to separate service (see the section below) if the request rate to those
endpoints depends on the volume of incoming traffic. Alternatively we can keep
those endpoints in Rails if the increased traffic will not translate into
increase of requests rate or increase in resources consumption on these API
endpoints on the Rails side.
#### Decoupled Limits Service
At some point we may decide that it is time to extract a stateful backend
responsible for storing metadata around limits, all the counters and state
required, and exposing API, out of Rails.
It is impossible to make a decision about extracting such a decoupled limits
service yet, because we will need to ship more proof-of-concept work, and
concrete iterations to inform us better about when and how we should do that. We
will depend on the Evolution Architecture practice to guide us towards either
extracting Decoupled Limits Service or not doing that at all.
As we evolve this blueprint, we will document our findings and insights about
how this service should look like, in this section of the document.
### GitLab Policy Service
_Disclaimer_: Extracting a GitLab Policy Service might be out of scope
of the current workstream organized around implementing this blueprint.
Not all limits can be easily described in YAML. There are some more complex
policies that require a bit more sophisticated approach and a declarative
programming language used to enforce them. One example of such a language might be
[Rego](https://www.openpolicyagent.org/docs/latest/policy-language/) language.
It is a standardized way to define policies in
[OPA - Open Policy Agent](https://www.openpolicyagent.org/). At GitLab we are
already using OPA in some departments. We envision the need to additional
consolidation to not only consolidate on the tooling we are using internally at
GitLab, but to also transform the Next Rate Limiting Architecture into
something we can make a part of the product itself.
Today, we already do have a policy service we are using to decide whether a
pipeline can be created or not. There are many policies defined in
[Pipeline Validation Service](https://gitlab.com/gitlab-org/modelops/anti-abuse/pipeline-validation-service).
There is a significant opportunity here in transforming Pipeline Validation
Service into a general purpose GitLab Policy Service / GitLab Policy Agent that
will be well integrated into the GitLab product itself.
Generalizing Pipeline Validation Service into GitLab Policy Service can bring a
few interesting benefits:
1. Consolidate on our tooling across the company to improve efficiency.
1. Integrate our GitLab Rails limits framework to resolve policies using the policy service.
1. Do not struggle to define complex policies in YAML and hack evaluating them in Ruby.
1. Build a policy for GraphQL queries limiting using query execution cost estimation.
1. Make it easier to resolve policies that do not need "hierarchical limits" structure.
1. Make GitLab Policy Service part of the product and integrate it into the single application.
We envision using GitLab Policy Service to be place to define policies that do
not require knowing anything about the hierarchical structure of the limits.
There are limits that do not need this, like IP addresses allow-list, spam
checks, configuration validation etc.
We defined "Policy" as a stateless, functional-style, limit. It takes input
arguments and evaluates to either true or false. It should not require a global
counter or any other volatile global state to get evaluated. It may still
require to have a globally defined rules / configuration, but this state is not
volatile in a same way a rate limiting counter may be, or a megabytes consumed
to evaluate quota limit.
#### Policies used internally and externally
The GitLab Policy Service might be used in two different ways:
1. Rails limits framework will use it as a source of policies enforced internally.
1. The policy service feature will be used as a backend to store policies defined by users.
These are two slightly different use-cases: first one is about using
internally-defined policies to ensure the stability / availability of a GitLab
instance (GitLab.com or self-managed instance). The second use-case is about
making GitLab Policy Service a feature that users will be able to build on top
of.
Both use-cases are valid but we will need to make technical decision about how
to separate them. Even if we decide to implement them both in a single service,
we will need to draw a strong boundary between the two.
The same principle might apply to Decouple Limits Service described in one of
the sections of this document above.
#### The two limits / policy services
It is possible that GitLab Policy Service and Decoupled Limits Service can
actually be the same thing. It, however, depends on the implementation details
that we can't predict yet, and the decision about merging these services
together will need to be informed by subsequent iterations' feedback.
## Hierarchical limits
GitLab application aggregates users, projects, groups and namespaces in a
hierarchical way. This hierarchical structure has been designed to make it
easier to manage permissions, streamline workflows, and allow users and
customers to store related projects, repositories, and other artifacts,
together.
It is important to design the new rate limiting framework in a way that it
built on top of this hierarchical structure and engineers, customers, SREs and
other stakeholders can understand how limits are being applied, enforced and
overridden within the hierarchy of namespaces, groups and projects.
We want to reduce the cognitive load required to understand how limits are
being managed within the existing permissions structure. We might need to build
a simple and easy-to-understand formula for how our application decides which
limits and thresholds to apply for a given request and a given actor:
> GitLab will read default limits for every operation, all overrides configured
> and will choose a limit with the highest precedence configured. A limit
> precedence needs to be explicitly configured for every override, a default
> limit has precedence 100.
One way in which we can simplify limits management in general is to:
1. Have default limits / thresholds defined in YAML files with a default precedence 100.
1. Allow limits to be overridden through the API, store overrides in the database.
1. Every limit / threshold override needs to have an integer precedence value provided.
1. Build an API that will take an actor and expose limits applicable for it.
1. Build a dashboard showing actors with non-standard limits / overrides.
1. Build a observability around this showing in Kibana when non-standard limits are being used.
The points above represent an idea to use precedence score (or Z-Index for
limits), but there may be better solutions, like just defining a direction of
overrides - a lower limit might always override a limit defined higher in the
hierarchy. Choosing a proper solution will require a thoughtful research.
## Principles
1. Try to avoid building rate limiting framework in a tightly coupled way.
1. Build application limits API in a way that it can be easily extracted to a separate service.
1. Build application limits definition in a way that is independent from the Rails application.
1. Build tooling that produce consistent behavior and results across programming languages.
1. Build the new framework in a way that we can extend to allow self-managed administrators to customize limits.
1. Maintain consistent features and behavior across SaaS and self-managed codebase.
1. Be mindful about a cognitive load added by the hierarchical limits, aim to reduce it.
## Phases and iterations
1. **Compile examples of current most important application limits (Owning Team)**
- Owning Team (in collaboration with Stage Groups) compiles a list of the
most important application limits used in Rails today.
1. **Implement Rate Limiting Framework in Rails (Owning Team)**
- Triangulate rate limiting abstractions based on the data gathered in Phase 1.
- Develop YAML model for limits.
- Build Rails SDK.
- Create examples showcasing usage of the new rate limits SDK.
1. **Team fan out of Rails SDK (Stage Groups)**
- Individual stage groups begin using the SDK built in Phase 2 for new limit and policies.
- Stage groups begin replacing historical ad hoc limit implementations with the SDK.
- (Owning team) Provides means to monitor and observe the progress of the replacement effort. Ideally this is broken down to the `feature_category` level to drive group-level buy-in.
1. **Enable Satellite Services to Use the Rate Limiting Framework (Owning Team)**
- Determine if the goals of Phase 4 are best met by either:
- Extracting the Rails rate limiting service into a decoupled service.
- Implementing a separate Go library which uses the same backend (for example, Redis) for rate limiting.
1. **SDK for Satellite Services (Owning Team)**
- Build Go SDK.
- Create examples showcasing usage of the new rate limits SDK.
1. **Team fan out for Satellite Services (Stage Groups)**
- Individual stage groups begin using the SDK built in Phase 5 for new limit and policies.
- Stage groups begin replacing historical ad hoc limit implementations with the SDK.
## Status
Request For Comments.
## Timeline
- 2022-04-27: [Rate Limit Architecture Working Group](https://handbook.gitlab.com/handbook/company/working-groups/rate-limit-architecture/) started.
- 2022-06-07: Working Group members [started submitting technical proposals](https://gitlab.com/gitlab-org/gitlab/-/issues/364524) for the next rate limiting architecture.
- 2022-06-15: We started [scoring proposals](https://docs.google.com/spreadsheets/d/1DFHU1kSdTnpydwM5P2RK8NhVBNWgEHvzT72eOhB8F9E) submitted by Working Group members.
- 2022-07-06: A fourth, [consolidated proposal](https://gitlab.com/gitlab-org/gitlab/-/issues/364524#note_1017640650), has been submitted.
- 2022-07-12: Started working on the design document following [Architecture Evolution Workflow](https://handbook.gitlab.com/handbook/engineering/architecture/workflow/).
- 2022-09-08: The initial version of the blueprint has been merged.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,37 +1,11 @@
---
owning-stage: "~devops::create"
description: 'GitLab Remote Development ADR 100: New agent authorization strategy'
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/remote_development/decisions/100_new_agent_authorization_strategy/'
remove_date: '2025-07-08'
---
# GitLab Remote Development ADR 001: New agent authorization strategy
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/remote_development/decisions/100_new_agent_authorization_strategy/).
## Context
A decision was made to drop the legacy agent authorization strategy in favor of the new agent authorization strategy. As covered in [detailed proposal](https://gitlab.com/gitlab-org/remote-development/gitlab-remote-development-docs/-/blob/main/doc/proposal-for-mapping-projects-to-agents.md?ref_type=heads#problems-with-the-current-solution), the current solution has several issues that make it unsuitable for the long-term needs of the GitLab Remote Development feature. The main problems are:
1. **Limited flexibility**: The legacy agent authorization strategy relies on granting group-level Developer role to potential users. This makes it unsuitable for use in some organisations where users are not granted access at a group level.
1. **Potential security risks**: The legacy approach allows any user with Developer role within a limited scope to spin up a GitLab Agent and have it be potentially used for workspaces by users with relevant access to the project. Since workspaces contain privileged information such as secrets, more control should be enforced on what GitLab Agents may be select for hosting workspaces within a given scope (for e.g a group) as it is with GitLab CI Runners.
## Decision
Taking inspiration from the authorization model for GitLab CI Runners, a new authorization strategy for GitLab Agents will be introduced. In order understand how workspaces can be created using the new authorization strategy, it's important to understand the following rules:
- A user can only create workspaces using a cluster agent that is "available", and which has been configured for remote_development.
- A user must have Developer role to both the agent project and the workspace project.
- An agent is considered "available" for use with a workspace if a group owner or administrator has mapped the cluster agent to any parent group of the workspace project. Another way of looking at it is; a mapping between a cluster agent and a group is inherited by its subgroups.
- Mapping between a cluster agent and a group is a new concept that has been introduced with the revamped authorization strategy. A group owner may create a mapping between the group and any cluster agent residing within the group or its subgroup. **NOTE:** By default, no cluster agent is mapped to a group. Additionally, if a project resides within a group, it does NOT imply that the cluster agents of this project are mapped to the parent group(s).
In addition the above, the first phase of delivery will have the following restrictions:
- A GitLab Agent may only be mapped to a group. In the future, mapping cluster agents to the instance, user namespaces etc. can/should be explored.
- A GitLab Agent may only be mapped to a parent group. The group in question may or may not be a direct parent. For eg. if an agent belongs to a project with path `root-group/nested-group/agent-project`, then the agent may be mapped to either `root-group` and/or `nested-group`. In the future, there may be a need to consider mapping agents to a non-parent group. However, this will increase the scope of the task significantly due to additional considerations: for example, what if some owners/maintainers of a group do not access to the agent being mapped? This is not a problem when the agent is contained within the group. However, this usecase will have to be thought through if such a capability must be supported consistently.
For more details, on the details of the new authorization strategy, please refer to the [detailed technical design](https://gitlab.com/gitlab-org/remote-development/gitlab-remote-development-docs/-/blob/e28003334fda100295ed41bd84eef2b1770d86af/doc/tech-designs/2024-01-23-support-group-agent-authorization.md).
## Consequences
Since the new strategy is incompatible with the legacy authorization strategy, this feature will be put behind a feature flag and rolled out gradually. Additionally, in order to provide a smooth user experience during feature rollout, a one-time data migration will take place to create mappings between root groups and remote development cluster agents within these groups. After this migration, for any changes desired to the list of cluster agents available during workspace creation, users will be required to explicitly create/delete mappings.
## Alternatives
NA
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,752 +1,11 @@
---
status: ongoing
creation-date: "2022-11-15"
authors: [ "@vtak" ]
coach: "@grzesiek"
approvers: [ "@michelle-chen", "@adebayo_a" ]
owning-stage: "~devops::create"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/remote_development/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/remote_development/).
# Remote development
## Decisions
- [100: New agent authorization strategy](decisions/100_new_agent_authorization_strategy.md)
## Summary
Remote development is a new architecture for our software-as-a-service platform that provides a more consistent user experience writing code hosted in GitLab. It may also provide additional features in the future, such as a purely browser-based workspace and the ability to connect to an already running VM/Container or to use a GitLab-hosted VM/Container.
## Web IDE and remote development
It is important to note that `remote development !== Web IDE`, and this is something we want to be explicit about in this document as the terms can become conflated when they shouldn't. Our new Web IDE is a separate ongoing effort that is running in parallel to remote development.
These two separate categories do have some overlap as it is a goal to allow a user to connect a running workspace to the Web IDE, **but** this does not mean the two are dependent on one another.
You can use the [Web IDE](../../../user/project/web_ide/index.md) to commit changes to a project directly from your web browser without installing any dependencies or cloning any repositories. The Web IDE, however, lacks a native runtime environment on which you would compile code, run tests, or generate real-time feedback in the IDE. For a more complete IDE experience, you can pair the Web IDE with a remote development workspace that has been properly configured to run as a host.
## Long-term vision
As a [new Software Developer to a team such as Sasha](https://handbook.gitlab.com/handbook/product/personas/#sasha-software-developer) with no local development environment, I should be able to:
- Go to a repository on GitLab.com or self-managed.
- Click a button that will provide a list of current workspaces for this repository.
- Click a button that will create a new workspace or select an existing workspace from a list.
- Go through a configuration wizard that will let me select various options for my workspace (memory/CPU).
- Start up a workspace from the Web IDE and within a minute have a fully interactive terminal panel at my disposal.
- Make code changes, run tests, troubleshoot based on the terminal output, and commit new changes.
- Submit MRs of any kind without having to clone the repository locally or to manually update a local development environment.
## Terminology
We use the following terms to describe components and properties of the remote development architecture.
### Remote development
Remote development allows you to use a secure development environment in the cloud that you can connect to from your local machine through a web browser or a client-based solution with the purpose of developing a software product there.
#### Remote development properties
- Separate your development environment to avoid impacting your local machine configuration.
- Make it easy for new contributors to get started and keep everyone on a consistent environment.
- Use tools or runtimes not available on your local OS or manage multiple versions of them.
- Access an existing development environment from multiple machines or locations.
Discouraged synonyms: VS Code for web, remote development Extension, browser-only WebIDE, Client only WebIDE
### Workspace
Container/VM-based developer machines providing all the tools and dependencies needed to code, build, test, run, and debug applications.
#### Workspace properties
- Workspaces should be isolated from each other by default and are responsible for managing the lifecycle of their components. This isolation can be multi-layered: namespace isolation, network isolation, resources isolation, node isolation, sandboxing containers, etc. ([reference](https://kubernetes.io/docs/concepts/security/multi-tenancy/)).
- A workspace should contain project components as well as editor components.
- A workspace should be a combination of resources that support cloud-based development environment.
- Workspaces are constrained by the amount of resources provided to them.
### Web IDE
VS Code for web - replacement of our current legacy Web IDE.
#### Web IDE properties
A package for bootstrapping GitLab context-aware Web IDE that:
- Is built on top of Microsoft's VS Code. We customize and add VS Code features in the [GitLab fork of the VS Code project](https://gitlab.com/gitlab-org/gitlab-web-ide-vscode-fork).
- Can be configured in a way that it connects to the workspace rather than only using the browser. When connected to a workspace, a user should be able to do the following from the Web IDE:
- Edit, build, or debug on a different OS than they are running locally.
- Make use of larger or more specialized hardware than their local machine for development.
- Separate developer environments to avoid conflicts, improve security, and speed up onboarding.
### Remote development extension for desktop
Something that plugs into the desktop IDE and connects you to the workspace.
#### Remote development extension for desktop properties
- Allows you to open any folder in a workspace.
- Should be desktop IDE agnostic.
- Should have access to local files or APIs.
## Goals
### A consistent experience
Organizations should have the same user experience on our SaaS platform as they do on a self-managed GitLab instance. We want to abstract away the user's development environment to avoid impacting their local machine configuration. We also want to provide support for developing on the same operating system you deploy to or use larger or more specialized hardware.
A major goal is that each member of a development team should have the same development experience minus any specialized local configuration. This will also make it easy for new contributors to get started and keep everyone on a consistent environment.
### Increased availability
A workspace should allow access to an existing development environment from multiple machines and locations across a single or multiple teams. It should also allow a user to make use of tools or runtimes not available on their local OS or manage multiple versions of them.
Additionally, remote development workspaces could provide a way to implement disaster recovery if we are able to leverage the capabilities of [Cells](../../../architecture/blueprints/cells/index.md).
### Scalability
As an organization begins to scale, they quickly realize the need to support additional types of projects that might require extensive workflows. Remote development workspaces aim to solve that issue by abstracting away the burden of complex machine configuration, dependency management, and possible data-seeding issues.
To facilitate working on different features across different projects, remote development should allow each user to provision multiple workspaces to enable quick context switching.
Eventually, we should be able to allow users to vertically scale their workspaces with more compute cores, memory, and other resources. If a user is currently working against a 2 CPU and 4 GB RAM workspace but comes to find they need more CPU, they should be able to upgrade their compute layer to something more suitable with a click or CLI command in the workspace.
### Built-in security and enterprise readiness
As remote development becomes a viable replacement for virtual desktop infrastructure solutions, it must be secure and support enterprise requirements. These include role-based access control and the ability to remove all source code from developer machines.
### Faster project and developer onboarding
As a zero-install development environment that runs in your browser, remote development makes it easy for anyone to join your team and contribute to a project.
### Regions
GitLab.com is only hosted in the United States of America. Organizations located in other regions have voiced demand for local SaaS offerings. BYO infrastructure helps work in conjunction with [GitLab Regions](https://gitlab.com/groups/gitlab-org/-/epics/6037) because a user's workspace may be deployed in different geographies. The ability to deploy workspaces to different geographies might also help to solve data residency and compliance problems.
## Market analysis
We have conducted a market analysis to understand the broader market and what others can offer us by way of open-source libraries, integrations, or partnership opportunities. We have broken down the effort into a set of issues where we investigate each potential competitor/pathway/partnership as a spike.
- [Market analysis](https://gitlab.com/groups/gitlab-org/-/epics/8131)
- [YouTube results](https://www.youtube.com/playlist?list=PL05JrBw4t0KrRQhnSYRNh1s1mEUypx67-)
## Che vs DevWorkspace Operator vs custom-built solution
After an investigation into using [Che](https://gitlab.com/gitlab-org/gitlab/-/issues/366052) as our backend to accelerate remote development, we ultimately opted to [write our own custom-built solution using DevWorkspace Operator](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/97449#note_1131215629).
Some advantages of us opting to write our own custom-built solution are:
- We can still use the core DevWorkspace Operator and build on top of it.
- It is easier to add support for other configurations apart from `devfile` in the future if the need arises.
- We have the ability to choose which tech stack to use (for example, instead of using `traefik`, which is used in Che, explore NGINX itself or use the GitLab agent for Kubernetes).
After writing our own custom-built solution using DevWorkspace Operator,
we decided to [remove the dependency on DevWorkspace Operator](https://gitlab.com/groups/gitlab-org/-/epics/9895)
and thus the transitive dependency of Cert Manager.
## Architecture details
Remote development is delivered as a module in the
[GitLab agent for Kubernetes](../../../user/clusters/agent/index.md) project.
The overall goal of this architecture is to ensure that the **actual state** of all
remote development workspaces running in the Kubernetes clusters is reconciled with the **desired state** of the
workspaces as set by the user.
This is accomplished as follows:
1. The desired state of the workspaces is obtained from user actions in the GitLab UI or API and persisted in the Rails database.
1. There is a reconciliation loop between the agent and Rails, which:
- Retrieves the actual state of the workspaces from the Kubernetes clusters through the agent and sends it to Rails to be persisted.
- Rails compares the actual state with the desired state and responds with actions to bring the actual state in line with the desired state for all workspaces.
### System design
```plantuml
@startuml
title
System design for Remote Development
end title
node "Kubernetes" {
[Ingress Controller] --> [GitLab Workspaces Proxy] : Decrypt traffic
note left of "Ingress Controller"
Customers can choose
an ingress controller
of their choice
end note
[GitLab Workspaces Proxy] ..> [Workspace n] : Forward traffic\nfor workspace n
[GitLab Workspaces Proxy] ..> [Workspace 2] : Forward traffic\nfor workspace 2
[GitLab Workspaces Proxy] --> [Workspace 1] : Forward traffic\nfor workspace 1
[Agent] .up.> [Workspace n] : Applies kubernetes resources\nfor workspace n
[Agent] .up.> [Workspace 2] : Applies kubernetes resources\nfor workspace 2
[Agent] .up.> [Workspace 1] : Applies kubernetes resources\nfor workspace 1
[Agent] --> [Kubernetes API Server] : Interact and get/apply\nKubernetes resources
}
node "GitLab" {
[Nginx] --> [GitLab Rails] : Forward\ntraffic
[GitLab Rails] --> [Postgres] : Access database
[GitLab Rails] --> [Gitaly] : Fetch files
[KAS] -up-> [GitLab Rails] : Proxy
}
[Agent] -up-> [KAS] : Initiate reconciliation loop
[L7 Load Balancer] --> [Ingress Controller]
[Browser] --> [Nginx] : Browse GitLab
[Browser] -right-> [L7 Load Balancer] : Browse workspace URL
[GitLab Workspaces Proxy] --> [GitLab Rails] : Authenticate and authorize\nthe user accessing the workspace
[L7 Load Balancer] -right[hidden]-> [L4 Load Balancer]
[L4 Load Balancer] --> [GitLab Workspaces Proxy] : Forward traffic
[Terminal] -left-> [L4 Load Balancer] : Connect to workspace SSH URL
@enduml
```
### Remote development with the GitLab agent for Kubernetes topology
- The Kubernetes API is not shown in this diagram, but it is assumed that it is managing the workspaces through the agent.
- The numbers of components in each Kubernetes cluster are arbitrary.
```plantuml
@startuml
title
Remote Development with GitLab agent for Kubernetes topology
end title
node "GitLab Monolith" as gitlab {
rectangle "kas deployment" as kas_deployment {
collections kas1..kas8
}
rectangle rails
database postgres
kas_deployment - rails
rails -left- postgres
}
node "kubernetes cluster 1" as kubernetes1 {
rectangle "agent A workspaces" as agent_a_workspaces {
collections workspace2..workspace8
rectangle workspace1
}
rectangle "agent B workspaces" as agent_b..agent_b_8_workspaces {
collections workspace10..workspace16
rectangle workspace9
}
rectangle "agent A deployment" as agent_a_deployment {
rectangle agent_a_1
}
rectangle "agent B deployment" as agent_b_deployment {
collections agent_b_1..agent_b_8
}
agent_a_1 - agent_a_workspaces
agent_b_1..agent_b_8 - agent_b..agent_b_8_workspaces
}
node "kubernetes cluster 2" as kubernetes2 {
rectangle "agent C workspaces" as agent_c_workspaces {
collections workspace18..workspace24
rectangle workspace17
}
rectangle "agent C deployment" as agent_c_deployment {
rectangle agent_c_1
}
agent_c_1 -down- agent_c_workspaces
}
cloud cloud
cloud - kas1..kas8
cloud - agent_a_1
cloud - agent_b_1..agent_b_8
cloud - agent_c_1
'the following hidden line is a hack to get the diagram to render correctly
agent_a_1 -[hidden]- agent_b_1..agent_b_8
gitlab -[hidden]d- kubernetes2
@enduml
```
### High-level overview of the communication between Rails and the agent
```plantuml
@startuml
!pragma teoz true
title
High level overview of the communication between rails and agent
end title
box gitlab monolith #Beige
participant rails order 20
box kas #Bisque
participant "kas" as kas order 40
end box
end box
box Kubernetes cluster #Beige
box agent #Bisque
participant "agent remote_development\nmodule" as agent_rd_mod order 50
end box
participant kubernetes order 60
end box
loop forever
agent_rd_mod -> kubernetes: Subscribe to Kubernetes for\nworkspace changes associated with the agent
activate agent_rd_mod
autoactivate on
agent_rd_mod -> kas: POST request with\nupdated workspace information
note right
Any updated workspace information from
Kubernetes is pushed with next reconciliation.
end note
kas -> rails: proxy POST request from agent to rails
return Respond with all workspaces to be created/updated/terminated\nalong with an acknowledgement that all information sent by\nagent have been persisted into the database successfully
return proxy workspace information to agent
autoactivate off
agent_rd_mod -> kubernetes: Apply any received workspace information to kubernetes
deactivate agent_rd_mod
end loop
@enduml
```
### Types of messages between Rails and the agent
The agent can send different types of messages to Rails to capture different information. Depending on what type of message the agent sends, Rails will respond accordingly.
Different types of messages are:
- `prerequisites` (yet to be implemented) - This is the first message the agent sends to Rails after the agent starts or restarts or after a leader-election.
- Actions performed by the agent:
- Fetch Kubernetes resources that are required to be available in the Kubernetes cluster
- Actions performed by Rails:
- Send the Kubernetes manifests for `gitlab-workspaces-proxy` that need to be available in the Kubernetes cluster.
- `reconcile` - Messages sent to Rails to persist the current state of the workspaces. There are two types of updates specified by the `update_type` field with the following possible values: `full` and `partial`. The payload schema remains the same for both update types.
- `full`
- Actions performed by the agent:
- Send the current state of all the workspaces in the Kubernetes cluster managed by the agent.
- To keep things consistent between the agent and Rails, the agent will send this message every time agent undergoes a full reconciliation cycle that occurs
- when an agent starts or restarts
- after a leader-election
- periodically, as set using the `full_sync_interval` configuration (default: once every hour)
- whenever the agent configuration is updated
- Actions performed by Rails:
- Update Postgres with the current state and respond with all the workspaces managed by the agent and their last resource version that Rails has persisted in Postgres.
- Returning the persisted resource version back to the agent gives it a confirmation that the updates for that workspace have been successfully processed on the Rails end.
- This persisted resource version will also help with sending only the latest workspaces changes from the agent to Rails for `reconcile` message with `partial` update type.
- `partial`
- Actions performed by the agent:
- Send the latest workspace changes to Rails that are not yet persisted in Postgres. This persisted resource version will help with sending only the latest workspaces changes from the agent to Rails.
- Actions performed by Rails:
- Update Postgres with the current state and respond with the workspaces to be created/updated/deleted in the Kubernetes cluster and their last resource version that Rails has persisted in Postgres.
- The workspaces to be created/updated/deleted are calculated by using the filter `desired state updated at >= agent info reported at`.
- Returning the persisted resource version back to the agent gives it a confirmation that the updates for that workspace have been successfully processed on the Rails end.
### Event-driven polling vs full or partial reconciliation
It was initially considered desirable to be able to tell the agent to not wait for the next reconciliation loop but instead poll immediately. This would grant the following benefits:
1. This would grant the ability to trigger a full reconciliation on demand that would allow on-demand recovery/resetting of module state in the agent.
1. Apart from making the architecture more event-driven and real-time it would also help to increase the interval between reconciliation polls, thus reducing the load on the infrastructure.
However, as the prospective solutions were evaluated, it was concluded that there are very few/rare cases that would merit this capability, especially given the complexity of the viable options. An eventual reconciliation of state would suffice for most cases and it could be simply achieved through full reconciliation that is carried out periodically (with a longer interval as compared to partial reconciliation).
You can read more in this [issue](https://gitlab.com/gitlab-org/gitlab/-/issues/387090) and [conclusion comment](https://gitlab.com/gitlab-org/remote-development/gitlab-remote-development-docs/-/merge_requests/13#note_1282495106)
## Workspace states
- `CreationRequested` - Initial state of a Workspace; Creation requested by user but hasn't yet been acted on
- `Starting` - In the process of being ready for use
- `Running` - Ready for use
- `Stopping` - In the process of scaling down
- `Stopped` - Persistent storage is still available but workspace has been scaled down
- `Failed` - Kubernetes resources have been applied by `agentk` but are not ready due to various reasons (for example, crashing container)
- `Error` - Kubernetes resources failed to get applied by `agentk`
- `RestartRequested` - User has requested a restart of the workspace but the restart has not yet successfully happened
- `Terminating` - User has requested the termination of the workspace and the action has been initiated but not yet completed.
- `Terminated` - Persistent storage has been deleted and the workspace has been scaled down
- `Unknown` - Not able to understand the actual state of the workspace
### Possible `actual_state` values
The `actual_state` values are determined from the `status` attribute in the Kubernetes deployment changes, which the agent listens to and sends to Rails.
The following diagram represents the typical flow of the `actual_state` values for a `Workspace` record based on the
`status` values received from the agent. The `status` is parsed to derive the `actual_state` of the workspace based on different conditions.
However, any of these states can be skipped if there have been any
transitional `status` updates that were not received from the agent for some reason (a quick transition, a
failure to send the event, etc).
```plantuml
[*] --> CreationRequested
CreationRequested : Initial state before\nworkspace creation\nrequest is sent\nto kubernetes
CreationRequested -right-> Starting : status=Starting
CreationRequested -right-> Error : Could not create\nworkspace
Starting : Workspace config is being\napplied to kubernetes
Starting -right-> Running : status=Running
Starting -down-> Failed : status=Failed\n(container crashing)
Running : Workspace is running
Running -down-> Stopping : status=Stopping
Running -down-> Failed : status=Failed\n(container crashing)
Running -down-> Terminated : status=Terminated
Running -right-> Error : Could not\nstop/terminate\nworkspace
Stopping : Workspace is stopping
Stopping -down-> Stopped : status=Stopped
Stopping -left-> Failed : status=Failed\n(could not\nunmount volume\nand stop workspace)
Stopped : Workspace is Stopped\nby user request
Stopped -left-> Failed : status=Failed\n(could not\nunmount volume\nterminate workspace)
Stopped -right-> Error : Could not\nstart/terminate\nworkspace
Stopped -down-> Terminated : status=Terminated
Stopped -up-> Starting : status=Starting
Terminated: Workspace has been deleted
Failed: Workspace is not ready due to\nvarious reasons(for example, crashing container)
Failed -up-> Starting : status=Starting\n(container\nnot crashing)
Failed -right-> Stopped : status=Stopped
Failed -down-> Terminated : status=Terminated
Failed -down-> Error : Could not\nstop/terminate\nworkspace
Error: Kubernetes resources failed to get applied
Error -up-> Terminated : status=Terminated
Unknown: Unable to understand the actual state of the workspace
```
### Possible `desired_state` values
The `desired_state` values are determined from the user's request to Rails and are sent to the agent by Rails.
`desired_state` is a subset of the `actual_state` with only `Running`, `Stopped`, `Terminated` and `RestartRequested` values.
The state reconciliation logic in Rails will
continually attempt to transition the `actual_state` to the `desired_state` value, unless the workspace is in an unrecoverable state.
There is also an additional supported state of `RestartRequested` which is only valid for `desired_state`.
This value is not a valid value for `actual_state`. It is required in order for Rails to
initiate a restart of a started workspace. It will only persist until a `status` of `Stopped` is received
from the agent, indicating that the restart request was successful and in progress or completed.
At this point, the `desired_state` will be automatically changed to `Running` to trigger the workspace to restart again.
If there is a failure to restart the workspace, and a `Stopped` status is never received, the
`desired_state` will remain `RestartRequested` until a new `desired_state` is specified.
```plantuml
[*] --> Running
Running : Workspace is running
Running -down-> Stopped : status=Stopped
Running -left-> Terminated : status=Terminated
Stopped : Workspace is Stopped\nby user request
Stopped -up-> Running : status=Running
Stopped -down-> Terminated : status=Terminated
Terminated: Workspace has been deleted
RestartRequested : User has requested a workspace restart.\n**desired_state** will automatically change\nto **'Running'** if actual state\nof **'Stopped'** is received.
RestartRequested -left-> Running : status=Running
```
## Injecting environment variables and files into a workspace
Like CI, there is a need to inject environment variables and files into a workspace.
These environment variables and files will be frozen in time during workspace creation to ensure the same values
are injected into the workspace every time it starts/restarts.
Thus, a new database table, on the lines of `ci_job_variables` will be required.
This table will contain the following columns -
- `key` - To store the name of the environment variable or the file.
- `encrypted_value` - To store the encrypted value of the environment variable or the file.
- `encrypted_value_iv` - To store the initialization vector used for encryption.
- `workspace_id` - To reference the workspace the environment variable or the file is to be injected into.
- `variable_type` - To store whether this data is to be injected as an environment variable or a file.
To perform the encryption, the GitLab instance level secret key is used. The data about the environment variables
and files will only be sent to the Agent when required i.e.
- When new workspace creation request has been received from the user and an Agent initiates a Partial Reconciliation request
- When an Agent initiates a Full Reconciliation request
More details about the implementation details can be found in this [epic](https://gitlab.com/groups/gitlab-org/-/epics/10882).
We need to keep in mind potential performance concerns of decrypting workspace variables on the Rails side,
and perform benchmarks of what scale we will reach unacceptably long request times for a reconcile request.
e.g. a reconcile request for 100 workspaces with 20 encrypted values each == 2000 decryptions in a single request.
More details about the benchmarking can be found in this [issue](https://gitlab.com/gitlab-org/gitlab/-/issues/421504).
When a workspace is created from a project, it will inherit all the variables from the group/subgroup/project hierarchy
which are defined under [`Settings > CI/CD > Variables`](../../../ci/variables/index.md#define-a-cicd-variable-in-the-ui).
This aspect will be generalized to allow for defining `Variables` which will be inherited in both CI/CD and Workspaces.
A user will also be able to define, at a user level, environment variables and files to be injected into each
workspace created by them. While creating a workspace, a user would be able to override any environment variable
or file that is inherited from the group/subgroup/project/user hierarchy.
## Git operations from within a workspace
When a new workspace is created, a new personal access token associated to the user who created the workspace
will be generated. This personal access token will be tied to the lifecycle of the workspace and will be injected
into the workspace as a file to allow for cloning private projects and supporting transparent Git operations from
within the workspace out-of-the-box among other things using a
[custom Git credential helper](https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage).
[Investigation](https://gitlab.com/gitlab-org/gitlab/-/issues/421289#note_1511631931) into using
ephemeral tokens(JWTs/OAuth/OIDC/etc.) instead of Personal Access Tokens revealed the need to have a common
JWT Authentication/Authorization layer at GitLab which can be tracked in this [issue](https://gitlab.com/gitlab-org/gitlab/-/issues/421983).
Once such a feature is available, Personal Access Tokens for each workspace would be replaced with JWT tokens.
## Workspace user traffic authentication and authorization
We need to only allow certain users to access workspaces. Currently, we are restricting this to the creator/owner of the workspace. After the workspace is created, it needs to be exposed to the network so that the user can connect to it.
Thus, any traffic incoming to the workspace needs to be authenticated and authorized.
[`gitlab-workspaces-proxy`](../../../user/workspace/set_up_workspaces_proxy.md) handles discovery, authentication and authorization of the workspaces running in a Kubernetes cluster.
It will proxy all HTTP and WebSocket calls to the correct workspace. It will perform the following tasks:
1. Workspace discovery - The proxy will auto discover workspaces based on labels of Kubernetes service resources. The proxy will watch the Kubernetes API for the creation/updation/deletion of Kubernetes service resources. When an service resource is created, the proxy will automatically configure itself to use corresponding service as an upstream. Thus it will require a Kubernetes service account and a role that allows it to watch, list and get service resources.
1. Authentication - It will use the [OAuth 2.0 flow](../../../api/oauth2.md) with GitLab to authenticate the user. GitLab will act as the identity provider. If the customer uses a third party SSO service to sign in to GitLab, the flow would automatically delegate authentication to that provider. One of the complexities with authentication is the fact that each workspace is served on its own domain, and therefore we can't set the redirect URI on the GitLab app to a specific workspace. We need to set a state in the OAuth 2.0 flow to redirect to the correct workspace.
1. Authorization - The proxy will make a call to a GitLab GraphQL endpoint with the user's credentials obtained in the authentication phase. The endpoint will validate if the user has access to the workspace and accordingly return either a 404 or a 200.
1. Session Management - The proxy will be stateless and be deployed without additional third party software such as a cache or database, and therefore will be using a signed JWT to manage sessions. The JWT is signed using a key provided to the proxy during startup.
All traffic incoming to the Kubernetes cluster on a given domain is forwarded to `gitlab-workspaces-proxy`, which then decides how to serve that traffic.
```mermaid
flowchart TB
UserWorkspaceTraffic[User Workspace Traffic] --> Ingress
subgraph WorkspaceCluster[Workspace Cluster]
Ingress --> GitLabWorkspacesProxy[GitLab Workspaces Proxy]
GitLabWorkspacesProxy --Proxy--> Workspace1[Workspace 1]
GitLabWorkspacesProxy --Proxy--> Workspace2[Workspace 2]
GitLabWorkspacesProxy --Proxy--> Workspace3[Workspace 3]
end
GitLabWorkspacesProxy --OAuth 2--> GitLab
GitLab --Redirect--> GitLabWorkspacesProxy
GitLabWorkspacesProxy --Authz API--> GitLab
```
- Advantages
- Single instance of proxy, and therefore it is easy to manage and get metrics from.
- Easy to upgrade as a single instance exists - workspaces do not need to be restarted.
- Disadvantages
- Single point of failure
- It will have to scale with traffic
- New component (other than the GitLab agent) that would have to be deployed in the Kubernetes cluster by the customer
- Does need Kubernetes privileges to list service resources.
### Other options considered
#### Sidecar proxy
A sidecar will be injected into each workspace and all traffic to the workspace will flow through the sidecar. The sidecar will only handle the traffic for a single workspace. The sidecar can communicate with the workspace over the loopback interface (localhost) because the two share a network namespace.
```mermaid
flowchart TB
UserWorkspaceTraffic[User Workspace Traffic] --> Ingress
subgraph WorkspaceCluster[Workspace Cluster]
Ingress --> Workspace1Proxy
Ingress --> Workspace2Proxy
Ingress --> Workspace3Proxy
subgraph workspace1[Workspace 1]
Workspace1Proxy[Workspace Sidecar Proxy] --Proxy--> Workspace1[Workspace 1]
end
subgraph workspace2[Workspace 2]
Workspace2Proxy[Workspace Sidecar Proxy] --Proxy--> Workspace2[Workspace 2]
end
subgraph workspace3[Workspace 3]
Workspace3Proxy[Workspace Sidecar Proxy] --Proxy--> Workspace3[Workspace 3]
end
end
Workspace3Proxy --OAuth 2--> GitLab
GitLab --Redirect--> Workspace3Proxy
Workspace3Proxy --Authz API--> GitLab
```
- Advantages
- Does not need to handle a large volume of traffic
- If a sidecar stops working it does not hamper the working of other workspaces
- Disadvantages
- Inefficient usage of resources as a sidecar will have to be deployed for every workspace
- Workspace might take slightly longer to come up because of the additional proxy element
#### Auth annotations on the Ingress resource
Use auth annotations on the Ingress resource to allow Ingress controllers(for example, `ingress-nginx`) to delegate authentication and authorization to a separate process. The challenge is that these annotations are not standardized (that is, not part of the [Ingress specification](https://kubernetes.io/docs/concepts/services-networking/ingress/)) and may not be supported across different Ingress controllers. We would need to document the process to set up our Auth provider for each of the Ingress controllers. However, if they do become a part of the new [Gateway API](https://gateway-api.sigs.k8s.io/concepts/security-model/), we will reconsider this decision.
For `ingress-nginx`, the auth annotations would be:
```yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/auth-url: "https://$host/oauth2/auth"
nginx.ingress.kubernetes.io/auth-signin: "https://$host/oauth2/start?rd=$escaped_request_uri"
name: example-ingress
namespace: example-namespace
spec:
ingressClassName: nginx
rules:
- host: "*.workspaces.example.dev"
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: example-workspace
port:
number: 80
```
For `traefik`, the auth annotations would be:
```yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: example-ingress
namespace: example-namespace
annotations:
kubernetes.io/ingress.class: traefik
ingress.kubernetes.io/auth-type: forward
ingress.kubernetes.io/auth-url: http://traefik-forward-auth:4181
ingress.kubernetes.io/auth-response-headers: X-Forwarded-User
spec:
ingressClassName: traefik
rules:
- host: "*.workspaces.example.dev"
http:
paths:
- backend:
name: example-workspace
servicePort: http
```
For `Google Cloud Load Balancer using IAP`, the auth annotations would be:
```yaml
apiVersion: cloud.google.com/v1
kind: BackendConfig
metadata:
name: example-backend-config
namespace: example-namespace
spec:
iap:
enabled: true
oauthclientCredentials:
secretName: example-secret
```
## Accessing the Web IDE from the workspace
Currently, we only support GitLab fork of VS Code as the editor that is injected inside a workspace during runtime.
The [editor injector](https://gitlab.com/gitlab-org/gitlab-web-ide-vscode-fork/-/tree/main/scripts/gl/editor-injector) is a container image that contains GitLab fork of VS Code server.
The editor injector contains scripts for copying this server into a workspace and starting the server.
The editor injector packages both the WebUI and the Extension Host (VS Code backend). [Currently](https://gitlab.com/gitlab-org/gitlab/-/issues/393006), we also package the WebUI in the workspace. That means that the GitLab fork of VS Code editor can be used two ways:
- Access the Workspace URL **directly** and use the bundled WebUI
- Access the Workspace through **WebIDE** (ignore the bundled WebUI)
```plantuml
@startuml
title
Accessing the editor inside a workspace
end title
node "Workspace" {
node "GitLab fork of VS Code" {
[HTTP Server] .[#blue]up.> [Static assets of the WebUI]
[HTTP Server] -[#blue]-> [Extension Server (VS Code backend)]
[HTTP Server] -[#green]-> [Extension Server (VS Code backend)]
}
}
[Workspace URL in browser] -[#blue]right-> [HTTP Server]
[Workspace URL in browser] .[#blue]right.> [HTTP Server]
[WebIDE URL in browser] -[#green]left-> [HTTP Server]
note right of "Static assets of the WebUI"
We build WebIDE on top of these assets
end note
note bottom of "Workspace URL in browser"
Accessing the workspace directly
uses the packaged WebUI
end note
note bottom of "WebIDE URL in browser"
Accessing the workspace through WebIDE
ignores the web assets in the workspace
and only uses the WebSocket connection
end note
@enduml
```
## Building container images for workspaces
We rely on file group permissions to be able to modify and run any file in a container.
Thus, while creating the workspace, we use an arbitrary Linux user to run the container.
If the container image you want to use does not support arbitrary user IDs, you can build you own by using the snippet below. This snippet is provided only for reference. If there are other locations in the container that should have write access to the Linux user running the container, make sure those files and folders have the desired root Linux group permissions that we rely on.
```dockerfile
FROM IMAGE_OF_YOUR_CHOICE
RUN useradd -l -u 33333 -G sudo -md /home/gitlab-workspaces -s /bin/bash -p gitlab-workspaces gitlab-workspaces
ENV HOME=/home/gitlab-workspaces
WORKDIR $HOME
RUN mkdir -p /home/gitlab-workspaces && chgrp -R 0 /home && chmod -R g=u /etc/passwd /etc/group /home
USER gitlab-workspaces
```
You can read more about this decision in this [issue](https://gitlab.com/gitlab-org/gitlab/-/issues/396300#note_1375061754).
## Links
- [Remote Development direction](https://about.gitlab.com/direction/create/ide/remote_development/)
- [Remote Development presentation](https://docs.google.com/presentation/d/1XHH_ZilZPufQoWVWViv3evipI-BnAvRQrdvzlhBuumw/edit#slide=id.g131f2bb72e4_0_8)
- [Category Strategy epic](https://gitlab.com/groups/gitlab-org/-/epics/7419)
- [Minimal Maturity epic](https://gitlab.com/groups/gitlab-org/-/epics/9189)
- [Viable Maturity epic](https://gitlab.com/groups/gitlab-org/-/epics/9190)
- [Complete Maturity epic](https://gitlab.com/groups/gitlab-org/-/epics/9191)
- [Remote Development Engineering Sync](https://docs.google.com/document/d/1hWVvksIc7VzZjG-0iSlzBnLpyr-OjwBVCYMxsBB3h_E/edit#)
- [Market analysis and architecture](https://gitlab.com/groups/gitlab-org/-/epics/8131)
- [Developer Documentation](https://gitlab.com/gitlab-org/remote-development/gitlab-remote-development-docs/)
- [BYO infrastructure](https://gitlab.com/groups/gitlab-org/-/epics/8290)
- [Browser runtime](https://gitlab.com/groups/gitlab-org/-/epics/8291)
- [GitLab-hosted infrastructure](https://gitlab.com/groups/gitlab-org/-/epics/8292)
- [Browser runtime spike](https://gitlab.com/gitlab-org/gitlab-web-ide/-/merge_requests/58)
- [Building container images for workspaces](https://gitlab.com/gitlab-org/gitlab/-/issues/396300#note_1375061754)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,268 +1,11 @@
---
status: proposed
creation-date: "2023-04-26"
authors: [ "@proglottis" ]
coach: "@DylanGriffith"
approvers: []
owning-stage: "~devops::systems"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/repository_backups/'
remove_date: '2025-07-08'
---
<!-- Blueprints often contain forward-looking statements -->
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/repository_backups/).
# Repository Backups
<!-- For long pages, consider creating a table of contents. The `[_TOC_]`
function is not supported on docs.gitlab.com. -->
## Summary
This proposal seeks to provide an out-of-a-box repository backup solution to
GitLab that gives more opportunities to apply Gitaly specific optimisations. It
will do this by moving repository backups out of `backup.rake` into a
coordination worker that enumerates repositories and makes per-repository
decisions to trigger repository backups that are streamed directly from Gitaly
to object-storage.
The advantages of this approach are:
- The backups are only transferred once, from the Gitaly hosting the physical
repository to object-storage.
- Smarter decisions can be made by leveraging specific repository access
patterns.
- Distributes backup and restore load.
- Since the entire process is run within Gitaly existing monitoring can be
used.
- Provides architecture for future WAL archiving and other optimisations.
This should relieve the major pain points of the existing two strategies:
- `backup.rake` - Repository backups are streamed from outside of Gitaly using
RPCs and stored in a single large tar file. Due to the amount of data
transferred these backups are limited to small installations.
- Snapshots - Cloud providers allow taking physical storage snapshots. These
are not an out-of-a-box solution as they are specific to the cloud provider.
## Motivation
### Goals
- Improve time to create and restore repository backups.
- Improve monitoring of repository backups.
### Non-Goals
- Improving filesystem based snapshots.
### Filesystem based Snapshots
Snapshots rely on cloud platforms to be able to take physical snapshots of the
disks that Gitaly and Praefect use to store data. While never officially
recommended, this strategy tends to be used once creating or restoring backups
using `backup.rake` takes too long.
Gitaly and Git use lock files and fsync in order to prevent repository
corruption from concurrent processes and partial writes from a crash. This
generally means that if a file is written, then it will be valid. However,
because Git repositories are composed of many files and many write operations
may be taking place, it would be impossible to schedule a snapshot while no
file operations are ongoing. This means the consistency of a snapshot cannot be
guaranteed and restoring from a snapshot backup may require manual
intervention.
[WAL](https://gitlab.com/groups/gitlab-org/-/epics/8911) may improve crash
resistance and so improve automatic recovery from snapshots, but each
repository will likely still require a majority of voting replicas in sync.
Since each node in a Gitaly Cluster is not homogeneous, depending on
replication factor, in order to create a complete snapshot backup all nodes
would need to have snapshots taken. This means that snapshot backups have a lot
of repository data duplication.
Snapshots are heavily dependent on the cloud provider and so they would not
provide an out-of-a-box experience.
### Downtime
An ideal repository backup solution would allow both backup and restore
operations to be done online. Specifically we would not want to shutdown or
pause writes to ensure that each node/repository is consistent.
### Consistency
Consistency in repository backups means:
- That the Git repositories are valid after restore. There are no partially
applied operations.
- That all repositories in a cluster are healthy after restore, or are made
healthy automatically.
Backups without consistency may result in data-loss or require manual
intervention on restore.
Both types of consistency are difficult to achieve using snapshots as this
requires that snapshots of the filesystems on multiple hosts are taken
synchronously and without repositories on any of those hosts currently being
mutated.
### Distribute Work
We want to distribute the backup/restore work such that it isn't bottlenecked
on the machine running `backup.rake`, a single Gitaly node, or a single network
connection.
On backup, `backup.rake` aggregates all repository backups onto its local
filesystem. This means that all repository data needs to be streamed from
Gitaly (possibly via Praefect) to where the Rake task is being run. If this is
CNG then it also requires a large volume on Kubernetes. The resulting backup
tar file then gets transferred to object storage. A similar process happens on
restore, the entire tar file needs to be downloaded and extracted on the local
filesystem, even for a partial restore when restoring a subset of repositories.
Effectively all repository data gets transferred, in full, multiple times
between multiple hosts.
If each Gitaly could directly upload backups it would mean only transferring
repository data a single time, reducing the number of hosts and so the amount
of data transferred over all.
### Gitaly Controlled
Gitaly is looking to become self-contained and so should own its backups.
`backup.rake` currently determines which repositories to backup and where those
backups are stored. This restricts the kind of optimisations that Gitaly could
apply and adds development/testing complexity.
### Monitoring
`backup.rake` is run in a variety of different environments. Historically
backups from Gitaly's perspective are a series of disconnected RPC calls. This
has resulted in backups having almost zero monitoring. Ideally the process
would run within Gitaly such that the process could be monitored using existing
metrics and log scraping.
### Automatic Backups
When `backup.rake` is set up on cron it can be difficult to tell if it has been
running successfully, if it is still running, how long it took, and how much
space it has taken. It is difficult to ensure that cron always has access to
the previous backup to allow for incremental backups or to determine if
updating the backup is required at all.
Having a coordination process running continuously will allow moving from a
single-shot backup strategy to one where each repository determines its own
backup schedule based on usage patterns and priority. This way each repository
should be able to have a reasonably up-to-date backup without adding excess
load to any Gitaly node.
### Updated Repositories Only
`backup.rake` packages all repository backups into a tar file and generally has
no access to the previous backup. This makes it difficult to determine if the
repository has changed since last backup.
Having access to previous backups on object-storage would mean that Gitaly
could more easily determine if a backup needs to be taken at all. This allows
us to waste less time backing up repositories that are no longer being
modified.
### Point-in-time Restores
There should be a mechanism by which a set of repositories can be restored to a
specific point in time. The identifier (backup ID) used should be able to be
determined by an admin and apply to all repositories.
### WAL (write ahead log)
We want to be able to provide infrastructure to allow continuous archiving of
the WAL. This means providing a central place to stream the archives to and
being able to match any full backup to a place in the log such that
repositories can be restored from the full backup, and the WAL applied up to a
specific point in time.
### WORM
Any Gitaly accessible storage should be WORM (write once, read many) in order
to prevent existing backups being modified in the case an attacker gains access
to a nodes object-storage credentials.
[The pointer layout](https://gitlab.com/gitlab-org/gitaly/-/blob/master/doc/gitaly-backup.md#pointer-layout)
currently used by repository backups relies on being able to overwrite the
pointer files, and as such would not be suitable for use on a WORM file store.
WORM is likely object-storage provider specific:
- [AWS object lock](https://aws.amazon.com/blogs/storage/protecting-data-with-amazon-s3-object-lock/)
- [Google Cloud WORM retention policy](https://cloud.google.com/blog/products/storage-data-transfer/protecting-cloud-storage-with-worm-key-management-and-more-updates).
- [MinIO object lock](https://min.io/docs/minio/linux/administration/object-management/object-retention.html)
### `bundle-uri`
Having direct access backup data may open the door for clone/fetch transfer
optimisations using bundle-uri. This allows us to point Git clients directly to
a bundle file instead of transferring packs from the repository itself. The
bulk repository transfer can then be faster and is offloaded to a plain http
server, rather than the Gitaly servers.
## Proposal
The proposal is broken down into an initial MVP and per-repository coordinator.
### MVP
The goal of the MVP is to validate that moving backup processing server-side
will improve the worst case, total-loss, scenario. That is, reduce the total
time to create and restore a full backup.
The MVP will introduce backup and restore repository RPCs. There will be no
coordination worker. The RPCs will stream a backup directly from the
called Gitaly node to object storage. These RPCs will be called from
`backup.rake` via the `gitaly-backup` tool. `backup.rake` will no longer
package repository backups into the backup archive.
This work is already underway, tracked by the [Server-side Backups MVP epic](https://gitlab.com/groups/gitlab-org/-/epics/10077).
### Per-Repository Coordinator
Instead of taking a backup of all repositories at once via `backup.rake`, a
backup coordination worker will be created. This worker will periodically
enumerate all repositories to decide if a backup needs to be taken. These
decisions could be determined by usage patterns or priority of the repository.
When restoring, since each repository will have a different backup state, a
timestamp will be provided by the user. This timestamp will be used to
determine which backup to restore for each repository. Once WAL archiving is
implemented, the WAL could then be replayed up to the given timestamp.
This wider effort is tracked in the [Server-side Backups epic](https://gitlab.com/groups/gitlab-org/-/epics/10826).
## Design and implementation details
### MVP
There will be a pair of RPCs `BackupRepository` and `RestoreRepository`. These
RPCs will synchronously create/restore backups directly onto object storage.
`backup.rake` will continue to use `gitaly-backup` with a new `--server-side`
flag. Each Gitaly will need a backup configuration to specify the
object-storage service to use.
Initially the structure of the backups in object-storage will be the same as
the existing [pointer layout](https://gitlab.com/gitlab-org/gitaly/-/blob/master/doc/gitaly-backup.md#pointer-layout).
For MVP the backup ID must match an exact backup ID on object-storage.
The configuration of object-storage will be controlled by a new config
`config.backup.go_cloud_url`. The [Go Cloud Development Kit](https://gocloud.dev)
tries to use a provider specific way to configure authentication. This can be
inferred from the VM or from environment variables.
See [Supported Storage Services](https://gocloud.dev/howto/blob/#services).
## Alternative Solutions
<!--
It might be a good idea to include a list of alternative solutions or paths considered, although it is not required. Include pros and cons for
each alternative solution/path.
"Do nothing" and its pros and cons could be included in the list too.
-->
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,286 +1,11 @@
---
status: proposed
creation-date: "2023-03-07"
authors: [ "@ajwalker", "@johnwparent" ]
coach: [ "@ayufan" ]
approvers: [ "@DarrenEastman", "@engineering-manager" ]
owning-stage: "~devops::<stage>"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/runner_admission_controller/'
remove_date: '2025-07-08'
---
# GitLab Runner Admissions Controller
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/runner_admission_controller/).
The GitLab `admission controller` (inspired by the [Kubernetes admission controller concept](https://kubernetes.io/docs/reference/access-authn-authz/admission-controllers/)) is a proposed technical solution to intercept jobs before they're persisted or added to the build queue for execution.
An admission controller can be registered to the GitLab instance and receive a payload containing jobs to be created. Admission controllers can be _mutating_, _validating_, or both.
- When _mutating_, mutable job information can be modified and sent back to the GitLab instance. Jobs can be modified to conform to organizational policy, security requirements, or have, for example, their tag list modified so that they're routed to specific runners.
- When _validating_, a job can be denied execution.
## Motivation
To comply with the segregation of duties, organizational policy, or security requirements, customers in financial services, the US federal government market segment, or other highly regulated industries must ensure that only authorized users can use runners associated with particular CI job environments.
In this context, using the term environments is not equivalent to the definition of the environment used in the GitLab CI environments and deployments documentation. Using the definition from the [SLSA guide](https://slsa.dev/spec/v0.1/terminology), an environment is the "machine, container, VM, or similar in which the job runs."
An additional requirement comes from the Remote Computing Enablement (RCE) group at [Lawrence Livermore National Laboratory](https://hpc.llnl.gov/). In this example, users must have a user ID on the target Runner CI build environment for the CI job to run. To simplify administration across the entire user base, RCE needs to be able to associate a Runner with a GitLab user entity.
### Current GitLab CI job handling mechanism
Before going further, it is helpful to level-set the current job handling mechanism in GitLab CI and GitLab Runners.
- First, a runner associated with a GitLab instance continuously queries the GitLab instance API to check if there is a new job that it could run.
- With every push to a project repository on GitLab with a `.gitlab-ci.yml` file present, the CI service present on the GitLab instance catches the event and triggers a new CI job.
- The CI job enters a pending state in the queue until a Runner requests a job from the instance.
- On the request from a runner to the API for a job, the database is queried to verify that the job parameters matches that of the runner. In other words, when runners poll a GitLab instance for a job to execute they're assigned a job if it matches the specified criteria.
- If the job matches the runner in question, then the GitLab instance connects the job to the runner and changes the job state to running. In other words, GitLab connects the `job` object with the `Runner` object.
- A runner can be configured to run un-tagged jobs. Tags are the primary mechanism used today to enable customers to have some control of which Runners run certain types of jobs.
- So while runners are scoped to the instance, group, or project, there are no additional access control mechanisms today that can be expanded on to deny access to a runner based on a user or group identifier.
The current CI jobs queue logic is as follows. **Note - in the code we still use the very old `build` naming construct, but we've migrated from `build` to `job` in the product and documentation.
```ruby
jobs =
if runner.instance_type?
jobs_for_shared_runner
elsif runner.group_type?
jobs_for_group_runner
else
jobs_for_project_runner
end
# select only jobs that have tags known to the runner
jobs = jobs.matches_tag_ids(runner.tags.ids)
# select builds that have at least one tag if required
unless runner.run_untagged?
jobs = jobs.with_any_tags
end
```
## Goals
- Implement an initial solution that provides an easy-to-configure and use mechanism to `allow`, `deny` or `redirect` CI job execution on a specific runner entity based on some basic job details (like user, group or project membership).
## Non-Goals
- A re-design of the CI job queueing mechanism is not in the scope of this blueprint.
## Proposal
Implement a mechanism, `admission controllers`, to intercept CI jobs, allowing them to either mutate jobs, validate them or do both. An admission controller is a mutating webhook that can modify the CI job or reject the job according to a policy. The webhook is called before the job is inserted into the CI jobs queue.
### Guiding principles
- The webhook payload schema will be part of our public facing APIs.
- We must maintain backwards compatibility when extending the webhook payload.
- Controllers should be idempotent.
### How will the admissions controller work?
**Scenario 1**: I want to deny access to a certain runner.
1. Configure an admissions controller to only accept jobs from specific projects.
1. When a job is created the `project information` (`project_id`, `job_id`, `api_token`) will be used to query GitLab for specific details.
1. If the `project information` matches the allow list, then the job payload is not modified and the job is able to run on the target runner.
1. If the `project information` does not match the allow list, then the job payload is not modified and the job is dropped.
1. The job tags are not changed.
1. Admission controller may optionally send back an arbitrary text description of why a decline decision was made.
**Scenario 2**: Large runner fleet with using a common configuration and tags.
Each runner has a tag such as `zone_a`, `zone_b`. In this scenario the customer does not know where a specific job can run as some users have access to `zone_a`, and some to `zone_b`. The customer does not want to fail a job that should run on `zone_a`, but instead redirect a job if it is not correctly tagged to run in `zone_a.`
1. Configure an admissions controller to mutate jobs based on `user_id`.
1. When a job is created the `project information` (`project_id`, `job_id`, `api_token`) will be used to query GitLab for specific details.
1. If the `user_id` matches then the admissions controller modifies the job tag list. `zone_a` is added to the tag list as the controller has detected that the user triggering the job should have their jobs run IN `zone_a`.
**Scenario 3**: Runner pool with specific tag scheme, user only has access to a specific subset
Each runner has a tag identifier unique to that runner, e.g. `DiscoveryOne`, `tugNostromo`, `MVSeamus`, etc. Users have arbitrary access to these runners, however we don't want to fail a job on access denial, instead we want to prevent the job from being executed on runners to which the user does not have access. We also don't want to reduce the pool of runners the job can be run on.
1. Configure an admissions controller to mutate jobs based on `user_id`.
1. When a job is created the `project information` (`project_id`, `job_id`, `api_token`) will be used to query GitLab for specific details.
1. The admission controller queries available runners with the `user_id` and collects all runners for which the job cannot be run. If this is _all_ runners, the admission controller rejects the job, which is dropped. No tags are modified, and a message is included indicating the reasoning. If there are runners for which the user has permissions, the admission controller filters the associated runners for which there are no permissions.
### MVC
#### Admission controller
1. A single admission controller can be registered at the instance level only.
1. The admission controller must respond within 1 hr.
1. The admission controller will receive individual jobs. The response must contain only responses to that job.
1. The admission controller will recieve an API callback for rejection and acceptance, with the acceptance callback accepting mutation parameters.
#### Job Lifecycle
1. The `preparing` job state will be expanded to include the validation process prerequisite.
```mermaid
stateDiagram-v2
created --> preparing
state preparing {
[*] --> accept
[*] --> reject
}
reject --> failed
accept --> pending
pending --> running: picked by runner
running --> executed
state executed {
[*] --> failed
[*] --> success
[*] --> canceled
}
executed --> created: retry
```
1. When the state is `preparing`, the mutating webhook payload is sent to the admission controller asynchronously. This will be retried a number of times as needed.
1. The `preparing` state will wait for a response from the webhook or until timeout.
1. The UI should be updated with the current status of the job prerequisites and admission
1. For jobs where the webhook times out (1 hour) their status should be set as though the admission was denied with a timeout reasoning. This should
be rare in typical circumstances.
1. Jobs with denied admission can be retried. Retried jobs will be resent to the admission controller without tag mutations or runner filtering reset.
1. [`allow_failure`](../../../ci/yaml/index.md#allow_failure) should be updated to support jobs that fail on denied admissions, for example:
```yaml
job:
script:
- echo "I will fail admission"
allow_failure:
on_denied_admission: true
```
1. The UI should be updated to display the reason for any job mutations (if provided) or rejection.
1. Tag modifications applied by the Admission Controller should be persisted by the system with associated reasoning for any modifications, acceptances, or rejections
#### Payload
1. The payload is comprised of individual job entries consisting of:
- Job ID.
- [Predefined variables](../../../ci/variables/predefined_variables.md)
- Job tag list.
1. The response payload is comprised of individual job entries consisting of:
- Job ID.
- Admission state: `accepted` or `denied`.
- Mutations: `additions` and `removals`. `additions` supplements the existing set of tags, `removals` removes tags from the current tag list
- Reason: A controller can provide a reason for admission and mutation.
- Accepted Runners: runners to be considered for job matching, can be empty to match all runners
- Rejected Runners: runners that should not be considered for job matching, can be empty to match all runners
##### Example request
```json
[
{
"id": 123,
"variables": {
# predefined variables: https://docs.gitlab.com/ee/ci/variables/predefined_variables.html
"CI_PROJECT_ID": 123,
"CI_PROJECT_NAME": "something",
"GITLAB_USER_ID": 98123,
...
},
"tags": [ "docker", "windows" ]
}
]
[
{
"id": 245,
"variables": {
"CI_PROJECT_ID": 245,
"CI_PROJECT_NAME": "foobar",
"GITLAB_USER_ID": 98123,
...
},
"tags": [ "linux", "eu-west" ]
}
]
[
{
"id": 666,
"variables": {
"CI_PROJECT_ID": 666,
"CI_PROJECT_NAME": "do-bad-things",
"GITLAB_USER_ID": 98123,
...
},
"tags": [ "secure-runner" ]
},
]
```
##### Example response
```json
[
{
"id": 123,
"admission": "accepted",
"reason": "it's always-allow-day-wednesday"
}
]
[
{
"id": 245,
"admission": "accepted",
"tags": {
"add": [ "linux", "us-west" ],
"remove": [...]
},
"runners": {
"accepted_ids": ["822993167"],
"rejected_ids": ["822993168"]
},
"reason": "user is US employee: retagged region; user only has uid on runner 822993167"
}
]
[
{
"id": 666,
"admission": "rejected",
"reason": "you have no power here"
}
]
```
### MVC +
1. Multiple admissions controllers on groups and project levels.
1. Passing job definition through a chain of the controllers (starting on the project, through all defined group controllers up to the instance controller).
1. Each level gets the definition modified by the previous controller in the chain and makes decisions based on the current state.
1. Modification reasons, if reported by multiple controllers, are concatenated.
1. Usage of the admission controller is optional, so we can have a chain containing project+instance, project+group+parent group+instance, project+group, group+instance, etc
### Implementation Details
#### GitLab
1. Expand `preparing` state to engage the validation process via the `prerequsite` interface.
1. Amend `preparing` state to indicate to user, via the UI and API, the status of job preparation with regard to the job prerequisites
1. Should indicate status of each prerequisite resource for the job separately as they are asynchronous
1. Should indicate overall prerequisite status
1. Introduce a 1 hr timeout to the entire `preparing` state
1. Add an `AdmissionValidation` prerequisite to the `preparing` status dependencies via `Gitlab::Ci::Build::Prerequisite::Factory`
1. Convert the Prerequisite factory and `preparing` status to operate asynchronously
1. Convert `PreparingBuildService` to operate asynchronously
1. `PreparingBuildService` transitions the job from preparing to failed or pending depending on success of validation.
1. AdmissionValidation performs a reasonable amount of retries when sending request
1. Add API endpoint for Webhook/Admission Controller response callback
1. Accepts Parameters:
- Acceptance/Rejection
- Reason String
- Tag mutations (if accepted, otherwise ignored)
1. Callback encodes one time auth token
1. Introduce new failure reasoning on validation rejection
1. Admission controller impacts on job should be persisted
1. Runner selection filtering per job as a function of the response from the Admission controller (mutating web hook) should be added
## Technical issues to resolve
| Issue | Resolution |
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|
| Rule definition for the queue web hook | |
| What data to send to the admissions controller? Is it a subset or all of the [predefined variables](../../../ci/variables/predefined_variables.md)? | |
| Is the `queueing web hook` able to run at GitLab.com scale? On GitLab.com we would trigger millions of webhooks per second and the concern is that would overload Sidekiq or be used to abuse the system. | |
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 37 KiB

View File

@ -1,460 +1,11 @@
---
status: ongoing
creation-date: "2022-01-19"
authors: [ "@grzesiek", "@tmaczukin", "@josephburnett" ]
coach: [ "@ayufan", "@grzesiek" ]
approvers: [ "@DarrenEastman" ]
owning-stage: "~devops::verify"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/runner_scaling/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/runner_scaling/).
# Next Runner Auto-scaling Architecture
## Summary
GitLab Runner is a core component of GitLab CI/CD. It makes it possible to run
CI/CD jobs in a reliable and concurrent environment. It has been initially
introduced by Kamil Trzciński in early 2015 to replace a Ruby version of the
same service. GitLab Runner written in Go turned out to be easier to use by the
wider community, it was more efficient and reliable than the previous,
Ruby-based, version.
In February 2016 Kamil Trzciński [implemented an auto-scaling feature](https://gitlab.com/gitlab-org/gitlab-runner/-/merge_requests/53)
to leverage cloud infrastructure to run many CI/CD jobs in parallel. This
feature has become a foundation supporting CI/CD adoption on GitLab.com over
the years, where we now run around 4 million builds per day at peak.
During the initial implementation a decision was made to use Docker Machine:
> Is easy to use. Is well documented. Is well supported and constantly
> extended. It supports almost any cloud provider or virtualization
> infrastructure. We need minimal amount of changes to support Docker Machine:
> machine enumeration and inspection. We don't need to implement any "cloud
> specific" features.
This design choice was crucial for the GitLab Runner success. Since that time
the auto-scaling feature has been used by many users and customers and enabled
rapid growth of CI/CD adoption on GitLab.com.
We can not, however, continue using Docker Machine. Work on that project
[was paused in July 2018](https://github.com/docker/machine/issues/4537) and there
was no development made since that time (except for some highly important
security fixes). In 2018, after Docker Machine entered the "maintenance mode",
we decided to create [our own fork](https://gitlab.com/gitlab-org/ci-cd/docker-machine)
to be able to keep using this and ship fixes and updates needed for our use case.
[On September 26th, 2021 the project got archived](https://github.com/docker/docker.github.io/commit/2dc8b49dcbe85686cc7230e17aff8e9944cb47a5)
and the documentation for it has been removed from the official page. This
means that the original reason to use Docker Machine is no longer valid too.
To keep supporting our customers and the wider community and to improve our SaaS runners
maintenance we need to design a new mechanism for GitLab Runner auto-scaling. It not only
needs to support auto-scaling, but it also needs to do that in the way to enable us to
build on top of it to improve efficiency, reliability and availability.
We call this new mechanism the "next GitLab Runner Scaling architecture".
## Continuing building on Docker Machine
At this moment one of our core products - GitLab Runner - and one of its most
important features - ability to auto-scale job execution environments - depends
on an external product that is abandoned.
Docker Machine project itself is also hard to maintain. Its design starts to
show its age, which makes it hard to bring new features and fixes. A huge
codebase that it brings with a lack of internal knowledge about it makes it
hard for our maintainers to support and properly handle incoming feature
requests and community contributions.
Docker Machine and it integrated 20+ drivers for cloud and virtualization
providers creates also another subset of problems, like:
- Each cloud/virtualization environment brings features that come and go
and we would need to maintain support for them (add new features, fix
bugs).
- We basically need to become experts for each of the virtualization/cloud
provider to properly support integration with their API,
- Every single provider that Docker Machine integrates with has its
bugs, security releases, vulnerabilities - to maintain the project properly
we would need to be on top of all of that and handle updates whenever
they are needed.
Another problem is the fact that Docker Machine, from its beginnings, was
focused on managing Linux based instances only. Despite that at some moment
Docker got official and native integration on Windows, Docker Machine never
followed this step. Nor its designed to make such integration easy.
There is also no support for MacOS. This one is obvious - Docker Machine is a
tool to maintain hosts for Docker Engine and there is no native Docker Engine
for MacOS. And by native we mean MacOS containers executed within MacOS
operating system. Docker for MacOS product is not a native support - it's just
a tooling and a virtualized Linux instance installed with it that makes it
easier to develop **Linux containers** on MacOS development instances.
This means that only one of three of our officially supported platforms -
Linux, Windows and MacOS - have a fully-featured support for CI/CD
auto-scaling. For Windows there is a possibility to use Kubernetes (which in
some cases have limitations) and maybe with a lot of effort we could bring
support for Windows into Docker Machine. But for MacOS, there is no
auto-scaling solution provided natively by GitLab Runner.
This is a huge limitation for our users and a frequently requested feature.
It's also a limitation for our SaaS runners offering. We've maintained to
create some sort of auto-scaling for our SaaS Windows and SaaS MacOS runners
hacking around Custom executor. But experiences from past three years show
that it's not the best way of doing this. And yet, after this time, Windows
and MacOS runners autoscaling lacks a lot of performance and feature support
that we have with our SaaS Linux runners.
To keep supporting our customers and the wider community and to improve our
SaaS runners maintenance we need to design a new mechanism for GitLab Runner
auto-scaling. It not only needs to support auto-scaling, but it also needs to
do that in the way to enable us to build on top of it to improve efficiency,
reliability and availability.
## Proposal
Currently, GitLab Runner auto-scaling can be configured in a few ways. Some
customers are successfully using an auto-scaled environment in Kubernetes. We
know that a custom and unofficial GitLab Runner version has been built to make
auto-scaling on Kubernetes more reliable. We recognize the importance of having
a really good Kubernetes solution for running multiple jobs in parallel, but
refinements in this area are out of scope for this architectural initiative.
We want to focus on resolving problems with Docker Machine and replacing this
mechanism with a reliable and flexible mechanism. We might be unable to build a
drop-in replacement for Docker Machine, as there are presumably many reasons
why it has been deprecated. It is very difficult to maintain compatibility with
so many cloud providers, and it seems that Docker Machine has been deprecated
in favor of Docker Desktop, which is not a viable replacement for us.
[This issue](https://github.com/docker/roadmap/issues/245) contains a discussion
about how people are using Docker Machine right now, and it seems that GitLab
CI is one of the most frequent reasons for people to keep using Docker Machine.
There is also an opportunity in being able to optionally run multiple jobs in a
single, larger virtual machine. We can't do that today, but we know that this
can potentially significantly improve efficiency. We might want to build a new
architecture that makes it easier and allows us to test how efficient it is
with PoCs. Running multiple jobs on a single machine can also make it possible
to reuse what we call a "sticky context" - a space for build artifacts / user
data that can be shared between job runs.
### 💡 Design a simple abstraction that users will be able to build on top of
Because there is no viable replacement and we might be unable to support all
cloud providers that Docker Machine used to support, the key design requirement
is to make it really simple and easy for the wider community to write a custom
GitLab plugin for whatever cloud provider they might be using. We
want to design a simple abstraction that users will be able to build on top, as
will we to support existing workflows on GitLab.com.
The designed mechanism should abstract what Docker Machine executor has been doing:
providing a way to create an external Docker environment, waiting to execute
jobs by provisioning this environment and returning credentials required to
perform these operations.
The new plugin system should be available for all major platforms: Linux,
Windows, MacOS.
### 💡 Migrate existing Docker Machine solution to a plugin
Once we design and implement the new abstraction, we should be able to migrate
existing Docker Machine mechanisms to a plugin. This will make it possible for
users and customers to immediately start using the new architecture, but still
keep their existing workflows and configuration for Docker Machine. This will
give everyone time to migrate to the new architecture before we drop support
for the legacy auto-scaling entirely.
### 💡 Build plugins for AWS, Google Cloud Platform and Azure
Although we might be unable to add support for all the cloud providers that
Docker Machine used to support, it seems to be important to provide
GitLab-maintained plugins for the major cloud providers like AWS, Google Cloud
Platform and Azure.
We should build them, presumably in separate repositories, in a way that they
are easy to contribute to, fork, modify for certain needs the wider community
team members might have. It should be also easy to install a new plugin without
the need of rebuilding GitLab Runner whenever it happens.
### 💡 Write a solid documentation about how to build your own plugin
It is important to show users how to build a plugin, so that they
can implement support for their own cloud infrastructure.
Building new plugins should be simple and supported with great
documentation. We want to design the plugin system in a way that the entry barrier
for contributing new plugins is very low.
### 💡 Build a PoC to run multiple builds on a single machine
We want to better understand what kind of efficiency can running multiple jobs
on a single machine bring. It is difficult to predict that, so ideally we
should build a PoC that will help us to better understand what we can expect
from this.
To run this experiment we most likely we will need to build an experimental
plugin, that not only allows us to schedule running multiple builds on a single
machine, but also has a set of comprehensive metrics built into it, to make it
easier to understand how it performs.
## Details
How the abstraction will look exactly is something that
we will need to prototype, PoC and decide in a data-informed way. There are a
few proposals that we should describe in detail, develop requirements for, PoC
and score. We will choose the solution that seems to support our goals the
most.
In order to describe the proposals we first need to better explain what part of
the GitLab Runner needs to be abstracted away. To make this easier to grasp
these concepts, let's take a look at the current auto-scaling architecture and
sequence diagram.
![GitLab Runner Autoscaling Overview](gitlab-autoscaling-overview.png)
On the diagrams above we see that currently a runner manager runs on a
machine that has access to a cloud provider's API. It is using Docker Machine
to provision new Virtual Machines with Docker Engine installed and it
configures the Docker daemon there to allow external authenticated requests. It
stores credentials to such ephemeral Docker environments on disk. Once a
machine has been provisioned and made available for the runner manager to
run builds, it is using one of the existing executors to run a user-provided
script. In auto-scaling, this is typically done using the Docker executor.
### Separation of concerns
There are several concerns represented in the current architecture. They are
coupled in the current implementation so we will break them out here to consider
them each separately.
- **Virtual Machine (VM) shape**. The underlying provider of a VM requires configuration to
know what kind of machine to create. For example, Cores, memory, failure domain,
etc... This information is very provider specific.
- **VM lifecycle management**. Multiple machines will be created and a
system must keep track of which machines belong to this executor. Typically
a cloud provider will have a way to manage a set of homogeneous machines.
For example, GCE Instance Group. The basic operations are increase, decrease and
usually delete a specific machine.
- **VM autoscaling**. In addition to low-level lifecycle management,
job-aware capacity decisions must be made to the set of machines to provide
capacity when it is needed but not maintain excess capacity for cost reasons.
- **Job to VM mapping (routing)**. Currently the system assigns only one job to a
given a machine. A machine may be reused based on the specific executor
configuration.
- **In-VM job execution**. Within each VM a job must be driven through
various pre-defined stages and results and trace information returned
to the Runner system. These details are highly dependent on the VM
architecture and operating system as well as Executor type.
See also Glossary below.
#### Current state
The current architecture has several points of coupling between concerns.
Coupling reduces opportunities for abstraction (for example, community supported
plugins) and increases complexity, making the code harder to understand,
test, maintain and extend.
A primary design decision will be which concerns to externalize to the plugin
and which should remain with the runner system. The current implementation
has several abstractions internally which could be used as cut points for a
new abstraction.
For example the [`Build`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/common/build.go#L125)
type uses the [`GetExecutorProvider`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/common/executor.go#L171)
function to get an executor provider based on a dispatching executor string.
Various executor types register with the system by being imported and calling
[`RegisterExecutorProvider`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/common/executor.go#L154)
during initialization. Here the abstractions are the [`ExecutorProvider`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/common/executor.go#L80)
and [`Executor`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/common/executor.go#L59)
interfaces.
Within the `docker+autoscaling` executor the [`machineExecutor`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/executors/docker/machine/machine.go#L19)
type has a [`Machine`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/helpers/docker/machine.go#L7)
interface which it uses to acquire a VM during the common [`Prepare`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/executors/docker/machine/machine.go#L71)
phase. This abstraction primarily creates, accesses and deletes VMs.
There is no current abstraction for the VM autoscaling logic. It is tightly
coupled with the VM lifecycle and job routing logic. Creating idle capacity
happens as a side-effect of calling [`Acquire`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/executors/docker/machine/provider.go#L449) on the `machineProvider` while binding a job to a VM.
There is also no current abstraction for in-VM job execution. VM-specific
commands are generated by the runner manager using the [`GenerateShellScript`](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/common/build.go#L336)
function and [injected](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/267f40d871cd260dd063f7fbd36a921fedc62241/common/build.go#L373)
into the VM as the manager drives the job execution stages.
### Design principles
Our goal is to design a GitLab Runner plugin system interface that is flexible
and simple for the wider community to consume. As we cannot build plugins for
all cloud platforms, we want to ensure a low entry barrier for anyone who needs
to develop a plugin. We want to allow everyone to contribute.
To achieve this goal, we will follow a few critical design principles. These
principles will guide our development process for the new plugin system
abstraction.
#### General high-level principles
- Design the new auto-scaling architecture aiming for having more choices and
flexibility in the future, instead of imposing new constraints.
- Design the new auto-scaling architecture to experiment with running multiple
jobs in parallel, on a single machine.
- Design the new provisioning architecture to replace Docker Machine in a way
that the wider community can easily build on top of the new abstractions.
- New auto-scaling method should become a core component of GitLab Runner product so that
we can simplify maintenance, use the same tooling, test configuration and Go language
setup as we do in our other main products.
- It should support multiple job execution environments - not only Docker containers
on Linux operating system.
The best design would be to bring auto-scaling as a feature wrapped around
our current executors like Docker or Shell.
#### Principles for the new plugin system
- Make the entry barrier for writing a new plugin low.
- Developing a new plugin should be simple and require only basic knowledge of
a programming language and a cloud provider's API.
- Strive for a balance between the plugin system's simplicity and flexibility.
These are not mutually exclusive.
- Abstract away as many technical details as possible but do not hide them completely.
- Build an abstraction that serves our community well but allows us to ship it quickly.
- Invest in a flexible solution, avoid one-way-door decisions, foster iteration.
- When in doubts err on the side of making things more simple for the wider community.
- Limit coupling between concerns to make the system more simple and extensible.
- Concerns should live on one side of the plug or the other--not both, which
duplicates effort and increases coupling.
#### The most important technical details
- Favor gRPC communication between a plugin and GitLab Runner.
- Make it possible to version communication interface and support many versions.
- Make Go a primary language for writing plugins but accept other languages too.
- Autoscaling mechanism should be fully owned by GitLab.
Cloud provider autoscalers don't know which VM to delete when scaling down so
they make sub-optimal decisions. Rather than teaching all autoscalers about GitLab
jobs, we prefer to have one, GitLab-owned autoscaler (not in the plugin).
It will also ensure that we can shape the future of the mechanism and make decisions
that fit our needs and requirements.
## Plugin boundary proposals
The following are proposals for where to draw the plugin boundary. We will evaluate
these proposals and others by the design principles and technical constraints
listed above.
### Custom provider
In order to reduce the scope of work, we only want to introduce the new
abstraction layer in one place.
A few years ago we introduced the [Custom Executor](https://docs.gitlab.com/runner/executors/custom.html)
feature in GitLab Runner. It allows users to design custom build execution
methods. The custom executor driver can be implemented in any way - from a
simple shell script to a dedicated binary - that is then used by a Runner
through os/exec system calls.
Thanks to the custom executor abstraction there is no more need to implement
new executors internally in Runner. Users who have specific needs can implement
their own drivers and don't need to wait for us to make their work part of the
"official" GitLab Runner. As each driver is a separate project, it also makes
it easier to create communities around them, where interested people can
collaborate together on improvements and bug fixes.
We want to design the new Custom Provider to replicate the success of the
Custom Executor. It will make it easier for users to build their own ways to
provide a context and an environment in which a build will be executed by one
of the Custom Executors.
There are multiple solutions to implementing a custom provider abstraction. We
can use raw Go plugins, HashiCorp's Go Plugin, HTTP interface or gRPC based
facade service. There are many solutions, and we want to choose the most
optimal one. In order to do that, we will describe the solutions in a separate
document, define requirements and score the solution accordingly. This will
allow us to choose a solution that will work best for us and the wider
community.
This proposal places VM lifecycle and autoscaling concerns as well as job to
VM mapping (routing) into the plugin. The build need only ask for a VM and
it will get one with all aspects of lifecycle and routing already accounted
for by the plugin.
Rationale: [Description of the Custom Executor Provider proposal](https://gitlab.com/gitlab-org/gitlab-runner/-/issues/28848#note_823321515)
### Taskscaler provider
We can introduce a more simple version of the `Machine` abstraction in the
form of a "Fleeting" interface. Fleeting provides a low-level interface to
a homogeneous VM group which allows increasing and decreasing the set size
as well as consuming a VM from within the set.
Plugins for cloud providers and other VM sources are implemented via the
HashiCorp go-plugin library. This is in practice gRPC over STDIN/STDOUT
but other wire protocols can be used also.
In order to make use of the new interface, the autoscaling logic is pulled
out of the Docker Executor and placed into a new Taskscaler library.
This places the concerns of VM lifecycle, VM shape and job routing within
the plugin. It also places the concern of VM autoscaling into a separate
component so it can be used by multiple Runner Executors (not just `docker+autoscaling`).
Rationale: [Description of the InstanceGroup / Fleeting proposal](https://gitlab.com/gitlab-org/gitlab-runner/-/issues/28848#note_823430883)
POC: [Merge request](https://gitlab.com/gitlab-org/gitlab-runner/-/merge_requests/3315)
## Glossary
- **[GitLab Runner](../../../development/documentation/styleguide/word_list.md#gitlab-runner)** - the software application that you can choose to install and manage, whose source code is hosted at `gitlab.com/gitlab-org/gitlab-runner`.
- **[runners](../../../development/documentation/styleguide/word_list.md#runner-runners)** - the runner is the agent that's responsible for running GitLab CI/CD jobs in an environment and reporting the results to a GitLab instance. It /1/ retrieves jobs from GitLab, /2/ configures a local or remote build environment, and /3/ executes jobs within the provisioned environment, passing along log data and status updates to GitLab.
- **runner manager** - the runner process is often referred to as the `Runner Manager` as it manages multiple runners, which are the `[[runners]]` workers defined in the runners `config.toml` file.
- **executor** - a concrete environment which can be prepared and used to run a job. A new executor is created for each job.
- **executor provider** - an implementation capable of providing executors on demand. Executor providers are registered on import and initialized once when a runner starts up.
- **custom executor** - works as an interface between GitLab Runner and a set of binaries or shell scripts with environment variable inputs that enable executing CI jobs in any host computing environment. New custom executors can be added to the system without making any changes to the GitLab Runner codebase.
- **custom executor provider** - a new abstraction, proposed under the custom provider heading in the plugin boundary proposal section above, which allows new executor providers to be created without modifying the GitLab Runner codebase. The protocol could be similar to custom executors or done over gRPC. This abstraction places all the mechanics of producing executors within the plugin, delegating autoscaling and lifecycle management concerns to each implementation.
- **taskscaler** - a new library, proposed under the taskscaler provider heading in the plugin boundary proposal section above, which is parameterized with a concrete executor provider and a fleeting provider. Taskscaler is responsible for the autoscaling concern and can be used to autoscale any executor provider using any VM shape. Taskscaler is also responsible for the runner-specific aspect of VM lifecycle and keeps track of how many jobs are using a give VM and how many times a VM has been used.
- **fleeting** - a new library proposed along with taskscaler which provides abstractions for cloud provider VMs.
- **fleeting instance group** - the abstraction that fleeting uses to represent a pool of like VMs. This would represent a GCP IGM or an AWS ASG (without the autoscaling). Instance groups can be increased, decreased or can provide connection details for a specific VM.
- **fleeting plugin** - a concrete implementation of a fleeting instance group representing a specific IGM or ASG (when initialized). There will be N of these, one for each provider, each in its own project. We will own and maintain the core ones but some will be community supported. A new fleeting plugin can be created without making any changes to the runner, taskscaler or fleeting code bases. This makes it analogous to the custom executor provider in terms of self-service and decoupling, but along a different line of concerns.
- **fleeting plugin Google Compute** - the fleeting plugin which creates GCP instances. This lives in a separate project from the fleeting and taskscaler.
- **fleeting plugin AWS** - the fleeting plugin which creates AWS instances. This lives in a separate project from the fleeting and taskscaler.
## Status
Status: RFC.
## Who
Proposal:
<!-- vale gitlab.Spelling = NO -->
| Role | Who |
|------------------------------|-------------------------------------------------|
| Authors | Grzegorz Bizon, Tomasz Maczukin, Joseph Burnett |
| Architecture Evolution Coach | Kamil Trzciński |
| Engineering Leader | Elliot Rushton, Cheryl Li |
| Product Manager | Darren Eastman, Jackie Porter |
| Domain Expert / Runner | Arran Walker |
DRIs:
| Role | Who |
|-------------|-----------------|
| Leadership | Elliot Rushton |
| Product | Darren Eastman |
| Engineering | Tomasz Maczukin |
Domain experts:
| Area | Who |
|------------------------|--------------|
| Domain Expert / Runner | Arran Walker |
<!-- vale gitlab.Spelling = YES -->
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,486 +1,11 @@
---
status: ongoing
creation-date: "2022-10-27"
authors: [ "@pedropombeiro", "@tmaczukin" ]
coach: "@ayufan"
approvers: [ "@erushton" ]
owning-stage: "~devops::verify"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/runner_tokens/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/runner_tokens/).
# Next GitLab Runner Token Architecture
## Summary
GitLab Runner is a core component of GitLab CI/CD that runs
CI/CD jobs in a reliable and concurrent environment. Ever since the beginnings
of the service as a Ruby program, runners are registered in a GitLab instance with
a registration token - a randomly generated string of text. The registration token is unique for its given scope
(instance, group, or project). The registration token proves that the party that registers the runner has
administrative access to the instance, group, or project to which the runner is registered.
This approach has worked well in the initial years, but some major known issues started to
become apparent as the target audience grew:
| Problem | Symptoms |
|---------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Single token per scope | - The registration token is shared by multiple runners: <br/>- Single tokens lower the value of auditing and make traceability almost impossible; <br/>- Copied in many places for [self-registration of runners](https://docs.gitlab.com/runner/install/kubernetes.html#required-configuration); <br/>- Reports of users storing tokens in unsecured locations; <br/>- Makes rotation of tokens costly. <br/>- In the case of a security event affecting the whole instance, rotating tokens requires users to update a table of projects/namespaces, which takes a significant amount of time. |
| No provision for automatic expiration | Requires manual intervention to change token. Addressed in [#30942](https://gitlab.com/gitlab-org/gitlab/-/issues/30942). |
| No permissions model | Used to register a runner for protected branches, and for any tags. In this case, the registration token has permission to do everything. Effectively, someone taking a possession of registration token could steal secrets or source code. |
| No traceability | Given that the token is not created by a user, and is accessible to all administrators, there is no possibility to know the source of a leaked token. |
| No historical records | When reset, the previous value of the registration token is not stored so there is no historical data to enable deeper auditing and inspection. |
| Token stored in project/namespace model | Inadvertent disclosure of token is possible. |
| Too many registered runners | It is too straightforward to register a new runner using a well-known registration token. |
In light of these issues, it is important that we redesign the way in which we connect runners to the GitLab instance so that we can guarantee traceability, security, and performance.
We call this new mechanism the "next GitLab Runner Token architecture".
## Proposal
The proposal addresses the issues of a _single token per scope_ and _token storage_
by eliminating the need for a registration token. Runner creation happens
in the GitLab Runners settings page for the given scope, in the context of the authenticated user,
which provides traceability. The page provides instructions to configure the newly-created
runner in supported environments using the existing `gitlab-runner register` command.
The remaining concerns become non-issues due to the elimination of the registration token.
### Comparison of current and new runner registration flow
```mermaid
graph TD
subgraph new[<b>New registration flow</b>]
A[<b>GitLab</b>: User creates a runner in GitLab UI and adds the runner configuration] -->|<b>GitLab</b>: creates ci_runners record and returns<br/>new 'glrt-' prefixed authentication token| B
B(<b>Runner</b>: User runs 'gitlab-runner register' command with</br>authentication token to register new runner manager with<br/>the GitLab instance) --> C{<b>Runner</b>: Does a .runner_system_id file exist in<br/>the gitlab-runner configuration directory?}
C -->|Yes| D[<b>Runner</b>: Reads existing system ID] --> F
C -->|No| E[<b>Runner</b>: Generates and persists unique system ID] --> F
F[<b>Runner</b>: Issues 'POST /runner/verify' request<br/>to verify authentication token validity] --> G{<b>GitLab</b>: Is the authentication token valid?}
G -->|Yes| H[<b>GitLab</b>: Creates ci_runner_machine database record if missing] --> J[<b>Runner</b>: Store authentication token in .config.toml]
G -->|No| I(<b>GitLab</b>: Returns '403 Forbidden' error) --> K(gitlab-runner register command fails)
J --> Z(Runner and runner manager are ready for use)
end
subgraph current[<b>Current registration flow</b>]
A'[<b>GitLab</b>: User retrieves runner registration token in GitLab UI] --> B'
B'[<b>Runner</b>: User runs 'gitlab-runner register' command<br/>with registration token to register new runner] -->|<b>Runner</b>: Issues 'POST /runner request' to create<br/>new runner and obtain authentication token| C'{<b>GitLab</b>: Is the registration token valid?}
C' -->|Yes| D'[<b>GitLab</b>: Create ci_runners database record] --> F'
C' -->|No| E'(<b>GitLab</b>: Return '403 Forbidden' error) --> K'(gitlab-runner register command fails)
F'[<b>Runner</b>: Store authentication token<br/>from response in .config.toml] --> Z'(Runner is ready for use)
end
style new fill:#f2ffe6
```
### Using the authentication token in place of the registration token
<!-- vale gitlab.Spelling = NO -->
In this proposal, runners created in the GitLab UI are assigned
[authentication tokens](../../../security/token_overview.md#runner-authentication-tokens)
prefixed with `glrt-` (**G**it**L**ab **R**unner **T**oken).
<!-- vale gitlab.Spelling = YES -->
The prefix allows the existing `register` command to use the authentication token _in lieu_
of the current registration token (`--registration-token`), requiring minimal adjustments in
existing workflows.
The authentication token is shown to the user only once - after completing the creation flow - to
discourage unintended reuse.
Given that the runner is pre-created through the GitLab UI, the `register` command fails if
provided with arguments that are exposed in the runner creation form.
Some examples are `--tag-list`, `--run-untagged`, `--locked`, or `--access-level` as these are
sensitive parameters that should be decided at creation time by an administrator/owner.
The runner configuration is generated through the existing `register` command, which can behave in
two different ways depending on whether it is supplied a registration token or an authentication
token in the `--registration-token` argument:
| Token type | Behavior |
| ---------- | -------- |
| [Registration token](../../../security/token_overview.md#runner-authentication-tokens) | Leverages the `POST /api/v4/runners` REST endpoint to create a new runner, creating a new entry in `config.toml` and a `system_id` value in a sidecar file if missing (`.runner_system_id`). |
| [Runner authentication token](../../../security/token_overview.md#runner-authentication-tokens) | Leverages the `POST /api/v4/runners/verify` REST endpoint to ensure the validity of the authentication token. Creates an entry in `config.toml` file and a `system_id` value in a sidecar file if missing (`.runner_system_id`). |
### Transition period
During a transition period, legacy tokens ("registration tokens") continue to be shown on the
GitLab Runners settings page and to be accepted by the `gitlab-runner register` command.
The legacy workflow is nevertheless discouraged in the UI.
Users are steered towards the new flow consisting of creating the runner in the UI and using the
resulting authentication token with the `gitlab-runner register` command as they do today.
This approach reduces disruption to users responsible for deploying runners.
### Reusing the runner authentication token across many machines
In the existing autoscaling model, a new runner is created whenever a new job needs to be executed.
This has led to many situations where runners are left behind and become stale.
In the proposed model, a `ci_runners` table entry describes a configuration that the user can reuse
across multiple machines, and runner state from each machine (for example, IP address, platform,
or architecture) is moved to a separate table (`ci_runner_machines`).
A unique system identifier is [generated automatically](#generating-a-system_id-value) whenever the
runner application starts up or the configuration is saved.
This allows differentiating the machine in which the runner is being used.
The `system_id` value complements the short runner token that is used to identify a runner in
command line output, CI job logs, and GitLab UI.
Given that the creation of runners involves user interaction, it should be possible
to eventually lower the per-plan limit of CI runners that can be registered per scope.
#### Generating a `system_id` value
We ensure that a unique system identifier is assigned at all times to a `gitlab-runner`
installation.
The ID is derived from an existing machine identifier such as `/etc/machine-id` (on Linux) and
hashed for privacy, in which case it is prefixed with `s_`.
If an ID is not available, a random string is used instead, in which case it is prefixed with `r_`.
This unique ID identifies the `gitlab-runner` process and is sent
on `POST /api/v4/jobs` requests for all runners in the `config.toml` file.
The ID is generated and saved both at `gitlab-runner` startup and whenever the configuration is
saved to disk.
Instead of saving the ID at the root of `config.toml` though, we save it to a new file that lives
next to it - `.runner_system_id`. The goal for this new file is to make it less likely that IDs
get reused due to manual copying of the `config.toml` file
```plain
s_cpwhDr7zFz4xBJujFeEM
```
### Runner identification in CI jobs
For users to identify the machine where the job was executed, the unique identifier needs to be
visible in CI job contexts.
As a first iteration, GitLab Runner will include the unique system identifier in the build logs,
wherever it publishes the short token SHA.
Given that the runner can potentially be reused with different unique system identifiers,
we should store the unique system ID in the database.
This ensures the unique system ID maps to a GitLab Runner's `system_id` value with the runner token.
A new `ci_runner_machines` table holds information about each unique runner manager,
with information regarding when the runner last connected, and what type of runner it was.
In the long term, the relevant fields are to be moved from the `ci_runners` into
`ci_runner_machines`.
Until the removal milestone though, they should be kept in the `ci_runners` as a fallback when a
matching `ci_runner_machines` record does not exist.
An expected scenario is the case when the table is created but the runner hasn't pinged the GitLab
instance (for example if the runner is offline).
In addition, we should add the following columns to `ci_runners`:
- a `creator_id` column to keep track of who created a runner;
- a `registration_type` enum column to `ci_runners` to signal whether a runner has been created
using the legacy `register` method, or the new UI-based method.
Possible values are `:registration_token` and `:authenticated_user`.
This allows the stale runner cleanup service to determine which runners to clean up, and allows
future uses that may not be apparent.
```sql
CREATE TABLE ci_runners (
...
creator_id bigint
registration_type int8
)
```
A new `p_ci_runner_machine_builds` table joins the `ci_runner_machines` and `ci_builds` tables, to avoid
adding more pressure to those tables.
We might consider a more efficient way to store `contacted_at` than updating the existing record.
```sql
CREATE TABLE p_ci_runner_machine_builds (
partition_id bigint DEFAULT 100 NOT NULL,
build_id bigint NOT NULL,
runner_machine_id bigint NOT NULL
)
PARTITION BY LIST (partition_id);
CREATE TABLE ci_runner_machines (
id bigint NOT NULL,
system_xid character varying UNIQUE NOT NULL,
contacted_at timestamp without time zone,
version character varying,
revision character varying,
platform character varying,
architecture character varying,
ip_address character varying,
executor_type smallint,
config jsonb DEFAULT '{}'::jsonb NOT NULL
);
```
## Advantages
- Easier for users to wrap their minds around the concept: instead of two types of tokens,
there is a single type of token - the per-runner authentication token. Having two types of tokens
frequently results in misunderstandings when discussing issues;
- Runners can always be traced back to the user who created it, using the audit log;
- The claims of a CI runner are known at creation time, and cannot be changed from the runner
(for example, changing the `access_level`/`protected` flag). Authenticated users
may however still edit these settings through the GitLab UI;
- Easier cleanup of stale runners, which doesn't touch the `ci_runner` table.
## Details
In the proposed approach, we create a distinct way to configure runners that is usable
alongside the current registration token method during a transition period. The idea is
to avoid having the Runner make API calls that allow it to leverage a single "god-like"
token to register new runners.
The new workflow looks as follows:
1. The user opens the Runners settings page (instance, group, or project level);
1. The user fills in the details regarding the new desired runner, namely description,
tags, protected, locked, etc.;
1. The user clicks `Create`. That results in the following:
1. Creates a new runner in the `ci_runners` table (and corresponding `glrt-` prefixed authentication token);
1. Presents the user with instructions on how to configure this new runner on a machine,
with possibilities for different supported deployment scenarios (for example, shell, `docker-compose`, Helm chart, etc.)
This information contains a token which is available to the user only once, and the UI
makes it clear to the user that the value shall not be shown again, as registering the same runner multiple times
is discouraged (though not impossible).
1. The user copies and pastes the instructions for the intended deployment scenario (a `register` command), leading to the following actions:
1. Upon executing the new `gitlab-runner register` command in the instructions, `gitlab-runner` performs
a call to the `POST /api/v4/runners/verify` with the given runner token;
1. If the `POST /api/v4/runners/verify` GitLab endpoint validates the token, the `config.toml`
file is populated with the configuration;
1. Whenever a runner pings for a job, the respective `ci_runner_machines` record is
["upserted"](https://en.wiktionary.org/wiki/upsert) with the latest information about the
runner (with Redis cache in front of it like we do for Runner heartbeats).
As part of the transition period, we provide admins and top-level group owners with an
instance/group-level setting (`allow_runner_registration_token`) to disable the legacy registration
token functionality and enforce using only the new workflow.
Any attempt by a `gitlab-runner register` command to hit the `POST /api/v4/runners` endpoint
to register a new runner with a registration token results in a `HTTP 410 Gone` status code.
The instance setting is inherited by the groups. This means that if the legacy registration method
is disabled at the instance method, the descendant groups/projects mandatorily prevents the legacy
registration method.
The registration token workflow is to be deprecated (with a deprecation notice printed by the `gitlab-runner register` command)
and removed at a future major release after the concept is proven stable and customers have migrated to the new workflow.
### Handling of legacy runners
Legacy versions of GitLab Runner do not send the unique system identifier in its requests, and we
will not change logic in Workhorse to handle unique system IDs. This can be improved upon in the
future after the legacy registration system is removed, and runners have been upgraded to newer
versions.
Job pings from such legacy runners results in a `ci_runner_machines` record containing a
`<legacy>` `system_xid` field value.
Not using the unique system ID means that all connected runners with the same token are
notified, instead of just the runner matching the exact system identifier. While not ideal, this is
not an issue per-se.
### `ci_runner_machines` record lifetime
New records are created in 2 situations:
- When the runner calls the `POST /api/v4/runners/verify` endpoint as part of the
`gitlab-runner register` command, if the specified runner token is prefixed with `glrt-`.
This allows the frontend to determine whether the user has successfully completed the registration and take an
appropriate action;
- When GitLab is pinged for new jobs and a record matching the `token`+`system_id` does not already exist.
Due to the time-decaying nature of the `ci_runner_machines` records, they are automatically
cleaned after 7 days after the last contact from the respective runner.
### Required adaptations
#### Migration to `ci_runner_machines` table
When details from `ci_runner_machines` are needed, we need to fall back to the existing fields in
`ci_runner` if a match is not found in `ci_runner_machines`.
#### REST API
API endpoints receiving runner tokens should be changed to also take an optional
`system_id` parameter, sent alongside with the runner token (most often as a JSON parameter on the
request body).
#### GraphQL `CiRunner` type
The [`CiRunner` type](../../../api/graphql/reference/index.md#cirunner) closely reflects the
`ci_runners` model. This means that machine information such as `ipAddress`, `architectureName`,
and `executorName` among others are no longer singular values in the proposed approach.
We can live with that fact for the time being and start returning lists of unique values, separated
by commas.
The respective `CiRunner` fields must return the values for the `ci_runner_machines` entries
(falling back to `ci_runner` record if non-existent).
#### Stale runner cleanup
The functionality to
[clean up stale runners](../../../ci/runners/runners_scope.md#clean-up-stale-group-runners) needs
to be adapted to clean up `ci_runner_machines` records instead of `ci_runners` records.
At some point after the removal of the registration token support, we'll want to create a background
migration to clean up stale runners that have been created with a registration token (leveraging the
enum column created in the `ci_runners` table.
### Runner creation through API
Automated runner creation is possible through a new GraphQL mutation and the existing
[`POST /user/runners` REST API endpoint](../../../api/users.md#create-a-runner-linked-to-a-user).
These endpoints are only available to users that are
[allowed](../../../user/permissions.md#gitlab-cicd-permissions) to create runners at the specified
scope.
## Implementation plan
### Stage 1 - Deprecations
| Component | Milestone | Changes |
|----------------------------------|----------:|---------|
| GitLab Rails app | `15.6` | Deprecate `POST /api/v4/runners` endpoint for `17.0`. This hinges on a [proposal](https://gitlab.com/gitlab-org/gitlab/-/issues/373774) to allow deprecating REST API endpoints for security reasons. |
| GitLab Runner | `15.6` | Add deprecation notice for `register` command for `17.0`. |
| GitLab Runner Helm Chart | `15.6` | Add deprecation notice for `runnerRegistrationToken` command for `17.0`. |
| GitLab Runner Operator | `15.6` | Add deprecation notice for `runner-registration-token` command for `17.0`. |
| GitLab Runner / GitLab Rails app | `15.7` | Add deprecation notice for registration token reset for `17.0`. |
### Stage 2 - Prepare `gitlab-runner` for `system_id`
| Component | Milestone | Changes |
|---------------|----------:|---------|
| GitLab Runner | `15.7` | Ensure a sidecar TOML file exists with a `system_id` value.<br/>Log new system ID values with `INFO` level as they get assigned. |
| GitLab Runner | `15.9` | Log unique system ID in the build logs. |
| GitLab Runner | `15.9` | Label Prometheus metrics with unique system ID. |
| GitLab Runner | `15.8` | Prepare `register` command to fail if runner server-side configuration options are passed together with a new `glrt-` token. |
### Stage 2a - Prepare GitLab Runner Helm Chart and GitLab Runner Operator
| Component | Milestone | Changes |
|--------------------------|----------:|---------|
| GitLab Runner Helm Chart | `%15.10` | Update the Runner Helm Chart to support registration with the authentication token. |
| GitLab Runner Operator | `%15.10` | Update the Runner Operator to support registration with the authentication token. |
| GitLab Runner Helm Chart | `%16.2` | Add `systemID` to Runner Helm Chart. |
### Stage 3 - Database changes
<!-- markdownlint-disable MD056 -->
| Component | Milestone | Changes |
|------------------|----------:|---------|
| GitLab Rails app | `%15.8` | Create database migration to add columns to `ci_runners` table. |
| GitLab Rails app | `%15.8` | Create database migration to add `ci_runner_machines` table. |
| GitLab Rails app | `%15.9` | Create database migration to add `ci_runner_machines.id` foreign key to `ci_builds_metadata` table. |
| GitLab Rails app | `%15.8` | Create database migrations to add `allow_runner_registration_token` setting to `application_settings` and `namespace_settings` tables (default: `true`). |
| GitLab Rails app | `%15.8` | Create database migration to add `config` column to `ci_runner_machines` table. |
| GitLab Runner | `%15.9` | Start sending `system_id` value in `POST /jobs/request` request and other follow-up requests that require identifying the unique system. |
| GitLab Rails app | `%15.9` | Create service similar to `StaleGroupRunnersPruneCronWorker` service to clean up `ci_runner_machines` records instead of `ci_runners` records.<br/>Existing service continues to exist but focuses only on legacy runners. |
| GitLab Rails app | `%15.9` | Implement the `create_runner_machine` [feature flag](../../../administration/feature_flags.md). |
| GitLab Rails app | `%15.9` | Create `ci_runner_machines` record in `POST /runners/verify` request if the runner token is prefixed with `glrt-`. |
| GitLab Rails app | `%15.9` | Use runner token + `system_id` JSON parameters in `POST /jobs/request` request in the [heartbeat request](https://gitlab.com/gitlab-org/gitlab/blob/c73c96a8ffd515295842d72a3635a8ae873d688c/lib/api/ci/helpers/runner.rb#L14-20) to update the `ci_runner_machines` cache/table. |
| GitLab Rails app | `%15.9` | Implement the `create_runner_workflow_for_admin` [feature flag](../../../administration/feature_flags.md). |
| GitLab Rails app | `%15.9` | Implement `create_{instance|group|project}_runner` permissions. |
| GitLab Rails app | `%15.9` | Rename `ci_runner_machines.machine_xid` column to `system_xid` to be consistent with `system_id` passed in APIs. |
| GitLab Rails app | `%15.10` | Remove the ignore rule for `ci_runner_machines.machine_xid` column. |
| GitLab Rails app | `%15.10` | Replace `ci_builds_metadata.runner_machine_id` with a new join table. |
| GitLab Rails app | `%15.11` | Drop `ci_builds_metadata.runner_machine_id` column. |
| GitLab Rails app | `%16.0` | Remove the ignore rule for `ci_builds_metadata.runner_machine_id` column. |
<!-- markdownlint-enable MD056 -->
### Stage 4 - Create runners from the UI
| Component | Milestone | Changes |
|------------------|----------:|---------|
| GitLab Rails app | `%15.9` | [Add prefix to newly generated runner authentication tokens](https://gitlab.com/gitlab-org/gitlab/-/issues/383198). |
| GitLab Rails app | `%15.9` | Add new runner field for with token that is used in registration |
| GitLab Rails app | `%15.9` | Implement new GraphQL user-authenticated API to create a new runner. |
| GitLab Rails app | `%15.10` | Return token and runner ID information from `/runners/verify` REST endpoint. |
| GitLab Runner | `%15.10` | [Modify register command to allow new flow with glrt- prefixed authentication tokens](https://gitlab.com/gitlab-org/gitlab-runner/-/issues/29613). |
| GitLab Runner | `%15.10` | Make the `gitlab-runner register` command happen in a single operation. |
| GitLab Rails app | `%15.10` | Define feature flag and policies for "New Runner creation workflow" for groups and projects. |
| GitLab Rails app | `%15.10` | Only update runner `contacted_at` and `status` when polled for jobs. |
| GitLab Rails app | `%15.10` | Add GraphQL type to represent runner managers under `CiRunner`. |
| GitLab Rails app | `%15.11` | Implement UI to create new instance runner. |
| GitLab Rails app | `%15.11` | Update service and mutation to accept groups and projects. |
| GitLab Rails app | `%15.11` | Implement UI to create new group/project runners. |
| GitLab Rails app | `%15.11` | Add `runner_machine` field to CiJob GraphQL type. |
| GitLab Rails app | `%15.11` | UI changes to runner details view (listing of platform, architecture, IP address, etc.) (?) |
| GitLab Rails app | `%15.11` | Adapt `POST /api/v4/runners` REST endpoint to accept a request from an authorized user with a scope instead of a registration token. |
| GitLab Runner | `%15.11` | Handle `glrt-` runner tokens in `unregister` command. |
| GitLab Runner | `%15.11` | Runner asks for registration token when a `glrt-` runner token is passed in `--token`. |
| GitLab Rails app | `%15.11` | Move from 'runner machine' terminology to 'runner manager'. |
### Stage 5 - Optional disabling of registration token
<!-- markdownlint-disable MD056 -->
| Component | Milestone | Changes |
|------------------|----------:|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GitLab Rails app | `%16.0` | Adapt `register_{group|project}_runner` permissions to take [application setting](https://gitlab.com/gitlab-org/gitlab/-/issues/386712) in consideration. |
| GitLab Rails app | `%16.1` | Make the [`POST /api/v4/runners`](../../../api/runners.md#create-a-runner) endpoint return `HTTP 410 Gone` permanently if either `allow_runner_registration_token` setting disables registration tokens. The Runners API v5 should return `HTTP 404 Not Found`. |
| GitLab Rails app | `%16.1` | Add runner group metadata to the runner list. |
| GitLab Rails app | `%16.11` | Add UI to allow disabling use of registration tokens in top-level group settings. |
| GitLab Rails app | `%16.11` | Add UI to allow disabling use of registration tokens in admin panel. |
| GitLab Rails app | `%16.11` | Hide legacy UI showing registration with a registration token, if it disabled on in top-level group settings or by admins. |
<!-- markdownlint-enable MD056 -->
### Stage 6 - Enforcement
| Component | Milestone | Changes |
|------------------|----------:|---------|
| GitLab Rails app | `%17.0` | Disable registration tokens for all groups by running database migration (only on GitLab.com) |
| GitLab Rails app | `%17.0` | Disable registration tokens on the instance level by running database migration (except GitLab.com) |
| GitLab Rails app | `%16.3` | Implement new `:create_runner` PPGAT scope so that we don't require a full `api` scope. |
| GitLab Rails app | | Document gotchas when [automatically rotating runner tokens](../../../ci/runners/configure_runners.md#automatically-rotate-runner-authentication-tokens) with multiple machines. |
### Stage 7 - Removals
| Component | Milestone | Changes |
|------------------|----------:|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GitLab Rails app | `18.0` | Remove UI enabling registration tokens on the group and instance levels. |
| GitLab Rails app | `18.0` | Remove legacy UI showing registration with a registration token. |
| GitLab Runner | `18.0` | Remove runner model arguments from `register` command (for example `--run-untagged`, `--tag-list`, etc.) |
| GitLab Rails app | `18.0` | Create database migrations to drop `allow_runner_registration_token` setting columns from `application_settings` and `namespace_settings` tables. |
| GitLab Rails app | `18.0` | Create database migrations to drop:<br/>- `runners_registration_token`/`runners_registration_token_encrypted` columns from `application_settings`;<br/>- `runners_token`/`runners_token_encrypted` from `namespaces` table;<br/>- `runners_token`/`runners_token_encrypted` from `projects` table. |
| GitLab Rails app | `18.0` | Remove `GITLAB_SHARED_RUNNERS_REGISTRATION_TOKEN`. |
## FAQ
Follow [the user documentation](../../../ci/runners/new_creation_workflow.md).
## Status
Status: RFC.
## Who
Proposal:
<!-- vale gitlab.Spelling = NO -->
| Role | Who |
|------------------------------|--------------------------------------------------|
| Authors | Kamil Trzciński, Tomasz Maczukin, Pedro Pombeiro |
| Architecture Evolution Coach | Kamil Trzciński |
| Engineering Leader | Nicole Williams, Cheryl Li |
| Product Manager | Darren Eastman, Jackie Porter |
| Domain Expert / Runner | Tomasz Maczukin |
DRIs:
| Role | Who |
|------------------------------|---------------------------------|
| Leadership | Nicole Williams |
| Product | Darren Eastman |
| Engineering | Tomasz Maczukin, Pedro Pombeiro |
Domain experts:
| Area | Who |
|------------------------------|-----------------|
| Domain Expert / Runner | Tomasz Maczukin |
<!-- vale gitlab.Spelling = YES -->
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,32 +1,11 @@
---
owning-stage: "~devops::secure"
description: "GitLab Secret Detection ADR 001: Use Ruby Push Check approach within monolith"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/001_use_ruby_push_check_approach_within_monolith/'
remove_date: '2025-07-08'
---
# GitLab Secret Detection ADR 001: Use Ruby Push Check approach within monolith
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/001_use_ruby_push_check_approach_within_monolith/).
## Context
There are a number of concerns around the performance of secret detection using a regex-based approach at scale. The primary considerations include transfer latency between nodes and both CPU and memory bloat. These concerns manifested in two ways: the language to be used for performing regex matching and the deployment architecture.
The original discussion in [the exploration issue](https://gitlab.com/gitlab-org/gitlab/-/issues/428499) covers many of these concerns and background.
### Implementation language
The two primary languages considered were Ruby and Go.
The choice to use other languages (such as C++) for implementation was discarded in favour of Ruby and Go due to team familiarity, speed of deployment, and portability. See [this benchmarking issue](https://gitlab.com/gitlab-org/gitlab/-/issues/423832) for performance comparisons between the two.
### Deployment architecture
Several options were considered for deployments: directly embedding the logic within the Rails monolith's Push Check execution path, placement as a sidecar within a Rails node deployment, placement as a sidecar within a Gitaly node as a [server-side hook](../../../../administration/server_hooks.md), and deployment as a standalone service.
## Decision
For the initial iteration around blocking push events using a prereceive integration, the decision was made to proceed with Ruby-based approach, leveraging `re2` for performant regex processing. Additionally, the decision was made to integrate the logic directly into the monolith rather than as a discrete service or server-side hook within Gitaly.
A Gitaly server-side hook would have performance benefits around minimal transfer latency for Git blobs between scanning service and Gitaly blob storage. However, an extra request would be needed between Gitaly and the Rails application to contextualize the scan. Additionally, the current hook architecture is [discouraged and work is planned to migrate towards a new plugin architecture in the near future](https://gitlab.com/gitlab-org/gitaly/-/issues/5642).
The Ruby Push Check approach follows a clear execution plan to achieve delivery by anticipated timeline and is more closely aligned with the long-term direction of platform-wide scanning. For example, future scanning of issuables will require execution within the trust boundary of the Rails application rather than Gitaly context. This approach, however, has raised concerns around elevated memory usage within the Rails application leading to availability concerns. This direction may also require migrating towards Gitaly's new plugin architecture in the future once the timeline is known.
A standalone service may be considered in the future but requires considerations of a technical approach that should be better informed by data gathered during [pre-production profiling](https://gitlab.com/gitlab-org/gitlab/-/issues/428499).
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,40 +1,11 @@
---
owning-stage: "~devops::secure"
description: "GitLab Secret Detection ADR 002: Store the Secret Detection Gem in the same repository"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/002_store_the_secret_detection_gem_in_the_same_repository/'
remove_date: '2025-07-08'
---
# GitLab Secret Detection ADR 002: Store the Secret Detection Gem in the same repository
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/002_store_the_secret_detection_gem_in_the_same_repository/).
## Context
During [Phase 1](../index.md#phase-1---ruby-pushcheck-pre-receive-integration), we opted for using the [Ruby-based push check approach](../decisions/001_use_ruby_push_check_approach_within_monolith.md) to block secrets from being committed to a repository, and as such the scanning of secrets was performed by a library (or a Ruby gem) developed internally within GitLab for this specific purpose.
Part of the process to create this library and make it available for use within the Rails monolith, we had to make a decision on the best way to distribute the library.
## Approach
We evaluated two possible approaches:
1. Store the library [in the same repository](../../../../development/gems.md#in-the-same-repo) as the monolith.
1. Store the library [in an external repository](../../../../development/gems.md#in-the-external-repo).
Each approach came with some advantages and disadvantages, mostly around distribution, consistency, maintainability, and the overhead of having to set up review and release workflows and similar processes. See below for more information.
### Within the same repository as the monolith
Having the gem developed and stored in the same repository meant having it packaged within GitLab monolith itself, and with that ensuring it does not have to be installed as a dependency. This would also reduce maintainability overhead in terms of defining workflows and processes from scratch. On the other hand, the library would have less visibility as it is not exposed or published to the wider community.
### In an external repository
Storing the library in an external repository meant having more visibility especially as the gem would be published on RubyGems.org, which would have garnered more interest and possibly contributions from the community into the feature. Additionally, the gem would be available to be used in other projects and applications. However, in doing so, the maintainability overhead would have increased signficantly for various reasons such as:
- Changes would need to be coordinated between multiple repositories when a new version is released.
- Review and release workflows, and similar processes would need to be defined separately.
## Decision
The decision was made to store the library in the same repository during the first phase to ensure easier distribution since it's packaged within GitLab and will be available immediately without having to install external dependencies.
With that said, we still followed [the process](../../../../development/gems.md#reserve-a-gem-name) to reserve the gem on [RubyGems.org](https://rubygems.org/gems/gitlab-secret_detection) to avoid name-squatters from taking over the name and providing malicious code to 3rd-parties.
We have no plans to publish the gem externally at least until [Phase 2](../index.md#phase-2---standalone-secret-detection-service) as we begin to consider building a standalone service to perform secret detection.
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,40 +1,11 @@
---
owning-stage: "~devops::secure"
description: "GitLab Secret Detection ADR 003: Run scan within subprocess"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/003_run_scan_within_subprocess/'
remove_date: '2025-07-08'
---
# GitLab Secret Detection ADR 003: Run scan within subprocesses
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/003_run_scan_within_subprocess/).
## Context
During the [spike](https://gitlab.com/gitlab-org/gitlab/-/issues/422574#note_1582015771) conducted for evaluating regex for Pre-receive Secret Detection, Ruby using RE2 library came out on the top of the list. Although Ruby has an acceptable regex performance, its language limitations have certain pitfalls like more memory consumption and lack of parallelism despite the language supporting multi-threading and Ractors (3.1+) as they are suitable for running I/O-bound operations in parallel but not CPU-bound operations.
One of the concerns running the Pre-receive Secret Detection feature in the critical path is memory consumption, especially by the regex operations involved in the scan. In a scan with 300+ regex-based rule patterns running on every line of the commit blobs, the memory could go up to ~2-3x the size of the commit blobs[1](https://gitlab.com/gitlab-org/gitlab/-/issues/422574#note_1582015771). The occupied memory is not released despite scan operation being complete, until the Garbage Collector triggers. Eventually, the servers might choke on the memory.
The [original discussion issue](https://gitlab.com/gitlab-org/gitlab/-/issues/430160) covers many of these concerns and more background.
### Approach
We can tackle the memory consumption problem to an extent by running the scan within a separate process forked from the main process. Once the scan is complete, we kill the spawned process such that the occupied memory releases to the OS immediately instead of waiting for Ruby to trigger GC.
## Technical Solution
There are several scenarios to consider while managing a process's lifecycle. Failing to do so would lead to an orphan process having no control over it, defeating the whole purpose of conserving memory. We offload this burden over a Ruby library called [`Parallel`](https://github.com/grosser/parallel) that provides the ability to run operations via subprocesses. Its simple interface for communication b/w parent and child processes, handling exit signals, and easy capping of no. of processes makes it a suitable solution for achieving our needs. It additionally supports parallelism (spawning and running multiple subprocesses simultaneously) that solves another problem not covered in this document.
### Scope of the operation within Subprocess
It is crucial to determine which operation runs within the subprocess because spawning a new process comes with an additional latency overhead from the OS (copying file descriptors, etc). For example, running the scan on each blob inside a new subprocess is `~2.5x` slower than when the scan runs on the main process. On the contrary, dedicating one subprocess for each commit request isn't feasible either as the scan on all the blobs runs within a single process and we wouldn't be able to release memory quickly until all the scans are complete, taking us back to square one.
*Bucket Approach*: A compromise between the two extremes would be when we group all the blobs whose cumulative size is at least a fixed chunk size ([`2MiB` in our case](https://gitlab.com/gitlab-org/gitlab/-/blob/5dfcf7431bfff25519c05a7e66c0cbb8d7b362be/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb#L32)) and then run each group within a separate sub-process as illustrated below.
![Bucketed Subprocesses](../img/003_subprocess.jpg "Bucketed Subprocess by Fixed Chunk size")
### Addendum
- Running operations within a subprocess isn't a silver bullet to the above mentioned problems. We could say it *delays* our servers from getting choked by releasing the memory *faster* than the usual process via GC. Even this approach can fail when the burst of requests is too huge to handle^.
- There's always a latency overhead on the process creation of the lifecycle. For the smaller commits^, the latency of the scan operation *might* be slower than when run on the main process.
- The parallelism factor or the no. of processes forked per request is currently capped to [`5` processes](https://gitlab.com/gitlab-org/gitlab/-/blob/5dfcf7431bfff25519c05a7e66c0cbb8d7b362be/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb#L29), beyond which pending requests wait in the queue to avoid over-forking processes which would also lead to resource exhaustion.
_^Threshold numbers will be added here soon for reference._
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,127 +1,11 @@
---
owning-stage: "~devops::secure"
description: "GitLab Secret Detection ADR 004: Secret Detection Scanner Service"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/004_secret_detection_scanner_service/'
remove_date: '2025-07-08'
---
# GitLab Secret Detection ADR 004: Secret Detection Scanner Service
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/004_secret_detection_scanner_service/).
## Context
In the [phase 2](../index.md#phase-2---standalone-secret-detection-service) of Secret Push Protection, the goal is to have a
dedicated service responsible for running Secret Detection scans on the given input blobs. This is done primarily from
the scalability standpoint. Regex operations in the Secret Detection scan [consume](https://gitlab.com/gitlab-org/gitlab/-/issues/422574#note_1582015771)
high resources so running scans within Rails or Gitaly instances would impact the resource availability for running
other operations. Running scans in isolation provides greater control over resource allocation and scaling the service
independently as needed.
## Proposed Solution
We will build a standalone Secret Detection service responsible for running the Secret Detection scans.
The main change in the workflow of Secret Push Protection would be the delegation of scanning responsibility from the
[Secret Detection gem](https://gitlab.com/gitlab-org/gitlab/-/tree/master/gems/gitlab-secret_detection) to the RPC
service for GitLab SaaS i.e., the [secrets push check](https://gitlab.com/gitlab-org/gitlab/-/blob/master/ee/lib/gitlab/checks/secrets_check.rb) invokes the RPC
service with an array of blobs to scan for secrets. Note that the project eligibility checks are still performed at the
[Rails side](https://gitlab.com/gitlab-org/gitlab/-/blob/1a6db446abce0aa02f41d060511d7e085e3c7571/ee/lib/gitlab/checks/secrets_check.rb#L49-51).
### High-Level Architecture
The service architecture involves extracting the secret detection logic into a standalone service
which communicates directly with both the Rails application and Gitaly. This provides a means to scale
the secret detection nodes independently, and reduce resource usage overhead on the Rails application.
Scans still runs synchronously as a (potentially) blocking pre-receive transaction. The blob size remains limited to 1MB.
Note that the node count is purely illustrative, but serves to emphasize the independent scaling requirements for the
scanning service.
```plantuml
@startuml Phase2
skinparam linetype ortho
card "**External Load Balancer**" as elb #6a9be7
card "**Internal Load Balancer**" as ilb #9370DB
together {
collections "**GitLab Rails** x3" as gitlab #32CD32
collections "**Sidekiq** x3" as sidekiq #ff8dd1
}
together {
collections "**Consul** x3" as consul #e76a9b
}
card "SecretScanningService Cluster" as prsd_cluster {
collections "**SecretScanningService** x5" as prsd #FF8C00
}
card "Gitaly Cluster" as gitaly_cluster {
collections "**Gitaly** x3" as gitaly #FF8C00
}
card "Database" as database {
collections "**PGBouncer** x3" as pgbouncer #4EA7FF
}
elb -[#6a9be7]-> gitlab
gitlab -[#32CD32,norank]--> ilb
gitlab .[#32CD32]----> database
gitlab -[hidden]-> consul
sidekiq -[#ff8dd1,norank]--> ilb
sidekiq .[#ff8dd1]----> database
sidekiq -[hidden]-> consul
ilb -[#9370DB]--> prsd_cluster
ilb -[#9370DB]--> gitaly_cluster
ilb -[#9370DB]--> database
ilb -[hidden]u-> consul
consul .[#e76a9b]u-> gitlab
consul .[#e76a9b]u-> sidekiq
consul .[#e76a9b]-> database
consul .[#e76a9b]-> gitaly_cluster
consul .[#e76a9b]-> prsd_cluster
@enduml
```
#### Service Level Indicators(SLIs)
We will adopt the same SLIs followed for [GitLab Applications](../../../../development/application_slis/index.md) i.e.,
**Apdex score**, **Error Ratio** and two additional metrics specific to the service - **Request Latency** and
**Memory Saturation rate**.
#### Service Level Objectives(SLOs)
_We will define threshold limits after obtaining benchmark scores from the RPC service._
### Service Implementation
We will build an RPC service primarily responsible for detecting secrets in the given input blobs with RPC as the
communication interface. This service will initially be invoked by Rails monolith when performing change access checks
for Git Push event, and eventually extended for the other use cases too.
To reuse the same business logic of Scanning, in addition to offering the feature as an RPC service, the same project
will also include the provision for distributing the feature into a Ruby Gem.
#### Language/Tools/Framework
- Ruby `3.2+`
- gRPC framework for serving RPC requests
- [Protobuf Service Definition](https://gitlab.com/gitlab-org/security-products/secret-detection/secret-detection-service/-/raw/main/rpc/secret_detection.proto) file
### Addendum
- The RPC service should also expose [Health Check](https://github.com/grpc/grpc/blob/master/doc/health-checking.md) RPC endpoint for ensuring the availability of the service.
- Unlike in Gem-based approach, we wouldn't be able to use [scan within the subprocess](003_run_scan_within_subprocess.md) approach in the RPC server
since they [removed support](https://github.com/grpc/grpc/blob/master/doc/fork_support.md) for forking subprocess. However, we can explore optimizations like batch requests
concurrently from the RPC client side.
### Reference links
- [Standalone Service as a concept](../../gitlab_ml_experiments/index.md)
- [Runway: Service Deployment & Docs](https://gitlab.com/gitlab-com/gl-infra/platform/runway)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,104 +1,11 @@
---
owning-stage: "~devops::secure"
description: "GitLab Secret Detection ADR 005: Use Runway for service deployment"
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/005_use_runway_for_deployment/'
remove_date: '2025-07-08'
---
# GitLab Secret Detection ADR 005: Use Runway for service deployment
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/decisions/005_use_runway_for_deployment/).
## Context
The [Secret Detection Service](004_secret_detection_scanner_service.md) requires a strategy for running automated
deployments via GitLab CI environment.
## Proposed Solution: Runway
We could use [Runway](https://gitlab.com/gitlab-com/gl-infra/platform/runway#runway) - a GitLab internal Platform as a
Service, which aims to enable teams to deploy and run their services quickly and safely.
### Platform Tooling Support
- **Logging**: Logging on GitLab-managed Elasticsearch/Kibana stack [isn't available](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/84#top)
in Runway and [there doesn't seem to be plans](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/84#note_1691419608) to support it anytime soon. At the moment, the workaround is to view
logs on [Google Cloud Run UI](https://cloud.google.com/run/docs/logging).
- **Observability**: Runway supports observability for service by integrating with the monitoring stack. The
[default metrics](https://docs.runway.gitlab.com/reference/observability/#dashboards)([example dashboard](https://dashboards.gitlab.net/d/runway-service/runway3a-runway-service-metrics?orgId=1)) provided by Runway covers all the necessary system metrics for
monitoring.
- **Pager alerts on failures**: Runway generates [alerts](https://docs.runway.gitlab.com/reference/observability/#alerts) for the following anomalies by default, which we believe
are sufficient to get started with:
- `Apdex SLO violation`
- `Error SLO violation`
- `Traffic absent SLO violation`
- **Service Level Indicators (SLIs)**: The [default metrics](https://docs.runway.gitlab.com/reference/observability/#dashboards)([example dashboard](https://dashboards.gitlab.net/d/runway-service/runway3a-runway-service-metrics?orgId=1)) provided by Runway covers
necessary [SLI requirements](004_secret_detection_scanner_service.md#service-level-indicatorsslis).
- **Insights**: We might need additional metrics on rule patterns like their latency, usage count, source, etc. We may
use custom metrics, which we will evaluate further soon.
### Known Limitations (relevant to Secret Detection Service)
- ~~No support for GRPC protocol~~ Update: [GRPC is now supported](https://gitlab.com/gitlab-com/gl-infra/platform/runway/runwayctl/-/merge_requests/421#note_1934369305)
- No support for GitLab Self-Managed environments ([Reference](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/236))
### Working with Limitations
The limitation of Runway's missing support for Self-Managed(SM) environments made us evaluate other solutions for SM
environments. The [Cloud Connector](../../cloud_connector/index.md)'s API-based approach would generally address the missing deployment solution
for SM environments. However, the Secret Push Protection feature involves frequent transferring large amounts of data between
Gitaly and Service in real-time so REST-based APIs aren't the right fit as they'd add significant network overhead
unlike streaming data on an RPC request. We could optimize the Cloud Connector approach with some additional complexity but it will be a matter of time
until Runway introduces a [deployment solution](https://gitlab.com/gitlab-com/gl-infra/platform/runway/team/-/issues/236)
for SM environments. One more [alternative solution](https://gitlab.com/gitlab-org/gitlab/-/issues/462359#note_1913306661) for SM environments was to share the Docker image artifact
along with deployment instructions with the customers (similar to [custom models approach](../../custom_models/index.md#ai-gateway-deployment)) but the horizontal
scaling could be a concern.
We came up with a hybrid solution. To address the scale of GitLab SaaS, we will have a dedicated RPC-based Secret
Detection service deployed using [Runway](https://gitlab.com/gitlab-com/gl-infra/platform/runway). This service will isolate the SD resource usage without impacting the
resources of other services(Rails and Gitaly) and can scale independently as needed. Whereas for Self-Managed instances,
we will continue using the current gem-based approach since that approach [performed adequately](https://gitlab.com/gitlab-org/gitlab/-/issues/431076#note_1755614298 "Enable/gather metrics - latency, memory, cpu, etc.")
for up to GET [50K Reference architecture](https://gitlab.com/gitlab-org/quality/performance/-/wikis/Benchmarks/Latest/50k). We will eventually migrate Self-Managed environments to Runway when it
introduces the deployment support.
**TL;DR:** We will use RPC service (deployed using Runway) for GitLab SaaS, and continue using the current Ruby gem
approach for GitLab Self-Managed instances.
To reuse the core implementation of Secret Scanning, we will have a single source code with two different distributions:
1. Wrap a Ruby gem around the secret detection logic and use it in the Rails(replacing the current gem).
1. Wrap an RPC service around the secret detection logic, deploy it using [Runway](https://gitlab.com/gitlab-com/gl-infra/platform/runway), and invoke the service from Rails for GitLab SaaS
![rpc_service.png](../img/004_rpc_service.jpg){width="1001" height="311"}
Here's the workflow illustrating the proposed change:
```mermaid
sequenceDiagram
autonumber
%% Phase 2: Iter 1
Gitaly->>+Rails: invokes `internal/allowed`
Rails->>Rails: Perform project eligibility checks
alt On project eligibility check failure
Rails-->>Gitaly: Scanning Skipped
end
Rails->>Gitaly: Get blobs
Gitaly->>Rails: Quarantined Blobs
Note over Rails,SD Ruby Gem: For GitLab Self-Managed
Rails->>SD Ruby Gem: Invoke RPC and forward quarantined blobs
SD Ruby Gem->>SD Ruby Gem: Runs Secret Detection on input blobs
SD Ruby Gem->>Rails: Result
Note over Rails,SD RPC Service: For GitLab SaaS (GitLab.com & Dedicated)
Rails->>SD RPC Service: Invoke RPC and forward quarantined blobs
SD RPC Service->>SD RPC Service: Runs Secret Detection on input blobs
SD RPC Service->>Rails: Result
Rails->>Gitaly: Result
```
## Reference Links
- [Runway Docs](https://runway.gitlab.com/)
- [Epic: Runway - Platform tooling to support AI Innovation](https://gitlab.com/groups/gitlab-com/gl-infra/-/epics/969)
- [Blueprint: GitLab Service-Integration: AI and Beyond](../../../blueprints/gitlab_ml_experiments/index.md)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 61 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 516 KiB

View File

@ -1,478 +1,11 @@
---
status: ongoing
creation-date: "2022-11-25"
authors: [ "@theoretick", "@vbhat161", "@ahmed.hemdan" ]
coach: [ "@theoretick" ]
approvers: [ "@connorgilbert", "@amarpatel" ]
owning-stage: "~devops::secure"
participating-stages: [ "~devops::systems" ]
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/secret_detection/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/secret_detection/).
# Secret Detection as a platform-wide experience
## Summary
Today's secret detection feature is built around containerized scans of repositories
within a pipeline context. This feature is quite limited compared to where leaks
or compromised tokens may appear and should be expanded to include a much wider scope.
Secret detection as a platform-wide experience encompasses detection across
platform features with high risk of secret leakage, including repository contents,
job logs, and project management features such as issues, epics, and MRs.
## Motivation
### Goals
- Support platform-wide detection of tokens to avoid secret leaks
- Prevent exposure by rejecting detected secrets
- Provide scalable means of detection without harming end user experience
- Unified list of token patterns and masking
See [target types](#target-types) for scan target priorities.
### Non-Goals
Phase1 is limited to detection and alerting across platform, with rejection only
during [prereceive Git interactions and browser-based detection](#iterations).
Secret revocation and rotation is also beyond the scope of this new capability.
Scanned object types beyond the scope of this MVC are included within [target types](#target-types).
#### Management UI
Development of an independent interface for managing secrets is out of scope
for this blueprint. Any detections will be managed using the existing
Vulnerability Management UI.
Management of detected secrets will remain distinct from the
[Secret Management feature capability](../../../ci/secrets/index.md) as
"detected" secrets are categorically distinct from actively "managed" secrets.
When a detected secret is identified, it has already been compromised due to
their presence in the target object (that is a repository). Alternatively, managed
secrets should be stored with stricter standards for secure storage, including
encryption and masking when visible (such as job logs or in the UI).
As a long-term priority we should consider unifying the management of the two
secret types however that work is out of scope for the current blueprints goals,
which remain focused on active detection.
### Target types
Target object types refer to the scanning targets prioritized for detection of leaked secrets.
In order of priority this includes:
1. non-binary Git blobs under 1 megabyte
1. job logs
1. issuable creation (issues, MRs, epics)
1. issuable updates (issues, MRs, epics)
1. issuable comments (issues, MRs, epics)
Targets out of scope for the initial phases include:
- non-binary Git blobs over 1 megabyte
- binary Git blobs
- Media types (JPEG, PDF, ...)
- Snippets
- Wikis
- Container images
- External media (Youtube platform videos)
### Token types
The existing Secret Detection configuration covers 100+ rules across a variety
of platforms. To reduce total cost of execution and likelihood of false positives
the dedicated service targets only well-defined, low-FP tokens.
Token types to identify in order of importance:
1. Well-defined GitLab tokens (including Personal Access Tokens and Pipeline Trigger Tokens)
1. Verified Partner tokens (including AWS)
1. Well-defined low-FP third party tokens
1. Remainder tokens currently included in Secret Detection analyzer configuration
A well-defined token is a token with a precise definition, most often a fixed
substring prefix (or suffix) and fixed length.
For GitLab and partner tokens, we have good domain understanding of our own tokens
and by collaborating with partners verified the accuracy of their provided patterns.
An observed low-FP token relies on user reports and dismissal reports. With delivery of
[this data issue](https://gitlab.com/gitlab-data/product-analytics/-/issues/1225)
we will have aggregates on FP-rates but primarily this is user-reported data, at present.
In order to minimize false positives, there are no plans to introduce or alert on high-entropy,
arbitrary strings; i.e. patterns such as `3lsjkw3a22`.
#### Uniformity of rule configuration
Rule pattern configuration should remain centralized in the `secrets` analyzer's packaged `gitleaks.toml`
configuration, vendored to the monolith for Phase 1, and checksum-checked to ensure it matches the
specific release version to avoid drift. Each token can be filtered by `tags` to form both high-confidence
and blocking groupings. For example:
```ruby
prereceive_blocking_rules = toml.load_file('gitleaks.toml')['rules'].select do |r|
r.tags.include?('gitlab_blocking_p1') &&
r.tags.include?('gitlab_blocking')
end
```
### Auditability
A critical aspect of both secret detection and [suppression](#detection-suppression) is administrative visibility.
With each phase we must include audit capabilities (events or logging) to enable event discovery.
## Proposal
The first iteration of the experimental capability will feature a blocking
pre-receive hook implemented in the Rails application. This iteration
will be released in an experimental state to select users and provide
opportunity for the team to profile the capability before considering extraction
into a dedicated service.
In the future state, to achieve scalable secret detection for a variety of domain objects a dedicated
scanning service must be created and deployed alongside the GitLab distribution.
This is referred to as the `SecretScanningService`.
This service must be:
- highly performant
- horizontally scalable
- generic in domain object scanning capability
Platform-wide secret detection should be enabled by-default on GitLab SaaS as well
as self-managed instances.
### Decisions
- [001: Use Ruby Push Check approach within monolith](decisions/001_use_ruby_push_check_approach_within_monolith.md)
- [002: Store the Secret Detection Gem in the same repository](decisions/002_store_the_secret_detection_gem_in_the_same_repository.md)
- [003: Run scan within subprocess](decisions/003_run_scan_within_subprocess.md)
- [004: Standalone Secret Detection Service](decisions/004_secret_detection_scanner_service.md)
- [005: Use Runway for service deployment](decisions/005_use_runway_for_deployment.md)
## Challenges
- Secure authentication to GitLab.com infrastructure
- Performance of scanning against large blobs
- Performance of scanning against volume of domain objects (such as push frequency)
- Queueing of scan requests
### Transfer optimizations for large Git data blobs
As described in [Gitaly's upload-pack traffic blueprint](../gitaly_handle_upload_pack_in_http2_server/index.md#git-data-transfer-optimization-with-sidechannel), we have faced problems in the past handling large data transfers over gRPC. This could be a concern as we expand secret detection to large blob sizes to increase coverage over leaked secrets. We expect to rollout pre-receive scanning with a 1 megabyte blob size limit which should be well within boundaries. From [Protobuffers' documentation](https://protobuf.dev/programming-guides/techniques/#large-data):
> As a general rule of thumb, if you are dealing in messages larger than a megabyte each, it may be time to consider an alternate strategy.
In expansion phases we must explore chunking or alternative strategies like the optimized sidechannel approach used by Gitaly.
## Design and implementation details
The detection capability relies on a multiphase rollout, from an experimental component implemented directly in the monolith to a standalone service capable of scanning text blobs generically.
The implementation of the secret scanning service is highly dependent on the outcomes of our benchmarking
and capacity planning against both GitLab.com and
[Reference Architectures](../../../administration/reference_architectures/index.md).
As the scanning capability must be an on-by-default component of both our SaaS and self-managed
instances, [each iteration's](#iterations) deployment characteristic defines whether
the service will act as a standalone component, or executed as a subprocess of the Rails architecture
(as mirrors the implementation of our Elasticsearch indexing service).
See [technical discovery](https://gitlab.com/gitlab-org/gitlab/-/issues/376716)
for further background exploration.
See [this thread](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/105142#note_1194863310)
for past discussion around scaling approaches.
### Detection engine
Our current secret detection offering uses [Gitleaks](https://github.com/zricethezav/gitleaks/)
for all secret scanning in pipeline contexts. By using its `--no-git` configuration
we can scan arbitrary text blobs outside of a repository context and continue to
use it for non-pipeline scanning.
Changes to the detection engine are out of scope until benchmarking unveils performance concerns.
For the long-term direction of GitLab Secret Detection, the scope is greater than that of the Gitleaks tool. As such, we should consider feature encapsulation to limit the Gitleaks domain to the relevant build context only.
In the case of pre-receive detection, we rely on a combination of keyword/substring matches
for pre-filtering and `re2` for regex detections. See [spike issue](https://gitlab.com/gitlab-org/gitlab/-/issues/423832) for initial benchmarks.
Notable alternatives include high-performance regex engines such as [Hyperscan](https://github.com/intel/hyperscan) or it's portable fork [Vectorscan](https://github.com/VectorCamp/vectorscan).
These systems may be worth exploring in the future if our performance characteristics show a need to grow beyond the existing stack, however the team's velocity in building an independently scalable and generic scanning engine was prioritized, see [ADR 001](decisions/001_use_ruby_push_check_approach_within_monolith.md) for more on the implementation language considerations.
### Organization-level Controls
Configuration and workflows should be oriented around [Organizations](../organization/index.md). Detection controls and governance patterns should support configuration across multiple projects and groups in a uniform way that emphasizes shared allowlists, organization-wide policies (i.e. disablement of push option bypass), and auditability.
Each phase documents the paradigm used as we iterate from Instance-level to Organization-level controls.
### Phase 1 - Ruby pushcheck pre-receive integration
The critical paths as outlined under [goals above](#goals) cover two major object
types: Git text blobs (corresponding to push events) and arbitrary text blobs. In Phase 1,
we focus entirely on Git text blobs.
The detection flow for push events relies on subscribing to the PreReceive hook
to scan commit data using the [PushCheck interface](https://gitlab.com/gitlab-org/gitlab/blob/3f1653f5706cd0e7bbd60ed7155010c0a32c681d/lib/gitlab/checks/push_check.rb). This `SecretScanningService`
service fetches the specified blob contents from Gitaly, scans
the commit contents, and rejects the push when a secret is detected.
See [Push event detection flow](#push-event-detection-flow) for sequence.
In the case of a push detection, the commit is rejected inline and error returned to the end user.
#### Configuration
This phase will be considered "experimental" with limited availability for customer opt-in, through instance level application settings.
#### High-Level Architecture
The Phase 1 architecture involves no additional components and is entirely encapsulated in the Rails application server. This provides a rapid deployment with tight integration within auth boundaries and no distribution coordination.
The primary drawback relies on resource utilization, adding additional CPU, memory, transfer volume, and request latency to existing application nodes.
```plantuml
@startuml Phase2
skinparam linetype ortho
card "**External Load Balancer**" as elb #6a9be7
together {
card "**GitLab Rails**" as gitlab #32CD32
card "**Gitaly**" as gitaly #FF8C00
card "**PostgreSQL**" as postgres #4EA7FF
card "**Redis**" as redis #FF6347
card "**Sidekiq**" as sidekiq #ff8dd1
}
}
gitlab -[#32CD32]--> gitaly
gitlab -[#32CD32]--> postgres
gitlab -[#32CD32]--> redis
gitlab -[#32CD32]--> sidekiq
elb -[#6a9be7]-> gitlab
gitlab .[#32CD32]----> postgres
sidekiq .[#ff8dd1]----> postgres
@enduml
```
#### Push Event Detection Flow
```mermaid
sequenceDiagram
autonumber
actor User
User->>+Workhorse: git push with-secret
Workhorse->>+Gitaly: tcp
Gitaly->>+Rails: PreReceive
Rails->>-Gitaly: ListAllBlobs
Gitaly->>-Rails: ListAllBlobsResponse
Rails->>+GitLabSecretDetection: Scan(blob)
GitLabSecretDetection->>-Rails: found
Rails->>User: rejected: secret found
User->>+Workhorse: git push without-secret
Workhorse->>+Gitaly: tcp
Gitaly->>+Rails: PreReceive
Rails->>-Gitaly: ListAllBlobs
Gitaly->>-Rails: ListAllBlobsResponse
Rails->>+GitLabSecretDetection: Scan(blob)
GitLabSecretDetection->>-Rails: not_found
Rails->>User: accepted
```
#### Gem Scanning Interface
For the Phase1, we use the private [Secret Detection Ruby Gem](https://gitlab.com/gitlab-org/gitlab/-/tree/5dfcf7431bfff25519c05a7e66c0cbb8d7b362be/gems/gitlab-secret_detection) that is invoked by the [Secrets Push Check](https://gitlab.com/gitlab-org/gitlab/-/blob/5dfcf7431bfff25519c05a7e66c0cbb8d7b362be/ee/lib/gitlab/checks/secrets_check.rb) on the GitLab Rails platform.
The private SD gem offers the following support in addition to running scan on multiple blobs:
- Configurable Timeout on the entire scan-level and on each blob level.
- Ability to run the scan within subprocess instead of the main process. The number of processes spawned per request is capped to [`5`](https://gitlab.com/gitlab-org/gitlab/-/blob/5dfcf7431bfff25519c05a7e66c0cbb8d7b362be/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb#L29).
The Ruleset file referred during the Pre-receive Secret Detection scan is
located [here](https://gitlab.com/gitlab-org/gitlab/-/blob/2da1c72dbc9df4d9130262c6b79ea785b6bb14ac/gems/gitlab-secret_detection/lib/gitleaks.toml).
More details about the Gem can be found in the [README](https://gitlab.com/gitlab-org/gitlab/-/blob/master/gems/gitlab-secret_detection/README.md) file. Also see [ADR 002](decisions/002_store_the_secret_detection_gem_in_the_same_repository.md) for more on how the Gem code is stored and distributed.
### Phase 2 - Standalone Secret Detection service
This phase emphasizes scaling the service outside of the monolith for general availability, isolating feature's resource
consumption, and ease of maintainability. The critical paths as outlined under [goals above](#goals) cover
two major object types: Git text blobs (corresponding to push events) and arbitrary text blobs. In Phase 2, we continue
to focus on Git text blobs.
The responsibility of the service will be limited to running Secret Detection scan on the given set of input blobs. More
details about the service are outlined in [ADR 004: Secret Detection Scanner Service](decisions/004_secret_detection_scanner_service.md).
The introduction of a dedicated service impacts the workflow for Secret Push Protection as follows:
```mermaid
sequenceDiagram
autonumber
%% Phase 2: Iter 1
Gitaly->>+Rails: invokes `/internal/allowed` API endpoint
Rails->>Rails: Perform project eligibility checks
alt On access check failure
Rails-->>Gitaly: Scanning Skipped
end
Rails->>Gitaly: Fetch blobs
Gitaly->>Rails: Quarantined Blobs
Rails->>Secret Detection Service: Invoke scan by embedding blobs
Secret Detection Service->>Secret Detection Service: Runs Secret Detection on input blobs
Secret Detection Service->>Rails: Result
Rails->>Gitaly: Result
```
The Secret Detection service addresses the previous phase's limitations of feature scalability and shared-resource
consumption. However, the Secret Push Protection workflow still requires Rails monolith to load large amount of
Git blobs fetched from Gitaly into its own memory before passing it down to the Secret Detection Service.
### Phase 2.1 - Invoke Push Protection directly from Gitaly
Until the previous phase, there are multiple hops made between Gitaly and Rails for running Pre-receive checks,
particularly for Secret Push protection so a fairly large amount of Rails memory is occupied for holding Git blobs to
pass them to the Gem/Service for running secret scan. This problem can be mitigated through a direct interaction between
the Secret Detection service and Gitaly via standard interface (either [Custom pre-receive hook](../../../administration/server_hooks.md#create-global-server-hooks-for-all-repositories)
or Gitaly's new [Plugin-based architecture](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/143582)). This setup
skips the need for Rails to be a blob messenger between Gitaly and Service.
Gitaly's new [Plugin-based architecture](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/143582) is the
preferred interface for interacting b/w Gitaly and RPC service as it provides streamlined access to the Git blob
repository. However, Gitaly team is yet to take it up for development.
_More details on Phase 2.1 will be added once there are updates on the development of Plugin architecture._
### Phase 3 - Expansion beyond Push Protection service
The detection flow for arbitrary text blobs, such as issue comments, relies on
subscribing to `Notes::PostProcessService` (or equivalent service) to enqueue
Sidekiq requests to the `SecretScanningService` to process the text blob by object type
and primary key of domain object. The `SecretScanningService` service fetches the
relevant text blob, scans the contents, and notifies the Rails application when a secret
is detected.
The detection flow for job logs requires processing the log during archive to object
storage. See discussion [in this issue](https://gitlab.com/groups/gitlab-org/-/epics/8847#note_1116647883)
around scanning during streaming and the added complexity in buffering lookbacks
for arbitrary trace chunks.
In the case of a push detection, the commit is rejected and error returned to the end user.
In any other case of detection, the Rails application manually creates a vulnerability
using the `Vulnerabilities::ManuallyCreateService` to surface the finding in the
existing Vulnerability Management UI.
#### Configuration
This phase will be considered "generally available" and on-by-default, with disablement configuration through organization-level settings.
#### High-Level Architecture
There is no change to the architecture defined in Phase 2, however the individual load requirements may require scaling up the node counts for the detection service.
#### Push Event Detection Flow
There is no change to the push event detection flow defined in Phase 2, however the added capability to scan
arbitrary text blobs directly from Rails allows us to emulate a pre-receive behavior for issuable creations,
as well (see [target types](#target-types) for priority object types).
```mermaid
sequenceDiagram
autonumber
actor User
User->>+Workhorse: git push with-secret
Workhorse->>+Gitaly: tcp
Gitaly->>+GitLabSecretDetection: PreReceive
GitLabSecretDetection->>-Gitaly: ListAllBlobs
Gitaly->>-GitLabSecretDetection: ListAllBlobsResponse
Gitaly->>+GitLabSecretDetection: PreReceive
GitLabSecretDetection->>GitLabSecretDetection: Scan(blob)
GitLabSecretDetection->>-Gitaly: found
Gitaly->>+Rails: PreReceive
Rails->>User: rejected: secret found
User->>+Workhorse: POST issuable with-secret
Workhorse->>+Rails: tcp
Rails->>+GitLabSecretDetection: PreReceive
GitLabSecretDetection->>GitLabSecretDetection: Scan(blob)
GitLabSecretDetection->>-Rails: found
Rails->>User: rejected: secret found
```
### Future Phases
These are key items for delivering a feature-complete always-on experience but have not have yet been prioritized into phases.
### Large blob sizes (1mb+)
Current phases do not include expansions of blob sizes beyond 1mb. While the main limitation was chosen [to conform to RPC transfer limits for future iterations](#transfer-optimizations-for-large-git-data-blobs) we should expand to supporting additional blob sizes. This can be achieved in two ways:
1. *Post-receive processing*
Accept blobs in a non-blocking fashion, process scanning as background job and alert passively on detection of a given secret.
1. *Improvements to scanning logic batching*
Maintaining the constraint of 1MB is primarily futureproofing to match an expected transport protocol. This can be mitigated by using separate transport (http, reads from disk, ...) or by slicing blob sizes.
### Detection Suppression
Suppression of detection and action on leaked secrets will be supported at several levels.
1. *Global suppression* - If a secret is highly-likely to be a false token (i.e. `EXAMPLE`) it should be suppressed in workflow contexts where user would be seriously inconvenienced.
We should still provide some means of triaging these results, whether via [audit events](#auditability) or as [automatic vulnerability resolution](../../../user/application_security/sast/index.md#automatic-vulnerability-resolution).
1. *Organization suppression* - If a secret matches an organization's allowlist (or was previously flagged and remediated as irrelevant) it should not reoccur. See [Organization-level controls](#organization-level-controls).
1. *Inline suppression* - Inline annotations should be supported in later phases with the Organization-level configuration to ignore annotations.
### External Token Verification
As a post-processing step for detection we should explore verification of detected secrets. This requires processors per supported token type in which we can distinguish tokens that are valid leaks from false positives. Similar to our [automatic response to leaked secrets](../../../user/application_security/secret_detection/automatic_response.md), we must externally verify a given token to give a high degree of confidence in our alerting.
There are two token types: internal and external:
- Internal tokens are verifiable and revocable as part of `ScanSecurityReportSecretsWorker` worker
- External tokens require external verification, in which [the architecture](../../../user/application_security/secret_detection/automatic_response.md#high-level-architecture) will closely match the [Secret Revocation Service](https://gitlab.com/gitlab-com/gl-security/engineering-and-research/automation-team/secret-revocation-service/)
## Iterations
- ✓ Define [requirements for detection coverage and actions](https://gitlab.com/gitlab-org/gitlab/-/issues/376716)
- ✓ Implement [Browser-based detection of GitLab tokens in comments/issues](https://gitlab.com/gitlab-org/gitlab/-/issues/368434)
- ✓ [PoC of secret scanning service](https://gitlab.com/gitlab-org/secure/pocs/secret-detection-go-poc/)
- ✓ [PoC of secret scanning gem](https://gitlab.com/gitlab-org/gitlab/-/issues/426823)
- [Pre-Production Performance Profiling for pre-receive PoCs](https://gitlab.com/gitlab-org/gitlab/-/issues/428499)
- Profiling service capabilities
- ✓ [Benchmarking regex performance between Ruby and Go approaches](https://gitlab.com/gitlab-org/gitlab/-/issues/423832)
- transfer latency, CPU, and memory footprint
- ✓ Implementation of secret scanning gem integration MVC (targeting individual commits)
- Phase1 - Deployment and monitoring
- Capacity planning for addition of service component to Reference Architectures headroom
- Security and readiness review
- Phase2 - Deployment and monitoring
- Implementation of secret scanning service (targeting arbitrary text blobs)
- Phase3 - Deployment and monitoring
- High priority domain object rollout (priority `TBD`)
- Issuable comments
- Issuable bodies
- Job logs
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

View File

@ -1,242 +1,11 @@
---
status: accepted
creation-date: "2022-09-28"
authors: [ "@ntepluhina" ]
coach: "@ayufan"
approvers: [ "@gweaver" ]
owning-stage: "~devops::plan"
participating-stages: []
redirect_to: 'https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/work_items/'
remove_date: '2025-07-08'
---
<!-- vale gitlab.FutureTense = NO -->
This document was moved to [another location](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/work_items/).
# Work Items
This document is a work-in-progress. Some aspects are not documented, though we expect to add them in the future.
## Summary
Work Items is a new architecture created to support the various types of built and planned entities throughout the product, such as issues, requirements, and incidents. It will make these types easy to extend and customize while sharing the same core functionality.
## Terminology
We use the following terms to describe components and properties of the Work items architecture.
### Work Item
Base type for issue, requirement, test case, incident and task (this list is planned to extend in the future). Different work items have the same set of base properties but their [widgets](#work-item-widgets) list is different.
### Work Item types
A set of predefined types for different categories of work items. Currently, the available types are:
- [Incident](../../../operations/incident_management/incidents.md)
- [Test case](../../../ci/test_cases/index.md)
- [Requirement](../../../user/project/requirements/index.md)
- [Task](../../../user/tasks.md)
- [OKRs](../../../user/okrs.md)
Work is underway to convert existing objects to Work Item Types or add new ones:
- [Issue](https://gitlab.com/groups/gitlab-org/-/epics/9584)
- [Epic](https://gitlab.com/groups/gitlab-org/-/epics/9290)
- [Ticket](https://gitlab.com/groups/gitlab-org/-/epics/10419)
#### Work Item properties
Every Work Item type has the following common properties:
**NOTE:**
You can also refer to fields of [Work Item](../../../api/graphql/reference/index.md#workitem) to learn more.
- `id` - a unique Work Item global identifier;
- `iid` - internal ID of the Work Item, relative to the parent workspace (currently workspace can only be a project)
- Work Item type;
- properties related to Work Item modification time: `createdAt`, `updatedAt`, `closedAt`;
- title string;
- Work Item confidentiality state;
- Work Item state (can be open or closed);
- lock version, incremented each time the work item is updated;
- permissions for the current user on the resource
- a list of [Work Item widgets](#work-item-widgets)
### Work Item widgets
All Work Item types share the same pool of predefined widgets and are customized
by which widgets are active on a specific type. The list of widgets for any
certain Work Item type is currently predefined and is not customizable. However,
in the future we plan to allow users to create new Work Item types and define a
set of widgets for them.
### Widget types (updating)
| Widget | Description | Feature flag | Write permission | GraphQL Subscription Support |
|---|---|---|---|---|
| [WorkItemWidgetAssignees](../../../api/graphql/reference/index.md#workitemwidgetassignees) | List of work item assignees | |`Guest`|Yes|
| [WorkItemWidgetAwardEmoji](../../../api/graphql/reference/index.md#workitemwidgetawardemoji) | Emoji reactions added to work item, including support for upvote/downvote counts | |Anyone who can view|No|
| [WorkItemWidgetColor](../../../api/graphql/reference/index.md#workitemwidgetcolor) | Set color of a work item. **Note:** Color is available only for epics. | |`Reporter`|No|
| [WorkItemWidgetCurrentUserTodos](../../../api/graphql/reference/index.md#workitemwidgetcurrentusertodos) | User todo state of work item | |Anyone who can view|No|
| [WorkItemWidgetDescription](../../../api/graphql/reference/index.md#workitemwidgetdescription) | Description of work item, including support for edited state, timestamp, and author | |`Reporter`|No|
| [WorkItemWidgetDesigns](../../../api/graphql/reference/index.md#workitemwidgetdesigns) | Design attachments for work items | |`Reporter`|No|
| [WorkItemWidgetDevelopment](../../../api/graphql/reference/index.md#workitemwidgetdevelopment) | Show related branches and merge requests for work items | |`Reporter`|No|
| [WorkItemWidgetHealthStatus](../../../api/graphql/reference/index.md#workitemwidgethealthstatus) | Health status assignment support for work item | |`Reporter`|No|
| [WorkItemWidgetHierarchy](../../../api/graphql/reference/index.md#workitemwidgethierarchy) | Hierarchy of work items, including support for boolean representing presence of children. | |`Guest`|No|
| [WorkItemWidgetIteration](../../../api/graphql/reference/index.md#workitemwidgetiteration) | Iteration assignment support for work item | |`Reporter`|No|
| [WorkItemWidgetLabels](../../../api/graphql/reference/index.md#workitemwidgetlabels) | List of labels added to work items, including support for checking whether scoped labels are supported | |`Reporter`|Yes|
| [WorkItemWidgetLinkedItems](../../../api/graphql/reference/index.md#workitemwidgetlinkeditems) | List of work items added as related to a given work item, with possible relationship types being `relates_to`, `blocks`, and `blocked_by`. Includes support for individual counts of blocked status, blocked by, blocking, and related to. | |`Guest`|No|
| [WorkItemWidgetMilestone](../../../api/graphql/reference/index.md#workitemwidgetmilestone) | Milestone assignment support for work item | |`Reporter`|No|
| [WorkItemWidgetNotes](../../../api/graphql/reference/index.md#workitemwidgetnotes) | List of discussions within a work item | |`Guest`|Yes|
| [WorkItemWidgetNotifications](../../../api/graphql/reference/index.md#workitemwidgetnotifications) | Notifications subscription status of a work item for current user | |Anyone who can view|No|
| [WorkItemWidgetParticipants](../../../api/graphql/reference/index.md#workitemwidgetparticipants) | Participants of a work item | |Anyone who can view|No|
| [WorkItemWidgetProgress](../../../api/graphql/reference/index.md#workitemwidgetprogress) | Progress value of a work item. **Note:** Progress is currently available only for OKRs. | `okrs_mvc` |`Reporter`|No|
| [WorkItemWidgetRequirementLegacy](../../../api/graphql/reference/index.md#workitemwidgetrequirementlegacy) | Legacy requirements | | |No|
| [WorkItemWidgetRolledupDates](../../../api/graphql/reference/index.md#workitemwidgetrolledupdates) | Set the start date and due date for epic work items, and roll up the start date and due date from child work items | |`Reporter`|No|
| [WorkItemWidgetStartAndDueDate](../../../api/graphql/reference/index.md#workitemwidgetstartandduedate) | Set start and due dates for a work item | |`Reporter`|No|
| [WorkItemWidgetStatus](../../../api/graphql/reference/index.md#workitemwidgetstatus) | Status of a work item when type is Requirement, with possible status types being `unverified`, `satisfied`, or `failed` | | |No|
| [WorkItemWidgetTestReports](../../../api/graphql/reference/index.md#workitemwidgettestreports) | Test reports associated with a work item | | | |
| [WorkItemWidgetTimeTracking](../../../api/graphql/reference/index.md#workitemwidgettimetracking) | Track total time spent on a work item | |`Reporter`|No|
| [WorkItemWidgetWeight](../../../api/graphql/reference/index.md#workitemwidgetweight) | Set weight of a work item | |`Reporter`|No|
| WorkItemWidgetLock | Lock/Unlock a work item | |`Reporter`|No|
#### Widget availability (updating)
| Widget | Epic | Issue | Task | Objective | Key Result |
|---|---|---|---|---|---|
| [WorkItemWidgetAssignees](../../../api/graphql/reference/index.md#workitemwidgetassignees) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [WorkItemWidgetAwardEmoji](../../../api/graphql/reference/index.md#workitemwidgetawardemoji) | ✅ | ✔️ | ✅ | ✅ | ✅ |
| [WorkItemWidgetColor](../../../api/graphql/reference/index.md#workitemwidgetcolor) | ✅ | ❌ | ❌ | ❌ | ❌ |
| [WorkItemWidgetCurrentUserTodos](../../../api/graphql/reference/index.md#workitemwidgetcurrentusertodos) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [WorkItemWidgetDescription](../../../api/graphql/reference/index.md#workitemwidgetdescription) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [WorkItemWidgetDesigns](../../../api/graphql/reference/index.md#workitemwidgetdesigns) | ✔️ | ✅ | ❌ | ❌ | ❌ |
| [WorkItemWidgetDevelopment](../../../api/graphql/reference/index.md#workitemwidgetdevelopment) | ❌ | ✅ | ❌ | ❌ | ❌ |
| [WorkItemWidgetHealthStatus](../../../api/graphql/reference/index.md#workitemwidgethealthstatus) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [WorkItemWidgetHierarchy](../../../api/graphql/reference/index.md#workitemwidgethierarchy) | ✅ | ✅ | ❌ | ✅ | ❌ |
| [WorkItemWidgetIteration](../../../api/graphql/reference/index.md#workitemwidgetiteration) | ❌ | ✅ | ✅ | ❌ | ❌ |
| [WorkItemWidgetLabels](../../../api/graphql/reference/index.md#workitemwidgetlabels) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [WorkItemWidgetLinkedItems](../../../api/graphql/reference/index.md#workitemwidgetlinkeditems) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [WorkItemWidgetMilestone](../../../api/graphql/reference/index.md#workitemwidgetmilestone) | ❌ | ✅ | ✅ | ✅ | ❌ |
| [WorkItemWidgetNotes](../../../api/graphql/reference/index.md#workitemwidgetnotes) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [WorkItemWidgetNotifications](../../../api/graphql/reference/index.md#workitemwidgetnotifications) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [WorkItemWidgetParticipants](../../../api/graphql/reference/index.md#workitemwidgetparticipants) | ✅ | ✅ | ✅ | ✅ | ✅ |
| [WorkItemWidgetProgress](../../../api/graphql/reference/index.md#workitemwidgetprogress) | ❌ | ❌ | ❌ | ✅ | ✅ |
| [WorkItemWidgetRequirementLegacy](../../../api/graphql/reference/index.md#workitemwidgetrequirementlegacy) | ❌ | ❌ | ❌ | ❌ | ❌ |
| [WorkItemWidgetRolledupDates](../../../api/graphql/reference/index.md#workitemwidgetrolledupdates) | ✅ | ❌ | ❌ | ❌ | ❌ |
| [WorkItemWidgetStartAndDueDate](../../../api/graphql/reference/index.md#workitemwidgetstartandduedate) | ❌ | ✅ | ✅ | ❌ | ✅ |
| [WorkItemWidgetStatus](../../../api/graphql/reference/index.md#workitemwidgetstatus) | ❓ | ❓ | ❓ | ❓ | ❓ |
| [WorkItemWidgetTestReports](../../../api/graphql/reference/index.md#workitemwidgettestreports) | ❌ | ❌ | ❌ | ❌ | ❌ |
| [WorkItemWidgetTimeTracking](../../../api/graphql/reference/index.md#workitemwidgettimetracking) | ✅ | ✅ | ✅ | ❌ | ❌ |
| [WorkItemWidgetWeight](../../../api/graphql/reference/index.md#workitemwidgetweight) | ❌ | ✅ | ✅ | ❌ | ❌ |
##### Legend
- ✅ - Widget available
- ✔️ - Widget planned to be available
- ❌ - Widget not available
- ❓ - Widget pending for consideration
- 🔍 - Alternative widget planned
### Work item relationships
Work items can be related to other work items in a number of different ways:
- Parent: A direct ancestor to the current work item, whose completion relies on completing this work item.
- Child: A direct descendant of the current work item, which contributes to this work item's completion.
- Blocked by: A work item preventing the completion of the current work item.
- Blocks: A work item whose completion is blocked by the current work item.
- Related: A work item that is relevant to the subject of the current work item, but does not directly contribute to or block the completion of this work item.
#### Hierarchy
Parent-child relationships form the basis of **hierarchy** in work items. Each work item type has a defined set of types that can be parents or children of that type.
As types expand, and parent items have their own parent items, the hierarchy capability can grow exponentially.
Currently, following are the allowed Parent-child relationships:
| Type | Can be parent of | Can be child of |
|------------|------------------|------------------|
| Epic | Epic | Epic |
| Issue | Task | Epic |
| Task | None | Issue |
| Objective | Objective | Objective |
| Key result | None | Objective |
### Work Item view
The new frontend view that renders Work Items of any type using global Work Item `id` as an identifier.
### Task
Task is a special Work Item type. Tasks can be added to issues as child items and can be displayed in the modal on the issue view.
### Feature flags
Since this is a large project with numerous moving parts, feature flags are being used to track promotions of available widgets. The table below shows the different feature flags that are being used, and the audience that they are available to.
| feature flag name | audience |
|---|---|
| `work_items` | defaulted to on |
| `work_items_beta` | `gitlab-org`, `gitlab-com` |
| `work_items_mvc_2` | `gitlab-org/plan-stage` |
For epic work item specific feature flags, please see the [Epic Work Item Migration Epic](https://gitlab.com/groups/gitlab-org/-/epics/11777#feature-flags).
## Motivation
Work Items main goal is to enhance the planning toolset to become the most popular collaboration tool for knowledge workers in any industry.
- Puts all like-items (issues, incidents, epics, test cases etc.) on a standard platform to simplify maintenance and increase consistency in experience
- Enables first-class support of common planning concepts to lower complexity and allow users to plan without learning GitLab-specific nuances.
## Goals
### Scalability
Currently, different entities like issues, epics, merge requests etc share many
similar features but these features are implemented separately for every entity
type. This makes implementing new features or refactoring existing ones
problematic: for example, if we plan to add new feature to issues and incidents,
we would need to implement it separately on issue and incident types. With work
items, any new feature is implemented via widgets for all existing types which
makes the architecture more scalable.
### Flexibility
With existing implementation, we have a rigid structure for issuables,
merge requests, epics etc. This structure is defined on both backend and frontend,
so any change requires a coordinated effort. Also, it would be very hard to make
this structure customizable for the user without introducing a set of flags to
enable/disable any existing feature. Work Item architecture allows frontend to
display Work Item widgets in a flexible way: whatever is present in Work Item
widgets, will be rendered on the page. This allows us to make changes fast and
makes the structure way more flexible. For example, if we want to stop
displaying labels on the Incident page, we remove labels widget from Incident
Work Item type on the backend. Also, in the future this will allow users to
define the set of widgets they want to see on custom Work Item types.
### A consistent experience
As much as we try to have consistent behavior for similar features on different
entities, we still have differences in the implementation. For example, updating
labels on merge request via GraphQL API can be done with dedicated
`setMergeRequestLabels` mutation, while for the issue we call more
coarse-grained `updateIssue`. This provides inconsistent experience for both
frontend and external API users. As a result, epics, issues, requirements, and
others all have similar but just subtle enough differences in common
interactions that the user needs to hold a complicated mental model of how they
each behave.
Work Item architecture is designed with making all the features for all the types consistent, implemented as Work Item widgets.
## High-level architecture problems to solve
- how can we bypass groups and projects consolidation to migrate epics to Work Item type;
- dealing with parent-child relationships for certain Work Item types: epic > issue > task, and to the same Work Item types: issue > issue.
- [implementing custom Work Item types and custom widgets](https://gitlab.com/gitlab-org/gitlab/-/issues/335110)
### Links
- [Work items initiative epic](https://gitlab.com/groups/gitlab-org/-/epics/6033)
- [Tasks roadmap](https://gitlab.com/groups/gitlab-org/-/epics/7103?_gl=1*zqatx*_ga*NzUyOTc3NTc1LjE2NjEzNDcwMDQ.*_ga_ENFH3X7M5Y*MTY2MjU0MDQ0MC43LjEuMTY2MjU0MDc2MC4wLjAuMA..)
- [Work Item "Vision" Prototype](https://gitlab.com/gitlab-org/gitlab/-/issues/368607)
- [Work Item Discussions](https://gitlab.com/groups/gitlab-org/-/epics/7060)
<!-- This redirect file can be deleted after <2025-07-08>. -->
<!-- Redirects that point to other docs in the same project expire in three months. -->
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->

Some files were not shown because too many files have changed in this diff Show More