diff --git a/docs/admin/code-hosts/aws-codecommit.mdx b/docs/admin/code-hosts/aws-codecommit.mdx
index 45bb54f32..474afd091 100644
--- a/docs/admin/code-hosts/aws-codecommit.mdx
+++ b/docs/admin/code-hosts/aws-codecommit.mdx
@@ -37,7 +37,7 @@ AWS CodeCommit connections support the following configuration options, which ar
{/* SCHEMA_SYNC_START: admin/code_hosts/aws_codecommit.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
// REQUIRED:
diff --git a/docs/admin/code-hosts/azuredevops.mdx b/docs/admin/code-hosts/azuredevops.mdx
index 08f4c56e2..7b3ba03b5 100644
--- a/docs/admin/code-hosts/azuredevops.mdx
+++ b/docs/admin/code-hosts/azuredevops.mdx
@@ -65,7 +65,7 @@ Azure DevOps connections support the following configuration options, which are
{/* SCHEMA_SYNC_START: admin/code_hosts/azuredevops.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
// Authentication alternatives: token OR windowsPassword
diff --git a/docs/admin/code-hosts/bitbucket-cloud.mdx b/docs/admin/code-hosts/bitbucket-cloud.mdx
index 8349f8ef6..e3076e34d 100644
--- a/docs/admin/code-hosts/bitbucket-cloud.mdx
+++ b/docs/admin/code-hosts/bitbucket-cloud.mdx
@@ -116,7 +116,7 @@ Bitbucket Cloud connections support the following configuration options, which a
{/* SCHEMA_SYNC_START: admin/code_hosts/bitbucket_cloud.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
// The workspace access token to use when authenticating with Bitbucket Cloud.
diff --git a/docs/admin/code-hosts/bitbucket-server.mdx b/docs/admin/code-hosts/bitbucket-server.mdx
index 01f1bd272..fb04ccd26 100644
--- a/docs/admin/code-hosts/bitbucket-server.mdx
+++ b/docs/admin/code-hosts/bitbucket-server.mdx
@@ -202,7 +202,7 @@ Bitbucket Server / Bitbucket Data Center connections support the following confi
{/* SCHEMA_SYNC_START: admin/code_hosts/bitbucket_server.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
// Authentication alternatives: token OR password
diff --git a/docs/admin/code-hosts/gerrit.mdx b/docs/admin/code-hosts/gerrit.mdx
index 651b5093a..674f96ffc 100644
--- a/docs/admin/code-hosts/gerrit.mdx
+++ b/docs/admin/code-hosts/gerrit.mdx
@@ -113,7 +113,7 @@ Gerrit connections support the following configuration options, which are specif
{/* SCHEMA_SYNC_START: admin/code_hosts/gerrit.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
// If non-null, enforces Gerrit repository permissions. This requires that there is an item in the [site configuration json](https://sourcegraph.com/docs/admin/config/site_config#auth-providers) `auth.providers` field, of type "gerrit" with the same `url` field as specified in this `GerritConnection`.
diff --git a/docs/admin/code-hosts/github.mdx b/docs/admin/code-hosts/github.mdx
index 777eedc88..bf7d0297e 100644
--- a/docs/admin/code-hosts/github.mdx
+++ b/docs/admin/code-hosts/github.mdx
@@ -454,7 +454,7 @@ GitHub connections support the following configuration options, which are specif
{/* SCHEMA_SYNC_START: admin/code_hosts/github.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
// Authentication alternatives: token OR gitHubAppDetails OR externalAccount OR useRandomExternalAccount
diff --git a/docs/admin/code-hosts/gitlab.mdx b/docs/admin/code-hosts/gitlab.mdx
index e125d1fee..1e9f3ea97 100644
--- a/docs/admin/code-hosts/gitlab.mdx
+++ b/docs/admin/code-hosts/gitlab.mdx
@@ -189,7 +189,7 @@ See [Internal rate limits](/admin/code-hosts/rate-limits#internal-rate-limits).
{/* SCHEMA_SYNC_START: admin/code_hosts/gitlab.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
// If non-null, enforces GitLab repository permissions. This requires that there be an item in the `auth.providers` field of type "gitlab" with the same `url` field as specified in this `GitLabConnection`.
diff --git a/docs/admin/code-hosts/gitolite.mdx b/docs/admin/code-hosts/gitolite.mdx
index d085e26ed..68aa4f062 100644
--- a/docs/admin/code-hosts/gitolite.mdx
+++ b/docs/admin/code-hosts/gitolite.mdx
@@ -25,7 +25,7 @@ To connect Gitolite to Sourcegraph:
{/* SCHEMA_SYNC_START: admin/code_hosts/gitolite.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
// A list of repositories to never mirror from this Gitolite instance. Supports excluding by exact name ({"name": "foo"}).
diff --git a/docs/admin/code-hosts/other.mdx b/docs/admin/code-hosts/other.mdx
index cb0ee85c5..99441bd11 100644
--- a/docs/admin/code-hosts/other.mdx
+++ b/docs/admin/code-hosts/other.mdx
@@ -68,7 +68,7 @@ Repositories must be listed individually:
{/* SCHEMA_SYNC_START: admin/code_hosts/other_external_service.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
// A list of repositories to never mirror by name after applying repositoryPathPattern. Supports excluding by exact name ({"name": "myrepo"}) or regular expression ({"pattern": ".*secret.*"}).
diff --git a/docs/admin/code-hosts/phabricator.mdx b/docs/admin/code-hosts/phabricator.mdx
index 5e05fdfaa..b145506c9 100644
--- a/docs/admin/code-hosts/phabricator.mdx
+++ b/docs/admin/code-hosts/phabricator.mdx
@@ -76,7 +76,7 @@ The Sourcegraph instance's site admin must [update the `corsOrigin` site config
{/* SCHEMA_SYNC_START: admin/code_hosts/phabricator.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
// SSH cipher to use when cloning via SSH. Must be a valid choice from `ssh -Q cipher`.
diff --git a/docs/admin/config/settings.mdx b/docs/admin/config/settings.mdx
index b0347f255..300424882 100644
--- a/docs/admin/config/settings.mdx
+++ b/docs/admin/config/settings.mdx
@@ -27,7 +27,7 @@ Settings options and their default values are shown below.
{/* SCHEMA_SYNC_START: admin/config/settings.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
diff --git a/docs/admin/config/site-config.mdx b/docs/admin/config/site-config.mdx
index 36d8d9fe5..e4936c5fc 100644
--- a/docs/admin/config/site-config.mdx
+++ b/docs/admin/config/site-config.mdx
@@ -21,7 +21,7 @@ All site configuration options and their default values are shown below.
{/* SCHEMA_SYNC_START: admin/config/site.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
@@ -648,7 +648,8 @@ All site configuration options and their default values are shown below.
"tls.external": null,
- // The channel on which to automatically check for Sourcegraph updates.
+ // ⚠️ DEPRECATED: DEPRECATED: This setting has no effect.
+ // DEPRECATED: This setting has no effect.
// Valid options: "release", "none"
// Other example values:
// - "none"
diff --git a/docs/admin/repo/perforce.mdx b/docs/admin/repo/perforce.mdx
index 55a6d31e4..c0ba5fcba 100644
--- a/docs/admin/repo/perforce.mdx
+++ b/docs/admin/repo/perforce.mdx
@@ -228,7 +228,7 @@ With this setting, Sourcegraph will ignore any rules with a host other than `*`,
{/* SCHEMA_SYNC_START: admin/code_hosts/perforce.schema.json */}
{/* WARNING: This section is auto-generated during releases. Do not edit manually. */}
-{/* Last updated: 2026-03-06T20:59:17Z */}
+{/* Last updated: 2026-03-20T11:19:49Z */}
```json
{
// If non-null, enforces Perforce depot permissions.
diff --git a/docs/admin/telemetry/private-metadata-allowlist.mdx b/docs/admin/telemetry/private-metadata-allowlist.mdx
index a887e247f..609fe90f9 100644
--- a/docs/admin/telemetry/private-metadata-allowlist.mdx
+++ b/docs/admin/telemetry/private-metadata-allowlist.mdx
@@ -33,4 +33,5 @@ To learn more, refer to the [telemetry documentation](https://sourcegraph.com/do
| `cody.modelSelector` | _(all)_ | `modelId` — High-cardinality model identifier; helpful for determining the model selected in the model selector.
`modelProvider` — High-cardinality model provider; helpful for determining the model selected in the model selector. |
| `cody.smart-apply.context` | `applied` | `model` — High-cardinality model identifier; helpful for determining the model that was selected. |
| `deepsearch` | `search.toolcall` | `toolName` — High-cardinality tool name; helpful for determining which tools are being used during deep search.
`toolId` — High-cardinality tool identifier; helpful for determining which tools are being used during deep search.
`model` — High-cardinality model identifier; helpful for determining which models are being used during deep search. |
+| `admin.users` | `delete` | `userIDs` — Numeric identifiers of users being deleted; needed for audit and analytics of admin user management actions. |
| `externalApi` | `request` | `procedure` — ConnectRPC procedure path (e.g. '/sourcegraph.users.v1.UsersService/GetUser'); not sensitive and needed to distinguish which external API RPCs are being used. |
\ No newline at end of file
diff --git a/docs/cli/references/index.mdx b/docs/cli/references/index.mdx
index c511872ba..5f606371d 100644
--- a/docs/cli/references/index.mdx
+++ b/docs/cli/references/index.mdx
@@ -15,11 +15,11 @@
* [`lsp`](references/lsp)
* [`orgs`](references/orgs)
* [`repos`](references/repos)
-* [`sbom` (deprecated)](references/sbom)
+* [`sbom`](references/sbom)
* [`search`](references/search)
* [`search-jobs`](references/search-jobs)
* [`serve-git`](references/serve-git)
-* [`signature` (deprecated)](references/signature)
+* [`signature`](references/signature)
* [`snapshot`](references/snapshot)
* [`teams`](references/teams)
* [`users`](references/users)
diff --git a/docs/cli/references/sbom.mdx b/docs/cli/references/sbom.mdx
index e3750cf6a..76dd910b3 100644
--- a/docs/cli/references/sbom.mdx
+++ b/docs/cli/references/sbom.mdx
@@ -1,13 +1,11 @@
# `src sbom`
-
+## frontend: goroutine_error_percentage_long_window
+
+
percentage of failed periodic goroutine executions over a long window
+ +**Descriptions** + +- warning frontend: 30%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s +- critical frontend: 50%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s + +**Next steps** + +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#frontend-goroutine_error_percentage_long_window). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_frontend_goroutine_error_percentage_long_window", + "critical_frontend_goroutine_error_percentage_long_window" +] +``` + +*Managed by the Sourcegraph Services team.* + +mean blocked seconds per conn request
@@ -1386,37 +1423,6 @@ Generated query for critical alert: `min(((src_gitserver_disk_space_available /container CPU throttling time %
- -**Descriptions** - -- warning gitserver: 75%+ container CPU throttling time % for 2m0s - -**Next steps** - -- - Consider increasing the CPU limit for the container. -- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-cpu_throttling_time). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_gitserver_cpu_throttling_time" -] -``` - -*Managed by the Sourcegraph Services team.* - -echo test command duration
@@ -1513,6 +1519,99 @@ Generated query for warning alert: `max((sum(src_gitserver_clone_queue)) >= 2CPU usage
+ +**Descriptions** + +- warning gitserver: 95%+ CPU usage for 10m0s + +**Next steps** + +- Consider increasing CPU limits or scaling out. +- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-cpu_usage_percentage). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_cpu_usage_percentage" +] +``` + +*Managed by the Sourcegraph Services team.* + +memory (RSS)
+ +**Descriptions** + +- warning gitserver: 90%+ memory (RSS) for 10m0s + +**Next steps** + +- Consider increasing memory limits or scaling out. +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-memory_rss). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_memory_rss" +] +``` + +*Managed by the Sourcegraph Services team.* + +container CPU throttling time %
+ +**Descriptions** + +- warning gitserver: 75%+ container CPU throttling time % for 2m0s + +**Next steps** + +- Consider increasing the CPU limit for the container. +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-cpu_throttling_time). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_gitserver_cpu_throttling_time" +] +``` + +*Managed by the Sourcegraph Services team.* + +rate of git command corruption retry attempts over 5m
@@ -1613,23 +1712,27 @@ Generated query for warning alert: `max((sum by (name, job_name) (rate(src_perioCPU usage
+percentage of failed periodic goroutine executions over a long window
**Descriptions** -- warning gitserver: 95%+ CPU usage for 10m0s +- warning gitserver: 30%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s +- critical gitserver: 50%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s **Next steps** -- Consider increasing CPU limits or scaling out. -- Learn more about the related dashboard panel in the [dashboards reference](dashboards#gitserver-cpu_usage_percentage). +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-goroutine_error_percentage_long_window). - **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: ```json "observability.silenceAlerts": [ - "warning_gitserver_cpu_usage_percentage" + "warning_gitserver_goroutine_error_percentage_long_window", + "critical_gitserver_goroutine_error_percentage_long_window" ] ``` @@ -1638,38 +1741,9 @@ Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periomemory (RSS)
- -**Descriptions** +Generated query for warning alert: `max(((sum by (name, job_name) (increase(src_periodic_goroutine_errors_total{job=~".*gitserver.*"\}[6h])) / clamp_min(sum by (name, job_name) (increase(src_periodic_goroutine_total\{job=~".*gitserver.*"}[6h])), 1)) * 100) >= 30)` -- warning gitserver: 90%+ memory (RSS) for 10m0s - -**Next steps** - -- Consider increasing memory limits or scaling out. -- More help interpreting this metric is available in the [dashboards reference](dashboards#gitserver-memory_rss). -- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: - -```json -"observability.silenceAlerts": [ - "warning_gitserver_memory_rss" -] -``` - -*Managed by the Sourcegraph Services team.* - -percentage of failed periodic goroutine executions over a long window
+ +**Descriptions** + +- warning worker: 30%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s +- critical worker: 50%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s + +**Next steps** + +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#worker-goroutine_error_percentage_long_window). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_worker_goroutine_error_percentage_long_window", + "critical_worker_goroutine_error_percentage_long_window" +] +``` + +*Managed by the Sourcegraph Services team.* + +mean blocked seconds per conn request
@@ -4488,6 +4599,43 @@ Generated query for warning alert: `max((sum by (name, job_name) (rate(src_periopercentage of failed periodic goroutine executions over a long window
+ +**Descriptions** + +- warning searcher: 30%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s +- critical searcher: 50%+ percentage of failed periodic goroutine executions over a long window for 3h0m0s + +**Next steps** + +- Check service logs for error details related to the failing periodic routine +- Check if the routine depends on external services that may be unavailable +- Consider temporarily disabling the routine if it`s non-critical and causing cascading issues +- More help interpreting this metric is available in the [dashboards reference](dashboards#searcher-goroutine_error_percentage_long_window). +- **Silence this alert:** If you are aware of this alert and want to silence notifications for it, add the following to your site configuration and set a reminder to re-evaluate the alert: + +```json +"observability.silenceAlerts": [ + "warning_searcher_goroutine_error_percentage_long_window", + "critical_searcher_goroutine_error_percentage_long_window" +] +``` + +*Managed by the Sourcegraph Services team.* + +mean blocked seconds per conn request
diff --git a/docs/self-hosted/observability/dashboards.mdx b/docs/self-hosted/observability/dashboards.mdx index dce0c86d8..cf0d3326c 100644 --- a/docs/self-hosted/observability/dashboards.mdx +++ b/docs/self-hosted/observability/dashboards.mdx @@ -4539,6 +4539,32 @@ sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*frontePercentage of failed periodic goroutine executions over a long window
+ +The percentage of failed executions over the last 6 hours for each periodic goroutine. +A value above 30% sustained for at least 3 hours indicates persistent failures. +A value above 50% sustained for at least 3 hours indicates that most executions are failing continuously. + +Refer to the [alerts reference](alerts#frontend-goroutine_error_percentage_long_window) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102920` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile handler execution time
@@ -4548,7 +4574,7 @@ Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102920` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102930` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4573,7 +4599,7 @@ This represents how long a complete loop iteration takes before sleeping for the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102921` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102931` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4598,7 +4624,7 @@ Higher values indicate that tenant processing is taking longer and may affect ov This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102930` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102940` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4623,7 +4649,7 @@ Consistently high values might indicate problematic tenants or inefficient proce This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102931` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102941` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4648,7 +4674,7 @@ Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102940` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102950` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4673,7 +4699,7 @@ A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102941` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102951` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4698,7 +4724,7 @@ Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102950` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102960` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4723,7 +4749,7 @@ Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102951` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/frontend/frontend?viewPanel=102961` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -4993,7 +5019,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^(frontend|sourcegraph-Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#frontend-memory_rss) for 1 alert related to this panel. @@ -6025,13 +6051,20 @@ Query:Container CPU throttling time %
+Number of git commands that exceeded the threshold for high memory usage
-- A high value indicates that the container is spending too much time waiting for CPU cycles. +This graph tracks the number of git subcommands that gitserver ran that exceeded the threshold for high memory usage. +This graph in itself is not an alert, but it is used to learn about the memory usage of gitserver. -Refer to the [alerts reference](alerts#gitserver-cpu_throttling_time) for 1 alert related to this panel. +If gitserver frequently serves requests where the status code is KILLED, this graph might help to correlate that +with the high memory usage. + +This graph spiking is not a problem necessarily. But when subcommands or the whole gitserver service are getting +OOM killed and this graph shows spikes, increasing the memory might be useful. + +This panel has no related alerts. To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100010` on your Sourcegraph instance. @@ -6043,20 +6076,17 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10001 Query: ``` -sum by (container_label_io_kubernetes_pod_name) ((rate(container_cpu_cfs_throttled_periods_total{container_label_io_kubernetes_container_name="gitserver", container_label_io_kubernetes_pod_name=~`${shard:regex}`}[5m]) / rate(container_cpu_cfs_periods_total{container_label_io_kubernetes_container_name="gitserver", container_label_io_kubernetes_pod_name=~`${shard:regex}`}[5m])) * 100) +sort_desc(sum(sum_over_time(src_gitserver_exec_high_memory_usage_count{instance=~`${shard:regex}`}[2m])) by (cmd)) ```Cpu usage seconds
+#### gitserver: running_git_commands -- This value should not exceed 75% of the CPU limit over a longer period of time. - - We cannot alert on this as we don`t know the resource allocation. +Git commands running on each gitserver instance
- - If this value is high for a longer time, consider increasing the CPU limit for the container. +A high value signals load. This panel has no related alerts. @@ -6070,20 +6100,44 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10001 Query: ``` -sum by (container_label_io_kubernetes_pod_name) (rate(container_cpu_usage_seconds_total{container_label_io_kubernetes_container_name="gitserver", container_label_io_kubernetes_pod_name=~`${shard:regex}`}[5m])) +sum by (instance, cmd) (src_gitserver_exec_running{instance=~`${shard:regex}`}) ```Gitserver page faults
+Rate of git commands received
-The number of major page faults in a 5 minute window for gitserver. If this number increases significantly, it indicates that more git API calls need to load data from disk. There may not be enough memory to efficiently support the amount of API requests served concurrently. +per second rate per command This panel has no related alerts. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100012` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Echo test command duration
+ + + +Refer to the [alerts reference](alerts#gitserver-echo_command_duration_test) for 1 alert related to this panel. + To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100020` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6094,28 +6148,71 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10002 Query: ``` -rate(container_memory_failures_total{failure_type="pgmajfault", name=~"^gitserver.*"}[5m]) +max(src_gitserver_echo_duration_seconds) ```Number of git commands that exceeded the threshold for high memory usage
+Number of times a repo corruption has been identified
-This graph tracks the number of git subcommands that gitserver ran that exceeded the threshold for high memory usage. -This graph in itself is not an alert, but it is used to learn about the memory usage of gitserver. +A non-null value here indicates that a problem has been detected with the gitserver repository storage. +Repository corruptions are never expected. This is a real issue. Gitserver should try to recover from them +by recloning repositories, but this may take a while depending on repo size. -If gitserver frequently serves requests where the status code is KILLED, this graph might help to correlate that -with the high memory usage. +Refer to the [alerts reference](alerts#gitserver-repo_corrupted) for 1 alert related to this panel. -This graph spiking is not a problem necessarily. But when subcommands or the whole gitserver service are getting -OOM killed and this graph shows spikes, increasing the memory might be useful. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100021` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Repository clone queue size
+ +Refer to the [alerts reference](alerts#gitserver-repository_clone_queue_size) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100030` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Number of concurrent requests running against gitserver client
+ +This metric is only for informational purposes. It indicates the current number of concurrently running requests by process against gitserver gRPC. + +It does not indicate any problems with the instance, but can give a good indication of load spikes or request throttling. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100021` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100031` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6125,21 +6222,58 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10002 Query: ``` -sort_desc(sum(sum_over_time(src_gitserver_exec_high_memory_usage_count{instance=~`${shard:regex}`}[2m])) by (cmd)) +sum by (job, instance) (src_gitserver_client_concurrent_requests) ```Git commands running on each gitserver instance
+Gitserver leverages memory mapping to optimize file reads: it is generally expected to consume all the memory provided to it, if it can. When it finds data that is not available in memory yet, this causes a 'page fault', and the data is loaded into memory from disk. -A high value signals load. +A trend to watch out for: when something in-application happens to take a lot of memory, and active file previously used nearly all remaining memory, then: + +1. 'Memory (RSS)' goes up, due to in-application usage +2. 'Memory usage (Active file)' goes down, as file data held in memory is evicted +3. 'Page faults' go up, as less data is held in memory (and with that, IOPS, disk read throughput, ...) + +This can also happen without 'Memory (RSS)' increasing, if the provisioned memory is insufficent to start with. +A small degree of this is behaviour generally expected, but if it happens significantly or causes user-noticeable impact, it's likely gitserver could benefit from more memory. Look for more user-facing metrics to make a final determination on appropriate resource allocation. + +_See https://en.wikipedia.org/wiki/Memory-mapped_file and the related articles for more information about memory maps._ + +#### gitserver: cpu_usage_percentage + +CPU usage
+ +Refer to the [alerts reference](alerts#gitserver-cpu_usage_percentage) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100100` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Memory usage percentage (total)
+ +An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100030` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100101` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6149,21 +6283,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10003 Query: ``` -sum by (instance, cmd) (src_gitserver_exec_running{instance=~`${shard:regex}`}) +cadvisor_container_memory_usage_percentage_total{name=~"^gitserver.*"} ```Rate of git commands received
+Memory usage bytes (total)
-per second rate per command +An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100031` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100102` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6173,21 +6307,45 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10003 Query: ``` -sum by (cmd) (rate(src_gitserver_exec_duration_seconds_count{instance=~`${shard:regex}`}[5m])) +max by (name) (container_memory_working_set_bytes{name=~"^gitserver.*"}) ```Echo test command duration
+Memory (RSS)
+The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. +Refer to the [alerts reference](alerts#gitserver-memory_rss) for 1 alert related to this panel. -Refer to the [alerts reference](alerts#gitserver-echo_command_duration_test) for 1 alert related to this panel. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100110` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Memory usage (active file)
+ +This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. + +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100040` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100111` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6197,23 +6355,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10004 Query: ``` -max(src_gitserver_echo_duration_seconds) +max(container_memory_total_active_file_bytes{name=~"^gitserver.*"} / container_spec_memory_limit_bytes{name=~"^gitserver.*"}) by (name) * 100.0 ```Number of times a repo corruption has been identified
+Memory usage (kernel)
-A non-null value here indicates that a problem has been detected with the gitserver repository storage. -Repository corruptions are never expected. This is a real issue. Gitserver should try to recover from them -by recloning repositories, but this may take a while depending on repo size. +The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. -Refer to the [alerts reference](alerts#gitserver-repo_corrupted) for 1 alert related to this panel. +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100041` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100112` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6223,19 +6379,21 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10004 Query: ``` -sum(rate(src_gitserver_repo_corrupted[5m])) +max(container_memory_kernel_usage{name=~"^gitserver.*"} / container_spec_memory_limit_bytes{name=~"^gitserver.*"}) by (name) * 100.0 ```Repository clone queue size
+Gitserver page faults
-Refer to the [alerts reference](alerts#gitserver-repository_clone_queue_size) for 1 alert related to this panel. +The number of major page faults in a 5 minute window for gitserver. If this number increases significantly, it indicates that more git API calls need to load data from disk. There may not be enough memory to efficiently support the amount of API requests served concurrently. + +This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100050` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100120` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6245,23 +6403,47 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10005 Query: ``` -sum(src_gitserver_clone_queue) +rate(container_memory_failures_total{failure_type="pgmajfault", name=~"^gitserver.*"}[5m]) ```Number of concurrent requests running against gitserver client
+Container CPU throttling time %
-This metric is only for informational purposes. It indicates the current number of concurrently running requests by process against gitserver gRPC. +A high value indicates that the container is spending too much time waiting for CPU cycles. -It does not indicate any problems with the instance, but can give a good indication of load spikes or request throttling. +Refer to the [alerts reference](alerts#gitserver-cpu_throttling_time) for 1 alert related to this panel. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100130` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +Cpu usage seconds
+ +- This value should not exceed 75% of the CPU limit over a longer period of time. +- We cannot alert on this as we don`t know the resource allocation. +- If this value is high for a longer time, consider increasing the CPU limit for the container. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100051` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100131` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6271,7 +6453,7 @@ To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=10005 Query: ``` -sum by (job, instance) (src_gitserver_client_concurrent_requests) +sum by (container_label_io_kubernetes_pod_name) (rate(container_cpu_usage_seconds_total{container_label_io_kubernetes_container_name="gitserver", container_label_io_kubernetes_pod_name=~`${shard:regex}`}[5m])) ``` @@ -6287,7 +6469,7 @@ A high value means any internal service trying to clone a repo from gitserver is This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100100` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100200` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6311,7 +6493,7 @@ Per shard gitservice request rate This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100101` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100201` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6335,7 +6517,7 @@ Per shard gitservice requests running This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100102` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100202` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6361,7 +6543,7 @@ The rate of housekeeping tasks performed in repositories, broken down by task ty This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100200` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100300` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6385,7 +6567,7 @@ The 90th percentile latency of successful housekeeping tasks, broken down by tas This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100210` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100310` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6409,7 +6591,7 @@ The 95th percentile latency of successful housekeeping tasks, broken down by tas This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100211` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100311` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6433,7 +6615,7 @@ The 99th percentile latency of successful housekeeping tasks, broken down by tas This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100212` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100312` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6457,7 +6639,7 @@ The 90th percentile latency of failed housekeeping tasks, broken down by task ty This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100220` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100320` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6481,7 +6663,7 @@ The 95th percentile latency of failed housekeeping tasks, broken down by task ty This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100221` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100321` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6505,7 +6687,7 @@ The 99th percentile latency of failed housekeeping tasks, broken down by task ty This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100222` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100322` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6529,7 +6711,7 @@ The rate of files pruned during cleanup, broken down by file type This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100230` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100330` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6553,7 +6735,7 @@ The count distribution of various Git data structures in repositories This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100240` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100340` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6577,7 +6759,7 @@ The size distribution of various Git data structures in repositories This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100250` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100350` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6601,7 +6783,7 @@ The time elapsed since last optimization of various Git data structures This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100260` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100360` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6625,7 +6807,7 @@ The rate at which data structures are reported to exist in repositories This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100270` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100370` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6653,7 +6835,7 @@ This metric helps track how often the retry mechanism is triggered. Refer to the [alerts reference](alerts#gitserver-git_command_retry_attempts_rate) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100300` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100400` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6678,7 +6860,7 @@ This indicates how effective the retry mechanism is at resolving transient corru This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100301` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100401` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6703,7 +6885,7 @@ These failures will result in repository corruption marking and potential reclon This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100310` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100410` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6729,7 +6911,7 @@ Common causes include network issues, permission changes, or concurrent reposito This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100311` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100411` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6755,7 +6937,7 @@ A low ratio may indicate persistent corruption issues requiring investigation. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100312` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100412` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6782,7 +6964,7 @@ A value of 0 indicates the routine isn`t running currently, it awaits it`s next This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100400` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100500` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6807,7 +6989,7 @@ A low or zero value could indicate that a routine is stalled or encountering err This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100401` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100501` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6832,7 +7014,7 @@ A sustained high error rate may indicate a problem with the routine`s configurat Refer to the [alerts reference](alerts#gitserver-goroutine_error_rate) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100410` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100510` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6857,7 +7039,7 @@ A value above 5% indicates that a significant portion of routine executions are Refer to the [alerts reference](alerts#gitserver-goroutine_error_percentage) for 1 alert related to this panel. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100411` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100511` on your Sourcegraph instance. *Managed by the Sourcegraph Services team.* @@ -6873,6 +7055,32 @@ sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*gitserPercentage of failed periodic goroutine executions over a long window
+ +The percentage of failed executions over the last 6 hours for each periodic goroutine. +A value above 30% sustained for at least 3 hours indicates persistent failures. +A value above 50% sustained for at least 3 hours indicates that most executions are failing continuously. + +Refer to the [alerts reference](alerts#gitserver-goroutine_error_percentage_long_window) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100520` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile handler execution time
@@ -6882,7 +7090,7 @@ Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100420` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100530` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6907,7 +7115,7 @@ This represents how long a complete loop iteration takes before sleeping for the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100421` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100531` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6932,7 +7140,7 @@ Higher values indicate that tenant processing is taking longer and may affect ov This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100430` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100540` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6957,7 +7165,7 @@ Consistently high values might indicate problematic tenants or inefficient proce This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100431` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100541` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -6982,7 +7190,7 @@ Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100440` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100550` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -7007,7 +7215,7 @@ A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100441` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100551` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -7032,7 +7240,7 @@ Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100450` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100560` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -7057,7 +7265,7 @@ Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100451` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100561` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -7073,150 +7281,6 @@ Query:CPU usage
- -Refer to the [alerts reference](alerts#gitserver-cpu_usage_percentage) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100500` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory usage percentage (total)
- -An estimate for the active memory in use, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100501` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory usage bytes (total)
- -An estimate for the active memory in use in bytes, which includes anonymous memory, file memory, and kernel memory. Some of this memory is reclaimable, so high usage does not necessarily indicate memory pressure. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100502` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory (RSS)
- -The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." - -Refer to the [alerts reference](alerts#gitserver-memory_rss) for 1 alert related to this panel. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100510` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory usage (active file)
- -This metric shows the total active file-backed memory currently in use by the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100511` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Memory usage (kernel)
- -The kernel usage metric shows the amount of memory used by the kernel on behalf of the application. Some of it may be reclaimable, so high usage does not necessarily indicate memory pressure. - -This panel has no related alerts. - -To see this panel, visit `/-/debug/grafana/d/gitserver/gitserver?viewPanel=100512` on your Sourcegraph instance. - -*Managed by the Sourcegraph Services team.* - -Percentage of failed periodic goroutine executions over a long window
+ +The percentage of failed executions over the last 6 hours for each periodic goroutine. +A value above 30% sustained for at least 3 hours indicates persistent failures. +A value above 50% sustained for at least 3 hours indicates that most executions are failing continuously. + +Refer to the [alerts reference](alerts#worker-goroutine_error_percentage_long_window) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101820` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile handler execution time
@@ -17139,7 +17229,7 @@ Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101820` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101830` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17164,7 +17254,7 @@ This represents how long a complete loop iteration takes before sleeping for the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101821` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101831` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17189,7 +17279,7 @@ Higher values indicate that tenant processing is taking longer and may affect ov This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101830` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101840` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17214,7 +17304,7 @@ Consistently high values might indicate problematic tenants or inefficient proce This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101831` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101841` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17239,7 +17329,7 @@ Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101840` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101850` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17264,7 +17354,7 @@ A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101841` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101851` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17289,7 +17379,7 @@ Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101850` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101860` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17314,7 +17404,7 @@ Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101851` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/worker/worker?viewPanel=101861` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -17584,7 +17674,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^worker.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#worker-memory_rss) for 1 alert related to this panel. @@ -20833,6 +20923,32 @@ sum by (name, job_name) (rate(src_periodic_goroutine_errors_total{job=~".*searchPercentage of failed periodic goroutine executions over a long window
+ +The percentage of failed executions over the last 6 hours for each periodic goroutine. +A value above 30% sustained for at least 3 hours indicates persistent failures. +A value above 50% sustained for at least 3 hours indicates that most executions are failing continuously. + +Refer to the [alerts reference](alerts#searcher-goroutine_error_percentage_long_window) for 2 alerts related to this panel. + +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101420` on your Sourcegraph instance. + +*Managed by the Sourcegraph Services team.* + +95th percentile handler execution time
@@ -20842,7 +20958,7 @@ Longer durations might indicate increased load or processing time. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101420` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101430` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20867,7 +20983,7 @@ This represents how long a complete loop iteration takes before sleeping for the This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101421` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101431` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20892,7 +21008,7 @@ Higher values indicate that tenant processing is taking longer and may affect ov This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101430` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101440` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20917,7 +21033,7 @@ Consistently high values might indicate problematic tenants or inefficient proce This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101431` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101441` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20942,7 +21058,7 @@ Unexpected changes can indicate tenant configuration issues or scaling events. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101440` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101450` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20967,7 +21083,7 @@ A healthy routine should maintain a consistent processing rate. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101441` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101451` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -20992,7 +21108,7 @@ Consistent errors indicate problems with specific tenants. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101450` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101460` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -21017,7 +21133,7 @@ Values above 5% indicate significant tenant processing problems. This panel has no related alerts. -To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101451` on your Sourcegraph instance. +To see this panel, visit `/-/debug/grafana/d/searcher/searcher?viewPanel=101461` on your Sourcegraph instance. *Managed by the Sourcegraph Platform team.* @@ -21287,7 +21403,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^searcher.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#searcher-memory_rss) for 1 alert related to this panel. @@ -21815,7 +21931,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^syntect-server.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#syntect-server-memory_rss) for 1 alert related to this panel. @@ -22369,7 +22485,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^zoekt-indexserver.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#zoekt-memory_rss) for 1 alert related to this panel. @@ -22439,6 +22555,19 @@ max(container_memory_kernel_usage{name=~"^zoekt-indexserver.*"} / container_spec ### Zoekt: Zoekt-webserver (CPU, Memory) +Zoekt web server leverages memory mapping to optimize file reads: it is generally expected to consume all the memory provided to it, if it can. When it finds data that is not available in memory yet, this causes a 'page fault', and the data is loaded into memory from disk. + +A trend to watch out for: when something in-application happens to take a lot of memory, and active file previously used nearly all remaining memory, then: + +1. 'Memory (RSS)' goes up, due to in-application usage +2. 'Memory usage (Active file)' goes down, as file data held in memory is evicted +3. 'Page faults' go up, as less data is held in memory (and with that, IOPS, disk read throughput, ...) + +This can also happen without 'Memory (RSS)' increasing, if the provisioned memory is insufficent to start with. +A small degree of this is behaviour generally expected, but if it happens significantly or causes user-noticeable impact, it's likely zoekt web server could benefit from more memory. Look for more user-facing metrics to make a final determination on appropriate resource allocation. + +_See https://en.wikipedia.org/wiki/Memory-mapped_file and the related articles for more information about memory maps._ + #### zoekt: cpu_usage_percentageCPU usage
@@ -22513,7 +22642,7 @@ max by (name) (container_memory_working_set_bytes{name=~"^zoekt-webserver.*"})Memory (RSS)
-The total anonymous memory in use by the application, which includes Go stack and heap. This memory is is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS because to match the cadvisor name, but `anonymous` is more accurate." +The total anonymous memory in use by the application, which includes Go stack and heap. This memory is non-reclaimable, and high usage may trigger OOM kills. Note: the metric is named RSS to match the cadvisor name, but "anonymous" is more accurate. Refer to the [alerts reference](alerts#zoekt-memory_rss) for 1 alert related to this panel.