grafana/apps/dashboard/pkg/migration/testdata/input/v16.span_zero_demo.json

688 lines
21 KiB
JSON

{
"__requires": [
{
"id": "grafana",
"name": "Grafana",
"type": "grafana",
"version": "8.0.0"
}
],
"annotations": {
"list": []
},
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"links": [
{
"icon": "external link",
"targetBlank": true,
"title": "External Documentation",
"type": "link",
"url": "https://example.com/docs"
}
],
"panels": [
{
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 0
},
"options": {
"content": "This dashboard demonstrates various monitoring components for application observability and performance metrics.\n",
"mode": "markdown"
},
"title": "Application Monitoring",
"type": "text"
}
],
"refresh": "10s",
"rows": [
{
"collapse": false,
"collapsed": false,
"height": "250px",
"panels": [
{
"gridPos": {
"h": 11,
"w": 24,
"x": 0,
"y": 5
},
"id": 6,
"options": {
"content": "This service handles background processing tasks for the application system. It manages various types of operations including data synchronization, resource management, and batch processing.\n\nSupported operation types:\n1. Sync: Synchronizes data between different systems\n2. Process: Handles batch data processing tasks\n3. Cleanup: Removes outdated or temporary resources\n4. Update: Applies configuration changes across services\n\nService dependencies:\n- Data API: For reading and writing application data\n- Configuration Service: For managing system settings\n- Queue Service: For handling task scheduling\n- Storage Service: For persistent data management\n- Auth Service: For authentication and authorization\n- Metrics Service: For collecting operational statistics\n",
"mode": "markdown"
},
"span": 0,
"title": "Service Overview",
"type": "text"
},
{
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 16
},
"id": 7,
"options": {
"content": "Error monitoring helps identify issues in the system. This section displays error logs and success rates for operations.",
"mode": "markdown"
},
"span": 0,
"title": "Error Monitoring",
"type": "text"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "yellow",
"value": 0.95
},
{
"color": "green",
"value": 1
}
]
},
"unit": "percentunit"
}
},
"gridPos": {
"h": 9,
"w": 3,
"x": 0,
"y": 19
},
"id": 8,
"span": 0,
"targets": [
{
"expr": "sum by (action) (app_jobs_processed_total{outcome=\"success\", cluster=\"$cluster\", namespace=\"default\"})\n/\nsum by (action) (app_jobs_processed_total{cluster=\"$cluster\", namespace=\"default\"})\n",
"legendFormat": "{{action}}"
}
],
"title": "Job Success Rate",
"type": "stat"
},
{
"datasource": {
"type": "loki",
"uid": "${loki}"
},
"gridPos": {
"h": 9,
"w": 10,
"x": 3,
"y": 19
},
"id": 9,
"options": {
"enableLogDetails": true,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": true
},
"span": 0,
"targets": [
{
"expr": "{namespace=\"default\", cluster=\"$cluster\", job=\"app-service\"} | logfmt | level=\"error\""
}
],
"title": "Errors",
"type": "logs"
},
{
"datasource": {
"type": "loki",
"uid": "${loki}"
},
"gridPos": {
"h": 9,
"w": 11,
"x": 13,
"y": 19
},
"id": 10,
"options": {
"enableLogDetails": true,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": true
},
"span": 0,
"targets": [
{
"expr": "{namespace=\"default\", cluster=\"$cluster\", job=\"app-service\"} | logfmt"
}
],
"title": "All",
"type": "logs"
},
{
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 28
},
"id": 11,
"options": {
"content": "Performance monitoring examines factors that affect system response times, including operation duration, queue lengths, and processing delays. This section provides metrics and traces for performance analysis.\n",
"mode": "markdown"
},
"span": 0,
"title": "Performance Analysis",
"type": "text"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"description": "Number of concurrent processing threads available for handling operations",
"gridPos": {
"h": 6,
"w": 5,
"x": 0,
"y": 31
},
"id": 12,
"span": 0,
"targets": [
{
"expr": "max(app_worker_threads_active{cluster=\"$cluster\", namespace=\"default\"})",
"instant": true
}
],
"title": "Concurrent Job Drivers",
"type": "stat"
},
{
"datasource": {
"type": "tempo",
"uid": "${tempo}"
},
"gridPos": {
"h": 6,
"w": 19,
"x": 5,
"y": 31
},
"id": 13,
"span": 0,
"targets": [
{
"filters": [
{
"id": "span-name",
"operator": "=",
"scope": "span",
"tag": "name",
"value": [
"provisioning.sync.process"
]
},
{
"id": "k8s-cluster-name",
"operator": "=",
"scope": "resource",
"tag": "k8s.cluster.name",
"value": [
"$cluster"
]
}
],
"query": "{name=\"app.operation.process\"}",
"queryType": "traceqlSearch"
}
],
"title": "Recent Operation Traces",
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"description": "Histogram showing p99, p95, p50, and p10 percentiles for job processing duration based on number of resources changed",
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 2
},
{
"color": "red",
"value": 5
}
]
},
"unit": "s"
}
},
"gridPos": {
"h": 10,
"w": 8,
"x": 0,
"y": 55
},
"id": 14,
"span": 0,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[7d])) by (le, resources_changed_bucket, action)) and on(resources_changed_bucket, action) sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[7d])) by (resources_changed_bucket, action) > 0",
"legendFormat": "{{action}} q0.99 - size {{resources_changed_bucket}}",
"refId": "B"
},
{
"expr": "histogram_quantile(0.9, sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[7d])) by (le, resources_changed_bucket, action)) and on(resources_changed_bucket, action) sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[7d])) by (resources_changed_bucket, action) > 0",
"legendFormat": "{{action}} q0.95 - size {{resources_changed_bucket}}",
"refId": "C"
},
{
"expr": "histogram_quantile(0.5, sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[7d])) by (le, resources_changed_bucket, action)) and on(resources_changed_bucket, action) sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[7d])) by (resources_changed_bucket, action) > 0",
"legendFormat": "{{action}} q0.5 - size {{resources_changed_bucket}}",
"refId": "D"
},
{
"expr": "histogram_quantile(0.1, sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[7d])) by (le, resources_changed_bucket, action)) and on(resources_changed_bucket, action) sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[7d])) by (resources_changed_bucket, action) > 0",
"legendFormat": "{{action}} q0.1 - size {{resources_changed_bucket}}",
"refId": "E"
}
],
"timeFrom": "7d",
"title": "7d avg of job durations",
"transformations": [
{
"id": "reduce",
"options": {
"mode": "seriesToRows",
"reducers": [
"mean"
]
}
},
{
"id": "seriesToRows"
},
{
"id": "organize",
"options": {
"renameByName": {
"Field": "Type",
"Mean": "Avg Duration",
"Metric": "Legend",
"Value": "Duration"
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"description": "Histogram showing p99, p95, p50, and p10 percentiles for job processing duration based on number of resources changed",
"gridPos": {
"h": 10,
"w": 16,
"x": 8,
"y": 55
},
"id": 15,
"span": 0,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[5m])) by (le, resources_changed_bucket, action))",
"legendFormat": "{{action}} q0.99 - size {{resources_changed_bucket}}",
"refId": "B"
},
{
"expr": "histogram_quantile(0.95, sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[5m])) by (le, resources_changed_bucket, action))",
"legendFormat": "{{action}} q0.95 - size {{resources_changed_bucket}}",
"refId": "C"
},
{
"expr": "histogram_quantile(0.5, sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[5m])) by (le, resources_changed_bucket, action))",
"legendFormat": "{{action}} q0.5 - size {{resources_changed_bucket}}",
"refId": "D"
},
{
"expr": "histogram_quantile(0.1, sum(rate(app_operation_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[5m])) by (le, resources_changed_bucket, action))",
"legendFormat": "{{action}} q0.1 - size {{resources_changed_bucket}}",
"refId": "E"
}
],
"title": "Job Duration",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"description": "Total number of jobs waiting to be processed",
"gridPos": {
"h": 5,
"w": 4,
"x": 0,
"y": 65
},
"id": 16,
"span": 0,
"targets": [
{
"expr": "clamp_min(sum(app_operation_queue_size{cluster=\"$cluster\", namespace=\"default\"}), 0)",
"legendFormat": "Queue size"
}
],
"title": "Queue Size",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"fieldConfig": {
"defaults": {
"unit": "s"
}
},
"gridPos": {
"h": 5,
"w": 4,
"x": 4,
"y": 65
},
"id": 17,
"span": 0,
"targets": [
{
"expr": "avg(histogram_quantile(0.5, sum(rate(app_operation_queue_wait_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[7d])) by (le)))",
"legendFormat": "Queue size"
}
],
"timeFrom": "7d",
"title": "7d avg Queue Wait Time",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"description": "How long a job is in the queue before being picked up",
"gridPos": {
"h": 5,
"w": 16,
"x": 8,
"y": 65
},
"id": 18,
"span": 0,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(app_operation_queue_wait_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[$__rate_interval])) by (le))",
"legendFormat": "q0.99",
"refId": "B"
},
{
"expr": "histogram_quantile(0.95, sum(rate(app_operation_queue_wait_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[$__rate_interval])) by (le))",
"legendFormat": "q0.95",
"refId": "C"
},
{
"expr": "histogram_quantile(0.5, sum(rate(app_operation_queue_wait_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[$__rate_interval])) by (le))",
"legendFormat": "q0.5",
"refId": "D"
},
{
"expr": "histogram_quantile(0.1, sum(rate(app_operation_queue_wait_seconds_bucket{cluster=\"$cluster\", namespace=\"default\"}[$__rate_interval])) by (le))",
"legendFormat": "q0.1",
"refId": "E"
}
],
"title": "Queue Wait Time",
"type": "timeseries"
},
{
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 52
},
"id": 19,
"options": {
"content": "Resource utilization monitoring for application containers",
"mode": "markdown"
},
"span": 0,
"title": "Resource Monitoring",
"type": "text"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"gridPos": {
"h": 9,
"w": 7,
"x": 0,
"y": 55
},
"id": 20,
"span": 0,
"targets": [
{
"expr": "count by (cluster, channel)(label_replace(label_replace(kube_pod_container_info{namespace=\"default\", container=\"app-worker\", pod=~\"app-worker.*\", cluster=~\"$cluster\"}, \"version\", \"$1\", \"image\", \".+:(.+)\"), \"channel\", \"$1\", \"container\", \".+-(.+)\"))",
"legendFormat": "{{cluster}}"
}
],
"title": "Running Pod(s)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"gridPos": {
"h": 9,
"w": 8,
"x": 7,
"y": 55
},
"id": 21,
"span": 0,
"targets": [
{
"expr": "max(kube_pod_container_resource_requests{namespace=\"default\", resource=\"memory\", cluster=~\"$cluster\", container=\"app-worker\", pod=~\"app-worker.*\"})",
"legendFormat": "Memory Request"
},
{
"expr": "max(kube_pod_container_resource_limits{namespace=\"default\", resource=\"memory\", cluster=~\"$cluster\", container=\"app-worker\", pod=~\"app-worker.*\"})",
"legendFormat": "Memory Limit"
},
{
"expr": "max(container_memory_usage_bytes{namespace=\"default\",cluster=~\"$cluster\", container=\"app-worker\", pod=~\"app-worker.*\"}) by (pod)",
"legendFormat": "Container usage {{pod}}"
}
],
"title": "Memory Utilization",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"gridPos": {
"h": 9,
"w": 9,
"x": 15,
"y": 55
},
"id": 22,
"span": 0,
"targets": [
{
"expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"default\", cluster=~\"$cluster\", container=\"app-worker\", pod=~\"app-worker-.*\"}[$__rate_interval])) by (pod, container, cpu)",
"legendFormat": "Usage {{pod}}"
},
{
"expr": "sum(irate(container_cpu_cfs_throttled_seconds_total{namespace=\"default\", cluster=~\"$cluster\", container=\"app-worker\", pod=~\"app-worker-.*\"}[$__rate_interval])) by (pod, container)",
"legendFormat": "Throttling {{pod}}"
},
{
"expr": "max(kube_pod_container_resource_limits{namespace=\"default\", cluster=~\"$cluster\", container=\"app-worker\", pod=~\"app-worker-.*\", resource=\"cpu\"})",
"legendFormat": "CPU limit"
},
{
"expr": "max(kube_pod_container_resource_requests{namespace=\"default\", cluster=~\"$cluster\", container=\"app-worker\", pod=~\"app-worker-.*\", resource=\"cpu\"})",
"legendFormat": "CPU request"
}
],
"title": "CPU Utilization",
"type": "timeseries"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Application Service",
"titleSize": "h6"
}
],
"schemaVersion": 15,
"style": "dark",
"tags": [
"as-code"
],
"templating": {
"list": [
{
"current": {
"value": "prometheus-datasource"
},
"hide": 0,
"label": "Data source",
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"current": {
"value": "prometheus-datasource"
},
"name": "prom",
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"current": {
"value": "loki-datasource"
},
"name": "loki",
"query": "loki",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"current": {
"text": "tempo-datasource",
"value": "tempo-datasource"
},
"name": "tempo",
"query": "tempo",
"refresh": 1,
"regex": ".*tempo.*",
"type": "datasource"
},
{
"current": {
"text": "demo-cluster",
"value": "demo-cluster"
},
"datasource": {
"type": "prometheus",
"uid": "${prom}"
},
"name": "cluster",
"query": "label_values(app_worker_threads_active,cluster)",
"refresh": 1,
"type": "query"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "utc",
"title": "Span Zero Demo Dashboard",
"uid": "span-zero-demo-dashboard",
"version": 0
}