mirror of https://github.com/kubevela/kubevela.git
Feat: Application Status Metrics & Structured Logs (#6857)
Feat: Application Status Metrics & Structured Logs Signed-off-by: Brian Kane <briankane1@gmail.com>
This commit is contained in:
parent
721c75e44a
commit
1a934e1618
|
@ -99,6 +99,7 @@ helm install --create-namespace -n vela-system kubevela kubevela/vela-core --wai
|
|||
| `featureGates.sharedDefinitionStorageForApplicationRevision` | use definition cache to reduce duplicated definition storage for application revision, must be used with InformerCacheFilterUnnecessaryFields | `true` |
|
||||
| `featureGates.disableWorkflowContextConfigMapCache` | disable the workflow context's configmap informer cache | `true` |
|
||||
| `featureGates.enableCueValidation` | enable the strict cue validation for cue required parameter fields | `false` |
|
||||
| `featureGates.enableApplicationStatusMetrics` | enable application status metrics and structured logging | `false` |
|
||||
|
||||
### MultiCluster parameters
|
||||
|
||||
|
|
|
@ -312,6 +312,7 @@ spec:
|
|||
- "--feature-gates=SharedDefinitionStorageForApplicationRevision={{- .Values.featureGates.sharedDefinitionStorageForApplicationRevision | toString -}}"
|
||||
- "--feature-gates=DisableWorkflowContextConfigMapCache={{- .Values.featureGates.disableWorkflowContextConfigMapCache | toString -}}"
|
||||
- "--feature-gates=EnableCueValidation={{- .Values.featureGates.enableCueValidation | toString -}}"
|
||||
- "--feature-gates=EnableApplicationStatusMetrics={{- .Values.featureGates.enableApplicationStatusMetrics | toString -}}"
|
||||
{{ if .Values.authentication.enabled }}
|
||||
{{ if .Values.authentication.withUser }}
|
||||
- "--authentication-with-user"
|
||||
|
|
|
@ -124,6 +124,7 @@ optimize:
|
|||
##@param featureGates.sharedDefinitionStorageForApplicationRevision use definition cache to reduce duplicated definition storage for application revision, must be used with InformerCacheFilterUnnecessaryFields
|
||||
##@param featureGates.disableWorkflowContextConfigMapCache disable the workflow context's configmap informer cache
|
||||
##@param featureGates.enableCueValidation enable the strict cue validation for cue required parameter fields
|
||||
##@param featureGates.enableApplicationStatusMetrics enable application status metrics and structured logging
|
||||
##@param
|
||||
featureGates:
|
||||
gzipResourceTracker: false
|
||||
|
@ -140,6 +141,7 @@ featureGates:
|
|||
sharedDefinitionStorageForApplicationRevision: true
|
||||
disableWorkflowContextConfigMapCache: true
|
||||
enableCueValidation: false
|
||||
enableApplicationStatusMetrics: false
|
||||
|
||||
## @section MultiCluster parameters
|
||||
|
||||
|
|
|
@ -475,6 +475,9 @@ func (r *Reconciler) writeStatusByMethod(ctx context.Context, method method, app
|
|||
executor.StepStatusCache.Store(fmt.Sprintf("%s-%s", app.Name, app.Namespace), -1)
|
||||
return err
|
||||
}
|
||||
if feature.DefaultMutableFeatureGate.Enabled(features.EnableApplicationStatusMetrics) {
|
||||
r.updateMetricsAndLog(ctx, app)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -591,6 +594,9 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
|
|||
|
||||
// Setup adds a controller that reconciles App.
|
||||
func Setup(mgr ctrl.Manager, args core.Args) error {
|
||||
// Register application status metrics after feature gates are initialized
|
||||
metrics.RegisterApplicationStatusMetrics()
|
||||
|
||||
reconciler := Reconciler{
|
||||
Client: mgr.GetClient(),
|
||||
Scheme: mgr.GetScheme(),
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
/*
|
||||
Copyright 2021 The KubeVela Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package application
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
workflowv1alpha1 "github.com/kubevela/workflow/api/v1alpha1"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
"github.com/oam-dev/kubevela/apis/core.oam.dev/common"
|
||||
"github.com/oam-dev/kubevela/apis/core.oam.dev/v1beta1"
|
||||
"github.com/oam-dev/kubevela/pkg/monitor/metrics"
|
||||
)
|
||||
|
||||
// HealthStatus represents the health status of an application
|
||||
type HealthStatus struct {
|
||||
Healthy bool
|
||||
HealthyCount int
|
||||
UnhealthyCount int
|
||||
}
|
||||
|
||||
// updateMetricsAndLog updates Prometheus metrics and logs application status with service details
|
||||
func (r *Reconciler) updateMetricsAndLog(_ context.Context, app *v1beta1.Application) {
|
||||
healthStatus := calculateHealthStatus(app.Status.Services)
|
||||
|
||||
updateHealthMetric(app, healthStatus.Healthy)
|
||||
updatePhaseMetrics(app)
|
||||
|
||||
workflowStatus := buildWorkflowStatus(app.Status.Workflow)
|
||||
serviceDetails := buildServiceDetails(app.Status.Services)
|
||||
logApplicationStatus(app, healthStatus, workflowStatus, serviceDetails)
|
||||
}
|
||||
|
||||
// calculateHealthStatus calculates the health status from services
|
||||
func calculateHealthStatus(services []common.ApplicationComponentStatus) HealthStatus {
|
||||
status := HealthStatus{
|
||||
Healthy: true,
|
||||
}
|
||||
|
||||
for _, svc := range services {
|
||||
if svc.Healthy {
|
||||
status.HealthyCount++
|
||||
} else {
|
||||
status.UnhealthyCount++
|
||||
status.Healthy = false
|
||||
}
|
||||
}
|
||||
|
||||
return status
|
||||
}
|
||||
|
||||
// updateHealthMetric updates the application health status metric
|
||||
func updateHealthMetric(app *v1beta1.Application, healthy bool) {
|
||||
healthValue := float64(1)
|
||||
if !healthy {
|
||||
healthValue = float64(0)
|
||||
}
|
||||
|
||||
metrics.ApplicationHealthStatus.WithLabelValues(
|
||||
app.Name,
|
||||
app.Namespace,
|
||||
).Set(healthValue)
|
||||
}
|
||||
|
||||
// updatePhaseMetrics updates the application and workflow phase metrics
|
||||
func updatePhaseMetrics(app *v1beta1.Application) {
|
||||
metrics.ApplicationPhase.WithLabelValues(
|
||||
app.Name,
|
||||
app.Namespace,
|
||||
).Set(appPhaseToNumeric(app.Status.Phase))
|
||||
|
||||
if app.Status.Workflow != nil && app.Status.Workflow.Phase != "" {
|
||||
metrics.WorkflowPhase.WithLabelValues(
|
||||
app.Name,
|
||||
app.Namespace,
|
||||
).Set(workflowPhaseToNumeric(app.Status.Workflow.Phase))
|
||||
}
|
||||
}
|
||||
|
||||
// buildWorkflowStatus builds workflow status information for logging
|
||||
func buildWorkflowStatus(workflow *common.WorkflowStatus) map[string]interface{} {
|
||||
if workflow == nil {
|
||||
return make(map[string]interface{})
|
||||
}
|
||||
|
||||
return map[string]interface{}{
|
||||
"app_revision": workflow.AppRevision,
|
||||
"finished": workflow.Finished,
|
||||
"phase": workflow.Phase,
|
||||
"message": workflow.Message,
|
||||
}
|
||||
}
|
||||
|
||||
// buildServiceDetails builds service details for logging
|
||||
func buildServiceDetails(services []common.ApplicationComponentStatus) []map[string]interface{} {
|
||||
serviceDetails := make([]map[string]interface{}, 0, len(services))
|
||||
|
||||
for _, svc := range services {
|
||||
svcDetails := map[string]interface{}{
|
||||
"name": svc.Name,
|
||||
"namespace": svc.Namespace,
|
||||
"cluster": svc.Cluster,
|
||||
"healthy": svc.Healthy,
|
||||
"message": svc.Message,
|
||||
}
|
||||
if len(svc.Details) > 0 {
|
||||
svcDetails["details"] = svc.Details
|
||||
}
|
||||
serviceDetails = append(serviceDetails, svcDetails)
|
||||
}
|
||||
|
||||
return serviceDetails
|
||||
}
|
||||
|
||||
// logApplicationStatus logs the application status with structured data
|
||||
func logApplicationStatus(app *v1beta1.Application, healthStatus HealthStatus, workflowStatus map[string]interface{}, serviceDetails []map[string]interface{}) {
|
||||
statusDetails := map[string]interface{}{
|
||||
"app_uid": app.UID,
|
||||
"app_name": app.Name,
|
||||
"version": app.ResourceVersion,
|
||||
"namespace": app.Namespace,
|
||||
"labels": app.Labels,
|
||||
"status": map[string]interface{}{
|
||||
"phase": string(app.Status.Phase),
|
||||
"healthy": healthStatus.Healthy,
|
||||
"healthy_services_count": healthStatus.HealthyCount,
|
||||
"unhealthy_services_count": healthStatus.UnhealthyCount,
|
||||
"services": serviceDetails,
|
||||
"workflow": workflowStatus,
|
||||
},
|
||||
}
|
||||
|
||||
klog.InfoS("application update",
|
||||
"app_uid", app.UID,
|
||||
"app_name", app.Name,
|
||||
"namespace", app.Namespace,
|
||||
"phase", string(app.Status.Phase),
|
||||
"healthy", healthStatus.Healthy,
|
||||
"data", statusDetails,
|
||||
)
|
||||
}
|
||||
|
||||
// appPhaseToNumeric converts application phase to numeric value for metrics
|
||||
func appPhaseToNumeric(phase common.ApplicationPhase) float64 {
|
||||
switch phase {
|
||||
case common.ApplicationStarting:
|
||||
return 0
|
||||
case common.ApplicationRunning:
|
||||
return 1
|
||||
case common.ApplicationRendering:
|
||||
return 2
|
||||
case common.ApplicationPolicyGenerating:
|
||||
return 3
|
||||
case common.ApplicationRunningWorkflow:
|
||||
return 4
|
||||
case common.ApplicationWorkflowSuspending:
|
||||
return 5
|
||||
case common.ApplicationWorkflowTerminated:
|
||||
return 6
|
||||
case common.ApplicationWorkflowFailed:
|
||||
return 7
|
||||
case common.ApplicationUnhealthy:
|
||||
return 8
|
||||
case common.ApplicationDeleting:
|
||||
return 9
|
||||
default:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
|
||||
// workflowPhaseToNumeric converts workflow phase to numeric value for metrics
|
||||
func workflowPhaseToNumeric(phase workflowv1alpha1.WorkflowRunPhase) float64 {
|
||||
switch phase {
|
||||
case workflowv1alpha1.WorkflowStateInitializing:
|
||||
return 0
|
||||
case workflowv1alpha1.WorkflowStateSucceeded:
|
||||
return 1
|
||||
case workflowv1alpha1.WorkflowStateExecuting:
|
||||
return 2
|
||||
case workflowv1alpha1.WorkflowStateSuspending:
|
||||
return 3
|
||||
case workflowv1alpha1.WorkflowStateTerminated:
|
||||
return 4
|
||||
case workflowv1alpha1.WorkflowStateFailed:
|
||||
return 5
|
||||
case workflowv1alpha1.WorkflowStateSkipped:
|
||||
return 6
|
||||
default:
|
||||
return -1
|
||||
}
|
||||
}
|
|
@ -0,0 +1,545 @@
|
|||
/*
|
||||
Copyright 2021 The KubeVela Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package application
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
workflowv1alpha1 "github.com/kubevela/workflow/api/v1alpha1"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||
"github.com/stretchr/testify/assert"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
|
||||
"github.com/oam-dev/kubevela/apis/core.oam.dev/common"
|
||||
"github.com/oam-dev/kubevela/apis/core.oam.dev/v1beta1"
|
||||
"github.com/oam-dev/kubevela/pkg/monitor/metrics"
|
||||
)
|
||||
|
||||
func TestCalculateHealthStatus(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
services []common.ApplicationComponentStatus
|
||||
expected HealthStatus
|
||||
}{
|
||||
{
|
||||
name: "all services healthy",
|
||||
services: []common.ApplicationComponentStatus{
|
||||
{Healthy: true},
|
||||
{Healthy: true},
|
||||
{Healthy: true},
|
||||
},
|
||||
expected: HealthStatus{
|
||||
Healthy: true,
|
||||
HealthyCount: 3,
|
||||
UnhealthyCount: 0,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "some unhealthy",
|
||||
services: []common.ApplicationComponentStatus{
|
||||
{Healthy: true},
|
||||
{Healthy: false},
|
||||
{Healthy: true},
|
||||
},
|
||||
expected: HealthStatus{
|
||||
Healthy: false,
|
||||
HealthyCount: 2,
|
||||
UnhealthyCount: 1,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "all services unhealthy",
|
||||
services: []common.ApplicationComponentStatus{
|
||||
{Healthy: false},
|
||||
{Healthy: false},
|
||||
},
|
||||
expected: HealthStatus{
|
||||
Healthy: false,
|
||||
HealthyCount: 0,
|
||||
UnhealthyCount: 2,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "no services",
|
||||
services: []common.ApplicationComponentStatus{},
|
||||
expected: HealthStatus{
|
||||
Healthy: true,
|
||||
HealthyCount: 0,
|
||||
UnhealthyCount: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := calculateHealthStatus(tt.services)
|
||||
assert.Equal(t, tt.expected, got)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAppPhaseToNumeric(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
phase common.ApplicationPhase
|
||||
want float64
|
||||
}{
|
||||
{"starting", common.ApplicationStarting, 0},
|
||||
{"running", common.ApplicationRunning, 1},
|
||||
{"rendering", common.ApplicationRendering, 2},
|
||||
{"policy generating", common.ApplicationPolicyGenerating, 3},
|
||||
{"running workflow", common.ApplicationRunningWorkflow, 4},
|
||||
{"workflow suspending", common.ApplicationWorkflowSuspending, 5},
|
||||
{"workflow terminated", common.ApplicationWorkflowTerminated, 6},
|
||||
{"workflow failed", common.ApplicationWorkflowFailed, 7},
|
||||
{"unhealthy", common.ApplicationUnhealthy, 8},
|
||||
{"deleting", common.ApplicationDeleting, 9},
|
||||
{"unknown", common.ApplicationPhase("unknown"), -1},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := appPhaseToNumeric(tt.phase)
|
||||
assert.Equal(t, tt.want, got)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkflowPhaseToNumeric(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
phase workflowv1alpha1.WorkflowRunPhase
|
||||
want float64
|
||||
}{
|
||||
{"initializing", workflowv1alpha1.WorkflowStateInitializing, 0},
|
||||
{"succeeded", workflowv1alpha1.WorkflowStateSucceeded, 1},
|
||||
{"executing", workflowv1alpha1.WorkflowStateExecuting, 2},
|
||||
{"suspending", workflowv1alpha1.WorkflowStateSuspending, 3},
|
||||
{"terminated", workflowv1alpha1.WorkflowStateTerminated, 4},
|
||||
{"failed", workflowv1alpha1.WorkflowStateFailed, 5},
|
||||
{"skipped", workflowv1alpha1.WorkflowStateSkipped, 6},
|
||||
{"unknown", workflowv1alpha1.WorkflowRunPhase("unknown"), -1},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := workflowPhaseToNumeric(tt.phase)
|
||||
assert.Equal(t, tt.want, got)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildWorkflowStatus(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
workflow *common.WorkflowStatus
|
||||
want map[string]interface{}
|
||||
}{
|
||||
{
|
||||
name: "nil workflow",
|
||||
workflow: nil,
|
||||
want: map[string]interface{}{},
|
||||
},
|
||||
{
|
||||
name: "workflow with data",
|
||||
workflow: &common.WorkflowStatus{
|
||||
AppRevision: "rev-1",
|
||||
Finished: true,
|
||||
Phase: workflowv1alpha1.WorkflowStateSucceeded,
|
||||
Message: "Workflow completed",
|
||||
},
|
||||
want: map[string]interface{}{
|
||||
"app_revision": "rev-1",
|
||||
"finished": true,
|
||||
"phase": workflowv1alpha1.WorkflowStateSucceeded,
|
||||
"message": "Workflow completed",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := buildWorkflowStatus(tt.workflow)
|
||||
assert.Equal(t, tt.want, got)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildServiceDetails(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
services []common.ApplicationComponentStatus
|
||||
want []map[string]interface{}
|
||||
}{
|
||||
{
|
||||
name: "empty services",
|
||||
services: []common.ApplicationComponentStatus{},
|
||||
want: []map[string]interface{}{},
|
||||
},
|
||||
{
|
||||
name: "services with details",
|
||||
services: []common.ApplicationComponentStatus{
|
||||
{
|
||||
Name: "web",
|
||||
Namespace: "default",
|
||||
Cluster: "local",
|
||||
Healthy: true,
|
||||
Message: "Running",
|
||||
Details: map[string]string{"replicas": "3"},
|
||||
},
|
||||
{
|
||||
Name: "db",
|
||||
Namespace: "default",
|
||||
Cluster: "local",
|
||||
Healthy: false,
|
||||
Message: "Connection failed",
|
||||
},
|
||||
},
|
||||
want: []map[string]interface{}{
|
||||
{
|
||||
"name": "web",
|
||||
"namespace": "default",
|
||||
"cluster": "local",
|
||||
"healthy": true,
|
||||
"message": "Running",
|
||||
"details": map[string]string{"replicas": "3"},
|
||||
},
|
||||
{
|
||||
"name": "db",
|
||||
"namespace": "default",
|
||||
"cluster": "local",
|
||||
"healthy": false,
|
||||
"message": "Connection failed",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := buildServiceDetails(tt.services)
|
||||
assert.Equal(t, tt.want, got)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateHealthMetric(t *testing.T) {
|
||||
// Reset the metric before testing
|
||||
metrics.ApplicationHealthStatus.Reset()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
app *v1beta1.Application
|
||||
healthy bool
|
||||
expectedValue float64
|
||||
}{
|
||||
{
|
||||
name: "healthy application",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app",
|
||||
Namespace: "default",
|
||||
},
|
||||
},
|
||||
healthy: true,
|
||||
expectedValue: 1,
|
||||
},
|
||||
{
|
||||
name: "unhealthy application",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app",
|
||||
Namespace: "default",
|
||||
},
|
||||
},
|
||||
healthy: false,
|
||||
expectedValue: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
updateHealthMetric(tt.app, tt.healthy)
|
||||
|
||||
value := testutil.ToFloat64(metrics.ApplicationHealthStatus.WithLabelValues(
|
||||
tt.app.Name,
|
||||
tt.app.Namespace,
|
||||
))
|
||||
assert.Equal(t, tt.expectedValue, value)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdatePhaseMetrics(t *testing.T) {
|
||||
// Reset metrics before testing
|
||||
metrics.ApplicationPhase.Reset()
|
||||
metrics.WorkflowPhase.Reset()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
app *v1beta1.Application
|
||||
expectedAppPhase float64
|
||||
expectedWorkflowPhase float64
|
||||
hasWorkflowMetric bool
|
||||
}{
|
||||
{
|
||||
name: "app with workflow",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app",
|
||||
Namespace: "default",
|
||||
},
|
||||
Status: common.AppStatus{
|
||||
Phase: common.ApplicationRunning,
|
||||
Workflow: &common.WorkflowStatus{
|
||||
Phase: workflowv1alpha1.WorkflowStateSucceeded,
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedAppPhase: 1, // ApplicationRunning
|
||||
expectedWorkflowPhase: 1, // WorkflowStateSucceeded
|
||||
hasWorkflowMetric: true,
|
||||
},
|
||||
{
|
||||
name: "app without workflow",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app-2",
|
||||
Namespace: "default",
|
||||
},
|
||||
Status: common.AppStatus{
|
||||
Phase: common.ApplicationStarting,
|
||||
},
|
||||
},
|
||||
expectedAppPhase: 0, // ApplicationStarting
|
||||
hasWorkflowMetric: false,
|
||||
},
|
||||
{
|
||||
name: "app with empty workflow phase",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app-3",
|
||||
Namespace: "default",
|
||||
},
|
||||
Status: common.AppStatus{
|
||||
Phase: common.ApplicationUnhealthy,
|
||||
Workflow: &common.WorkflowStatus{
|
||||
Phase: "", // Empty phase
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedAppPhase: 8, // ApplicationUnhealthy
|
||||
hasWorkflowMetric: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
updatePhaseMetrics(tt.app)
|
||||
|
||||
appPhaseValue := testutil.ToFloat64(metrics.ApplicationPhase.WithLabelValues(
|
||||
tt.app.Name,
|
||||
tt.app.Namespace,
|
||||
))
|
||||
assert.Equal(t, tt.expectedAppPhase, appPhaseValue)
|
||||
|
||||
if tt.hasWorkflowMetric {
|
||||
workflowPhaseValue := testutil.ToFloat64(metrics.WorkflowPhase.WithLabelValues(
|
||||
tt.app.Name,
|
||||
tt.app.Namespace,
|
||||
))
|
||||
assert.Equal(t, tt.expectedWorkflowPhase, workflowPhaseValue)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogApplicationStatus(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
app *v1beta1.Application
|
||||
healthStatus HealthStatus
|
||||
workflowStatus map[string]interface{}
|
||||
serviceDetails []map[string]interface{}
|
||||
}{
|
||||
{
|
||||
name: "complete status",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app",
|
||||
Namespace: "default",
|
||||
UID: "12345",
|
||||
Labels: map[string]string{"env": "prod"},
|
||||
},
|
||||
Status: common.AppStatus{
|
||||
Phase: common.ApplicationRunning,
|
||||
},
|
||||
},
|
||||
healthStatus: HealthStatus{
|
||||
Healthy: true,
|
||||
HealthyCount: 2,
|
||||
UnhealthyCount: 0,
|
||||
},
|
||||
workflowStatus: map[string]interface{}{
|
||||
"phase": workflowv1alpha1.WorkflowStateSucceeded,
|
||||
"finished": true,
|
||||
},
|
||||
serviceDetails: []map[string]interface{}{
|
||||
{
|
||||
"name": "web",
|
||||
"healthy": true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "minimal status",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app-minimal",
|
||||
Namespace: "default",
|
||||
},
|
||||
Status: common.AppStatus{
|
||||
Phase: common.ApplicationStarting,
|
||||
},
|
||||
},
|
||||
healthStatus: HealthStatus{Healthy: true},
|
||||
workflowStatus: map[string]interface{}{},
|
||||
serviceDetails: []map[string]interface{}{},
|
||||
},
|
||||
{
|
||||
name: "nil values",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app-nil",
|
||||
Namespace: "default",
|
||||
},
|
||||
},
|
||||
healthStatus: HealthStatus{},
|
||||
workflowStatus: nil,
|
||||
serviceDetails: nil,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.NotPanics(t, func() {
|
||||
logApplicationStatus(tt.app, tt.healthStatus, tt.workflowStatus, tt.serviceDetails)
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMetricsAndLogFunction(t *testing.T) {
|
||||
// Reset metrics before testing
|
||||
metrics.ApplicationHealthStatus.Reset()
|
||||
metrics.ApplicationPhase.Reset()
|
||||
metrics.WorkflowPhase.Reset()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
app *v1beta1.Application
|
||||
}{
|
||||
{
|
||||
name: "complete application",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app",
|
||||
Namespace: "default",
|
||||
UID: "12345",
|
||||
},
|
||||
Status: common.AppStatus{
|
||||
Phase: common.ApplicationRunning,
|
||||
Services: []common.ApplicationComponentStatus{
|
||||
{
|
||||
Name: "web",
|
||||
Namespace: "default",
|
||||
Healthy: true,
|
||||
Message: "Running",
|
||||
},
|
||||
{
|
||||
Name: "db",
|
||||
Namespace: "default",
|
||||
Healthy: false,
|
||||
Message: "Starting",
|
||||
},
|
||||
},
|
||||
Workflow: &common.WorkflowStatus{
|
||||
Phase: workflowv1alpha1.WorkflowStateExecuting,
|
||||
Finished: false,
|
||||
AppRevision: "v1",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "application with no services",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app-empty",
|
||||
Namespace: "test",
|
||||
},
|
||||
Status: common.AppStatus{
|
||||
Phase: common.ApplicationStarting,
|
||||
Services: []common.ApplicationComponentStatus{},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "application with nil workflow",
|
||||
app: &v1beta1.Application{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-app-no-workflow",
|
||||
Namespace: "default",
|
||||
},
|
||||
Status: common.AppStatus{
|
||||
Phase: common.ApplicationUnhealthy,
|
||||
Services: []common.ApplicationComponentStatus{
|
||||
{
|
||||
Name: "failing-service",
|
||||
Healthy: false,
|
||||
},
|
||||
},
|
||||
Workflow: nil,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
r := &Reconciler{}
|
||||
ctx := context.Background()
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.NotPanics(t, func() {
|
||||
r.updateMetricsAndLog(ctx, tt.app)
|
||||
})
|
||||
|
||||
labels := prometheus.Labels{
|
||||
"app_name": tt.app.Name,
|
||||
"namespace": tt.app.Namespace,
|
||||
}
|
||||
|
||||
_, err := metrics.ApplicationHealthStatus.GetMetricWith(labels)
|
||||
assert.NoError(t, err)
|
||||
|
||||
_, err = metrics.ApplicationPhase.GetMetricWith(labels)
|
||||
assert.NoError(t, err)
|
||||
})
|
||||
}
|
||||
}
|
|
@ -114,6 +114,9 @@ const (
|
|||
|
||||
// EnableCueValidation enable strict cue validation fields for the required parameter field verification
|
||||
EnableCueValidation = "EnableCueValidation"
|
||||
|
||||
// EnableApplicationStatusMetrics enable the collection and export of application status metrics and structured logging
|
||||
EnableApplicationStatusMetrics = "EnableApplicationStatusMetrics"
|
||||
)
|
||||
|
||||
var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
|
||||
|
@ -139,6 +142,7 @@ var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
|
|||
SharedDefinitionStorageForApplicationRevision: {Default: true, PreRelease: featuregate.Alpha},
|
||||
DisableWorkflowContextConfigMapCache: {Default: true, PreRelease: featuregate.Alpha},
|
||||
EnableCueValidation: {Default: false, PreRelease: featuregate.Beta},
|
||||
EnableApplicationStatusMetrics: {Default: false, PreRelease: featuregate.Alpha},
|
||||
}
|
||||
|
||||
func init() {
|
||||
|
|
|
@ -66,6 +66,27 @@ var (
|
|||
Name: "workflow_step_phase_number",
|
||||
Help: "workflow step phase number",
|
||||
}, []string{"step_type", "phase"})
|
||||
|
||||
// ApplicationHealthStatus reports the overall health status of each application
|
||||
ApplicationHealthStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Name: "kubevela_application_health_status",
|
||||
Help: "Application health status (1 = healthy, 0 = unhealthy)",
|
||||
}, []string{"app_name", "namespace"})
|
||||
|
||||
// ApplicationPhase reports the numeric phase of each application
|
||||
ApplicationPhase = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Name: "kubevela_application_phase",
|
||||
Help: "Application phase as numeric value (0=starting, 1=running, 2=rendering, 3=policy_generating, 4=running_workflow, " +
|
||||
"5=workflow_suspending, 6=workflow_terminated, 7=workflow_failed, 8=unhealthy, 9=deleting, " +
|
||||
"-1=unknown)",
|
||||
}, []string{"app_name", "namespace"})
|
||||
|
||||
// WorkflowPhase reports the numeric phase of each workflow
|
||||
WorkflowPhase = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Name: "kubevela_application_workflow_phase",
|
||||
Help: "Workflow phase as numeric value (0=initializing, 1=succeeded, 2=executing, 3=suspending, 4=terminated, " +
|
||||
"5=failed, 6=skipped, -1=unknown)",
|
||||
}, []string{"app_name", "namespace"})
|
||||
)
|
||||
|
||||
var (
|
||||
|
|
|
@ -15,10 +15,13 @@ package metrics
|
|||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"k8s.io/apiserver/pkg/util/feature"
|
||||
"k8s.io/klog/v2"
|
||||
"sigs.k8s.io/controller-runtime/pkg/metrics"
|
||||
|
||||
velametrics "github.com/kubevela/pkg/monitor/metrics"
|
||||
|
||||
"github.com/oam-dev/kubevela/pkg/features"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -53,6 +56,10 @@ var collectorGroup = []prometheus.Collector{
|
|||
ClusterCPUUsageGauge,
|
||||
}
|
||||
|
||||
var (
|
||||
applicationStatusMetricsRegistered = false
|
||||
)
|
||||
|
||||
func init() {
|
||||
for _, collector := range collectorGroup {
|
||||
if err := metrics.Registry.Register(collector); err != nil {
|
||||
|
@ -60,3 +67,27 @@ func init() {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterApplicationStatusMetrics registers the application status metrics
|
||||
// This should be called after the feature gate system is initialized
|
||||
func RegisterApplicationStatusMetrics() {
|
||||
if applicationStatusMetricsRegistered {
|
||||
return
|
||||
}
|
||||
|
||||
if feature.DefaultMutableFeatureGate.Enabled(features.EnableApplicationStatusMetrics) {
|
||||
statusMetrics := []prometheus.Collector{
|
||||
ApplicationHealthStatus,
|
||||
ApplicationPhase,
|
||||
WorkflowPhase,
|
||||
}
|
||||
|
||||
for _, metric := range statusMetrics {
|
||||
if err := metrics.Registry.Register(metric); err != nil {
|
||||
klog.Errorf("Failed to register application status metric: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
applicationStatusMetricsRegistered = true
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue