mirror of https://github.com/minio/minio.git
				
				
				
			export cluster health as prometheus metrics (#17741)
This commit is contained in:
		
							parent
							
								
									c2edbfae55
								
							
						
					
					
						commit
						114fab4c70
					
				| 
						 | 
				
			
			@ -2062,8 +2062,12 @@ type HealthOptions struct {
 | 
			
		|||
type HealthResult struct {
 | 
			
		||||
	Healthy        bool
 | 
			
		||||
	HealingDrives  int
 | 
			
		||||
	UnhealthyPools []struct {
 | 
			
		||||
		Maintenance   bool
 | 
			
		||||
		PoolID, SetID int
 | 
			
		||||
		WriteQuorum   int
 | 
			
		||||
	}
 | 
			
		||||
	WriteQuorum   int
 | 
			
		||||
	UsingDefaults bool
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -2164,24 +2168,6 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
 | 
			
		|||
		usingDefaults = true
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for poolIdx := range erasureSetUpCount {
 | 
			
		||||
		for setIdx := range erasureSetUpCount[poolIdx] {
 | 
			
		||||
			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
 | 
			
		||||
				logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
 | 
			
		||||
					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
 | 
			
		||||
						poolIdx, setIdx, poolWriteQuorums[poolIdx]))
 | 
			
		||||
				return HealthResult{
 | 
			
		||||
					Healthy:       false,
 | 
			
		||||
					HealingDrives: len(aggHealStateResult.HealDisks),
 | 
			
		||||
					PoolID:        poolIdx,
 | 
			
		||||
					SetID:         setIdx,
 | 
			
		||||
					WriteQuorum:   poolWriteQuorums[poolIdx],
 | 
			
		||||
					UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	var maximumWriteQuorum int
 | 
			
		||||
	for _, writeQuorum := range poolWriteQuorums {
 | 
			
		||||
		if maximumWriteQuorum == 0 {
 | 
			
		||||
| 
						 | 
				
			
			@ -2192,6 +2178,35 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
 | 
			
		|||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	result := HealthResult{
 | 
			
		||||
		HealingDrives: len(aggHealStateResult.HealDisks),
 | 
			
		||||
		WriteQuorum:   maximumWriteQuorum,
 | 
			
		||||
		UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for poolIdx := range erasureSetUpCount {
 | 
			
		||||
		for setIdx := range erasureSetUpCount[poolIdx] {
 | 
			
		||||
			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
 | 
			
		||||
				logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
 | 
			
		||||
					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
 | 
			
		||||
						poolIdx, setIdx, poolWriteQuorums[poolIdx]))
 | 
			
		||||
				result.UnhealthyPools = append(result.UnhealthyPools, struct {
 | 
			
		||||
					Maintenance                bool
 | 
			
		||||
					PoolID, SetID, WriteQuorum int
 | 
			
		||||
				}{
 | 
			
		||||
					Maintenance: opts.Maintenance,
 | 
			
		||||
					SetID:       setIdx,
 | 
			
		||||
					PoolID:      poolIdx,
 | 
			
		||||
					WriteQuorum: poolWriteQuorums[poolIdx],
 | 
			
		||||
				})
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if len(result.UnhealthyPools) > 0 {
 | 
			
		||||
			// We have unhealthy pools return error.
 | 
			
		||||
			return result
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// when maintenance is not specified we don't have
 | 
			
		||||
	// to look at the healing side of the code.
 | 
			
		||||
	if !opts.Maintenance {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,22 +28,17 @@ import (
 | 
			
		|||
 | 
			
		||||
const unavailable = "offline"
 | 
			
		||||
 | 
			
		||||
func isServerNotInitialized() bool {
 | 
			
		||||
	return newObjectLayerFn() == nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// ClusterCheckHandler returns if the server is ready for requests.
 | 
			
		||||
func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
 | 
			
		||||
	ctx := newContext(r, w, "ClusterCheckHandler")
 | 
			
		||||
 | 
			
		||||
	if isServerNotInitialized() {
 | 
			
		||||
	objLayer := newObjectLayerFn()
 | 
			
		||||
	if objLayer == nil {
 | 
			
		||||
		w.Header().Set(xhttp.MinIOServerStatus, unavailable)
 | 
			
		||||
		writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	objLayer := newObjectLayerFn()
 | 
			
		||||
 | 
			
		||||
	ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
 | 
			
		||||
	defer cancel()
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -52,16 +47,13 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
 | 
			
		|||
		DeploymentType: r.Form.Get("deployment-type"),
 | 
			
		||||
	}
 | 
			
		||||
	result := objLayer.Health(ctx, opts)
 | 
			
		||||
	if result.WriteQuorum > 0 {
 | 
			
		||||
	w.Header().Set(xhttp.MinIOWriteQuorum, strconv.Itoa(result.WriteQuorum))
 | 
			
		||||
	}
 | 
			
		||||
	w.Header().Set(xhttp.MinIOStorageClassDefaults, strconv.FormatBool(result.UsingDefaults))
 | 
			
		||||
 | 
			
		||||
	if !result.Healthy {
 | 
			
		||||
	// return how many drives are being healed if any
 | 
			
		||||
	if result.HealingDrives > 0 {
 | 
			
		||||
		w.Header().Set(xhttp.MinIOHealingDrives, strconv.Itoa(result.HealingDrives))
 | 
			
		||||
	}
 | 
			
		||||
	if !result.Healthy {
 | 
			
		||||
		// As a maintenance call we are purposefully asked to be taken
 | 
			
		||||
		// down, this is for orchestrators to know if we can safely
 | 
			
		||||
		// take this server down, return appropriate error.
 | 
			
		||||
| 
						 | 
				
			
			@ -79,14 +71,13 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
 | 
			
		|||
func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) {
 | 
			
		||||
	ctx := newContext(r, w, "ClusterReadCheckHandler")
 | 
			
		||||
 | 
			
		||||
	if isServerNotInitialized() {
 | 
			
		||||
	objLayer := newObjectLayerFn()
 | 
			
		||||
	if objLayer == nil {
 | 
			
		||||
		w.Header().Set(xhttp.MinIOServerStatus, unavailable)
 | 
			
		||||
		writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	objLayer := newObjectLayerFn()
 | 
			
		||||
 | 
			
		||||
	ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
 | 
			
		||||
	defer cancel()
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -106,17 +97,17 @@ func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) {
 | 
			
		|||
 | 
			
		||||
// LivenessCheckHandler - Checks if the process is up. Always returns success.
 | 
			
		||||
func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) {
 | 
			
		||||
	peerCall := r.Header.Get("x-minio-from-peer") != ""
 | 
			
		||||
 | 
			
		||||
	if peerCall {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if isServerNotInitialized() {
 | 
			
		||||
	objLayer := newObjectLayerFn()
 | 
			
		||||
	if objLayer == nil {
 | 
			
		||||
		// Service not initialized yet
 | 
			
		||||
		w.Header().Set(xhttp.MinIOServerStatus, unavailable)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	peerCall := r.Header.Get(xhttp.MinIOPeerCall) != ""
 | 
			
		||||
	if peerCall {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if int(globalHTTPStats.loadRequestsInQueue()) > globalAPIConfig.getRequestsPoolCapacity() {
 | 
			
		||||
		apiErr := getAPIError(ErrBusy)
 | 
			
		||||
		switch r.Method {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -54,6 +54,7 @@ func init() {
 | 
			
		|||
		getClusterTierMetrics(),
 | 
			
		||||
		getClusterUsageMetrics(),
 | 
			
		||||
		getKMSMetrics(),
 | 
			
		||||
		getClusterHealthMetrics(),
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	peerMetricsGroups = []*MetricsGroup{
 | 
			
		||||
| 
						 | 
				
			
			@ -2642,6 +2643,63 @@ func getLocalDriveStorageMetrics() *MetricsGroup {
 | 
			
		|||
	return mg
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getClusterWriteQuorumMD() MetricDescription {
 | 
			
		||||
	return MetricDescription{
 | 
			
		||||
		Namespace: clusterMetricNamespace,
 | 
			
		||||
		Subsystem: "write",
 | 
			
		||||
		Name:      "quorum",
 | 
			
		||||
		Help:      "Maximum write quorum across all pools and sets",
 | 
			
		||||
		Type:      gaugeMetric,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getClusterHealthStatusMD() MetricDescription {
 | 
			
		||||
	return MetricDescription{
 | 
			
		||||
		Namespace: clusterMetricNamespace,
 | 
			
		||||
		Subsystem: "health",
 | 
			
		||||
		Name:      "status",
 | 
			
		||||
		Help:      "Get current cluster health status",
 | 
			
		||||
		Type:      gaugeMetric,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getClusterHealthMetrics() *MetricsGroup {
 | 
			
		||||
	mg := &MetricsGroup{
 | 
			
		||||
		cacheInterval: 10 * time.Second,
 | 
			
		||||
	}
 | 
			
		||||
	mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
 | 
			
		||||
		objLayer := newObjectLayerFn()
 | 
			
		||||
		// Service not initialized yet
 | 
			
		||||
		if objLayer == nil {
 | 
			
		||||
			return
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		metrics = make([]Metric, 0, 2)
 | 
			
		||||
 | 
			
		||||
		opts := HealthOptions{}
 | 
			
		||||
		result := objLayer.Health(ctx, opts)
 | 
			
		||||
 | 
			
		||||
		metrics = append(metrics, Metric{
 | 
			
		||||
			Description: getClusterWriteQuorumMD(),
 | 
			
		||||
			Value:       float64(result.WriteQuorum),
 | 
			
		||||
		})
 | 
			
		||||
 | 
			
		||||
		health := 1
 | 
			
		||||
		if !result.Healthy {
 | 
			
		||||
			health = 0
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		metrics = append(metrics, Metric{
 | 
			
		||||
			Description: getClusterHealthStatusMD(),
 | 
			
		||||
			Value:       float64(health),
 | 
			
		||||
		})
 | 
			
		||||
 | 
			
		||||
		return
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	return mg
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getClusterStorageMetrics() *MetricsGroup {
 | 
			
		||||
	mg := &MetricsGroup{
 | 
			
		||||
		cacheInterval: 1 * time.Minute,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -138,8 +138,8 @@ func isServerResolvable(endpoint Endpoint, timeout time.Duration) error {
 | 
			
		|||
	if err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	req.Header.Set("x-minio-from-peer", "true")
 | 
			
		||||
	// Indicate that the liveness check for a peer call
 | 
			
		||||
	req.Header.Set(xhttp.MinIOPeerCall, "true")
 | 
			
		||||
 | 
			
		||||
	resp, err := httpClient.Do(req)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -40,6 +40,8 @@ These metrics can be obtained from any MinIO server once per collection.
 | 
			
		|||
| `minio_cluster_kms_uptime`                    | The time the KMS has been up and running in seconds.                                                            |
 | 
			
		||||
| `minio_cluster_nodes_offline_total`           | Total number of MinIO nodes offline.                                                                            |
 | 
			
		||||
| `minio_cluster_nodes_online_total`            | Total number of MinIO nodes online.                                                                             |
 | 
			
		||||
| `minio_cluster_write_quorum`                  | Maximum write quorum across all pools and sets                                                                  |
 | 
			
		||||
| `minio_cluster_health_status`                 | Get current cluster health status                                                                               |
 | 
			
		||||
| `minio_heal_objects_errors_total`             | Objects for which healing failed in current self healing run.                                                   |
 | 
			
		||||
| `minio_heal_objects_heal_total`               | Objects healed in current self healing run.                                                                     |
 | 
			
		||||
| `minio_heal_objects_total`                    | Objects scanned in current self healing run.                                                                    |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -152,6 +152,9 @@ const (
 | 
			
		|||
	// Deployment id.
 | 
			
		||||
	MinioDeploymentID = "x-minio-deployment-id"
 | 
			
		||||
 | 
			
		||||
	// Peer call
 | 
			
		||||
	MinIOPeerCall = "x-minio-from-peer"
 | 
			
		||||
 | 
			
		||||
	// Server-Status
 | 
			
		||||
	MinIOServerStatus = "x-minio-server-status"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue