mirror of https://github.com/minio/minio.git
				
				
				
			prom: Add drive failure tolerance per erasure set (#18424)
This commit is contained in:
		
							parent
							
								
									4598827dcb
								
							
						
					
					
						commit
						fe63664164
					
				|  | @ -2255,9 +2255,10 @@ type HealthOptions struct { | |||
| type HealthResult struct { | ||||
| 	Healthy       bool | ||||
| 	HealingDrives int | ||||
| 	UnhealthyPools []struct { | ||||
| 	ESHealth      []struct { | ||||
| 		Maintenance   bool | ||||
| 		PoolID, SetID int | ||||
| 		HealthyDrives int | ||||
| 		WriteQuorum   int | ||||
| 	} | ||||
| 	WriteQuorum   int | ||||
|  | @ -2372,50 +2373,40 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea | |||
| 	} | ||||
| 
 | ||||
| 	result := HealthResult{ | ||||
| 		HealingDrives: len(aggHealStateResult.HealDisks), | ||||
| 		Healthy:       true, | ||||
| 		WriteQuorum:   maximumWriteQuorum, | ||||
| 		UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 | ||||
| 	} | ||||
| 
 | ||||
| 	for poolIdx := range erasureSetUpCount { | ||||
| 		for setIdx := range erasureSetUpCount[poolIdx] { | ||||
| 			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] { | ||||
| 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo), | ||||
| 					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", | ||||
| 						poolIdx, setIdx, poolWriteQuorums[poolIdx])) | ||||
| 				result.UnhealthyPools = append(result.UnhealthyPools, struct { | ||||
| 			result.ESHealth = append(result.ESHealth, struct { | ||||
| 				Maintenance                bool | ||||
| 					PoolID, SetID, WriteQuorum int | ||||
| 				PoolID, SetID              int | ||||
| 				HealthyDrives, WriteQuorum int | ||||
| 			}{ | ||||
| 				Maintenance:   opts.Maintenance, | ||||
| 				SetID:         setIdx, | ||||
| 				PoolID:        poolIdx, | ||||
| 				HealthyDrives: erasureSetUpCount[poolIdx][setIdx], | ||||
| 				WriteQuorum:   poolWriteQuorums[poolIdx], | ||||
| 			}) | ||||
| 
 | ||||
| 			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] { | ||||
| 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo), | ||||
| 					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", | ||||
| 						poolIdx, setIdx, poolWriteQuorums[poolIdx])) | ||||
| 				result.Healthy = false | ||||
| 			} | ||||
| 		} | ||||
| 		if len(result.UnhealthyPools) > 0 { | ||||
| 			// We have unhealthy pools return error.
 | ||||
| 	} | ||||
| 
 | ||||
| 	if opts.Maintenance { | ||||
| 		result.Healthy = result.Healthy && len(aggHealStateResult.HealDisks) == 0 | ||||
| 		result.HealingDrives = len(aggHealStateResult.HealDisks) | ||||
| 	} | ||||
| 
 | ||||
| 	return result | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// when maintenance is not specified we don't have
 | ||||
| 	// to look at the healing side of the code.
 | ||||
| 	if !opts.Maintenance { | ||||
| 		return HealthResult{ | ||||
| 			Healthy:       true, | ||||
| 			WriteQuorum:   maximumWriteQuorum, | ||||
| 			UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return HealthResult{ | ||||
| 		Healthy:       len(aggHealStateResult.HealDisks) == 0, | ||||
| 		HealingDrives: len(aggHealStateResult.HealDisks), | ||||
| 		WriteQuorum:   maximumWriteQuorum, | ||||
| 		UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| // PutObjectMetadata - replace or add tags to an existing object
 | ||||
|  |  | |||
|  | @ -22,6 +22,7 @@ import ( | |||
| 	"fmt" | ||||
| 	"net/http" | ||||
| 	"runtime" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
| 	"sync" | ||||
| 	"sync/atomic" | ||||
|  | @ -3187,6 +3188,16 @@ func getClusterHealthStatusMD() MetricDescription { | |||
| 	} | ||||
| } | ||||
| 
 | ||||
| func getClusterErasureSetToleranceMD() MetricDescription { | ||||
| 	return MetricDescription{ | ||||
| 		Namespace: clusterMetricNamespace, | ||||
| 		Subsystem: "health", | ||||
| 		Name:      "erasure_set_tolerance", | ||||
| 		Help:      "Get erasure set tolerance status", | ||||
| 		Type:      gaugeMetric, | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func getClusterHealthMetrics() *MetricsGroup { | ||||
| 	mg := &MetricsGroup{ | ||||
| 		cacheInterval: 10 * time.Second, | ||||
|  | @ -3218,6 +3229,18 @@ func getClusterHealthMetrics() *MetricsGroup { | |||
| 			Value:       float64(health), | ||||
| 		}) | ||||
| 
 | ||||
| 		for _, h := range result.ESHealth { | ||||
| 			labels := map[string]string{ | ||||
| 				"pool": strconv.Itoa(h.PoolID), | ||||
| 				"set":  strconv.Itoa(h.SetID), | ||||
| 			} | ||||
| 			metrics = append(metrics, Metric{ | ||||
| 				Description:    getClusterErasureSetToleranceMD(), | ||||
| 				VariableLabels: labels, | ||||
| 				Value:          float64(h.HealthyDrives - h.WriteQuorum), | ||||
| 			}) | ||||
| 		} | ||||
| 
 | ||||
| 		return | ||||
| 	}) | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue