mirror of https://github.com/minio/minio.git
				
				
				
			prom: Add drive failure tolerance per erasure set (#18424)
This commit is contained in:
		
							parent
							
								
									4598827dcb
								
							
						
					
					
						commit
						fe63664164
					
				|  | @ -2253,11 +2253,12 @@ type HealthOptions struct { | ||||||
| // additionally with any specific heuristic information which
 | // additionally with any specific heuristic information which
 | ||||||
| // was queried
 | // was queried
 | ||||||
| type HealthResult struct { | type HealthResult struct { | ||||||
| 	Healthy        bool | 	Healthy       bool | ||||||
| 	HealingDrives  int | 	HealingDrives int | ||||||
| 	UnhealthyPools []struct { | 	ESHealth      []struct { | ||||||
| 		Maintenance   bool | 		Maintenance   bool | ||||||
| 		PoolID, SetID int | 		PoolID, SetID int | ||||||
|  | 		HealthyDrives int | ||||||
| 		WriteQuorum   int | 		WriteQuorum   int | ||||||
| 	} | 	} | ||||||
| 	WriteQuorum   int | 	WriteQuorum   int | ||||||
|  | @ -2372,50 +2373,40 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	result := HealthResult{ | 	result := HealthResult{ | ||||||
| 		HealingDrives: len(aggHealStateResult.HealDisks), | 		Healthy:       true, | ||||||
| 		WriteQuorum:   maximumWriteQuorum, | 		WriteQuorum:   maximumWriteQuorum, | ||||||
| 		UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 | 		UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	for poolIdx := range erasureSetUpCount { | 	for poolIdx := range erasureSetUpCount { | ||||||
| 		for setIdx := range erasureSetUpCount[poolIdx] { | 		for setIdx := range erasureSetUpCount[poolIdx] { | ||||||
|  | 			result.ESHealth = append(result.ESHealth, struct { | ||||||
|  | 				Maintenance                bool | ||||||
|  | 				PoolID, SetID              int | ||||||
|  | 				HealthyDrives, WriteQuorum int | ||||||
|  | 			}{ | ||||||
|  | 				Maintenance:   opts.Maintenance, | ||||||
|  | 				SetID:         setIdx, | ||||||
|  | 				PoolID:        poolIdx, | ||||||
|  | 				HealthyDrives: erasureSetUpCount[poolIdx][setIdx], | ||||||
|  | 				WriteQuorum:   poolWriteQuorums[poolIdx], | ||||||
|  | 			}) | ||||||
|  | 
 | ||||||
| 			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] { | 			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] { | ||||||
| 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo), | 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo), | ||||||
| 					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", | 					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", | ||||||
| 						poolIdx, setIdx, poolWriteQuorums[poolIdx])) | 						poolIdx, setIdx, poolWriteQuorums[poolIdx])) | ||||||
| 				result.UnhealthyPools = append(result.UnhealthyPools, struct { | 				result.Healthy = false | ||||||
| 					Maintenance                bool |  | ||||||
| 					PoolID, SetID, WriteQuorum int |  | ||||||
| 				}{ |  | ||||||
| 					Maintenance: opts.Maintenance, |  | ||||||
| 					SetID:       setIdx, |  | ||||||
| 					PoolID:      poolIdx, |  | ||||||
| 					WriteQuorum: poolWriteQuorums[poolIdx], |  | ||||||
| 				}) |  | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 		if len(result.UnhealthyPools) > 0 { |  | ||||||
| 			// We have unhealthy pools return error.
 |  | ||||||
| 			return result |  | ||||||
| 		} |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	// when maintenance is not specified we don't have
 | 	if opts.Maintenance { | ||||||
| 	// to look at the healing side of the code.
 | 		result.Healthy = result.Healthy && len(aggHealStateResult.HealDisks) == 0 | ||||||
| 	if !opts.Maintenance { | 		result.HealingDrives = len(aggHealStateResult.HealDisks) | ||||||
| 		return HealthResult{ |  | ||||||
| 			Healthy:       true, |  | ||||||
| 			WriteQuorum:   maximumWriteQuorum, |  | ||||||
| 			UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 |  | ||||||
| 		} |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return HealthResult{ | 	return result | ||||||
| 		Healthy:       len(aggHealStateResult.HealDisks) == 0, |  | ||||||
| 		HealingDrives: len(aggHealStateResult.HealDisks), |  | ||||||
| 		WriteQuorum:   maximumWriteQuorum, |  | ||||||
| 		UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 |  | ||||||
| 	} |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // PutObjectMetadata - replace or add tags to an existing object
 | // PutObjectMetadata - replace or add tags to an existing object
 | ||||||
|  |  | ||||||
|  | @ -22,6 +22,7 @@ import ( | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"net/http" | 	"net/http" | ||||||
| 	"runtime" | 	"runtime" | ||||||
|  | 	"strconv" | ||||||
| 	"strings" | 	"strings" | ||||||
| 	"sync" | 	"sync" | ||||||
| 	"sync/atomic" | 	"sync/atomic" | ||||||
|  | @ -3187,6 +3188,16 @@ func getClusterHealthStatusMD() MetricDescription { | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | func getClusterErasureSetToleranceMD() MetricDescription { | ||||||
|  | 	return MetricDescription{ | ||||||
|  | 		Namespace: clusterMetricNamespace, | ||||||
|  | 		Subsystem: "health", | ||||||
|  | 		Name:      "erasure_set_tolerance", | ||||||
|  | 		Help:      "Get erasure set tolerance status", | ||||||
|  | 		Type:      gaugeMetric, | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
| func getClusterHealthMetrics() *MetricsGroup { | func getClusterHealthMetrics() *MetricsGroup { | ||||||
| 	mg := &MetricsGroup{ | 	mg := &MetricsGroup{ | ||||||
| 		cacheInterval: 10 * time.Second, | 		cacheInterval: 10 * time.Second, | ||||||
|  | @ -3218,6 +3229,18 @@ func getClusterHealthMetrics() *MetricsGroup { | ||||||
| 			Value:       float64(health), | 			Value:       float64(health), | ||||||
| 		}) | 		}) | ||||||
| 
 | 
 | ||||||
|  | 		for _, h := range result.ESHealth { | ||||||
|  | 			labels := map[string]string{ | ||||||
|  | 				"pool": strconv.Itoa(h.PoolID), | ||||||
|  | 				"set":  strconv.Itoa(h.SetID), | ||||||
|  | 			} | ||||||
|  | 			metrics = append(metrics, Metric{ | ||||||
|  | 				Description:    getClusterErasureSetToleranceMD(), | ||||||
|  | 				VariableLabels: labels, | ||||||
|  | 				Value:          float64(h.HealthyDrives - h.WriteQuorum), | ||||||
|  | 			}) | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
| 		return | 		return | ||||||
| 	}) | 	}) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue