351 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			351 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
| // Copyright 2013 Prometheus Team
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| // http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| package main
 | |
| 
 | |
| import (
 | |
| 	"flag"
 | |
| 	"github.com/prometheus/prometheus/config"
 | |
| 	"github.com/prometheus/prometheus/retrieval"
 | |
| 	"github.com/prometheus/prometheus/retrieval/format"
 | |
| 	"github.com/prometheus/prometheus/rules"
 | |
| 	"github.com/prometheus/prometheus/storage/metric"
 | |
| 	"github.com/prometheus/prometheus/storage/raw/leveldb"
 | |
| 	"github.com/prometheus/prometheus/web"
 | |
| 	"github.com/prometheus/prometheus/web/api"
 | |
| 	"log"
 | |
| 	"os"
 | |
| 	"os/signal"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	deletionBatchSize = 100
 | |
| )
 | |
| 
 | |
| // Commandline flags.
 | |
| var (
 | |
| 	printVersion                 = flag.Bool("version", false, "print version information")
 | |
| 	configFile                   = flag.String("configFile", "prometheus.conf", "Prometheus configuration file name.")
 | |
| 	metricsStoragePath           = flag.String("metricsStoragePath", "/tmp/metrics", "Base path for metrics storage.")
 | |
| 	scrapeResultsQueueCapacity   = flag.Int("scrapeResultsQueueCapacity", 4096, "The size of the scrape results queue.")
 | |
| 	ruleResultsQueueCapacity     = flag.Int("ruleResultsQueueCapacity", 4096, "The size of the rule results queue.")
 | |
| 	concurrentRetrievalAllowance = flag.Int("concurrentRetrievalAllowance", 15, "The number of concurrent metrics retrieval requests allowed.")
 | |
| 	diskAppendQueueCapacity      = flag.Int("queue.diskAppendCapacity", 1000000, "The size of the queue for items that are pending writing to disk.")
 | |
| 	memoryAppendQueueCapacity    = flag.Int("queue.memoryAppendCapacity", 10000, "The size of the queue for items that are pending writing to memory.")
 | |
| 
 | |
| 	headCompactInterval = flag.Duration("compact.headInterval", 10*3*time.Minute, "The amount of time between head compactions.")
 | |
| 	bodyCompactInterval = flag.Duration("compact.bodyInterval", 10*5*time.Minute, "The amount of time between body compactions.")
 | |
| 	tailCompactInterval = flag.Duration("compact.tailInterval", 10*7*time.Minute, "The amount of time between tail compactions.")
 | |
| 
 | |
| 	headGroupSize = flag.Int("compact.headGroupSize", 50, "The minimum group size for head samples.")
 | |
| 	bodyGroupSize = flag.Int("compact.bodyGroupSize", 250, "The minimum group size for body samples.")
 | |
| 	tailGroupSize = flag.Int("compact.tailGroupSize", 5000, "The minimum group size for tail samples.")
 | |
| 
 | |
| 	headAge = flag.Duration("compact.headAgeInclusiveness", 5*time.Minute, "The relative inclusiveness of head samples.")
 | |
| 	bodyAge = flag.Duration("compact.bodyAgeInclusiveness", time.Hour, "The relative inclusiveness of body samples.")
 | |
| 	tailAge = flag.Duration("compact.tailAgeInclusiveness", 24*time.Hour, "The relative inclusiveness of tail samples.")
 | |
| 
 | |
| 	deleteInterval = flag.Duration("delete.interval", 10*11*time.Minute, "The amount of time between deletion of old values.")
 | |
| 
 | |
| 	deleteAge = flag.Duration("delete.ageMaximum", 10*24*time.Hour, "The relative maximum age for values before they are deleted.")
 | |
| 
 | |
| 	arenaFlushInterval = flag.Duration("arena.flushInterval", 15*time.Minute, "The period at which the in-memory arena is flushed to disk.")
 | |
| 	arenaTTL           = flag.Duration("arena.ttl", 10*time.Minute, "The relative age of values to purge to disk from memory.")
 | |
| )
 | |
| 
 | |
| type prometheus struct {
 | |
| 	headCompactionTimer      *time.Ticker
 | |
| 	bodyCompactionTimer      *time.Ticker
 | |
| 	tailCompactionTimer      *time.Ticker
 | |
| 	deletionTimer            *time.Ticker
 | |
| 	reportDatabasesTimer     *time.Ticker
 | |
| 	curationMutex            sync.Mutex
 | |
| 	curationState            chan metric.CurationState
 | |
| 	databaseStates           chan []leveldb.DatabaseState
 | |
| 	stopBackgroundOperations chan bool
 | |
| 
 | |
| 	ruleResults   chan *rules.Result
 | |
| 	scrapeResults chan format.Result
 | |
| 
 | |
| 	storage *metric.TieredStorage
 | |
| }
 | |
| 
 | |
| func (p *prometheus) interruptHandler() {
 | |
| 	notifier := make(chan os.Signal)
 | |
| 	signal.Notify(notifier, os.Interrupt)
 | |
| 
 | |
| 	<-notifier
 | |
| 
 | |
| 	log.Println("Received SIGINT; Exiting Gracefully...")
 | |
| 	p.close()
 | |
| 	os.Exit(0)
 | |
| }
 | |
| 
 | |
| func (p *prometheus) compact(olderThan time.Duration, groupSize int) error {
 | |
| 	p.curationMutex.Lock()
 | |
| 	defer p.curationMutex.Unlock()
 | |
| 
 | |
| 	processor := &metric.CompactionProcessor{
 | |
| 		MaximumMutationPoolBatch: groupSize * 3,
 | |
| 		MinimumGroupSize:         groupSize,
 | |
| 	}
 | |
| 
 | |
| 	curator := metric.Curator{
 | |
| 		Stop: p.stopBackgroundOperations,
 | |
| 	}
 | |
| 
 | |
| 	return curator.Run(olderThan, time.Now(), processor, p.storage.DiskStorage.CurationRemarks, p.storage.DiskStorage.MetricSamples, p.storage.DiskStorage.MetricHighWatermarks, p.curationState)
 | |
| }
 | |
| 
 | |
| func (p *prometheus) delete(olderThan time.Duration, batchSize int) error {
 | |
| 	p.curationMutex.Lock()
 | |
| 	defer p.curationMutex.Unlock()
 | |
| 
 | |
| 	processor := &metric.DeletionProcessor{
 | |
| 		MaximumMutationPoolBatch: batchSize,
 | |
| 	}
 | |
| 
 | |
| 	curator := metric.Curator{
 | |
| 		Stop: p.stopBackgroundOperations,
 | |
| 	}
 | |
| 
 | |
| 	return curator.Run(olderThan, time.Now(), processor, p.storage.DiskStorage.CurationRemarks, p.storage.DiskStorage.MetricSamples, p.storage.DiskStorage.MetricHighWatermarks, p.curationState)
 | |
| }
 | |
| 
 | |
| func (p *prometheus) close() {
 | |
| 	if p.headCompactionTimer != nil {
 | |
| 		p.headCompactionTimer.Stop()
 | |
| 	}
 | |
| 	if p.bodyCompactionTimer != nil {
 | |
| 		p.bodyCompactionTimer.Stop()
 | |
| 	}
 | |
| 	if p.tailCompactionTimer != nil {
 | |
| 		p.tailCompactionTimer.Stop()
 | |
| 	}
 | |
| 	if p.deletionTimer != nil {
 | |
| 		p.deletionTimer.Stop()
 | |
| 	}
 | |
| 
 | |
| 	if p.reportDatabasesTimer != nil {
 | |
| 		p.reportDatabasesTimer.Stop()
 | |
| 	}
 | |
| 
 | |
| 	if len(p.stopBackgroundOperations) == 0 {
 | |
| 		p.stopBackgroundOperations <- true
 | |
| 	}
 | |
| 
 | |
| 	p.curationMutex.Lock()
 | |
| 
 | |
| 	p.storage.Close()
 | |
| 	close(p.stopBackgroundOperations)
 | |
| 	close(p.curationState)
 | |
| 	close(p.databaseStates)
 | |
| }
 | |
| 
 | |
| func (p *prometheus) reportDatabaseState() {
 | |
| 	for _ = range p.reportDatabasesTimer.C {
 | |
| 		// BUG(matt): Per Julius, ...
 | |
| 		// These channel magic tricks confuse me and seem a bit awkward just to
 | |
| 		// pass a status around. Now that we have Go 1.1, would it be maybe be
 | |
| 		// nicer to pass ts.DiskStorage.States as a method value
 | |
| 		// (http://tip.golang.org/ref/spec#Method_values) to the web layer
 | |
| 		// instead of doing this?
 | |
| 		select {
 | |
| 		case <-p.databaseStates:
 | |
| 			// Reset the future database state if nobody consumes it.
 | |
| 		case p.databaseStates <- p.storage.DiskStorage.States():
 | |
| 			// Set the database state so someone can consume it if they want.
 | |
| 		default:
 | |
| 			// Don't block.
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func main() {
 | |
| 	// TODO(all): Future additions to main should be, where applicable, glumped
 | |
| 	// into the prometheus struct above---at least where the scoping of the entire
 | |
| 	// server is concerned.
 | |
| 	flag.Parse()
 | |
| 
 | |
| 	versionInfoTmpl.Execute(os.Stdout, BuildInfo)
 | |
| 
 | |
| 	if *printVersion {
 | |
| 		os.Exit(0)
 | |
| 	}
 | |
| 
 | |
| 	conf, err := config.LoadFromFile(*configFile)
 | |
| 	if err != nil {
 | |
| 		log.Fatalf("Error loading configuration from %s: %v", *configFile, err)
 | |
| 	}
 | |
| 
 | |
| 	ts, err := metric.NewTieredStorage(uint(*diskAppendQueueCapacity), 100, *arenaFlushInterval, *arenaTTL, *metricsStoragePath)
 | |
| 	if err != nil {
 | |
| 		log.Fatalf("Error opening storage: %s", err)
 | |
| 	}
 | |
| 	if ts == nil {
 | |
| 		log.Fatalln("Nil tiered storage.")
 | |
| 	}
 | |
| 
 | |
| 	scrapeResults := make(chan format.Result, *scrapeResultsQueueCapacity)
 | |
| 	ruleResults := make(chan *rules.Result, *ruleResultsQueueCapacity)
 | |
| 	curationState := make(chan metric.CurationState, 1)
 | |
| 	databaseStates := make(chan []leveldb.DatabaseState, 1)
 | |
| 	// Coprime numbers, fool!
 | |
| 	headCompactionTimer := time.NewTicker(*headCompactInterval)
 | |
| 	bodyCompactionTimer := time.NewTicker(*bodyCompactInterval)
 | |
| 	tailCompactionTimer := time.NewTicker(*tailCompactInterval)
 | |
| 	deletionTimer := time.NewTicker(*deleteInterval)
 | |
| 
 | |
| 	// Queue depth will need to be exposed
 | |
| 	targetManager := retrieval.NewTargetManager(scrapeResults, *concurrentRetrievalAllowance)
 | |
| 	targetManager.AddTargetsFromConfig(conf)
 | |
| 
 | |
| 	flags := map[string]string{}
 | |
| 
 | |
| 	flag.VisitAll(func(f *flag.Flag) {
 | |
| 		flags[f.Name] = f.Value.String()
 | |
| 	})
 | |
| 
 | |
| 	statusHandler := &web.StatusHandler{
 | |
| 		PrometheusStatus: &web.PrometheusStatus{
 | |
| 			BuildInfo:   BuildInfo,
 | |
| 			Config:      conf.String(),
 | |
| 			TargetPools: targetManager.Pools(),
 | |
| 			Flags:       flags,
 | |
| 			Birth:       time.Now(),
 | |
| 		},
 | |
| 		CurationState: curationState,
 | |
| 	}
 | |
| 
 | |
| 	databasesHandler := &web.DatabasesHandler{
 | |
| 		Incoming: databaseStates,
 | |
| 	}
 | |
| 
 | |
| 	metricsService := &api.MetricsService{
 | |
| 		Config:        &conf,
 | |
| 		TargetManager: targetManager,
 | |
| 		Storage:       ts,
 | |
| 	}
 | |
| 
 | |
| 	webService := &web.WebService{
 | |
| 		StatusHandler:    statusHandler,
 | |
| 		MetricsHandler:   metricsService,
 | |
| 		DatabasesHandler: databasesHandler,
 | |
| 	}
 | |
| 
 | |
| 	prometheus := prometheus{
 | |
| 		bodyCompactionTimer: bodyCompactionTimer,
 | |
| 		headCompactionTimer: headCompactionTimer,
 | |
| 		tailCompactionTimer: tailCompactionTimer,
 | |
| 
 | |
| 		deletionTimer: deletionTimer,
 | |
| 
 | |
| 		reportDatabasesTimer: time.NewTicker(15 * time.Minute),
 | |
| 
 | |
| 		curationState:  curationState,
 | |
| 		databaseStates: databaseStates,
 | |
| 
 | |
| 		ruleResults:   ruleResults,
 | |
| 		scrapeResults: scrapeResults,
 | |
| 
 | |
| 		stopBackgroundOperations: make(chan bool, 1),
 | |
| 
 | |
| 		storage: ts,
 | |
| 	}
 | |
| 	defer prometheus.close()
 | |
| 
 | |
| 	go ts.Serve()
 | |
| 	go prometheus.interruptHandler()
 | |
| 	go prometheus.reportDatabaseState()
 | |
| 
 | |
| 	go func() {
 | |
| 		for _ = range prometheus.headCompactionTimer.C {
 | |
| 			log.Println("Starting head compaction...")
 | |
| 			err := prometheus.compact(*headAge, *headGroupSize)
 | |
| 
 | |
| 			if err != nil {
 | |
| 				log.Printf("could not compact due to %s", err)
 | |
| 			}
 | |
| 			log.Println("Done")
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	go func() {
 | |
| 		for _ = range prometheus.bodyCompactionTimer.C {
 | |
| 			log.Println("Starting body compaction...")
 | |
| 			err := prometheus.compact(*bodyAge, *bodyGroupSize)
 | |
| 
 | |
| 			if err != nil {
 | |
| 				log.Printf("could not compact due to %s", err)
 | |
| 			}
 | |
| 			log.Println("Done")
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	go func() {
 | |
| 		for _ = range prometheus.tailCompactionTimer.C {
 | |
| 			log.Println("Starting tail compaction...")
 | |
| 			err := prometheus.compact(*tailAge, *tailGroupSize)
 | |
| 
 | |
| 			if err != nil {
 | |
| 				log.Printf("could not compact due to %s", err)
 | |
| 			}
 | |
| 			log.Println("Done")
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	go func() {
 | |
| 		for _ = range prometheus.deletionTimer.C {
 | |
| 			log.Println("Starting deletion of stale values...")
 | |
| 			err := prometheus.delete(*deleteAge, deletionBatchSize)
 | |
| 
 | |
| 			if err != nil {
 | |
| 				log.Printf("could not delete due to %s", err)
 | |
| 			}
 | |
| 			log.Println("Done")
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	// Queue depth will need to be exposed
 | |
| 
 | |
| 	ruleManager := rules.NewRuleManager(ruleResults, conf.EvaluationInterval(), ts)
 | |
| 	err = ruleManager.AddRulesFromConfig(conf)
 | |
| 	if err != nil {
 | |
| 		log.Fatalf("Error loading rule files: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	go func() {
 | |
| 		err := webService.ServeForever()
 | |
| 		if err != nil {
 | |
| 			log.Fatal(err)
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	// TODO(all): Migrate this into prometheus.serve().
 | |
| 	for {
 | |
| 		select {
 | |
| 		case scrapeResult := <-scrapeResults:
 | |
| 			if scrapeResult.Err == nil {
 | |
| 				ts.AppendSamples(scrapeResult.Samples)
 | |
| 			}
 | |
| 
 | |
| 		case ruleResult := <-ruleResults:
 | |
| 			if ruleResult.Err == nil {
 | |
| 				ts.AppendSamples(ruleResult.Samples)
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| }
 |