From d90a47656e7d9fe7352f70c38f8f81b7ab892f95 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Sat, 17 Oct 2015 12:48:25 -0700 Subject: [PATCH] etcdserver: use Histogram for proposal_durations --- Documentation/metrics.md | 14 +++++++------- etcdserver/metrics.go | 5 +++-- etcdserver/server.go | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Documentation/metrics.md b/Documentation/metrics.md index cdb2e5256..8ba409a02 100644 --- a/Documentation/metrics.md +++ b/Documentation/metrics.md @@ -15,16 +15,16 @@ etcd now exposes the following metrics: ### etcdserver -| Name | Description | Type | -|-----------------------------------------|--------------------------------------------------|---------| -| file_descriptors_used_total | The total number of file descriptors used | Gauge | -| proposal_durations_milliseconds | The latency distributions of committing proposal | Summary | -| pending_proposal_total | The total number of pending proposals | Gauge | -| proposal_failed_total | The total number of failed proposals | Counter | +| Name | Description | Type | +|-----------------------------------------|--------------------------------------------------|-----------| +| file_descriptors_used_total | The total number of file descriptors used | Gauge | +| proposal_durations_seconds | The latency distributions of committing proposal | Histogram | +| pending_proposal_total | The total number of pending proposals | Gauge | +| proposal_failed_total | The total number of failed proposals | Counter | High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics. -[Proposal](glossary.md#proposal) durations (`proposal_durations_milliseconds`) give you an summary about the proposal commit latency. Latency can be introduced into this process by network and disk IO. +[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you an histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO. Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster. diff --git a/etcdserver/metrics.go b/etcdserver/metrics.go index 0544f3f1a..cf87a12d1 100644 --- a/etcdserver/metrics.go +++ b/etcdserver/metrics.go @@ -23,11 +23,12 @@ import ( var ( // TODO: with label in v3? - proposeDurations = prometheus.NewSummary(prometheus.SummaryOpts{ + proposeDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "server", - Name: "proposal_durations_milliseconds", + Name: "proposal_durations_seconds", Help: "The latency distributions of committing proposal.", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), }) proposePending = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "etcd", diff --git a/etcdserver/server.go b/etcdserver/server.go index 92cf3a87e..79b808bf7 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -598,7 +598,7 @@ func (s *EtcdServer) Do(ctx context.Context, r pb.Request) (Response, error) { select { case x := <-ch: - proposeDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Millisecond))) + proposeDurations.Observe(float64(time.Since(start)) / float64(time.Second)) resp := x.(Response) return resp, resp.err case <-ctx.Done():