Go语言中的Prometheus监控实战

2026-03-30 09:59:01 作者：王码码2035哦

本文主要介绍了Go语言中的Prometheus监控实战,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧

Prometheus是云原生时代最流行的监控系统之一，以其强大的数据模型和查询语言深受开发者喜爱。本文将深入介绍如何在Go语言应用中集成Prometheus监控，从基础指标到高级告警，帮助你构建完善的可观测性体系。

Prometheus核心概念

Metrics（指标）：被监控的数据点，如CPU使用率、请求延迟等
Labels（标签）：用于区分不同维度数据的键值对
Targets（目标）：被监控的应用实例
Exporters（导出器）：将第三方系统数据转换为Prometheus格式
Alertmanager（告警管理器）：处理告警通知

快速开始

安装依赖

go get github.com/prometheus/client_golang/prometheus
go get github.com/prometheus/client_golang/prometheus/promhttp

基础指标暴露

package main

import (
    "net/http"
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

func main() {
    // 暴露metrics端点
    http.Handle("/metrics", promhttp.Handler())
    
    log.Fatal(http.ListenAndServe(":8080", nil))
}

指标类型详解

Counter（计数器）

import "github.com/prometheus/client_golang/prometheus"
// 定义计数器
var requestCounter = prometheus.NewCounterVec(
    prometheus.CounterOpts{
        Name: "http_requests_total",
        Help: "Total number of HTTP requests",
    },
    []string{"method", "endpoint", "status"},
)
func init() {
    prometheus.MustRegister(requestCounter)
}
// 使用计数器
func handleRequest(w http.ResponseWriter, r *http.Request) {
    start := time.Now()
    // 处理请求
    status := "200"
    if err := processRequest(r); err != nil {
        status = "500"
        http.Error(w, err.Error(), http.StatusInternalServerError)
    }
    // 记录指标
    requestCounter.WithLabelValues(r.Method, r.URL.Path, status).Inc()
}

Gauge（仪表盘）

// 定义仪表盘
var activeConnections = prometheus.NewGauge(
    prometheus.GaugeOpts{
        Name: "active_connections",
        Help: "Number of active connections",
    },
)
var queueSize = prometheus.NewGaugeVec(
    prometheus.GaugeOpts{
        Name: "queue_size",
        Help: "Current size of the queue",
    },
    []string{"queue_name"},
)
func init() {
    prometheus.MustRegister(activeConnections, queueSize)
}
// 使用仪表盘
func handleConnection(conn net.Conn) {
    activeConnections.Inc()
    defer activeConnections.Dec()
    // 处理连接
}
func updateQueueSize(name string, size int) {
    queueSize.WithLabelValues(name).Set(float64(size))
}

Histogram（直方图）

// 定义直方图
var requestDuration = prometheus.NewHistogramVec(
    prometheus.HistogramOpts{
        Name:    "http_request_duration_seconds",
        Help:    "HTTP request duration in seconds",
        Buckets: prometheus.DefBuckets, // 默认桶: .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10
    },
    []string{"method", "endpoint"},
)

// 自定义桶
var dbQueryDuration = prometheus.NewHistogramVec(
    prometheus.HistogramOpts{
        Name:    "db_query_duration_seconds",
        Help:    "Database query duration in seconds",
        Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1},
    },
    []string{"query_type"},
)

func init() {
    prometheus.MustRegister(requestDuration, dbQueryDuration)
}

// 使用直方图
func handleRequest(w http.ResponseWriter, r *http.Request) {
    start := time.Now()
    
    // 处理请求
    processRequest(r)
    
    // 记录耗时
    duration := time.Since(start).Seconds()
    requestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
}

Summary（摘要）

// 定义摘要
var requestLatency = prometheus.NewSummaryVec(
    prometheus.SummaryOpts{
        Name:       "http_request_latency_seconds",
        Help:       "HTTP request latency in seconds",
        Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
    },
    []string{"method", "endpoint"},
)

func init() {
    prometheus.MustRegister(requestLatency)
}

// 使用摘要
func handleRequest(w http.ResponseWriter, r *http.Request) {
    start := time.Now()
    
    // 处理请求
    processRequest(r)
    
    // 记录延迟
    latency := time.Since(start).Seconds()
    requestLatency.WithLabelValues(r.Method, r.URL.Path).Observe(latency)
}

HTTP中间件集成

func PrometheusMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        
        // 包装ResponseWriter以获取状态码
        wrapped := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}
        
        next.ServeHTTP(wrapped, r)
        
        duration := time.Since(start).Seconds()
        
        // 记录指标
        requestCounter.WithLabelValues(r.Method, r.URL.Path, strconv.Itoa(wrapped.statusCode)).Inc()
        requestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
    })
}

type responseWriter struct {
    http.ResponseWriter
    statusCode int
}

func (rw *responseWriter) WriteHeader(code int) {
    rw.statusCode = code
    rw.ResponseWriter.WriteHeader(code)
}

// 使用中间件
func main() {
    mux := http.NewServeMux()
    mux.HandleFunc("/api/users", handleUsers)
    mux.HandleFunc("/api/orders", handleOrders)
    
    // 包装handler
    handler := PrometheusMiddleware(mux)
    
    http.Handle("/metrics", promhttp.Handler())
    http.Handle("/", handler)
    
    log.Fatal(http.ListenAndServe(":8080", nil))
}

自定义Collector

type CustomCollector struct {
    cpuUsage    *prometheus.Desc
    memoryUsage *prometheus.Desc
}

func NewCustomCollector() *CustomCollector {
    return &CustomCollector{
        cpuUsage: prometheus.NewDesc(
            "custom_cpu_usage_percent",
            "Current CPU usage percentage",
            nil, nil,
        ),
        memoryUsage: prometheus.NewDesc(
            "custom_memory_usage_bytes",
            "Current memory usage in bytes",
            nil, nil,
        ),
    }
}

func (c *CustomCollector) Describe(ch chan<- *prometheus.Desc) {
    ch <- c.cpuUsage
    ch <- c.memoryUsage
}

func (c *CustomCollector) Collect(ch chan<- prometheus.Metric) {
    // 获取CPU使用率
    cpuPercent := getCPUUsage()
    ch <- prometheus.MustNewConstMetric(
        c.cpuUsage,
        prometheus.GaugeValue,
        cpuPercent,
    )
    
    // 获取内存使用
    memUsage := getMemoryUsage()
    ch <- prometheus.MustNewConstMetric(
        c.memoryUsage,
        prometheus.GaugeValue,
        float64(memUsage),
    )
}

func init() {
    prometheus.MustRegister(NewCustomCollector())
}

业务指标监控

// 订单相关指标
var (
    orderTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "orders_total",
            Help: "Total number of orders",
        },
        []string{"status", "payment_method"},
    )
    
    orderAmount = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "order_amount_histogram",
            Help:    "Order amount distribution",
            Buckets: prometheus.LinearBuckets(0, 100, 20), // 0-2000
        },
        []string{"category"},
    )
    
    orderProcessingTime = prometheus.NewSummaryVec(
        prometheus.SummaryOpts{
            Name:       "order_processing_time_seconds",
            Help:       "Order processing time",
            Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
        },
        []string{"step"},
    )
)

func init() {
    prometheus.MustRegister(orderTotal, orderAmount, orderProcessingTime)
}

// 订单处理
func ProcessOrder(order Order) error {
    start := time.Now()
    
    // 验证订单
    validationStart := time.Now()
    if err := validateOrder(order); err != nil {
        orderTotal.WithLabelValues("failed", order.PaymentMethod).Inc()
        return err
    }
    orderProcessingTime.WithLabelValues("validation").Observe(time.Since(validationStart).Seconds())
    
    // 处理支付
    paymentStart := time.Now()
    if err := processPayment(order); err != nil {
        orderTotal.WithLabelValues("payment_failed", order.PaymentMethod).Inc()
        return err
    }
    orderProcessingTime.WithLabelValues("payment").Observe(time.Since(paymentStart).Seconds())
    
    // 记录成功指标
    orderTotal.WithLabelValues("success", order.PaymentMethod).Inc()
    orderAmount.WithLabelValues(order.Category).Observe(order.Amount)
    orderProcessingTime.WithLabelValues("total").Observe(time.Since(start).Seconds())
    
    return nil
}

告警规则配置

# alerts.yml
groups:
  - name: example
    rules:
      # 高错误率告警
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is above 10% for {{ $labels.endpoint }}"
      
      # 高延迟告警
      - alert: HighLatency
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency detected"
          description: "95th percentile latency is above 500ms"
      
      # 服务宕机告警
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service is down"
          description: "{{ $labels.instance }} has been down for more than 1 minute"

Pushgateway推送

import "github.com/prometheus/client_golang/prometheus/push"

func pushMetrics() error {
    // 创建推送器
    pusher := push.New("http://pushgateway:9091", "batch_job").
        Collector(batchDuration).
        Collector(batchSize)
    
    // 推送指标
    if err := pusher.Push(); err != nil {
        return err
    }
    
    return nil
}

// 批处理任务
func batchJob() {
    start := time.Now()
    
    // 处理批处理任务
    size := processBatch()
    
    // 记录指标
    batchDuration.Set(time.Since(start).Seconds())
    batchSize.Set(float64(size))
    
    // 推送指标
    if err := pushMetrics(); err != nil {
        log.Printf("Failed to push metrics: %v", err)
    }
}

性能优化

指标缓存

type CachedCollector struct {
    collector prometheus.Collector
    cache     []prometheus.Metric
    lastCollect time.Time
    ttl         time.Duration
}

func (c *CachedCollector) Collect(ch chan<- prometheus.Metric) {
    if time.Since(c.lastCollect) > c.ttl {
        // 重新收集
        c.cache = c.cache[:0]
        tempCh := make(chan prometheus.Metric, 100)
        
        go func() {
            c.collector.Collect(tempCh)
            close(tempCh)
        }()
        
        for metric := range tempCh {
            c.cache = append(c.cache, metric)
        }
        
        c.lastCollect = time.Now()
    }
    
    // 发送缓存的指标
    for _, metric := range c.cache {
        ch <- metric
    }
}

指标命名规范

// 好的命名
var (
    httpRequestsTotal    = prometheus.NewCounter(...) // 带单位，清晰
    httpRequestDuration  = prometheus.NewHistogram(...) // 带单位，清晰
    cacheHitsTotal       = prometheus.NewCounter(...) // 业务相关
)

// 避免的命名
var (
    requests    = prometheus.NewCounter(...) // 太模糊
    duration    = prometheus.NewHistogram(...) // 缺少单位
    myCounter   = prometheus.NewCounter(...) // 无意义
)

总结

Prometheus是构建可观测性体系的强大工具，掌握以下要点能帮助你更好地使用Prometheus：

选择合适的指标类型：Counter、Gauge、Histogram、Summary各有适用场景
合理设置标签：标签过多会导致基数爆炸，过少则缺乏维度信息
命名规范：清晰的命名有助于理解和查询
告警策略：避免告警疲劳，设置合理的阈值和持续时间
性能考虑：注意指标收集对应用性能的影响

希望本文能帮助你在Go项目中更好地集成Prometheus监控。

到此这篇关于Go语言中的Prometheus监控实战的文章就介绍到这了,更多相关Go语言 Prometheus监控内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家！