5.7.4 Prometheus 监控体系

5.7.4 Prometheus 监控体系 #

Prometheus 是一个开源的监控和告警系统,专为云原生环境设计。它采用拉取模式收集指标数据,提供强大的查询语言 PromQL,并与 Grafana 等可视化工具完美集成。

Prometheus 架构概览 #

Prometheus 生态系统包含以下核心组件:

  • Prometheus Server:核心服务器,负责数据收集、存储和查询
  • Client Libraries:客户端库,用于应用程序暴露指标
  • Pushgateway:推送网关,用于短生命周期任务的指标收集
  • Exporters:导出器,用于收集第三方系统的指标
  • Alertmanager:告警管理器,处理告警通知

Prometheus 部署 #

Docker Compose 部署 #

创建 docker-compose.yml 文件:

version: "3.8"

services:
  # Prometheus 服务器
  prometheus:
    image: prom/prometheus:v2.45.0
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./rules:/etc/prometheus/rules
      - prometheus_data:/prometheus
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.path=/prometheus"
      - "--web.console.libraries=/etc/prometheus/console_libraries"
      - "--web.console.templates=/etc/prometheus/consoles"
      - "--storage.tsdb.retention.time=200h"
      - "--web.enable-lifecycle"
      - "--web.enable-admin-api"
    networks:
      - monitoring

  # Grafana 可视化
  grafana:
    image: grafana/grafana:10.0.0
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/dashboards:/etc/grafana/provisioning/dashboards
      - ./grafana/datasources:/etc/grafana/provisioning/datasources
    networks:
      - monitoring

  # Alertmanager 告警管理
  alertmanager:
    image: prom/alertmanager:v0.25.0
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - alertmanager_data:/alertmanager
    networks:
      - monitoring

  # Node Exporter 系统指标
  node-exporter:
    image: prom/node-exporter:v1.6.0
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - "--path.procfs=/host/proc"
      - "--path.rootfs=/rootfs"
      - "--path.sysfs=/host/sys"
      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
    networks:
      - monitoring

volumes:
  prometheus_data:
  grafana_data:
  alertmanager_data:

networks:
  monitoring:
    driver: bridge

Prometheus 配置 #

创建 prometheus.yml 配置文件:

global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "rules/*.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093

scrape_configs:
  # Prometheus 自身监控
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]

  # Node Exporter 系统监控
  - job_name: "node-exporter"
    static_configs:
      - targets: ["node-exporter:9100"]

  # Go 应用监控
  - job_name: "user-service"
    static_configs:
      - targets: ["user-service:8080"]
    metrics_path: "/metrics"
    scrape_interval: 10s

  - job_name: "order-service"
    static_configs:
      - targets: ["order-service:8080"]
    metrics_path: "/metrics"
    scrape_interval: 10s

  # 服务发现配置(Kubernetes 环境)
  - job_name: "kubernetes-pods"
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels:
          [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__

Go 应用指标暴露 #

基础指标集成 #

package main

import (
    "context"
    "log"
    "net/http"
    "time"

    "github.com/gin-gonic/gin"
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

// MetricsCollector 指标收集器
type MetricsCollector struct {
    // HTTP 请求相关指标
    httpRequestsTotal    *prometheus.CounterVec
    httpRequestDuration  *prometheus.HistogramVec
    httpRequestsInFlight prometheus.Gauge

    // 业务指标
    usersTotal           prometheus.Counter
    ordersTotal          *prometheus.CounterVec
    activeConnections    prometheus.Gauge

    // 系统指标
    memoryUsage          prometheus.Gauge
    cpuUsage             prometheus.Gauge
    goroutinesCount      prometheus.Gauge
}

// NewMetricsCollector 创建指标收集器
func NewMetricsCollector(serviceName string) *MetricsCollector {
    collector := &MetricsCollector{
        // HTTP 请求总数
        httpRequestsTotal: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "http_requests_total",
                Help: "Total number of HTTP requests",
            },
            []string{"method", "path", "status_code"},
        ),

        // HTTP 请求持续时间
        httpRequestDuration: prometheus.NewHistogramVec(
            prometheus.HistogramOpts{
                Name:    "http_request_duration_seconds",
                Help:    "HTTP request duration in seconds",
                Buckets: prometheus.DefBuckets,
            },
            []string{"method", "path", "status_code"},
        ),

        // 正在处理的 HTTP 请求数
        httpRequestsInFlight: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "http_requests_in_flight",
                Help: "Number of HTTP requests currently being processed",
            },
        ),

        // 用户总数
        usersTotal: prometheus.NewCounter(
            prometheus.CounterOpts{
                Name: "users_created_total",
                Help: "Total number of users created",
            },
        ),

        // 订单总数(按状态分类)
        ordersTotal: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "orders_total",
                Help: "Total number of orders",
            },
            []string{"status"},
        ),

        // 活跃连接数
        activeConnections: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "active_connections",
                Help: "Number of active connections",
            },
        ),

        // 内存使用量
        memoryUsage: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "memory_usage_bytes",
                Help: "Current memory usage in bytes",
            },
        ),

        // CPU 使用率
        cpuUsage: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "cpu_usage_percent",
                Help: "Current CPU usage percentage",
            },
        ),

        // Goroutine 数量
        goroutinesCount: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "goroutines_count",
                Help: "Number of goroutines",
            },
        ),
    }

    // 注册指标
    prometheus.MustRegister(
        collector.httpRequestsTotal,
        collector.httpRequestDuration,
        collector.httpRequestsInFlight,
        collector.usersTotal,
        collector.ordersTotal,
        collector.activeConnections,
        collector.memoryUsage,
        collector.cpuUsage,
        collector.goroutinesCount,
    )

    return collector
}

// RecordHTTPRequest 记录 HTTP 请求指标
func (m *MetricsCollector) RecordHTTPRequest(method, path, statusCode string, duration time.Duration) {
    m.httpRequestsTotal.WithLabelValues(method, path, statusCode).Inc()
    m.httpRequestDuration.WithLabelValues(method, path, statusCode).Observe(duration.Seconds())
}

// IncHTTPRequestsInFlight 增加正在处理的请求数
func (m *MetricsCollector) IncHTTPRequestsInFlight() {
    m.httpRequestsInFlight.Inc()
}

// DecHTTPRequestsInFlight 减少正在处理的请求数
func (m *MetricsCollector) DecHTTPRequestsInFlight() {
    m.httpRequestsInFlight.Dec()
}

// IncUsersTotal 增加用户总数
func (m *MetricsCollector) IncUsersTotal() {
    m.usersTotal.Inc()
}

// IncOrdersTotal 增加订单总数
func (m *MetricsCollector) IncOrdersTotal(status string) {
    m.ordersTotal.WithLabelValues(status).Inc()
}

// SetActiveConnections 设置活跃连接数
func (m *MetricsCollector) SetActiveConnections(count float64) {
    m.activeConnections.Set(count)
}

// UpdateSystemMetrics 更新系统指标
func (m *MetricsCollector) UpdateSystemMetrics(memUsage, cpuUsage float64, goroutines int) {
    m.memoryUsage.Set(memUsage)
    m.cpuUsage.Set(cpuUsage)
    m.goroutinesCount.Set(float64(goroutines))
}

HTTP 中间件集成 #

package middleware

import (
    "strconv"
    "time"

    "github.com/gin-gonic/gin"
)

// PrometheusMiddleware Prometheus 监控中间件
func PrometheusMiddleware(collector *MetricsCollector) gin.HandlerFunc {
    return func(c *gin.Context) {
        start := time.Now()

        // 增加正在处理的请求数
        collector.IncHTTPRequestsInFlight()
        defer collector.DecHTTPRequestsInFlight()

        // 处理请求
        c.Next()

        // 记录指标
        duration := time.Since(start)
        statusCode := strconv.Itoa(c.Writer.Status())

        collector.RecordHTTPRequest(
            c.Request.Method,
            c.FullPath(),
            statusCode,
            duration,
        )
    }
}

业务指标集成 #

package handlers

import (
    "net/http"
    "runtime"
    "time"

    "github.com/gin-gonic/gin"
)

// UserHandler 用户处理器(集成指标)
type UserHandler struct {
    collector *MetricsCollector
    // 其他依赖...
}

func NewUserHandler(collector *MetricsCollector) *UserHandler {
    return &UserHandler{
        collector: collector,
    }
}

// CreateUser 创建用户(记录业务指标)
func (h *UserHandler) CreateUser(c *gin.Context) {
    var req CreateUserRequest
    if err := c.ShouldBindJSON(&req); err != nil {
        c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
        return
    }

    // 创建用户逻辑...
    user := &User{
        ID:    generateUserID(),
        Name:  req.Name,
        Email: req.Email,
    }

    // 记录业务指标
    h.collector.IncUsersTotal()

    c.JSON(http.StatusCreated, user)
}

// GetSystemMetrics 获取系统指标
func (h *UserHandler) GetSystemMetrics(c *gin.Context) {
    var m runtime.MemStats
    runtime.ReadMemStats(&m)

    // 更新系统指标
    h.collector.UpdateSystemMetrics(
        float64(m.Alloc),           // 内存使用量
        getCurrentCPUUsage(),       // CPU 使用率
        runtime.NumGoroutine(),     // Goroutine 数量
    )

    c.JSON(http.StatusOK, gin.H{
        "memory_usage":     m.Alloc,
        "goroutines_count": runtime.NumGoroutine(),
        "timestamp":        time.Now().Unix(),
    })
}

// getCurrentCPUUsage 获取当前 CPU 使用率(简化实现)
func getCurrentCPUUsage() float64 {
    // 实际实现中应该使用更精确的 CPU 使用率计算
    return 0.0
}

自定义指标收集器 #

package metrics

import (
    "context"
    "database/sql"
    "time"

    "github.com/prometheus/client_golang/prometheus"
)

// DatabaseMetrics 数据库指标收集器
type DatabaseMetrics struct {
    dbConnections     *prometheus.GaugeVec
    dbQueryDuration   *prometheus.HistogramVec
    dbQueriesTotal    *prometheus.CounterVec
    dbConnectionsMax  prometheus.Gauge
}

func NewDatabaseMetrics() *DatabaseMetrics {
    metrics := &DatabaseMetrics{
        dbConnections: prometheus.NewGaugeVec(
            prometheus.GaugeOpts{
                Name: "database_connections",
                Help: "Number of database connections",
            },
            []string{"state"}, // open, idle, in_use
        ),

        dbQueryDuration: prometheus.NewHistogramVec(
            prometheus.HistogramOpts{
                Name:    "database_query_duration_seconds",
                Help:    "Database query duration in seconds",
                Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0},
            },
            []string{"operation", "table"},
        ),

        dbQueriesTotal: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "database_queries_total",
                Help: "Total number of database queries",
            },
            []string{"operation", "table", "status"},
        ),

        dbConnectionsMax: prometheus.NewGauge(
            prometheus.GaugeOpts{
                Name: "database_connections_max",
                Help: "Maximum number of database connections",
            },
        ),
    }

    prometheus.MustRegister(
        metrics.dbConnections,
        metrics.dbQueryDuration,
        metrics.dbQueriesTotal,
        metrics.dbConnectionsMax,
    )

    return metrics
}

// RecordQuery 记录数据库查询
func (m *DatabaseMetrics) RecordQuery(operation, table string, duration time.Duration, err error) {
    status := "success"
    if err != nil {
        status = "error"
    }

    m.dbQueryDuration.WithLabelValues(operation, table).Observe(duration.Seconds())
    m.dbQueriesTotal.WithLabelValues(operation, table, status).Inc()
}

// UpdateConnectionStats 更新连接统计
func (m *DatabaseMetrics) UpdateConnectionStats(db *sql.DB) {
    stats := db.Stats()

    m.dbConnections.WithLabelValues("open").Set(float64(stats.OpenConnections))
    m.dbConnections.WithLabelValues("idle").Set(float64(stats.Idle))
    m.dbConnections.WithLabelValues("in_use").Set(float64(stats.InUse))
    m.dbConnectionsMax.Set(float64(stats.MaxOpenConnections))
}

// 数据库操作包装器
type MetricsDB struct {
    db      *sql.DB
    metrics *DatabaseMetrics
}

func NewMetricsDB(db *sql.DB, metrics *DatabaseMetrics) *MetricsDB {
    return &MetricsDB{
        db:      db,
        metrics: metrics,
    }
}

// Query 执行查询并记录指标
func (mdb *MetricsDB) Query(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) {
    start := time.Now()

    rows, err := mdb.db.QueryContext(ctx, query, args...)

    duration := time.Since(start)
    mdb.metrics.RecordQuery("SELECT", "users", duration, err)

    return rows, err
}

// Exec 执行命令并记录指标
func (mdb *MetricsDB) Exec(ctx context.Context, query string, args ...interface{}) (sql.Result, error) {
    start := time.Now()

    result, err := mdb.db.ExecContext(ctx, query, args...)

    duration := time.Since(start)
    mdb.metrics.RecordQuery("INSERT", "users", duration, err)

    return result, err
}

告警规则配置 #

创建告警规则 #

创建 rules/alerts.yml 文件:

groups:
  - name: application.rules
    rules:
      # HTTP 错误率告警
      - alert: HighHTTPErrorRate
        expr: |
          (
            sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (job)
            /
            sum(rate(http_requests_total[5m])) by (job)
          ) > 0.05
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "High HTTP error rate detected"
          description: "HTTP error rate is {{ $value | humanizePercentage }} for {{ $labels.job }}"

      # 响应时间告警
      - alert: HighResponseTime
        expr: |
          histogram_quantile(0.95, 
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)
          ) > 1.0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High response time detected"
          description: "95th percentile response time is {{ $value }}s for {{ $labels.job }}"

      # 内存使用告警
      - alert: HighMemoryUsage
        expr: memory_usage_bytes > 1073741824 # 1GB
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage detected"
          description: "Memory usage is {{ $value | humanizeBytes }} for {{ $labels.job }}"

      # 数据库连接告警
      - alert: DatabaseConnectionsHigh
        expr: database_connections{state="in_use"} / database_connections_max > 0.8
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Database connections usage high"
          description: "Database connections usage is {{ $value | humanizePercentage }}"

      # 服务不可用告警
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service is down"
          description: "{{ $labels.job }} service is down"

  - name: system.rules
    rules:
      # CPU 使用率告警
      - alert: HighCPUUsage
        expr: cpu_usage_percent > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage detected"
          description: "CPU usage is {{ $value }}% for {{ $labels.instance }}"

      # Goroutine 数量告警
      - alert: HighGoroutineCount
        expr: goroutines_count > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High goroutine count detected"
          description: "Goroutine count is {{ $value }} for {{ $labels.job }}"

      # 磁盘空间告警
      - alert: DiskSpaceLow
        expr: |
          (
            node_filesystem_avail_bytes{mountpoint="/"}
            /
            node_filesystem_size_bytes{mountpoint="/"}
          ) < 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Disk space low"
          description: "Disk space is {{ $value | humanizePercentage }} available"

Alertmanager 配置 #

创建 alertmanager.yml 文件:

global:
  smtp_smarthost: "localhost:587"
  smtp_from: "[email protected]"

route:
  group_by: ["alertname"]
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: "web.hook"
  routes:
    - match:
        severity: critical
      receiver: "critical-alerts"
    - match:
        severity: warning
      receiver: "warning-alerts"

receivers:
  - name: "web.hook"
    webhook_configs:
      - url: "http://localhost:5001/webhook"

  - name: "critical-alerts"
    email_configs:
      - to: "[email protected]"
        subject: "Critical Alert: {{ .GroupLabels.alertname }}"
        body: |
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          {{ end }}
    slack_configs:
      - api_url: "YOUR_SLACK_WEBHOOK_URL"
        channel: "#alerts"
        title: "Critical Alert"
        text: "{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}"

  - name: "warning-alerts"
    email_configs:
      - to: "[email protected]"
        subject: "Warning Alert: {{ .GroupLabels.alertname }}"
        body: |
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          {{ end }}

inhibit_rules:
  - source_match:
      severity: "critical"
    target_match:
      severity: "warning"
    equal: ["alertname", "instance"]

Grafana 可视化展示 #

数据源配置 #

创建 grafana/datasources/prometheus.yml

apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: true

仪表板配置 #

创建 grafana/dashboards/go-application.json

{
  "dashboard": {
    "id": null,
    "title": "Go Application Metrics",
    "tags": ["go", "application"],
    "timezone": "browser",
    "panels": [
      {
        "id": 1,
        "title": "HTTP Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total[5m])) by (job)",
            "legendFormat": "{{ job }}"
          }
        ],
        "yAxes": [
          {
            "label": "Requests/sec"
          }
        ]
      },
      {
        "id": 2,
        "title": "HTTP Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) by (job) / sum(rate(http_requests_total[5m])) by (job)",
            "legendFormat": "{{ job }}"
          }
        ],
        "yAxes": [
          {
            "label": "Error Rate",
            "max": 1,
            "min": 0
          }
        ]
      },
      {
        "id": 3,
        "title": "Response Time",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job))",
            "legendFormat": "95th percentile - {{ job }}"
          },
          {
            "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job))",
            "legendFormat": "50th percentile - {{ job }}"
          }
        ],
        "yAxes": [
          {
            "label": "Seconds"
          }
        ]
      },
      {
        "id": 4,
        "title": "Memory Usage",
        "type": "graph",
        "targets": [
          {
            "expr": "memory_usage_bytes",
            "legendFormat": "{{ job }}"
          }
        ],
        "yAxes": [
          {
            "label": "Bytes"
          }
        ]
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "5s"
  }
}

完整的应用示例 #

package main

import (
    "context"
    "log"
    "net/http"
    "os"
    "os/signal"
    "syscall"
    "time"

    "github.com/gin-gonic/gin"
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

func main() {
    // 创建指标收集器
    collector := NewMetricsCollector("user-service")

    // 创建 Gin 应用
    r := gin.New()

    // 添加 Prometheus 中间件
    r.Use(PrometheusMiddleware(collector))

    // 暴露指标端点
    r.GET("/metrics", gin.WrapH(promhttp.Handler()))

    // 创建处理器
    userHandler := NewUserHandler(collector)

    // 添加路由
    r.POST("/user", userHandler.CreateUser)
    r.GET("/user/:id", userHandler.GetUser)
    r.GET("/system/metrics", userHandler.GetSystemMetrics)

    // 启动系统指标收集
    go startSystemMetricsCollection(collector)

    // 创建 HTTP 服务器
    server := &http.Server{
        Addr:    ":8080",
        Handler: r,
    }

    // 启动服务器
    go func() {
        log.Println("Server starting on :8080")
        if err := server.ListenAndServe(); err != http.ErrServerClosed {
            log.Fatalf("Server failed to start: %v", err)
        }
    }()

    // 优雅关闭
    quit := make(chan os.Signal, 1)
    signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
    <-quit

    log.Println("Shutting down server...")

    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()

    if err := server.Shutdown(ctx); err != nil {
        log.Printf("Server shutdown error: %v", err)
    }

    log.Println("Server stopped")
}

// startSystemMetricsCollection 启动系统指标收集
func startSystemMetricsCollection(collector *MetricsCollector) {
    ticker := time.NewTicker(15 * time.Second)
    defer ticker.Stop()

    for range ticker.C {
        // 收集系统指标
        var m runtime.MemStats
        runtime.ReadMemStats(&m)

        collector.UpdateSystemMetrics(
            float64(m.Alloc),
            getCurrentCPUUsage(),
            runtime.NumGoroutine(),
        )
    }
}

小结 #

Prometheus 为 Go 应用提供了完整的监控解决方案。通过合理的指标设计、告警配置和可视化展示,我们可以构建一个全面的应用监控体系。结合 Grafana 的强大可视化能力和 Alertmanager 的灵活告警机制,Prometheus 成为云原生环境中监控系统的首选方案。

通过本章的学习,我们掌握了分布式追踪与监控的完整技术栈,为构建可观测的微服务架构奠定了坚实的基础。