5.7.4 Prometheus 监控体系 #
Prometheus 是一个开源的监控和告警系统,专为云原生环境设计。它采用拉取模式收集指标数据,提供强大的查询语言 PromQL,并与 Grafana 等可视化工具完美集成。
Prometheus 架构概览 #
Prometheus 生态系统包含以下核心组件:
- Prometheus Server:核心服务器,负责数据收集、存储和查询
- Client Libraries:客户端库,用于应用程序暴露指标
- Pushgateway:推送网关,用于短生命周期任务的指标收集
- Exporters:导出器,用于收集第三方系统的指标
- Alertmanager:告警管理器,处理告警通知
Prometheus 部署 #
Docker Compose 部署 #
创建 docker-compose.yml
文件:
version: "3.8"
services:
# Prometheus 服务器
prometheus:
image: prom/prometheus:v2.45.0
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./rules:/etc/prometheus/rules
- prometheus_data:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.libraries=/etc/prometheus/console_libraries"
- "--web.console.templates=/etc/prometheus/consoles"
- "--storage.tsdb.retention.time=200h"
- "--web.enable-lifecycle"
- "--web.enable-admin-api"
networks:
- monitoring
# Grafana 可视化
grafana:
image: grafana/grafana:10.0.0
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/datasources:/etc/grafana/provisioning/datasources
networks:
- monitoring
# Alertmanager 告警管理
alertmanager:
image: prom/alertmanager:v0.25.0
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager_data:/alertmanager
networks:
- monitoring
# Node Exporter 系统指标
node-exporter:
image: prom/node-exporter:v1.6.0
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- "--path.procfs=/host/proc"
- "--path.rootfs=/rootfs"
- "--path.sysfs=/host/sys"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
networks:
- monitoring
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
networks:
monitoring:
driver: bridge
Prometheus 配置 #
创建 prometheus.yml
配置文件:
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# Prometheus 自身监控
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# Node Exporter 系统监控
- job_name: "node-exporter"
static_configs:
- targets: ["node-exporter:9100"]
# Go 应用监控
- job_name: "user-service"
static_configs:
- targets: ["user-service:8080"]
metrics_path: "/metrics"
scrape_interval: 10s
- job_name: "order-service"
static_configs:
- targets: ["order-service:8080"]
metrics_path: "/metrics"
scrape_interval: 10s
# 服务发现配置(Kubernetes 环境)
- job_name: "kubernetes-pods"
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels:
[__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
Go 应用指标暴露 #
基础指标集成 #
package main
import (
"context"
"log"
"net/http"
"time"
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// MetricsCollector 指标收集器
type MetricsCollector struct {
// HTTP 请求相关指标
httpRequestsTotal *prometheus.CounterVec
httpRequestDuration *prometheus.HistogramVec
httpRequestsInFlight prometheus.Gauge
// 业务指标
usersTotal prometheus.Counter
ordersTotal *prometheus.CounterVec
activeConnections prometheus.Gauge
// 系统指标
memoryUsage prometheus.Gauge
cpuUsage prometheus.Gauge
goroutinesCount prometheus.Gauge
}
// NewMetricsCollector 创建指标收集器
func NewMetricsCollector(serviceName string) *MetricsCollector {
collector := &MetricsCollector{
// HTTP 请求总数
httpRequestsTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "path", "status_code"},
),
// HTTP 请求持续时间
httpRequestDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "path", "status_code"},
),
// 正在处理的 HTTP 请求数
httpRequestsInFlight: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "http_requests_in_flight",
Help: "Number of HTTP requests currently being processed",
},
),
// 用户总数
usersTotal: prometheus.NewCounter(
prometheus.CounterOpts{
Name: "users_created_total",
Help: "Total number of users created",
},
),
// 订单总数(按状态分类)
ordersTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "orders_total",
Help: "Total number of orders",
},
[]string{"status"},
),
// 活跃连接数
activeConnections: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "active_connections",
Help: "Number of active connections",
},
),
// 内存使用量
memoryUsage: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "memory_usage_bytes",
Help: "Current memory usage in bytes",
},
),
// CPU 使用率
cpuUsage: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "cpu_usage_percent",
Help: "Current CPU usage percentage",
},
),
// Goroutine 数量
goroutinesCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "goroutines_count",
Help: "Number of goroutines",
},
),
}
// 注册指标
prometheus.MustRegister(
collector.httpRequestsTotal,
collector.httpRequestDuration,
collector.httpRequestsInFlight,
collector.usersTotal,
collector.ordersTotal,
collector.activeConnections,
collector.memoryUsage,
collector.cpuUsage,
collector.goroutinesCount,
)
return collector
}
// RecordHTTPRequest 记录 HTTP 请求指标
func (m *MetricsCollector) RecordHTTPRequest(method, path, statusCode string, duration time.Duration) {
m.httpRequestsTotal.WithLabelValues(method, path, statusCode).Inc()
m.httpRequestDuration.WithLabelValues(method, path, statusCode).Observe(duration.Seconds())
}
// IncHTTPRequestsInFlight 增加正在处理的请求数
func (m *MetricsCollector) IncHTTPRequestsInFlight() {
m.httpRequestsInFlight.Inc()
}
// DecHTTPRequestsInFlight 减少正在处理的请求数
func (m *MetricsCollector) DecHTTPRequestsInFlight() {
m.httpRequestsInFlight.Dec()
}
// IncUsersTotal 增加用户总数
func (m *MetricsCollector) IncUsersTotal() {
m.usersTotal.Inc()
}
// IncOrdersTotal 增加订单总数
func (m *MetricsCollector) IncOrdersTotal(status string) {
m.ordersTotal.WithLabelValues(status).Inc()
}
// SetActiveConnections 设置活跃连接数
func (m *MetricsCollector) SetActiveConnections(count float64) {
m.activeConnections.Set(count)
}
// UpdateSystemMetrics 更新系统指标
func (m *MetricsCollector) UpdateSystemMetrics(memUsage, cpuUsage float64, goroutines int) {
m.memoryUsage.Set(memUsage)
m.cpuUsage.Set(cpuUsage)
m.goroutinesCount.Set(float64(goroutines))
}
HTTP 中间件集成 #
package middleware
import (
"strconv"
"time"
"github.com/gin-gonic/gin"
)
// PrometheusMiddleware Prometheus 监控中间件
func PrometheusMiddleware(collector *MetricsCollector) gin.HandlerFunc {
return func(c *gin.Context) {
start := time.Now()
// 增加正在处理的请求数
collector.IncHTTPRequestsInFlight()
defer collector.DecHTTPRequestsInFlight()
// 处理请求
c.Next()
// 记录指标
duration := time.Since(start)
statusCode := strconv.Itoa(c.Writer.Status())
collector.RecordHTTPRequest(
c.Request.Method,
c.FullPath(),
statusCode,
duration,
)
}
}
业务指标集成 #
package handlers
import (
"net/http"
"runtime"
"time"
"github.com/gin-gonic/gin"
)
// UserHandler 用户处理器(集成指标)
type UserHandler struct {
collector *MetricsCollector
// 其他依赖...
}
func NewUserHandler(collector *MetricsCollector) *UserHandler {
return &UserHandler{
collector: collector,
}
}
// CreateUser 创建用户(记录业务指标)
func (h *UserHandler) CreateUser(c *gin.Context) {
var req CreateUserRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
// 创建用户逻辑...
user := &User{
ID: generateUserID(),
Name: req.Name,
Email: req.Email,
}
// 记录业务指标
h.collector.IncUsersTotal()
c.JSON(http.StatusCreated, user)
}
// GetSystemMetrics 获取系统指标
func (h *UserHandler) GetSystemMetrics(c *gin.Context) {
var m runtime.MemStats
runtime.ReadMemStats(&m)
// 更新系统指标
h.collector.UpdateSystemMetrics(
float64(m.Alloc), // 内存使用量
getCurrentCPUUsage(), // CPU 使用率
runtime.NumGoroutine(), // Goroutine 数量
)
c.JSON(http.StatusOK, gin.H{
"memory_usage": m.Alloc,
"goroutines_count": runtime.NumGoroutine(),
"timestamp": time.Now().Unix(),
})
}
// getCurrentCPUUsage 获取当前 CPU 使用率(简化实现)
func getCurrentCPUUsage() float64 {
// 实际实现中应该使用更精确的 CPU 使用率计算
return 0.0
}
自定义指标收集器 #
package metrics
import (
"context"
"database/sql"
"time"
"github.com/prometheus/client_golang/prometheus"
)
// DatabaseMetrics 数据库指标收集器
type DatabaseMetrics struct {
dbConnections *prometheus.GaugeVec
dbQueryDuration *prometheus.HistogramVec
dbQueriesTotal *prometheus.CounterVec
dbConnectionsMax prometheus.Gauge
}
func NewDatabaseMetrics() *DatabaseMetrics {
metrics := &DatabaseMetrics{
dbConnections: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "database_connections",
Help: "Number of database connections",
},
[]string{"state"}, // open, idle, in_use
),
dbQueryDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "database_query_duration_seconds",
Help: "Database query duration in seconds",
Buckets: []float64{0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0},
},
[]string{"operation", "table"},
),
dbQueriesTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "database_queries_total",
Help: "Total number of database queries",
},
[]string{"operation", "table", "status"},
),
dbConnectionsMax: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "database_connections_max",
Help: "Maximum number of database connections",
},
),
}
prometheus.MustRegister(
metrics.dbConnections,
metrics.dbQueryDuration,
metrics.dbQueriesTotal,
metrics.dbConnectionsMax,
)
return metrics
}
// RecordQuery 记录数据库查询
func (m *DatabaseMetrics) RecordQuery(operation, table string, duration time.Duration, err error) {
status := "success"
if err != nil {
status = "error"
}
m.dbQueryDuration.WithLabelValues(operation, table).Observe(duration.Seconds())
m.dbQueriesTotal.WithLabelValues(operation, table, status).Inc()
}
// UpdateConnectionStats 更新连接统计
func (m *DatabaseMetrics) UpdateConnectionStats(db *sql.DB) {
stats := db.Stats()
m.dbConnections.WithLabelValues("open").Set(float64(stats.OpenConnections))
m.dbConnections.WithLabelValues("idle").Set(float64(stats.Idle))
m.dbConnections.WithLabelValues("in_use").Set(float64(stats.InUse))
m.dbConnectionsMax.Set(float64(stats.MaxOpenConnections))
}
// 数据库操作包装器
type MetricsDB struct {
db *sql.DB
metrics *DatabaseMetrics
}
func NewMetricsDB(db *sql.DB, metrics *DatabaseMetrics) *MetricsDB {
return &MetricsDB{
db: db,
metrics: metrics,
}
}
// Query 执行查询并记录指标
func (mdb *MetricsDB) Query(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) {
start := time.Now()
rows, err := mdb.db.QueryContext(ctx, query, args...)
duration := time.Since(start)
mdb.metrics.RecordQuery("SELECT", "users", duration, err)
return rows, err
}
// Exec 执行命令并记录指标
func (mdb *MetricsDB) Exec(ctx context.Context, query string, args ...interface{}) (sql.Result, error) {
start := time.Now()
result, err := mdb.db.ExecContext(ctx, query, args...)
duration := time.Since(start)
mdb.metrics.RecordQuery("INSERT", "users", duration, err)
return result, err
}
告警规则配置 #
创建告警规则 #
创建 rules/alerts.yml
文件:
groups:
- name: application.rules
rules:
# HTTP 错误率告警
- alert: HighHTTPErrorRate
expr: |
(
sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (job)
/
sum(rate(http_requests_total[5m])) by (job)
) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "High HTTP error rate detected"
description: "HTTP error rate is {{ $value | humanizePercentage }} for {{ $labels.job }}"
# 响应时间告警
- alert: HighResponseTime
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)
) > 1.0
for: 5m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}s for {{ $labels.job }}"
# 内存使用告警
- alert: HighMemoryUsage
expr: memory_usage_bytes > 1073741824 # 1GB
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected"
description: "Memory usage is {{ $value | humanizeBytes }} for {{ $labels.job }}"
# 数据库连接告警
- alert: DatabaseConnectionsHigh
expr: database_connections{state="in_use"} / database_connections_max > 0.8
for: 2m
labels:
severity: warning
annotations:
summary: "Database connections usage high"
description: "Database connections usage is {{ $value | humanizePercentage }}"
# 服务不可用告警
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "{{ $labels.job }} service is down"
- name: system.rules
rules:
# CPU 使用率告警
- alert: HighCPUUsage
expr: cpu_usage_percent > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is {{ $value }}% for {{ $labels.instance }}"
# Goroutine 数量告警
- alert: HighGoroutineCount
expr: goroutines_count > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High goroutine count detected"
description: "Goroutine count is {{ $value }} for {{ $labels.job }}"
# 磁盘空间告警
- alert: DiskSpaceLow
expr: |
(
node_filesystem_avail_bytes{mountpoint="/"}
/
node_filesystem_size_bytes{mountpoint="/"}
) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Disk space low"
description: "Disk space is {{ $value | humanizePercentage }} available"
Alertmanager 配置 #
创建 alertmanager.yml
文件:
global:
smtp_smarthost: "localhost:587"
smtp_from: "[email protected]"
route:
group_by: ["alertname"]
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: "web.hook"
routes:
- match:
severity: critical
receiver: "critical-alerts"
- match:
severity: warning
receiver: "warning-alerts"
receivers:
- name: "web.hook"
webhook_configs:
- url: "http://localhost:5001/webhook"
- name: "critical-alerts"
email_configs:
- to: "[email protected]"
subject: "Critical Alert: {{ .GroupLabels.alertname }}"
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
slack_configs:
- api_url: "YOUR_SLACK_WEBHOOK_URL"
channel: "#alerts"
title: "Critical Alert"
text: "{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}"
- name: "warning-alerts"
email_configs:
- to: "[email protected]"
subject: "Warning Alert: {{ .GroupLabels.alertname }}"
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
inhibit_rules:
- source_match:
severity: "critical"
target_match:
severity: "warning"
equal: ["alertname", "instance"]
Grafana 可视化展示 #
数据源配置 #
创建 grafana/datasources/prometheus.yml
:
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
仪表板配置 #
创建 grafana/dashboards/go-application.json
:
{
"dashboard": {
"id": null,
"title": "Go Application Metrics",
"tags": ["go", "application"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "HTTP Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (job)",
"legendFormat": "{{ job }}"
}
],
"yAxes": [
{
"label": "Requests/sec"
}
]
},
{
"id": 2,
"title": "HTTP Error Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) by (job) / sum(rate(http_requests_total[5m])) by (job)",
"legendFormat": "{{ job }}"
}
],
"yAxes": [
{
"label": "Error Rate",
"max": 1,
"min": 0
}
]
},
{
"id": 3,
"title": "Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job))",
"legendFormat": "95th percentile - {{ job }}"
},
{
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job))",
"legendFormat": "50th percentile - {{ job }}"
}
],
"yAxes": [
{
"label": "Seconds"
}
]
},
{
"id": 4,
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "memory_usage_bytes",
"legendFormat": "{{ job }}"
}
],
"yAxes": [
{
"label": "Bytes"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}
}
完整的应用示例 #
package main
import (
"context"
"log"
"net/http"
"os"
"os/signal"
"syscall"
"time"
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
func main() {
// 创建指标收集器
collector := NewMetricsCollector("user-service")
// 创建 Gin 应用
r := gin.New()
// 添加 Prometheus 中间件
r.Use(PrometheusMiddleware(collector))
// 暴露指标端点
r.GET("/metrics", gin.WrapH(promhttp.Handler()))
// 创建处理器
userHandler := NewUserHandler(collector)
// 添加路由
r.POST("/user", userHandler.CreateUser)
r.GET("/user/:id", userHandler.GetUser)
r.GET("/system/metrics", userHandler.GetSystemMetrics)
// 启动系统指标收集
go startSystemMetricsCollection(collector)
// 创建 HTTP 服务器
server := &http.Server{
Addr: ":8080",
Handler: r,
}
// 启动服务器
go func() {
log.Println("Server starting on :8080")
if err := server.ListenAndServe(); err != http.ErrServerClosed {
log.Fatalf("Server failed to start: %v", err)
}
}()
// 优雅关闭
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
<-quit
log.Println("Shutting down server...")
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
if err := server.Shutdown(ctx); err != nil {
log.Printf("Server shutdown error: %v", err)
}
log.Println("Server stopped")
}
// startSystemMetricsCollection 启动系统指标收集
func startSystemMetricsCollection(collector *MetricsCollector) {
ticker := time.NewTicker(15 * time.Second)
defer ticker.Stop()
for range ticker.C {
// 收集系统指标
var m runtime.MemStats
runtime.ReadMemStats(&m)
collector.UpdateSystemMetrics(
float64(m.Alloc),
getCurrentCPUUsage(),
runtime.NumGoroutine(),
)
}
}
小结 #
Prometheus 为 Go 应用提供了完整的监控解决方案。通过合理的指标设计、告警配置和可视化展示,我们可以构建一个全面的应用监控体系。结合 Grafana 的强大可视化能力和 Alertmanager 的灵活告警机制,Prometheus 成为云原生环境中监控系统的首选方案。
通过本章的学习,我们掌握了分布式追踪与监控的完整技术栈,为构建可观测的微服务架构奠定了坚实的基础。