5.7.3 Jaeger 链路追踪

5.7.3 Jaeger 链路追踪 #

Jaeger 是 Uber 开源的分布式追踪系统,现已成为 CNCF 的毕业项目。它提供了完整的分布式追踪解决方案,包括数据收集、存储、查询和可视化功能。

Jaeger 架构概览 #

Jaeger 采用微服务架构设计,主要包含以下组件:

  • Jaeger Client:客户端库,负责生成和发送追踪数据
  • Jaeger Agent:部署在应用节点的代理,接收客户端数据
  • Jaeger Collector:收集器,处理和存储追踪数据
  • Jaeger Query:查询服务,提供 API 和 Web UI
  • Storage Backend:存储后端,支持 Cassandra、Elasticsearch、Kafka 等

Jaeger 部署 #

Docker Compose 部署 #

创建 docker-compose.yml 文件:

version: "3.8"

services:
  # Jaeger 一体化部署(开发环境)
  jaeger-all-in-one:
    image: jaegertracing/all-in-one:1.45
    ports:
      - "16686:16686" # Jaeger UI
      - "14268:14268" # HTTP collector
      - "14250:14250" # gRPC collector
      - "6831:6831/udp" # Jaeger agent UDP
      - "6832:6832/udp" # Jaeger agent UDP
    environment:
      - COLLECTOR_OTLP_ENABLED=true
    networks:
      - jaeger-network

  # 示例应用
  user-service:
    build: ./user-service
    ports:
      - "8080:8080"
    environment:
      - JAEGER_ENDPOINT=http://jaeger-all-in-one:14268/api/traces
      - SERVICE_NAME=user-service
    depends_on:
      - jaeger-all-in-one
    networks:
      - jaeger-network

  order-service:
    build: ./order-service
    ports:
      - "8081:8080"
    environment:
      - JAEGER_ENDPOINT=http://jaeger-all-in-one:14268/api/traces
      - SERVICE_NAME=order-service
      - USER_SERVICE_URL=http://user-service:8080
    depends_on:
      - jaeger-all-in-one
      - user-service
    networks:
      - jaeger-network

networks:
  jaeger-network:
    driver: bridge

生产环境部署 #

生产环境建议使用分布式部署:

version: "3.8"

services:
  # Elasticsearch 存储
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0
    environment:
      - discovery.type=single-node
      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
    ports:
      - "9200:9200"
    volumes:
      - es_data:/usr/share/elasticsearch/data

  # Jaeger Collector
  jaeger-collector:
    image: jaegertracing/jaeger-collector:1.45
    ports:
      - "14268:14268"
      - "14250:14250"
    environment:
      - SPAN_STORAGE_TYPE=elasticsearch
      - ES_SERVER_URLS=http://elasticsearch:9200
    depends_on:
      - elasticsearch

  # Jaeger Query
  jaeger-query:
    image: jaegertracing/jaeger-query:1.45
    ports:
      - "16686:16686"
    environment:
      - SPAN_STORAGE_TYPE=elasticsearch
      - ES_SERVER_URLS=http://elasticsearch:9200
    depends_on:
      - elasticsearch

  # Jaeger Agent
  jaeger-agent:
    image: jaegertracing/jaeger-agent:1.45
    ports:
      - "6831:6831/udp"
      - "6832:6832/udp"
    environment:
      - REPORTER_GRPC_HOST_PORT=jaeger-collector:14250
    depends_on:
      - jaeger-collector

volumes:
  es_data:

Go 应用集成 Jaeger #

基础集成 #

package main

import (
    "context"
    "fmt"
    "log"
    "net/http"
    "time"

    "github.com/gin-gonic/gin"
    "go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin"
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/codes"
    "go.opentelemetry.io/otel/exporters/jaeger"
    "go.opentelemetry.io/otel/propagation"
    "go.opentelemetry.io/otel/sdk/resource"
    "go.opentelemetry.io/otel/sdk/trace"
    semconv "go.opentelemetry.io/otel/semconv/v1.17.0"
)

// JaegerConfig Jaeger 配置
type JaegerConfig struct {
    ServiceName string
    Environment string
    Endpoint    string
    SampleRate  float64
}

// InitJaeger 初始化 Jaeger 追踪
func InitJaeger(config JaegerConfig) (*trace.TracerProvider, error) {
    // 创建 Jaeger 导出器
    exp, err := jaeger.New(jaeger.WithCollectorEndpoint(
        jaeger.WithEndpoint(config.Endpoint),
    ))
    if err != nil {
        return nil, fmt.Errorf("failed to create Jaeger exporter: %w", err)
    }

    // 创建资源信息
    res, err := resource.New(context.Background(),
        resource.WithAttributes(
            semconv.ServiceName(config.ServiceName),
            semconv.ServiceVersion("1.0.0"),
            semconv.DeploymentEnvironment(config.Environment),
            semconv.ServiceInstanceID("instance-1"),
        ),
    )
    if err != nil {
        return nil, fmt.Errorf("failed to create resource: %w", err)
    }

    // 创建采样器
    var sampler trace.Sampler
    if config.SampleRate >= 1.0 {
        sampler = trace.AlwaysSample()
    } else if config.SampleRate <= 0.0 {
        sampler = trace.NeverSample()
    } else {
        sampler = trace.TraceIDRatioBased(config.SampleRate)
    }

    // 创建追踪提供者
    tp := trace.NewTracerProvider(
        trace.WithBatcher(exp,
            trace.WithBatchTimeout(5*time.Second),
            trace.WithMaxExportBatchSize(512),
        ),
        trace.WithResource(res),
        trace.WithSampler(sampler),
    )

    // 设置全局提供者
    otel.SetTracerProvider(tp)
    otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
        propagation.TraceContext{},
        propagation.Baggage{},
    ))

    return tp, nil
}

func main() {
    // 初始化 Jaeger
    config := JaegerConfig{
        ServiceName: "user-service",
        Environment: "development",
        Endpoint:    "http://localhost:14268/api/traces",
        SampleRate:  1.0,
    }

    tp, err := InitJaeger(config)
    if err != nil {
        log.Fatal("Failed to initialize Jaeger:", err)
    }
    defer func() {
        if err := tp.Shutdown(context.Background()); err != nil {
            log.Printf("Error shutting down tracer provider: %v", err)
        }
    }()

    // 创建 Gin 应用
    r := gin.New()

    // 添加 OpenTelemetry 中间件
    r.Use(otelgin.Middleware("user-service"))

    // 添加路由
    setupRoutes(r)

    log.Println("User service starting on :8080")
    log.Fatal(http.ListenAndServe(":8080", r))
}

用户服务实现 #

package handlers

import (
    "context"
    "fmt"
    "net/http"
    "strconv"
    "time"

    "github.com/gin-gonic/gin"
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/codes"
    "go.opentelemetry.io/otel/trace"
)

// UserHandler 用户处理器
type UserHandler struct {
    tracer trace.Tracer
    db     *UserDB
}

func NewUserHandler(db *UserDB) *UserHandler {
    return &UserHandler{
        tracer: otel.Tracer("user-handler"),
        db:     db,
    }
}

// GetUser 获取用户信息
func (h *UserHandler) GetUser(c *gin.Context) {
    ctx := c.Request.Context()
    userID := c.Param("id")

    // 创建 Span
    ctx, span := h.tracer.Start(ctx, "UserHandler.GetUser")
    defer span.End()

    // 添加输入参数
    span.SetAttributes(
        attribute.String("user.id", userID),
        attribute.String("http.method", c.Request.Method),
        attribute.String("http.url", c.Request.URL.String()),
    )

    // 验证参数
    if userID == "" {
        err := fmt.Errorf("user ID is required")
        span.RecordError(err)
        span.SetStatus(codes.Error, "invalid input")
        c.JSON(http.StatusBadRequest, gin.H{"error": "user ID is required"})
        return
    }

    // 查询用户
    user, err := h.getUserFromDB(ctx, userID)
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "database query failed")
        c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to get user"})
        return
    }

    if user == nil {
        span.SetAttributes(attribute.Bool("user.found", false))
        span.SetStatus(codes.Error, "user not found")
        c.JSON(http.StatusNotFound, gin.H{"error": "user not found"})
        return
    }

    // 添加成功属性
    span.SetAttributes(
        attribute.Bool("user.found", true),
        attribute.String("user.name", user.Name),
        attribute.String("user.email", user.Email),
    )

    span.SetStatus(codes.Ok, "user retrieved successfully")
    c.JSON(http.StatusOK, user)
}

// getUserFromDB 从数据库获取用户
func (h *UserHandler) getUserFromDB(ctx context.Context, userID string) (*User, error) {
    ctx, span := h.tracer.Start(ctx, "database.get_user")
    defer span.End()

    // 添加数据库操作属性
    span.SetAttributes(
        attribute.String("db.system", "postgresql"),
        attribute.String("db.operation", "SELECT"),
        attribute.String("db.table", "users"),
        attribute.String("db.statement", "SELECT * FROM users WHERE id = $1"),
    )

    // 记录查询开始事件
    span.AddEvent("query.start", trace.WithAttributes(
        attribute.String("query.type", "user_lookup"),
        attribute.String("user.id", userID),
    ))

    // 模拟数据库查询延迟
    time.Sleep(50 * time.Millisecond)

    // 模拟查询结果
    id, err := strconv.Atoi(userID)
    if err != nil {
        span.RecordError(err)
        return nil, fmt.Errorf("invalid user ID: %w", err)
    }

    if id <= 0 || id > 1000 {
        span.AddEvent("query.complete", trace.WithAttributes(
            attribute.Int("rows.returned", 0),
        ))
        return nil, nil // 用户不存在
    }

    user := &User{
        ID:    userID,
        Name:  fmt.Sprintf("User %d", id),
        Email: fmt.Sprintf("user%[email protected]", id),
    }

    span.AddEvent("query.complete", trace.WithAttributes(
        attribute.Int("rows.returned", 1),
    ))

    return user, nil
}

// CreateUser 创建用户
func (h *UserHandler) CreateUser(c *gin.Context) {
    ctx := c.Request.Context()

    ctx, span := h.tracer.Start(ctx, "UserHandler.CreateUser")
    defer span.End()

    var req CreateUserRequest
    if err := c.ShouldBindJSON(&req); err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "invalid request body")
        c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
        return
    }

    // 添加请求属性
    span.SetAttributes(
        attribute.String("user.name", req.Name),
        attribute.String("user.email", req.Email),
    )

    // 验证请求
    if err := h.validateCreateUserRequest(ctx, &req); err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "validation failed")
        c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
        return
    }

    // 创建用户
    user, err := h.createUserInDB(ctx, &req)
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "user creation failed")
        c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create user"})
        return
    }

    span.SetAttributes(attribute.String("user.id", user.ID))
    span.SetStatus(codes.Ok, "user created successfully")

    c.JSON(http.StatusCreated, user)
}

// validateCreateUserRequest 验证创建用户请求
func (h *UserHandler) validateCreateUserRequest(ctx context.Context, req *CreateUserRequest) error {
    ctx, span := h.tracer.Start(ctx, "validation.create_user")
    defer span.End()

    if req.Name == "" {
        err := fmt.Errorf("name is required")
        span.RecordError(err)
        return err
    }

    if req.Email == "" {
        err := fmt.Errorf("email is required")
        span.RecordError(err)
        return err
    }

    // 检查邮箱是否已存在
    exists, err := h.checkEmailExists(ctx, req.Email)
    if err != nil {
        span.RecordError(err)
        return err
    }

    if exists {
        err := fmt.Errorf("email already exists")
        span.RecordError(err)
        return err
    }

    span.SetStatus(codes.Ok, "validation passed")
    return nil
}

// checkEmailExists 检查邮箱是否存在
func (h *UserHandler) checkEmailExists(ctx context.Context, email string) (bool, error) {
    ctx, span := h.tracer.Start(ctx, "database.check_email")
    defer span.End()

    span.SetAttributes(
        attribute.String("db.operation", "SELECT"),
        attribute.String("db.statement", "SELECT COUNT(*) FROM users WHERE email = $1"),
        attribute.String("email", email),
    )

    // 模拟数据库查询
    time.Sleep(30 * time.Millisecond)

    // 简单的模拟逻辑
    exists := email == "[email protected]"

    span.SetAttributes(attribute.Bool("email.exists", exists))
    return exists, nil
}

// createUserInDB 在数据库中创建用户
func (h *UserHandler) createUserInDB(ctx context.Context, req *CreateUserRequest) (*User, error) {
    ctx, span := h.tracer.Start(ctx, "database.create_user")
    defer span.End()

    span.SetAttributes(
        attribute.String("db.operation", "INSERT"),
        attribute.String("db.statement", "INSERT INTO users (name, email) VALUES ($1, $2) RETURNING id"),
    )

    // 模拟数据库插入
    time.Sleep(80 * time.Millisecond)

    user := &User{
        ID:    fmt.Sprintf("%d", time.Now().Unix()),
        Name:  req.Name,
        Email: req.Email,
    }

    span.AddEvent("user.created", trace.WithAttributes(
        attribute.String("user.id", user.ID),
    ))

    return user, nil
}

// 数据结构定义
type User struct {
    ID    string `json:"id"`
    Name  string `json:"name"`
    Email string `json:"email"`
}

type CreateUserRequest struct {
    Name  string `json:"name" binding:"required"`
    Email string `json:"email" binding:"required,email"`
}

type UserDB struct {
    // 数据库连接等
}

订单服务调用用户服务 #

package main

import (
    "context"
    "encoding/json"
    "fmt"
    "net/http"
    "time"

    "github.com/gin-gonic/gin"
    "go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin"
    "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/codes"
    "go.opentelemetry.io/otel/trace"
)

// OrderService 订单服务
type OrderService struct {
    tracer         trace.Tracer
    httpClient     *http.Client
    userServiceURL string
}

func NewOrderService(userServiceURL string) *OrderService {
    // 创建带追踪的 HTTP 客户端
    client := &http.Client{
        Transport: otelhttp.NewTransport(http.DefaultTransport),
        Timeout:   30 * time.Second,
    }

    return &OrderService{
        tracer:         otel.Tracer("order-service"),
        httpClient:     client,
        userServiceURL: userServiceURL,
    }
}

// CreateOrder 创建订单
func (s *OrderService) CreateOrder(c *gin.Context) {
    ctx := c.Request.Context()

    ctx, span := s.tracer.Start(ctx, "OrderService.CreateOrder")
    defer span.End()

    var req CreateOrderRequest
    if err := c.ShouldBindJSON(&req); err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "invalid request")
        c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
        return
    }

    // 添加订单属性
    span.SetAttributes(
        attribute.String("order.user_id", req.UserID),
        attribute.String("order.product_id", req.ProductID),
        attribute.Int("order.quantity", req.Quantity),
        attribute.Float64("order.amount", req.Amount),
    )

    // 验证用户存在
    user, err := s.getUserInfo(ctx, req.UserID)
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "user validation failed")
        c.JSON(http.StatusBadRequest, gin.H{"error": "invalid user"})
        return
    }

    // 创建订单
    order, err := s.createOrder(ctx, &req, user)
    if err != nil {
        span.RecordError(err)
        span.SetStatus(codes.Error, "order creation failed")
        c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create order"})
        return
    }

    span.SetAttributes(attribute.String("order.id", order.ID))
    span.SetStatus(codes.Ok, "order created successfully")

    c.JSON(http.StatusCreated, order)
}

// getUserInfo 获取用户信息
func (s *OrderService) getUserInfo(ctx context.Context, userID string) (*User, error) {
    ctx, span := s.tracer.Start(ctx, "http_client.get_user")
    defer span.End()

    span.SetAttributes(
        attribute.String("http.method", "GET"),
        attribute.String("http.url", fmt.Sprintf("%s/user/%s", s.userServiceURL, userID)),
        attribute.String("service.name", "user-service"),
    )

    url := fmt.Sprintf("%s/user/%s", s.userServiceURL, userID)
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        span.RecordError(err)
        return nil, err
    }

    // 发送请求(会自动创建子 Span)
    resp, err := s.httpClient.Do(req)
    if err != nil {
        span.RecordError(err)
        return nil, err
    }
    defer resp.Body.Close()

    span.SetAttributes(attribute.Int("http.status_code", resp.StatusCode))

    if resp.StatusCode == http.StatusNotFound {
        err := fmt.Errorf("user not found")
        span.RecordError(err)
        return nil, err
    }

    if resp.StatusCode != http.StatusOK {
        err := fmt.Errorf("HTTP error: %d", resp.StatusCode)
        span.RecordError(err)
        return nil, err
    }

    var user User
    if err := json.NewDecoder(resp.Body).Decode(&user); err != nil {
        span.RecordError(err)
        return nil, err
    }

    span.AddEvent("user.retrieved", trace.WithAttributes(
        attribute.String("user.name", user.Name),
        attribute.String("user.email", user.Email),
    ))

    return &user, nil
}

// createOrder 创建订单
func (s *OrderService) createOrder(ctx context.Context, req *CreateOrderRequest, user *User) (*Order, error) {
    ctx, span := s.tracer.Start(ctx, "database.create_order")
    defer span.End()

    span.SetAttributes(
        attribute.String("db.operation", "INSERT"),
        attribute.String("db.table", "orders"),
    )

    // 模拟订单创建过程
    time.Sleep(100 * time.Millisecond)

    order := &Order{
        ID:        fmt.Sprintf("order_%d", time.Now().Unix()),
        UserID:    req.UserID,
        UserName:  user.Name,
        ProductID: req.ProductID,
        Quantity:  req.Quantity,
        Amount:    req.Amount,
        Status:    "created",
        CreatedAt: time.Now(),
    }

    span.AddEvent("order.created", trace.WithAttributes(
        attribute.String("order.id", order.ID),
        attribute.String("order.status", order.Status),
    ))

    return order, nil
}

// 数据结构
type CreateOrderRequest struct {
    UserID    string  `json:"user_id" binding:"required"`
    ProductID string  `json:"product_id" binding:"required"`
    Quantity  int     `json:"quantity" binding:"required,min=1"`
    Amount    float64 `json:"amount" binding:"required,min=0"`
}

type Order struct {
    ID        string    `json:"id"`
    UserID    string    `json:"user_id"`
    UserName  string    `json:"user_name"`
    ProductID string    `json:"product_id"`
    Quantity  int       `json:"quantity"`
    Amount    float64   `json:"amount"`
    Status    string    `json:"status"`
    CreatedAt time.Time `json:"created_at"`
}

func main() {
    // 初始化 Jaeger(代码省略,与用户服务类似)

    // 创建订单服务
    orderService := NewOrderService("http://user-service:8080")

    // 创建 Gin 应用
    r := gin.New()
    r.Use(otelgin.Middleware("order-service"))

    // 添加路由
    r.POST("/order", orderService.CreateOrder)

    log.Println("Order service starting on :8080")
    log.Fatal(http.ListenAndServe(":8080", r))
}

链路数据分析 #

Jaeger UI 使用 #

  1. 访问 Jaeger UI:打开浏览器访问 http://localhost:16686

  2. 搜索追踪

    • 选择服务名称(如 order-service
    • 设置时间范围
    • 添加标签过滤(如 user.id=123
    • 点击 “Find Traces” 搜索
  3. 分析追踪详情

    • 点击具体的 Trace 查看详细信息
    • 查看 Span 的时间线和依赖关系
    • 检查 Span 的标签、日志和错误信息

性能分析 #

package analysis

import (
    "context"
    "fmt"
    "time"

    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/trace"
)

// PerformanceAnalyzer 性能分析器
type PerformanceAnalyzer struct {
    tracer trace.Tracer
}

func NewPerformanceAnalyzer() *PerformanceAnalyzer {
    return &PerformanceAnalyzer{
        tracer: otel.Tracer("performance-analyzer"),
    }
}

// AnalyzeSlowQuery 分析慢查询
func (p *PerformanceAnalyzer) AnalyzeSlowQuery(ctx context.Context, query string, duration time.Duration) {
    ctx, span := p.tracer.Start(ctx, "performance.slow_query_analysis")
    defer span.End()

    span.SetAttributes(
        attribute.String("db.statement", query),
        attribute.Float64("query.duration_ms", float64(duration.Nanoseconds())/1e6),
        attribute.Bool("query.slow", duration > 100*time.Millisecond),
    )

    if duration > 500*time.Millisecond {
        span.AddEvent("critical_slow_query", trace.WithAttributes(
            attribute.String("severity", "critical"),
            attribute.String("recommendation", "consider adding index or optimizing query"),
        ))
    } else if duration > 100*time.Millisecond {
        span.AddEvent("slow_query_detected", trace.WithAttributes(
            attribute.String("severity", "warning"),
            attribute.String("recommendation", "monitor query performance"),
        ))
    }
}

// TrackMemoryUsage 跟踪内存使用
func (p *PerformanceAnalyzer) TrackMemoryUsage(ctx context.Context, operation string, memBefore, memAfter uint64) {
    ctx, span := p.tracer.Start(ctx, "performance.memory_usage")
    defer span.End()

    memDiff := memAfter - memBefore

    span.SetAttributes(
        attribute.String("operation", operation),
        attribute.Int64("memory.before_bytes", int64(memBefore)),
        attribute.Int64("memory.after_bytes", int64(memAfter)),
        attribute.Int64("memory.allocated_bytes", int64(memDiff)),
    )

    if memDiff > 10*1024*1024 { // 10MB
        span.AddEvent("high_memory_allocation", trace.WithAttributes(
            attribute.String("severity", "warning"),
            attribute.String("recommendation", "check for memory leaks"),
        ))
    }
}

错误追踪和告警 #

package monitoring

import (
    "context"
    "fmt"
    "time"

    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/codes"
    "go.opentelemetry.io/otel/trace"
)

// ErrorTracker 错误追踪器
type ErrorTracker struct {
    tracer trace.Tracer
}

func NewErrorTracker() *ErrorTracker {
    return &ErrorTracker{
        tracer: otel.Tracer("error-tracker"),
    }
}

// TrackError 追踪错误
func (e *ErrorTracker) TrackError(ctx context.Context, err error, severity string, metadata map[string]interface{}) {
    ctx, span := e.tracer.Start(ctx, "error.tracking")
    defer span.End()

    // 基础错误信息
    span.SetAttributes(
        attribute.String("error.type", fmt.Sprintf("%T", err)),
        attribute.String("error.message", err.Error()),
        attribute.String("error.severity", severity),
        attribute.String("error.timestamp", time.Now().Format(time.RFC3339)),
    )

    // 添加元数据
    for key, value := range metadata {
        switch v := value.(type) {
        case string:
            span.SetAttributes(attribute.String(key, v))
        case int:
            span.SetAttributes(attribute.Int(key, v))
        case float64:
            span.SetAttributes(attribute.Float64(key, v))
        case bool:
            span.SetAttributes(attribute.Bool(key, v))
        }
    }

    // 记录错误
    span.RecordError(err)

    // 设置状态
    switch severity {
    case "critical":
        span.SetStatus(codes.Error, "critical error occurred")
        span.AddEvent("critical_error", trace.WithAttributes(
            attribute.String("action", "immediate_attention_required"),
        ))
    case "warning":
        span.SetStatus(codes.Error, "warning level error")
    default:
        span.SetStatus(codes.Error, "error occurred")
    }
}

// TrackBusinessError 追踪业务错误
func (e *ErrorTracker) TrackBusinessError(ctx context.Context, errorCode, errorMessage string, userID string) {
    ctx, span := e.tracer.Start(ctx, "business_error.tracking")
    defer span.End()

    span.SetAttributes(
        attribute.String("business_error.code", errorCode),
        attribute.String("business_error.message", errorMessage),
        attribute.String("user.id", userID),
        attribute.String("error.category", "business_logic"),
    )

    span.AddEvent("business_error_occurred", trace.WithAttributes(
        attribute.String("error_code", errorCode),
        attribute.String("user_id", userID),
    ))
}

故障排查实践 #

分布式追踪故障排查流程 #

package troubleshooting

import (
    "context"
    "fmt"
    "time"

    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/trace"
)

// TroubleshootingGuide 故障排查指南
type TroubleshootingGuide struct {
    tracer trace.Tracer
}

func NewTroubleshootingGuide() *TroubleshootingGuide {
    return &TroubleshootingGuide{
        tracer: otel.Tracer("troubleshooting"),
    }
}

// DiagnoseSlowRequest 诊断慢请求
func (t *TroubleshootingGuide) DiagnoseSlowRequest(ctx context.Context, traceID string, threshold time.Duration) {
    ctx, span := t.tracer.Start(ctx, "troubleshooting.slow_request")
    defer span.End()

    span.SetAttributes(
        attribute.String("trace.id", traceID),
        attribute.Float64("threshold_ms", float64(threshold.Nanoseconds())/1e6),
    )

    // 模拟分析过程
    span.AddEvent("analysis.start", trace.WithAttributes(
        attribute.String("analysis.type", "slow_request"),
    ))

    // 检查各个服务的响应时间
    services := []string{"user-service", "order-service", "payment-service"}
    for _, service := range services {
        t.analyzeServicePerformance(ctx, service, traceID)
    }

    span.AddEvent("analysis.complete")
}

// analyzeServicePerformance 分析服务性能
func (t *TroubleshootingGuide) analyzeServicePerformance(ctx context.Context, serviceName, traceID string) {
    ctx, span := t.tracer.Start(ctx, fmt.Sprintf("analysis.%s", serviceName))
    defer span.End()

    span.SetAttributes(
        attribute.String("service.name", serviceName),
        attribute.String("trace.id", traceID),
    )

    // 模拟性能分析
    time.Sleep(10 * time.Millisecond)

    // 模拟发现问题
    if serviceName == "payment-service" {
        span.AddEvent("performance_issue_detected", trace.WithAttributes(
            attribute.String("issue.type", "database_timeout"),
            attribute.String("issue.description", "payment database connection timeout"),
            attribute.String("recommendation", "check database connection pool settings"),
        ))
    }
}

// DiagnoseErrorRate 诊断错误率
func (t *TroubleshootingGuide) DiagnoseErrorRate(ctx context.Context, serviceName string, errorRate float64) {
    ctx, span := t.tracer.Start(ctx, "troubleshooting.error_rate")
    defer span.End()

    span.SetAttributes(
        attribute.String("service.name", serviceName),
        attribute.Float64("error.rate", errorRate),
    )

    if errorRate > 0.05 { // 5% 错误率阈值
        span.AddEvent("high_error_rate_detected", trace.WithAttributes(
            attribute.String("severity", "critical"),
            attribute.String("action", "investigate_error_patterns"),
        ))

        // 分析错误模式
        t.analyzeErrorPatterns(ctx, serviceName)
    }
}

// analyzeErrorPatterns 分析错误模式
func (t *TroubleshootingGuide) analyzeErrorPatterns(ctx context.Context, serviceName string) {
    ctx, span := t.tracer.Start(ctx, "analysis.error_patterns")
    defer span.End()

    span.SetAttributes(attribute.String("service.name", serviceName))

    // 模拟错误模式分析
    errorTypes := []string{"database_connection", "timeout", "validation_error"}
    for _, errorType := range errorTypes {
        span.AddEvent("error_pattern_analyzed", trace.WithAttributes(
            attribute.String("error.type", errorType),
            attribute.Int("error.count", 10), // 模拟错误数量
        ))
    }
}

小结 #

Jaeger 为分布式系统提供了强大的链路追踪能力。通过合理的部署架构和 Go 应用集成,我们可以获得完整的请求生命周期视图,快速定位性能瓶颈和故障根因。结合性能分析和错误追踪功能,Jaeger 成为微服务架构中不可或缺的可观测性工具。

在下一节中,我们将学习如何使用 Prometheus 构建完整的监控体系。