5.7.3 Jaeger 链路追踪 #
Jaeger 是 Uber 开源的分布式追踪系统,现已成为 CNCF 的毕业项目。它提供了完整的分布式追踪解决方案,包括数据收集、存储、查询和可视化功能。
Jaeger 架构概览 #
Jaeger 采用微服务架构设计,主要包含以下组件:
- Jaeger Client:客户端库,负责生成和发送追踪数据
- Jaeger Agent:部署在应用节点的代理,接收客户端数据
- Jaeger Collector:收集器,处理和存储追踪数据
- Jaeger Query:查询服务,提供 API 和 Web UI
- Storage Backend:存储后端,支持 Cassandra、Elasticsearch、Kafka 等
Jaeger 部署 #
Docker Compose 部署 #
创建 docker-compose.yml
文件:
version: "3.8"
services:
# Jaeger 一体化部署(开发环境)
jaeger-all-in-one:
image: jaegertracing/all-in-one:1.45
ports:
- "16686:16686" # Jaeger UI
- "14268:14268" # HTTP collector
- "14250:14250" # gRPC collector
- "6831:6831/udp" # Jaeger agent UDP
- "6832:6832/udp" # Jaeger agent UDP
environment:
- COLLECTOR_OTLP_ENABLED=true
networks:
- jaeger-network
# 示例应用
user-service:
build: ./user-service
ports:
- "8080:8080"
environment:
- JAEGER_ENDPOINT=http://jaeger-all-in-one:14268/api/traces
- SERVICE_NAME=user-service
depends_on:
- jaeger-all-in-one
networks:
- jaeger-network
order-service:
build: ./order-service
ports:
- "8081:8080"
environment:
- JAEGER_ENDPOINT=http://jaeger-all-in-one:14268/api/traces
- SERVICE_NAME=order-service
- USER_SERVICE_URL=http://user-service:8080
depends_on:
- jaeger-all-in-one
- user-service
networks:
- jaeger-network
networks:
jaeger-network:
driver: bridge
生产环境部署 #
生产环境建议使用分布式部署:
version: "3.8"
services:
# Elasticsearch 存储
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0
environment:
- discovery.type=single-node
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ports:
- "9200:9200"
volumes:
- es_data:/usr/share/elasticsearch/data
# Jaeger Collector
jaeger-collector:
image: jaegertracing/jaeger-collector:1.45
ports:
- "14268:14268"
- "14250:14250"
environment:
- SPAN_STORAGE_TYPE=elasticsearch
- ES_SERVER_URLS=http://elasticsearch:9200
depends_on:
- elasticsearch
# Jaeger Query
jaeger-query:
image: jaegertracing/jaeger-query:1.45
ports:
- "16686:16686"
environment:
- SPAN_STORAGE_TYPE=elasticsearch
- ES_SERVER_URLS=http://elasticsearch:9200
depends_on:
- elasticsearch
# Jaeger Agent
jaeger-agent:
image: jaegertracing/jaeger-agent:1.45
ports:
- "6831:6831/udp"
- "6832:6832/udp"
environment:
- REPORTER_GRPC_HOST_PORT=jaeger-collector:14250
depends_on:
- jaeger-collector
volumes:
es_data:
Go 应用集成 Jaeger #
基础集成 #
package main
import (
"context"
"fmt"
"log"
"net/http"
"time"
"github.com/gin-gonic/gin"
"go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/exporters/jaeger"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/resource"
"go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.17.0"
)
// JaegerConfig Jaeger 配置
type JaegerConfig struct {
ServiceName string
Environment string
Endpoint string
SampleRate float64
}
// InitJaeger 初始化 Jaeger 追踪
func InitJaeger(config JaegerConfig) (*trace.TracerProvider, error) {
// 创建 Jaeger 导出器
exp, err := jaeger.New(jaeger.WithCollectorEndpoint(
jaeger.WithEndpoint(config.Endpoint),
))
if err != nil {
return nil, fmt.Errorf("failed to create Jaeger exporter: %w", err)
}
// 创建资源信息
res, err := resource.New(context.Background(),
resource.WithAttributes(
semconv.ServiceName(config.ServiceName),
semconv.ServiceVersion("1.0.0"),
semconv.DeploymentEnvironment(config.Environment),
semconv.ServiceInstanceID("instance-1"),
),
)
if err != nil {
return nil, fmt.Errorf("failed to create resource: %w", err)
}
// 创建采样器
var sampler trace.Sampler
if config.SampleRate >= 1.0 {
sampler = trace.AlwaysSample()
} else if config.SampleRate <= 0.0 {
sampler = trace.NeverSample()
} else {
sampler = trace.TraceIDRatioBased(config.SampleRate)
}
// 创建追踪提供者
tp := trace.NewTracerProvider(
trace.WithBatcher(exp,
trace.WithBatchTimeout(5*time.Second),
trace.WithMaxExportBatchSize(512),
),
trace.WithResource(res),
trace.WithSampler(sampler),
)
// 设置全局提供者
otel.SetTracerProvider(tp)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
return tp, nil
}
func main() {
// 初始化 Jaeger
config := JaegerConfig{
ServiceName: "user-service",
Environment: "development",
Endpoint: "http://localhost:14268/api/traces",
SampleRate: 1.0,
}
tp, err := InitJaeger(config)
if err != nil {
log.Fatal("Failed to initialize Jaeger:", err)
}
defer func() {
if err := tp.Shutdown(context.Background()); err != nil {
log.Printf("Error shutting down tracer provider: %v", err)
}
}()
// 创建 Gin 应用
r := gin.New()
// 添加 OpenTelemetry 中间件
r.Use(otelgin.Middleware("user-service"))
// 添加路由
setupRoutes(r)
log.Println("User service starting on :8080")
log.Fatal(http.ListenAndServe(":8080", r))
}
用户服务实现 #
package handlers
import (
"context"
"fmt"
"net/http"
"strconv"
"time"
"github.com/gin-gonic/gin"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
)
// UserHandler 用户处理器
type UserHandler struct {
tracer trace.Tracer
db *UserDB
}
func NewUserHandler(db *UserDB) *UserHandler {
return &UserHandler{
tracer: otel.Tracer("user-handler"),
db: db,
}
}
// GetUser 获取用户信息
func (h *UserHandler) GetUser(c *gin.Context) {
ctx := c.Request.Context()
userID := c.Param("id")
// 创建 Span
ctx, span := h.tracer.Start(ctx, "UserHandler.GetUser")
defer span.End()
// 添加输入参数
span.SetAttributes(
attribute.String("user.id", userID),
attribute.String("http.method", c.Request.Method),
attribute.String("http.url", c.Request.URL.String()),
)
// 验证参数
if userID == "" {
err := fmt.Errorf("user ID is required")
span.RecordError(err)
span.SetStatus(codes.Error, "invalid input")
c.JSON(http.StatusBadRequest, gin.H{"error": "user ID is required"})
return
}
// 查询用户
user, err := h.getUserFromDB(ctx, userID)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "database query failed")
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to get user"})
return
}
if user == nil {
span.SetAttributes(attribute.Bool("user.found", false))
span.SetStatus(codes.Error, "user not found")
c.JSON(http.StatusNotFound, gin.H{"error": "user not found"})
return
}
// 添加成功属性
span.SetAttributes(
attribute.Bool("user.found", true),
attribute.String("user.name", user.Name),
attribute.String("user.email", user.Email),
)
span.SetStatus(codes.Ok, "user retrieved successfully")
c.JSON(http.StatusOK, user)
}
// getUserFromDB 从数据库获取用户
func (h *UserHandler) getUserFromDB(ctx context.Context, userID string) (*User, error) {
ctx, span := h.tracer.Start(ctx, "database.get_user")
defer span.End()
// 添加数据库操作属性
span.SetAttributes(
attribute.String("db.system", "postgresql"),
attribute.String("db.operation", "SELECT"),
attribute.String("db.table", "users"),
attribute.String("db.statement", "SELECT * FROM users WHERE id = $1"),
)
// 记录查询开始事件
span.AddEvent("query.start", trace.WithAttributes(
attribute.String("query.type", "user_lookup"),
attribute.String("user.id", userID),
))
// 模拟数据库查询延迟
time.Sleep(50 * time.Millisecond)
// 模拟查询结果
id, err := strconv.Atoi(userID)
if err != nil {
span.RecordError(err)
return nil, fmt.Errorf("invalid user ID: %w", err)
}
if id <= 0 || id > 1000 {
span.AddEvent("query.complete", trace.WithAttributes(
attribute.Int("rows.returned", 0),
))
return nil, nil // 用户不存在
}
user := &User{
ID: userID,
Name: fmt.Sprintf("User %d", id),
Email: fmt.Sprintf("user%[email protected]", id),
}
span.AddEvent("query.complete", trace.WithAttributes(
attribute.Int("rows.returned", 1),
))
return user, nil
}
// CreateUser 创建用户
func (h *UserHandler) CreateUser(c *gin.Context) {
ctx := c.Request.Context()
ctx, span := h.tracer.Start(ctx, "UserHandler.CreateUser")
defer span.End()
var req CreateUserRequest
if err := c.ShouldBindJSON(&req); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "invalid request body")
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
// 添加请求属性
span.SetAttributes(
attribute.String("user.name", req.Name),
attribute.String("user.email", req.Email),
)
// 验证请求
if err := h.validateCreateUserRequest(ctx, &req); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "validation failed")
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
// 创建用户
user, err := h.createUserInDB(ctx, &req)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "user creation failed")
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create user"})
return
}
span.SetAttributes(attribute.String("user.id", user.ID))
span.SetStatus(codes.Ok, "user created successfully")
c.JSON(http.StatusCreated, user)
}
// validateCreateUserRequest 验证创建用户请求
func (h *UserHandler) validateCreateUserRequest(ctx context.Context, req *CreateUserRequest) error {
ctx, span := h.tracer.Start(ctx, "validation.create_user")
defer span.End()
if req.Name == "" {
err := fmt.Errorf("name is required")
span.RecordError(err)
return err
}
if req.Email == "" {
err := fmt.Errorf("email is required")
span.RecordError(err)
return err
}
// 检查邮箱是否已存在
exists, err := h.checkEmailExists(ctx, req.Email)
if err != nil {
span.RecordError(err)
return err
}
if exists {
err := fmt.Errorf("email already exists")
span.RecordError(err)
return err
}
span.SetStatus(codes.Ok, "validation passed")
return nil
}
// checkEmailExists 检查邮箱是否存在
func (h *UserHandler) checkEmailExists(ctx context.Context, email string) (bool, error) {
ctx, span := h.tracer.Start(ctx, "database.check_email")
defer span.End()
span.SetAttributes(
attribute.String("db.operation", "SELECT"),
attribute.String("db.statement", "SELECT COUNT(*) FROM users WHERE email = $1"),
attribute.String("email", email),
)
// 模拟数据库查询
time.Sleep(30 * time.Millisecond)
// 简单的模拟逻辑
exists := email == "[email protected]"
span.SetAttributes(attribute.Bool("email.exists", exists))
return exists, nil
}
// createUserInDB 在数据库中创建用户
func (h *UserHandler) createUserInDB(ctx context.Context, req *CreateUserRequest) (*User, error) {
ctx, span := h.tracer.Start(ctx, "database.create_user")
defer span.End()
span.SetAttributes(
attribute.String("db.operation", "INSERT"),
attribute.String("db.statement", "INSERT INTO users (name, email) VALUES ($1, $2) RETURNING id"),
)
// 模拟数据库插入
time.Sleep(80 * time.Millisecond)
user := &User{
ID: fmt.Sprintf("%d", time.Now().Unix()),
Name: req.Name,
Email: req.Email,
}
span.AddEvent("user.created", trace.WithAttributes(
attribute.String("user.id", user.ID),
))
return user, nil
}
// 数据结构定义
type User struct {
ID string `json:"id"`
Name string `json:"name"`
Email string `json:"email"`
}
type CreateUserRequest struct {
Name string `json:"name" binding:"required"`
Email string `json:"email" binding:"required,email"`
}
type UserDB struct {
// 数据库连接等
}
订单服务调用用户服务 #
package main
import (
"context"
"encoding/json"
"fmt"
"net/http"
"time"
"github.com/gin-gonic/gin"
"go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
)
// OrderService 订单服务
type OrderService struct {
tracer trace.Tracer
httpClient *http.Client
userServiceURL string
}
func NewOrderService(userServiceURL string) *OrderService {
// 创建带追踪的 HTTP 客户端
client := &http.Client{
Transport: otelhttp.NewTransport(http.DefaultTransport),
Timeout: 30 * time.Second,
}
return &OrderService{
tracer: otel.Tracer("order-service"),
httpClient: client,
userServiceURL: userServiceURL,
}
}
// CreateOrder 创建订单
func (s *OrderService) CreateOrder(c *gin.Context) {
ctx := c.Request.Context()
ctx, span := s.tracer.Start(ctx, "OrderService.CreateOrder")
defer span.End()
var req CreateOrderRequest
if err := c.ShouldBindJSON(&req); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "invalid request")
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
// 添加订单属性
span.SetAttributes(
attribute.String("order.user_id", req.UserID),
attribute.String("order.product_id", req.ProductID),
attribute.Int("order.quantity", req.Quantity),
attribute.Float64("order.amount", req.Amount),
)
// 验证用户存在
user, err := s.getUserInfo(ctx, req.UserID)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "user validation failed")
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid user"})
return
}
// 创建订单
order, err := s.createOrder(ctx, &req, user)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "order creation failed")
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create order"})
return
}
span.SetAttributes(attribute.String("order.id", order.ID))
span.SetStatus(codes.Ok, "order created successfully")
c.JSON(http.StatusCreated, order)
}
// getUserInfo 获取用户信息
func (s *OrderService) getUserInfo(ctx context.Context, userID string) (*User, error) {
ctx, span := s.tracer.Start(ctx, "http_client.get_user")
defer span.End()
span.SetAttributes(
attribute.String("http.method", "GET"),
attribute.String("http.url", fmt.Sprintf("%s/user/%s", s.userServiceURL, userID)),
attribute.String("service.name", "user-service"),
)
url := fmt.Sprintf("%s/user/%s", s.userServiceURL, userID)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
span.RecordError(err)
return nil, err
}
// 发送请求(会自动创建子 Span)
resp, err := s.httpClient.Do(req)
if err != nil {
span.RecordError(err)
return nil, err
}
defer resp.Body.Close()
span.SetAttributes(attribute.Int("http.status_code", resp.StatusCode))
if resp.StatusCode == http.StatusNotFound {
err := fmt.Errorf("user not found")
span.RecordError(err)
return nil, err
}
if resp.StatusCode != http.StatusOK {
err := fmt.Errorf("HTTP error: %d", resp.StatusCode)
span.RecordError(err)
return nil, err
}
var user User
if err := json.NewDecoder(resp.Body).Decode(&user); err != nil {
span.RecordError(err)
return nil, err
}
span.AddEvent("user.retrieved", trace.WithAttributes(
attribute.String("user.name", user.Name),
attribute.String("user.email", user.Email),
))
return &user, nil
}
// createOrder 创建订单
func (s *OrderService) createOrder(ctx context.Context, req *CreateOrderRequest, user *User) (*Order, error) {
ctx, span := s.tracer.Start(ctx, "database.create_order")
defer span.End()
span.SetAttributes(
attribute.String("db.operation", "INSERT"),
attribute.String("db.table", "orders"),
)
// 模拟订单创建过程
time.Sleep(100 * time.Millisecond)
order := &Order{
ID: fmt.Sprintf("order_%d", time.Now().Unix()),
UserID: req.UserID,
UserName: user.Name,
ProductID: req.ProductID,
Quantity: req.Quantity,
Amount: req.Amount,
Status: "created",
CreatedAt: time.Now(),
}
span.AddEvent("order.created", trace.WithAttributes(
attribute.String("order.id", order.ID),
attribute.String("order.status", order.Status),
))
return order, nil
}
// 数据结构
type CreateOrderRequest struct {
UserID string `json:"user_id" binding:"required"`
ProductID string `json:"product_id" binding:"required"`
Quantity int `json:"quantity" binding:"required,min=1"`
Amount float64 `json:"amount" binding:"required,min=0"`
}
type Order struct {
ID string `json:"id"`
UserID string `json:"user_id"`
UserName string `json:"user_name"`
ProductID string `json:"product_id"`
Quantity int `json:"quantity"`
Amount float64 `json:"amount"`
Status string `json:"status"`
CreatedAt time.Time `json:"created_at"`
}
func main() {
// 初始化 Jaeger(代码省略,与用户服务类似)
// 创建订单服务
orderService := NewOrderService("http://user-service:8080")
// 创建 Gin 应用
r := gin.New()
r.Use(otelgin.Middleware("order-service"))
// 添加路由
r.POST("/order", orderService.CreateOrder)
log.Println("Order service starting on :8080")
log.Fatal(http.ListenAndServe(":8080", r))
}
链路数据分析 #
Jaeger UI 使用 #
-
访问 Jaeger UI:打开浏览器访问
http://localhost:16686
-
搜索追踪:
- 选择服务名称(如
order-service
) - 设置时间范围
- 添加标签过滤(如
user.id=123
) - 点击 “Find Traces” 搜索
- 选择服务名称(如
-
分析追踪详情:
- 点击具体的 Trace 查看详细信息
- 查看 Span 的时间线和依赖关系
- 检查 Span 的标签、日志和错误信息
性能分析 #
package analysis
import (
"context"
"fmt"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
)
// PerformanceAnalyzer 性能分析器
type PerformanceAnalyzer struct {
tracer trace.Tracer
}
func NewPerformanceAnalyzer() *PerformanceAnalyzer {
return &PerformanceAnalyzer{
tracer: otel.Tracer("performance-analyzer"),
}
}
// AnalyzeSlowQuery 分析慢查询
func (p *PerformanceAnalyzer) AnalyzeSlowQuery(ctx context.Context, query string, duration time.Duration) {
ctx, span := p.tracer.Start(ctx, "performance.slow_query_analysis")
defer span.End()
span.SetAttributes(
attribute.String("db.statement", query),
attribute.Float64("query.duration_ms", float64(duration.Nanoseconds())/1e6),
attribute.Bool("query.slow", duration > 100*time.Millisecond),
)
if duration > 500*time.Millisecond {
span.AddEvent("critical_slow_query", trace.WithAttributes(
attribute.String("severity", "critical"),
attribute.String("recommendation", "consider adding index or optimizing query"),
))
} else if duration > 100*time.Millisecond {
span.AddEvent("slow_query_detected", trace.WithAttributes(
attribute.String("severity", "warning"),
attribute.String("recommendation", "monitor query performance"),
))
}
}
// TrackMemoryUsage 跟踪内存使用
func (p *PerformanceAnalyzer) TrackMemoryUsage(ctx context.Context, operation string, memBefore, memAfter uint64) {
ctx, span := p.tracer.Start(ctx, "performance.memory_usage")
defer span.End()
memDiff := memAfter - memBefore
span.SetAttributes(
attribute.String("operation", operation),
attribute.Int64("memory.before_bytes", int64(memBefore)),
attribute.Int64("memory.after_bytes", int64(memAfter)),
attribute.Int64("memory.allocated_bytes", int64(memDiff)),
)
if memDiff > 10*1024*1024 { // 10MB
span.AddEvent("high_memory_allocation", trace.WithAttributes(
attribute.String("severity", "warning"),
attribute.String("recommendation", "check for memory leaks"),
))
}
}
错误追踪和告警 #
package monitoring
import (
"context"
"fmt"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
)
// ErrorTracker 错误追踪器
type ErrorTracker struct {
tracer trace.Tracer
}
func NewErrorTracker() *ErrorTracker {
return &ErrorTracker{
tracer: otel.Tracer("error-tracker"),
}
}
// TrackError 追踪错误
func (e *ErrorTracker) TrackError(ctx context.Context, err error, severity string, metadata map[string]interface{}) {
ctx, span := e.tracer.Start(ctx, "error.tracking")
defer span.End()
// 基础错误信息
span.SetAttributes(
attribute.String("error.type", fmt.Sprintf("%T", err)),
attribute.String("error.message", err.Error()),
attribute.String("error.severity", severity),
attribute.String("error.timestamp", time.Now().Format(time.RFC3339)),
)
// 添加元数据
for key, value := range metadata {
switch v := value.(type) {
case string:
span.SetAttributes(attribute.String(key, v))
case int:
span.SetAttributes(attribute.Int(key, v))
case float64:
span.SetAttributes(attribute.Float64(key, v))
case bool:
span.SetAttributes(attribute.Bool(key, v))
}
}
// 记录错误
span.RecordError(err)
// 设置状态
switch severity {
case "critical":
span.SetStatus(codes.Error, "critical error occurred")
span.AddEvent("critical_error", trace.WithAttributes(
attribute.String("action", "immediate_attention_required"),
))
case "warning":
span.SetStatus(codes.Error, "warning level error")
default:
span.SetStatus(codes.Error, "error occurred")
}
}
// TrackBusinessError 追踪业务错误
func (e *ErrorTracker) TrackBusinessError(ctx context.Context, errorCode, errorMessage string, userID string) {
ctx, span := e.tracer.Start(ctx, "business_error.tracking")
defer span.End()
span.SetAttributes(
attribute.String("business_error.code", errorCode),
attribute.String("business_error.message", errorMessage),
attribute.String("user.id", userID),
attribute.String("error.category", "business_logic"),
)
span.AddEvent("business_error_occurred", trace.WithAttributes(
attribute.String("error_code", errorCode),
attribute.String("user_id", userID),
))
}
故障排查实践 #
分布式追踪故障排查流程 #
package troubleshooting
import (
"context"
"fmt"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
)
// TroubleshootingGuide 故障排查指南
type TroubleshootingGuide struct {
tracer trace.Tracer
}
func NewTroubleshootingGuide() *TroubleshootingGuide {
return &TroubleshootingGuide{
tracer: otel.Tracer("troubleshooting"),
}
}
// DiagnoseSlowRequest 诊断慢请求
func (t *TroubleshootingGuide) DiagnoseSlowRequest(ctx context.Context, traceID string, threshold time.Duration) {
ctx, span := t.tracer.Start(ctx, "troubleshooting.slow_request")
defer span.End()
span.SetAttributes(
attribute.String("trace.id", traceID),
attribute.Float64("threshold_ms", float64(threshold.Nanoseconds())/1e6),
)
// 模拟分析过程
span.AddEvent("analysis.start", trace.WithAttributes(
attribute.String("analysis.type", "slow_request"),
))
// 检查各个服务的响应时间
services := []string{"user-service", "order-service", "payment-service"}
for _, service := range services {
t.analyzeServicePerformance(ctx, service, traceID)
}
span.AddEvent("analysis.complete")
}
// analyzeServicePerformance 分析服务性能
func (t *TroubleshootingGuide) analyzeServicePerformance(ctx context.Context, serviceName, traceID string) {
ctx, span := t.tracer.Start(ctx, fmt.Sprintf("analysis.%s", serviceName))
defer span.End()
span.SetAttributes(
attribute.String("service.name", serviceName),
attribute.String("trace.id", traceID),
)
// 模拟性能分析
time.Sleep(10 * time.Millisecond)
// 模拟发现问题
if serviceName == "payment-service" {
span.AddEvent("performance_issue_detected", trace.WithAttributes(
attribute.String("issue.type", "database_timeout"),
attribute.String("issue.description", "payment database connection timeout"),
attribute.String("recommendation", "check database connection pool settings"),
))
}
}
// DiagnoseErrorRate 诊断错误率
func (t *TroubleshootingGuide) DiagnoseErrorRate(ctx context.Context, serviceName string, errorRate float64) {
ctx, span := t.tracer.Start(ctx, "troubleshooting.error_rate")
defer span.End()
span.SetAttributes(
attribute.String("service.name", serviceName),
attribute.Float64("error.rate", errorRate),
)
if errorRate > 0.05 { // 5% 错误率阈值
span.AddEvent("high_error_rate_detected", trace.WithAttributes(
attribute.String("severity", "critical"),
attribute.String("action", "investigate_error_patterns"),
))
// 分析错误模式
t.analyzeErrorPatterns(ctx, serviceName)
}
}
// analyzeErrorPatterns 分析错误模式
func (t *TroubleshootingGuide) analyzeErrorPatterns(ctx context.Context, serviceName string) {
ctx, span := t.tracer.Start(ctx, "analysis.error_patterns")
defer span.End()
span.SetAttributes(attribute.String("service.name", serviceName))
// 模拟错误模式分析
errorTypes := []string{"database_connection", "timeout", "validation_error"}
for _, errorType := range errorTypes {
span.AddEvent("error_pattern_analyzed", trace.WithAttributes(
attribute.String("error.type", errorType),
attribute.Int("error.count", 10), // 模拟错误数量
))
}
}
小结 #
Jaeger 为分布式系统提供了强大的链路追踪能力。通过合理的部署架构和 Go 应用集成,我们可以获得完整的请求生命周期视图,快速定位性能瓶颈和故障根因。结合性能分析和错误追踪功能,Jaeger 成为微服务架构中不可或缺的可观测性工具。
在下一节中,我们将学习如何使用 Prometheus 构建完整的监控体系。