4.6.1 系统资源监控

4.6.1 系统资源监控 #

系统资源监控是确保系统稳定运行的基础。通过实时监控 CPU、内存、磁盘、网络等关键资源,我们可以及时发现性能瓶颈和潜在问题。本节将详细介绍如何使用 Go 语言实现全面的系统资源监控。

监控架构设计 #

监控系统组件 #

一个完整的监控系统通常包含以下核心组件:

  • 数据采集器(Collector):负责收集各种系统指标
  • 数据存储(Storage):存储历史监控数据
  • 数据处理器(Processor):对原始数据进行清洗和聚合
  • 告警引擎(Alerting):根据规则触发告警
  • 可视化界面(Dashboard):展示监控数据和趋势

监控指标分类 #

系统监控指标可以分为以下几类:

  1. 系统级指标:CPU 使用率、内存使用率、磁盘 I/O、网络流量
  2. 应用级指标:请求响应时间、错误率、吞吐量
  3. 业务级指标:用户活跃度、交易量、转化率

CPU 监控实现 #

基础 CPU 信息获取 #

package monitoring

import (
    "bufio"
    "fmt"
    "os"
    "strconv"
    "strings"
    "time"
)

// CPUInfo 表示 CPU 信息
type CPUInfo struct {
    User   float64 `json:"user"`
    System float64 `json:"system"`
    Idle   float64 `json:"idle"`
    IOWait float64 `json:"iowait"`
    Total  float64 `json:"total"`
}

// CPUMonitor CPU 监控器
type CPUMonitor struct {
    interval time.Duration
    lastStat *CPUInfo
}

// NewCPUMonitor 创建 CPU 监控器
func NewCPUMonitor(interval time.Duration) *CPUMonitor {
    return &CPUMonitor{
        interval: interval,
    }
}

// getCPUStat 获取 CPU 统计信息
func (m *CPUMonitor) getCPUStat() (*CPUInfo, error) {
    file, err := os.Open("/proc/stat")
    if err != nil {
        return nil, err
    }
    defer file.Close()

    scanner := bufio.NewScanner(file)
    if !scanner.Scan() {
        return nil, fmt.Errorf("failed to read /proc/stat")
    }

    line := scanner.Text()
    fields := strings.Fields(line)
    if len(fields) < 8 || fields[0] != "cpu" {
        return nil, fmt.Errorf("invalid /proc/stat format")
    }

    // 解析 CPU 时间
    user, _ := strconv.ParseFloat(fields[1], 64)
    nice, _ := strconv.ParseFloat(fields[2], 64)
    system, _ := strconv.ParseFloat(fields[3], 64)
    idle, _ := strconv.ParseFloat(fields[4], 64)
    iowait, _ := strconv.ParseFloat(fields[5], 64)
    irq, _ := strconv.ParseFloat(fields[6], 64)
    softirq, _ := strconv.ParseFloat(fields[7], 64)

    total := user + nice + system + idle + iowait + irq + softirq

    return &CPUInfo{
        User:   user + nice,
        System: system + irq + softirq,
        Idle:   idle,
        IOWait: iowait,
        Total:  total,
    }, nil
}

// GetCPUUsage 获取 CPU 使用率
func (m *CPUMonitor) GetCPUUsage() (float64, error) {
    currentStat, err := m.getCPUStat()
    if err != nil {
        return 0, err
    }

    if m.lastStat == nil {
        m.lastStat = currentStat
        time.Sleep(m.interval)
        return m.GetCPUUsage()
    }

    // 计算差值
    totalDiff := currentStat.Total - m.lastStat.Total
    idleDiff := currentStat.Idle - m.lastStat.Idle

    if totalDiff == 0 {
        return 0, nil
    }

    usage := (totalDiff - idleDiff) / totalDiff * 100
    m.lastStat = currentStat

    return usage, nil
}

// GetDetailedCPUUsage 获取详细的 CPU 使用率
func (m *CPUMonitor) GetDetailedCPUUsage() (map[string]float64, error) {
    currentStat, err := m.getCPUStat()
    if err != nil {
        return nil, err
    }

    if m.lastStat == nil {
        m.lastStat = currentStat
        time.Sleep(m.interval)
        return m.GetDetailedCPUUsage()
    }

    totalDiff := currentStat.Total - m.lastStat.Total
    if totalDiff == 0 {
        return map[string]float64{
            "user":   0,
            "system": 0,
            "idle":   100,
            "iowait": 0,
        }, nil
    }

    result := map[string]float64{
        "user":   (currentStat.User - m.lastStat.User) / totalDiff * 100,
        "system": (currentStat.System - m.lastStat.System) / totalDiff * 100,
        "idle":   (currentStat.Idle - m.lastStat.Idle) / totalDiff * 100,
        "iowait": (currentStat.IOWait - m.lastStat.IOWait) / totalDiff * 100,
    }

    m.lastStat = currentStat
    return result, nil
}

多核 CPU 监控 #

// MultiCoreCPUMonitor 多核 CPU 监控器
type MultiCoreCPUMonitor struct {
    interval time.Duration
    lastStats map[string]*CPUInfo
}

// NewMultiCoreCPUMonitor 创建多核 CPU 监控器
func NewMultiCoreCPUMonitor(interval time.Duration) *MultiCoreCPUMonitor {
    return &MultiCoreCPUMonitor{
        interval:  interval,
        lastStats: make(map[string]*CPUInfo),
    }
}

// getAllCPUStats 获取所有 CPU 核心的统计信息
func (m *MultiCoreCPUMonitor) getAllCPUStats() (map[string]*CPUInfo, error) {
    file, err := os.Open("/proc/stat")
    if err != nil {
        return nil, err
    }
    defer file.Close()

    stats := make(map[string]*CPUInfo)
    scanner := bufio.NewScanner(file)

    for scanner.Scan() {
        line := scanner.Text()
        fields := strings.Fields(line)

        if len(fields) < 8 {
            continue
        }

        cpuName := fields[0]
        if !strings.HasPrefix(cpuName, "cpu") {
            break
        }

        // 解析 CPU 时间
        user, _ := strconv.ParseFloat(fields[1], 64)
        nice, _ := strconv.ParseFloat(fields[2], 64)
        system, _ := strconv.ParseFloat(fields[3], 64)
        idle, _ := strconv.ParseFloat(fields[4], 64)
        iowait, _ := strconv.ParseFloat(fields[5], 64)
        irq, _ := strconv.ParseFloat(fields[6], 64)
        softirq, _ := strconv.ParseFloat(fields[7], 64)

        total := user + nice + system + idle + iowait + irq + softirq

        stats[cpuName] = &CPUInfo{
            User:   user + nice,
            System: system + irq + softirq,
            Idle:   idle,
            IOWait: iowait,
            Total:  total,
        }
    }

    return stats, nil
}

// GetAllCPUUsage 获取所有 CPU 核心的使用率
func (m *MultiCoreCPUMonitor) GetAllCPUUsage() (map[string]float64, error) {
    currentStats, err := m.getAllCPUStats()
    if err != nil {
        return nil, err
    }

    if len(m.lastStats) == 0 {
        m.lastStats = currentStats
        time.Sleep(m.interval)
        return m.GetAllCPUUsage()
    }

    usage := make(map[string]float64)

    for cpuName, currentStat := range currentStats {
        lastStat, exists := m.lastStats[cpuName]
        if !exists {
            continue
        }

        totalDiff := currentStat.Total - lastStat.Total
        idleDiff := currentStat.Idle - lastStat.Idle

        if totalDiff == 0 {
            usage[cpuName] = 0
        } else {
            usage[cpuName] = (totalDiff - idleDiff) / totalDiff * 100
        }
    }

    m.lastStats = currentStats
    return usage, nil
}

内存监控实现 #

内存信息结构 #

// MemoryInfo 内存信息结构
type MemoryInfo struct {
    Total     uint64  `json:"total"`      // 总内存
    Free      uint64  `json:"free"`       // 空闲内存
    Available uint64  `json:"available"`  // 可用内存
    Used      uint64  `json:"used"`       // 已用内存
    Buffers   uint64  `json:"buffers"`    // 缓冲区
    Cached    uint64  `json:"cached"`     // 缓存
    UsageRate float64 `json:"usage_rate"` // 使用率
}

// MemoryMonitor 内存监控器
type MemoryMonitor struct{}

// NewMemoryMonitor 创建内存监控器
func NewMemoryMonitor() *MemoryMonitor {
    return &MemoryMonitor{}
}

// GetMemoryInfo 获取内存信息
func (m *MemoryMonitor) GetMemoryInfo() (*MemoryInfo, error) {
    file, err := os.Open("/proc/meminfo")
    if err != nil {
        return nil, err
    }
    defer file.Close()

    memInfo := &MemoryInfo{}
    scanner := bufio.NewScanner(file)

    for scanner.Scan() {
        line := scanner.Text()
        fields := strings.Fields(line)
        if len(fields) < 2 {
            continue
        }

        key := strings.TrimSuffix(fields[0], ":")
        value, err := strconv.ParseUint(fields[1], 10, 64)
        if err != nil {
            continue
        }

        // 转换为字节(/proc/meminfo 中的值是 KB)
        value *= 1024

        switch key {
        case "MemTotal":
            memInfo.Total = value
        case "MemFree":
            memInfo.Free = value
        case "MemAvailable":
            memInfo.Available = value
        case "Buffers":
            memInfo.Buffers = value
        case "Cached":
            memInfo.Cached = value
        }
    }

    // 计算已用内存和使用率
    memInfo.Used = memInfo.Total - memInfo.Free - memInfo.Buffers - memInfo.Cached
    if memInfo.Total > 0 {
        memInfo.UsageRate = float64(memInfo.Used) / float64(memInfo.Total) * 100
    }

    return memInfo, nil
}

// GetMemoryUsage 获取内存使用率
func (m *MemoryMonitor) GetMemoryUsage() (float64, error) {
    memInfo, err := m.GetMemoryInfo()
    if err != nil {
        return 0, err
    }
    return memInfo.UsageRate, nil
}

交换分区监控 #

// SwapInfo 交换分区信息
type SwapInfo struct {
    Total     uint64  `json:"total"`
    Used      uint64  `json:"used"`
    Free      uint64  `json:"free"`
    UsageRate float64 `json:"usage_rate"`
}

// GetSwapInfo 获取交换分区信息
func (m *MemoryMonitor) GetSwapInfo() (*SwapInfo, error) {
    file, err := os.Open("/proc/meminfo")
    if err != nil {
        return nil, err
    }
    defer file.Close()

    swapInfo := &SwapInfo{}
    scanner := bufio.NewScanner(file)

    for scanner.Scan() {
        line := scanner.Text()
        fields := strings.Fields(line)
        if len(fields) < 2 {
            continue
        }

        key := strings.TrimSuffix(fields[0], ":")
        value, err := strconv.ParseUint(fields[1], 10, 64)
        if err != nil {
            continue
        }

        // 转换为字节
        value *= 1024

        switch key {
        case "SwapTotal":
            swapInfo.Total = value
        case "SwapFree":
            swapInfo.Free = value
        }
    }

    swapInfo.Used = swapInfo.Total - swapInfo.Free
    if swapInfo.Total > 0 {
        swapInfo.UsageRate = float64(swapInfo.Used) / float64(swapInfo.Total) * 100
    }

    return swapInfo, nil
}

磁盘监控实现 #

磁盘使用情况监控 #

import (
    "syscall"
    "unsafe"
)

// DiskInfo 磁盘信息结构
type DiskInfo struct {
    Path      string  `json:"path"`
    Total     uint64  `json:"total"`
    Used      uint64  `json:"used"`
    Free      uint64  `json:"free"`
    UsageRate float64 `json:"usage_rate"`
}

// DiskMonitor 磁盘监控器
type DiskMonitor struct{}

// NewDiskMonitor 创建磁盘监控器
func NewDiskMonitor() *DiskMonitor {
    return &DiskMonitor{}
}

// GetDiskUsage 获取指定路径的磁盘使用情况
func (m *DiskMonitor) GetDiskUsage(path string) (*DiskInfo, error) {
    var stat syscall.Statfs_t
    err := syscall.Statfs(path, &stat)
    if err != nil {
        return nil, err
    }

    total := stat.Blocks * uint64(stat.Bsize)
    free := stat.Bavail * uint64(stat.Bsize)
    used := total - free

    diskInfo := &DiskInfo{
        Path:  path,
        Total: total,
        Used:  used,
        Free:  free,
    }

    if total > 0 {
        diskInfo.UsageRate = float64(used) / float64(total) * 100
    }

    return diskInfo, nil
}

// GetAllDiskUsage 获取所有挂载点的磁盘使用情况
func (m *DiskMonitor) GetAllDiskUsage() ([]*DiskInfo, error) {
    file, err := os.Open("/proc/mounts")
    if err != nil {
        return nil, err
    }
    defer file.Close()

    var diskInfos []*DiskInfo
    scanner := bufio.NewScanner(file)
    seen := make(map[string]bool)

    for scanner.Scan() {
        line := scanner.Text()
        fields := strings.Fields(line)
        if len(fields) < 3 {
            continue
        }

        device := fields[0]
        mountPoint := fields[1]
        fsType := fields[2]

        // 过滤虚拟文件系统
        if strings.HasPrefix(device, "/dev/") &&
           !strings.Contains(fsType, "tmpfs") &&
           !strings.Contains(fsType, "devtmpfs") &&
           !seen[mountPoint] {

            diskInfo, err := m.GetDiskUsage(mountPoint)
            if err == nil {
                diskInfos = append(diskInfos, diskInfo)
                seen[mountPoint] = true
            }
        }
    }

    return diskInfos, nil
}

磁盘 I/O 监控 #

// DiskIOInfo 磁盘 I/O 信息
type DiskIOInfo struct {
    Device        string  `json:"device"`
    ReadBytes     uint64  `json:"read_bytes"`
    WriteBytes    uint64  `json:"write_bytes"`
    ReadOps       uint64  `json:"read_ops"`
    WriteOps      uint64  `json:"write_ops"`
    ReadTime      uint64  `json:"read_time"`
    WriteTime     uint64  `json:"write_time"`
    IOTime        uint64  `json:"io_time"`
    WeightedIOTime uint64 `json:"weighted_io_time"`
}

// DiskIOMonitor 磁盘 I/O 监控器
type DiskIOMonitor struct {
    lastStats map[string]*DiskIOInfo
}

// NewDiskIOMonitor 创建磁盘 I/O 监控器
func NewDiskIOMonitor() *DiskIOMonitor {
    return &DiskIOMonitor{
        lastStats: make(map[string]*DiskIOInfo),
    }
}

// getDiskIOStats 获取磁盘 I/O 统计信息
func (m *DiskIOMonitor) getDiskIOStats() (map[string]*DiskIOInfo, error) {
    file, err := os.Open("/proc/diskstats")
    if err != nil {
        return nil, err
    }
    defer file.Close()

    stats := make(map[string]*DiskIOInfo)
    scanner := bufio.NewScanner(file)

    for scanner.Scan() {
        line := scanner.Text()
        fields := strings.Fields(line)
        if len(fields) < 14 {
            continue
        }

        device := fields[2]

        // 只监控物理磁盘
        if !strings.HasPrefix(device, "sd") &&
           !strings.HasPrefix(device, "hd") &&
           !strings.HasPrefix(device, "nvme") {
            continue
        }

        readOps, _ := strconv.ParseUint(fields[3], 10, 64)
        readBytes, _ := strconv.ParseUint(fields[5], 10, 64)
        writeOps, _ := strconv.ParseUint(fields[7], 10, 64)
        writeBytes, _ := strconv.ParseUint(fields[9], 10, 64)
        readTime, _ := strconv.ParseUint(fields[6], 10, 64)
        writeTime, _ := strconv.ParseUint(fields[10], 10, 64)
        ioTime, _ := strconv.ParseUint(fields[12], 10, 64)
        weightedIOTime, _ := strconv.ParseUint(fields[13], 10, 64)

        stats[device] = &DiskIOInfo{
            Device:         device,
            ReadBytes:      readBytes * 512, // 扇区转字节
            WriteBytes:     writeBytes * 512,
            ReadOps:        readOps,
            WriteOps:       writeOps,
            ReadTime:       readTime,
            WriteTime:      writeTime,
            IOTime:         ioTime,
            WeightedIOTime: weightedIOTime,
        }
    }

    return stats, nil
}

// GetDiskIORate 获取磁盘 I/O 速率
func (m *DiskIOMonitor) GetDiskIORate(interval time.Duration) (map[string]map[string]float64, error) {
    currentStats, err := m.getDiskIOStats()
    if err != nil {
        return nil, err
    }

    if len(m.lastStats) == 0 {
        m.lastStats = currentStats
        time.Sleep(interval)
        return m.GetDiskIORate(interval)
    }

    rates := make(map[string]map[string]float64)
    intervalSec := interval.Seconds()

    for device, current := range currentStats {
        last, exists := m.lastStats[device]
        if !exists {
            continue
        }

        rates[device] = map[string]float64{
            "read_bytes_per_sec":  float64(current.ReadBytes-last.ReadBytes) / intervalSec,
            "write_bytes_per_sec": float64(current.WriteBytes-last.WriteBytes) / intervalSec,
            "read_ops_per_sec":    float64(current.ReadOps-last.ReadOps) / intervalSec,
            "write_ops_per_sec":   float64(current.WriteOps-last.WriteOps) / intervalSec,
        }
    }

    m.lastStats = currentStats
    return rates, nil
}

网络监控实现 #

网络接口监控 #

// NetworkInfo 网络接口信息
type NetworkInfo struct {
    Interface   string  `json:"interface"`
    RxBytes     uint64  `json:"rx_bytes"`
    TxBytes     uint64  `json:"tx_bytes"`
    RxPackets   uint64  `json:"rx_packets"`
    TxPackets   uint64  `json:"tx_packets"`
    RxErrors    uint64  `json:"rx_errors"`
    TxErrors    uint64  `json:"tx_errors"`
    RxDropped   uint64  `json:"rx_dropped"`
    TxDropped   uint64  `json:"tx_dropped"`
}

// NetworkMonitor 网络监控器
type NetworkMonitor struct {
    lastStats map[string]*NetworkInfo
}

// NewNetworkMonitor 创建网络监控器
func NewNetworkMonitor() *NetworkMonitor {
    return &NetworkMonitor{
        lastStats: make(map[string]*NetworkInfo),
    }
}

// getNetworkStats 获取网络统计信息
func (m *NetworkMonitor) getNetworkStats() (map[string]*NetworkInfo, error) {
    file, err := os.Open("/proc/net/dev")
    if err != nil {
        return nil, err
    }
    defer file.Close()

    stats := make(map[string]*NetworkInfo)
    scanner := bufio.NewScanner(file)

    // 跳过头部两行
    scanner.Scan()
    scanner.Scan()

    for scanner.Scan() {
        line := scanner.Text()
        parts := strings.Split(line, ":")
        if len(parts) != 2 {
            continue
        }

        iface := strings.TrimSpace(parts[0])
        fields := strings.Fields(parts[1])
        if len(fields) < 16 {
            continue
        }

        rxBytes, _ := strconv.ParseUint(fields[0], 10, 64)
        rxPackets, _ := strconv.ParseUint(fields[1], 10, 64)
        rxErrors, _ := strconv.ParseUint(fields[2], 10, 64)
        rxDropped, _ := strconv.ParseUint(fields[3], 10, 64)
        txBytes, _ := strconv.ParseUint(fields[8], 10, 64)
        txPackets, _ := strconv.ParseUint(fields[9], 10, 64)
        txErrors, _ := strconv.ParseUint(fields[10], 10, 64)
        txDropped, _ := strconv.ParseUint(fields[11], 10, 64)

        stats[iface] = &NetworkInfo{
            Interface: iface,
            RxBytes:   rxBytes,
            TxBytes:   txBytes,
            RxPackets: rxPackets,
            TxPackets: txPackets,
            RxErrors:  rxErrors,
            TxErrors:  txErrors,
            RxDropped: rxDropped,
            TxDropped: txDropped,
        }
    }

    return stats, nil
}

// GetNetworkRate 获取网络传输速率
func (m *NetworkMonitor) GetNetworkRate(interval time.Duration) (map[string]map[string]float64, error) {
    currentStats, err := m.getNetworkStats()
    if err != nil {
        return nil, err
    }

    if len(m.lastStats) == 0 {
        m.lastStats = currentStats
        time.Sleep(interval)
        return m.GetNetworkRate(interval)
    }

    rates := make(map[string]map[string]float64)
    intervalSec := interval.Seconds()

    for iface, current := range currentStats {
        last, exists := m.lastStats[iface]
        if !exists {
            continue
        }

        rates[iface] = map[string]float64{
            "rx_bytes_per_sec":   float64(current.RxBytes-last.RxBytes) / intervalSec,
            "tx_bytes_per_sec":   float64(current.TxBytes-last.TxBytes) / intervalSec,
            "rx_packets_per_sec": float64(current.RxPackets-last.RxPackets) / intervalSec,
            "tx_packets_per_sec": float64(current.TxPackets-last.TxPackets) / intervalSec,
        }
    }

    m.lastStats = currentStats
    return rates, nil
}

综合监控系统 #

系统监控器 #

// SystemMonitor 系统监控器
type SystemMonitor struct {
    cpuMonitor     *CPUMonitor
    memoryMonitor  *MemoryMonitor
    diskMonitor    *DiskMonitor
    diskIOMonitor  *DiskIOMonitor
    networkMonitor *NetworkMonitor
    interval       time.Duration
}

// SystemMetrics 系统指标
type SystemMetrics struct {
    Timestamp time.Time                      `json:"timestamp"`
    CPU       map[string]float64             `json:"cpu"`
    Memory    *MemoryInfo                    `json:"memory"`
    Swap      *SwapInfo                      `json:"swap"`
    Disk      []*DiskInfo                    `json:"disk"`
    DiskIO    map[string]map[string]float64  `json:"disk_io"`
    Network   map[string]map[string]float64  `json:"network"`
}

// NewSystemMonitor 创建系统监控器
func NewSystemMonitor(interval time.Duration) *SystemMonitor {
    return &SystemMonitor{
        cpuMonitor:     NewCPUMonitor(interval),
        memoryMonitor:  NewMemoryMonitor(),
        diskMonitor:    NewDiskMonitor(),
        diskIOMonitor:  NewDiskIOMonitor(),
        networkMonitor: NewNetworkMonitor(),
        interval:       interval,
    }
}

// CollectMetrics 收集系统指标
func (sm *SystemMonitor) CollectMetrics() (*SystemMetrics, error) {
    metrics := &SystemMetrics{
        Timestamp: time.Now(),
    }

    // 收集 CPU 指标
    cpuUsage, err := sm.cpuMonitor.GetDetailedCPUUsage()
    if err != nil {
        return nil, fmt.Errorf("failed to get CPU usage: %v", err)
    }
    metrics.CPU = cpuUsage

    // 收集内存指标
    memInfo, err := sm.memoryMonitor.GetMemoryInfo()
    if err != nil {
        return nil, fmt.Errorf("failed to get memory info: %v", err)
    }
    metrics.Memory = memInfo

    // 收集交换分区指标
    swapInfo, err := sm.memoryMonitor.GetSwapInfo()
    if err != nil {
        return nil, fmt.Errorf("failed to get swap info: %v", err)
    }
    metrics.Swap = swapInfo

    // 收集磁盘使用指标
    diskInfos, err := sm.diskMonitor.GetAllDiskUsage()
    if err != nil {
        return nil, fmt.Errorf("failed to get disk usage: %v", err)
    }
    metrics.Disk = diskInfos

    // 收集磁盘 I/O 指标
    diskIORate, err := sm.diskIOMonitor.GetDiskIORate(sm.interval)
    if err != nil {
        return nil, fmt.Errorf("failed to get disk I/O rate: %v", err)
    }
    metrics.DiskIO = diskIORate

    // 收集网络指标
    networkRate, err := sm.networkMonitor.GetNetworkRate(sm.interval)
    if err != nil {
        return nil, fmt.Errorf("failed to get network rate: %v", err)
    }
    metrics.Network = networkRate

    return metrics, nil
}

// StartMonitoring 开始监控
func (sm *SystemMonitor) StartMonitoring(ctx context.Context, callback func(*SystemMetrics)) error {
    ticker := time.NewTicker(sm.interval)
    defer ticker.Stop()

    for {
        select {
        case <-ctx.Done():
            return ctx.Err()
        case <-ticker.C:
            metrics, err := sm.CollectMetrics()
            if err != nil {
                fmt.Printf("Error collecting metrics: %v\n", err)
                continue
            }
            callback(metrics)
        }
    }
}

使用示例 #

package main

import (
    "context"
    "encoding/json"
    "fmt"
    "log"
    "time"
)

func main() {
    // 创建系统监控器
    monitor := NewSystemMonitor(5 * time.Second)

    // 创建上下文
    ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
    defer cancel()

    // 开始监控
    err := monitor.StartMonitoring(ctx, func(metrics *SystemMetrics) {
        // 输出监控数据
        data, _ := json.MarshalIndent(metrics, "", "  ")
        fmt.Printf("System Metrics at %s:\n%s\n\n",
            metrics.Timestamp.Format("2006-01-02 15:04:05"), string(data))

        // 检查告警条件
        checkAlerts(metrics)
    })

    if err != nil {
        log.Fatalf("Monitoring failed: %v", err)
    }
}

// checkAlerts 检查告警条件
func checkAlerts(metrics *SystemMetrics) {
    // CPU 使用率告警
    if cpuUsage, ok := metrics.CPU["user"]; ok && cpuUsage > 80 {
        fmt.Printf("ALERT: High CPU usage: %.2f%%\n", cpuUsage)
    }

    // 内存使用率告警
    if metrics.Memory.UsageRate > 90 {
        fmt.Printf("ALERT: High memory usage: %.2f%%\n", metrics.Memory.UsageRate)
    }

    // 磁盘使用率告警
    for _, disk := range metrics.Disk {
        if disk.UsageRate > 85 {
            fmt.Printf("ALERT: High disk usage on %s: %.2f%%\n",
                disk.Path, disk.UsageRate)
        }
    }
}

性能优化建议 #

监控数据采集优化 #

  1. 采样频率控制:根据实际需求调整采样间隔,避免过度采集
  2. 批量处理:将多个指标的采集合并到一次系统调用中
  3. 缓存机制:对变化不频繁的数据进行缓存
  4. 异步处理:使用 goroutine 并行采集不同类型的指标

内存使用优化 #

// 使用对象池减少内存分配
var metricsPool = sync.Pool{
    New: func() interface{} {
        return &SystemMetrics{}
    },
}

func (sm *SystemMonitor) CollectMetricsOptimized() (*SystemMetrics, error) {
    metrics := metricsPool.Get().(*SystemMetrics)
    defer metricsPool.Put(metrics)

    // 重置结构体
    *metrics = SystemMetrics{
        Timestamp: time.Now(),
    }

    // ... 收集指标的代码

    return metrics, nil
}

小结 #

本节详细介绍了系统资源监控的实现方法,包括:

  1. CPU 监控:实现了单核和多核 CPU 使用率监控
  2. 内存监控:包括物理内存和交换分区的监控
  3. 磁盘监控:磁盘使用情况和 I/O 性能监控
  4. 网络监控:网络接口流量和错误统计
  5. 综合监控:整合各种监控指标的系统监控器

通过这些监控组件,我们可以构建一个完整的系统资源监控系统,为系统的稳定运行和性能优化提供数据支持。在实际应用中,还需要考虑数据存储、可视化展示和告警通知等功能。