4.5.3 系统服务管理

4.5.3 系统服务管理 #

现代 Linux 系统主要使用 systemd 作为系统和服务管理器。systemd 提供了强大的服务管理功能,包括服务启动、停止、重启、状态监控、日志管理等。本节将详细介绍如何将 Go 程序集成到 systemd 中,创建专业的系统服务。

systemd 基础概念 #

systemd 单元类型 #

systemd 使用单元(Unit)来管理系统资源,常见的单元类型包括:

package main

import (
    "fmt"
    "os/exec"
    "strings"
)

// SystemdUnitType systemd 单元类型
type SystemdUnitType struct {
    Name        string
    Extension   string
    Description string
    Examples    []string
}

func main() {
    unitTypes := []SystemdUnitType{
        {
            Name:        "Service",
            Extension:   ".service",
            Description: "系统服务和应用程序",
            Examples:    []string{"nginx.service", "mysql.service", "sshd.service"},
        },
        {
            Name:        "Socket",
            Extension:   ".socket",
            Description: "套接字激活的服务",
            Examples:    []string{"docker.socket", "systemd-journald.socket"},
        },
        {
            Name:        "Target",
            Extension:   ".target",
            Description: "单元组,类似于运行级别",
            Examples:    []string{"multi-user.target", "graphical.target"},
        },
        {
            Name:        "Timer",
            Extension:   ".timer",
            Description: "定时器,类似于 cron",
            Examples:    []string{"logrotate.timer", "systemd-tmpfiles-clean.timer"},
        },
        {
            Name:        "Mount",
            Extension:   ".mount",
            Description: "文件系统挂载点",
            Examples:    []string{"tmp.mount", "home.mount"},
        },
        {
            Name:        "Path",
            Extension:   ".path",
            Description: "路径监控",
            Examples:    []string{"systemd-ask-password-console.path"},
        },
    }

    fmt.Println("=== systemd 单元类型 ===")
    for _, unit := range unitTypes {
        fmt.Printf("\n%s (%s):\n", unit.Name, unit.Extension)
        fmt.Printf("  描述: %s\n", unit.Description)
        fmt.Printf("  示例: %s\n", strings.Join(unit.Examples, ", "))
    }

    // 显示当前系统的服务状态
    fmt.Println("\n=== 当前系统服务状态 ===")
    showSystemdStatus()
}

func showSystemdStatus() {
    // 获取系统服务列表
    cmd := exec.Command("systemctl", "list-units", "--type=service", "--state=running", "--no-pager", "--no-legend")
    output, err := cmd.Output()
    if err != nil {
        fmt.Printf("获取服务列表失败: %v\n", err)
        return
    }

    lines := strings.Split(string(output), "\n")
    count := 0
    for _, line := range lines {
        if strings.TrimSpace(line) != "" && count < 5 {
            fields := strings.Fields(line)
            if len(fields) >= 4 {
                fmt.Printf("  %s - %s\n", fields[0], fields[3])
                count++
            }
        }
    }

    if count == 5 {
        fmt.Println("  ... (更多服务)")
    }
}

systemd 服务状态 #

systemd 服务具有多种状态,了解这些状态对于服务管理很重要:

package main

import (
    "fmt"
    "os/exec"
    "strings"
)

// ServiceState 服务状态
type ServiceState struct {
    Name        string
    Description string
    Color       string
}

func main() {
    states := []ServiceState{
        {"active (running)", "服务正在运行", "绿色"},
        {"active (exited)", "服务已成功执行并退出", "绿色"},
        {"active (waiting)", "服务正在等待事件", "绿色"},
        {"inactive (dead)", "服务未运行", "白色"},
        {"activating (start)", "服务正在启动", "黄色"},
        {"deactivating (stop)", "服务正在停止", "黄色"},
        {"failed", "服务启动失败", "红色"},
        {"maintenance", "服务处于维护模式", "红色"},
    }

    fmt.Println("=== systemd 服务状态 ===")
    for _, state := range states {
        fmt.Printf("%-25s: %s (%s)\n", state.Name, state.Description, state.Color)
    }

    // 演示服务状态查询
    fmt.Println("\n=== 服务状态查询示例 ===")
    demonstrateServiceStatus()
}

func demonstrateServiceStatus() {
    services := []string{"sshd", "systemd-journald", "dbus"}

    for _, service := range services {
        cmd := exec.Command("systemctl", "is-active", service)
        output, _ := cmd.Output()
        status := strings.TrimSpace(string(output))

        cmd2 := exec.Command("systemctl", "is-enabled", service)
        output2, _ := cmd2.Output()
        enabled := strings.TrimSpace(string(output2))

        fmt.Printf("服务: %-20s 状态: %-10s 开机启动: %s\n", service, status, enabled)
    }
}

创建 systemd 服务 #

基本服务配置 #

让我们创建一个完整的 Go 应用程序并将其配置为 systemd 服务:

// main.go - Go 应用程序
package main

import (
    "context"
    "encoding/json"
    "fmt"
    "log"
    "net/http"
    "os"
    "os/signal"
    "syscall"
    "time"
)

// Config 应用配置
type Config struct {
    Port     int    `json:"port"`
    LogLevel string `json:"log_level"`
    DataDir  string `json:"data_dir"`
}

// Application 应用程序结构
type Application struct {
    config *Config
    server *http.Server
    logger *log.Logger
}

// NewApplication 创建应用程序实例
func NewApplication(configFile string) (*Application, error) {
    config, err := loadConfig(configFile)
    if err != nil {
        return nil, err
    }

    logger := log.New(os.Stdout, "[myservice] ", log.LstdFlags)

    return &Application{
        config: config,
        logger: logger,
    }, nil
}

// loadConfig 加载配置文件
func loadConfig(configFile string) (*Config, error) {
    data, err := os.ReadFile(configFile)
    if err != nil {
        return nil, err
    }

    var config Config
    if err := json.Unmarshal(data, &config); err != nil {
        return nil, err
    }

    // 设置默认值
    if config.Port == 0 {
        config.Port = 8080
    }
    if config.LogLevel == "" {
        config.LogLevel = "INFO"
    }
    if config.DataDir == "" {
        config.DataDir = "/var/lib/myservice"
    }

    return &config, nil
}

// Start 启动应用程序
func (app *Application) Start() error {
    app.logger.Printf("启动服务,端口: %d", app.config.Port)

    // 创建 HTTP 服务器
    mux := http.NewServeMux()
    mux.HandleFunc("/", app.handleRoot)
    mux.HandleFunc("/health", app.handleHealth)
    mux.HandleFunc("/status", app.handleStatus)

    app.server = &http.Server{
        Addr:    fmt.Sprintf(":%d", app.config.Port),
        Handler: mux,
    }

    // 启动服务器
    go func() {
        if err := app.server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
            app.logger.Fatalf("服务器启动失败: %v", err)
        }
    }()

    app.logger.Println("服务启动成功")

    // 等待信号
    return app.waitForSignal()
}

// handleRoot 处理根路径
func (app *Application) handleRoot(w http.ResponseWriter, r *http.Request) {
    fmt.Fprintf(w, "MyService is running!\nTime: %s\n", time.Now().Format("2006-01-02 15:04:05"))
}

// handleHealth 健康检查
func (app *Application) handleHealth(w http.ResponseWriter, r *http.Request) {
    w.WriteHeader(http.StatusOK)
    fmt.Fprintf(w, "OK")
}

// handleStatus 状态信息
func (app *Application) handleStatus(w http.ResponseWriter, r *http.Request) {
    status := map[string]interface{}{
        "service": "myservice",
        "version": "1.0.0",
        "uptime":  time.Since(startTime).String(),
        "pid":     os.Getpid(),
        "config":  app.config,
    }

    w.Header().Set("Content-Type", "application/json")
    json.NewEncoder(w).Encode(status)
}

// waitForSignal 等待系统信号
func (app *Application) waitForSignal() error {
    sigChan := make(chan os.Signal, 1)
    signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)

    for {
        sig := <-sigChan
        app.logger.Printf("接收到信号: %s", sig)

        switch sig {
        case syscall.SIGINT, syscall.SIGTERM:
            app.logger.Println("开始优雅关闭...")
            return app.shutdown()
        case syscall.SIGHUP:
            app.logger.Println("重新加载配置...")
            // 这里可以添加配置重载逻辑
        }
    }
}

// shutdown 优雅关闭
func (app *Application) shutdown() error {
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()

    app.logger.Println("关闭 HTTP 服务器...")
    if err := app.server.Shutdown(ctx); err != nil {
        app.logger.Printf("服务器关闭失败: %v", err)
        return err
    }

    app.logger.Println("服务已关闭")
    return nil
}

var startTime = time.Now()

func main() {
    if len(os.Args) < 2 {
        fmt.Printf("用法: %s <config-file>\n", os.Args[0])
        os.Exit(1)
    }

    app, err := NewApplication(os.Args[1])
    if err != nil {
        log.Fatalf("创建应用程序失败: %v", err)
    }

    if err := app.Start(); err != nil {
        log.Fatalf("应用程序运行失败: %v", err)
    }
}

systemd 服务单元文件 #

创建 systemd 服务单元文件:

# /etc/systemd/system/myservice.service
[Unit]
Description=My Go Service
Documentation=https://github.com/mycompany/myservice
After=network.target
Wants=network.target

[Service]
Type=simple
User=myservice
Group=myservice
WorkingDirectory=/opt/myservice
ExecStart=/opt/myservice/bin/myservice /etc/myservice/config.json
ExecReload=/bin/kill -HUP $MAINPID
Restart=always
RestartSec=5
StandardOutput=journal
StandardError=journal
SyslogIdentifier=myservice

# 安全设置
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=/var/lib/myservice /var/log/myservice

# 资源限制
LimitNOFILE=65536
LimitNPROC=4096

# 环境变量
Environment=GO_ENV=production
Environment=LOG_LEVEL=info

[Install]
WantedBy=multi-user.target

服务安装脚本 #

创建服务安装和管理脚本:

#!/bin/bash
# install-service.sh

set -e

SERVICE_NAME="myservice"
SERVICE_USER="myservice"
SERVICE_GROUP="myservice"
INSTALL_DIR="/opt/myservice"
CONFIG_DIR="/etc/myservice"
DATA_DIR="/var/lib/myservice"
LOG_DIR="/var/log/myservice"

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# 日志函数
log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# 检查是否为 root 用户
check_root() {
    if [[ $EUID -ne 0 ]]; then
        log_error "此脚本需要 root 权限运行"
        exit 1
    fi
}

# 创建用户和组
create_user() {
    if ! id "$SERVICE_USER" &>/dev/null; then
        log_info "创建用户: $SERVICE_USER"
        useradd --system --no-create-home --shell /bin/false "$SERVICE_USER"
    else
        log_info "用户 $SERVICE_USER 已存在"
    fi
}

# 创建目录
create_directories() {
    log_info "创建目录结构..."

    mkdir -p "$INSTALL_DIR/bin"
    mkdir -p "$CONFIG_DIR"
    mkdir -p "$DATA_DIR"
    mkdir -p "$LOG_DIR"

    # 设置权限
    chown -R "$SERVICE_USER:$SERVICE_GROUP" "$INSTALL_DIR"
    chown -R "$SERVICE_USER:$SERVICE_GROUP" "$DATA_DIR"
    chown -R "$SERVICE_USER:$SERVICE_GROUP" "$LOG_DIR"

    chmod 755 "$INSTALL_DIR"
    chmod 755 "$CONFIG_DIR"
    chmod 750 "$DATA_DIR"
    chmod 750 "$LOG_DIR"
}

# 安装二进制文件
install_binary() {
    if [[ ! -f "myservice" ]]; then
        log_error "找不到二进制文件 myservice"
        exit 1
    fi

    log_info "安装二进制文件..."
    cp myservice "$INSTALL_DIR/bin/"
    chmod 755 "$INSTALL_DIR/bin/myservice"
    chown "$SERVICE_USER:$SERVICE_GROUP" "$INSTALL_DIR/bin/myservice"
}

# 安装配置文件
install_config() {
    log_info "安装配置文件..."

    cat > "$CONFIG_DIR/config.json" << EOF
{
    "port": 8080,
    "log_level": "info",
    "data_dir": "$DATA_DIR"
}
EOF

    chmod 644 "$CONFIG_DIR/config.json"
    chown root:root "$CONFIG_DIR/config.json"
}

# 安装 systemd 服务文件
install_systemd_service() {
    log_info "安装 systemd 服务文件..."

    cat > "/etc/systemd/system/$SERVICE_NAME.service" << EOF
[Unit]
Description=My Go Service
Documentation=https://github.com/mycompany/myservice
After=network.target
Wants=network.target

[Service]
Type=simple
User=$SERVICE_USER
Group=$SERVICE_GROUP
WorkingDirectory=$INSTALL_DIR
ExecStart=$INSTALL_DIR/bin/myservice $CONFIG_DIR/config.json
ExecReload=/bin/kill -HUP \$MAINPID
Restart=always
RestartSec=5
StandardOutput=journal
StandardError=journal
SyslogIdentifier=$SERVICE_NAME

# 安全设置
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=$DATA_DIR $LOG_DIR

# 资源限制
LimitNOFILE=65536
LimitNPROC=4096

# 环境变量
Environment=GO_ENV=production
Environment=LOG_LEVEL=info

[Install]
WantedBy=multi-user.target
EOF

    # 重新加载 systemd
    systemctl daemon-reload
}

# 启用并启动服务
enable_service() {
    log_info "启用服务..."
    systemctl enable "$SERVICE_NAME"

    log_info "启动服务..."
    systemctl start "$SERVICE_NAME"

    # 检查服务状态
    sleep 2
    if systemctl is-active --quiet "$SERVICE_NAME"; then
        log_info "服务启动成功"
        systemctl status "$SERVICE_NAME" --no-pager
    else
        log_error "服务启动失败"
        systemctl status "$SERVICE_NAME" --no-pager
        exit 1
    fi
}

# 卸载服务
uninstall_service() {
    log_info "卸载服务..."

    # 停止并禁用服务
    if systemctl is-active --quiet "$SERVICE_NAME"; then
        systemctl stop "$SERVICE_NAME"
    fi

    if systemctl is-enabled --quiet "$SERVICE_NAME"; then
        systemctl disable "$SERVICE_NAME"
    fi

    # 删除服务文件
    rm -f "/etc/systemd/system/$SERVICE_NAME.service"
    systemctl daemon-reload

    # 删除文件和目录
    rm -rf "$INSTALL_DIR"
    rm -rf "$CONFIG_DIR"
    rm -rf "$DATA_DIR"
    rm -rf "$LOG_DIR"

    # 删除用户
    if id "$SERVICE_USER" &>/dev/null; then
        userdel "$SERVICE_USER"
    fi

    log_info "服务卸载完成"
}

# 显示服务状态
show_status() {
    echo "=== 服务状态 ==="
    systemctl status "$SERVICE_NAME" --no-pager

    echo -e "\n=== 服务日志 (最近10行) ==="
    journalctl -u "$SERVICE_NAME" -n 10 --no-pager
}

# 主函数
main() {
    case "${1:-install}" in
        install)
            check_root
            create_user
            create_directories
            install_binary
            install_config
            install_systemd_service
            enable_service
            log_info "安装完成!"
            ;;
        uninstall)
            check_root
            uninstall_service
            ;;
        status)
            show_status
            ;;
        *)
            echo "用法: $0 {install|uninstall|status}"
            exit 1
            ;;
    esac
}

main "$@"

高级服务配置 #

服务依赖和启动顺序 #

# /etc/systemd/system/myservice.service
[Unit]
Description=My Go Service with Dependencies
Documentation=https://github.com/mycompany/myservice

# 网络依赖
After=network.target network-online.target
Wants=network-online.target

# 数据库依赖
After=mysql.service postgresql.service
Wants=mysql.service

# Redis 依赖
After=redis.service
Requires=redis.service

# 确保在多用户模式之前启动
Before=multi-user.target

[Service]
Type=notify
User=myservice
Group=myservice
WorkingDirectory=/opt/myservice

# 启动命令
ExecStart=/opt/myservice/bin/myservice /etc/myservice/config.json

# 启动前检查
ExecStartPre=/opt/myservice/bin/myservice --check-config /etc/myservice/config.json

# 重载配置
ExecReload=/bin/kill -HUP $MAINPID

# 停止命令
ExecStop=/bin/kill -TERM $MAINPID

# 重启策略
Restart=always
RestartSec=10
StartLimitInterval=60
StartLimitBurst=3

# 超时设置
TimeoutStartSec=60
TimeoutStopSec=30

# 输出设置
StandardOutput=journal
StandardError=journal
SyslogIdentifier=myservice

# 安全设置
NoNewPrivileges=true
PrivateTmp=true
PrivateDevices=true
ProtectSystem=strict
ProtectHome=true
ProtectKernelTunables=true
ProtectControlGroups=true
RestrictRealtime=true
RestrictSUIDSGID=true

# 网络设置
PrivateNetwork=false
RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX

# 文件系统访问
ReadWritePaths=/var/lib/myservice /var/log/myservice /tmp
ReadOnlyPaths=/etc/myservice

# 资源限制
LimitNOFILE=65536
LimitNPROC=4096
LimitCORE=0

# 环境变量
Environment=GO_ENV=production
Environment=LOG_LEVEL=info
EnvironmentFile=-/etc/myservice/environment

[Install]
WantedBy=multi-user.target
Also=myservice-backup.timer

支持 systemd 通知的 Go 服务 #

package main

import (
    "context"
    "fmt"
    "log"
    "net"
    "net/http"
    "os"
    "os/signal"
    "syscall"
    "time"

    "github.com/coreos/go-systemd/v22/daemon"
    "github.com/coreos/go-systemd/v22/journal"
)

// SystemdService 支持 systemd 的服务
type SystemdService struct {
    server     *http.Server
    logger     *log.Logger
    startTime  time.Time
    watchdog   *time.Ticker
}

// NewSystemdService 创建新的 systemd 服务
func NewSystemdService() *SystemdService {
    // 使用 systemd journal 作为日志输出
    logger := log.New(os.Stdout, "", 0)

    return &SystemdService{
        logger:    logger,
        startTime: time.Now(),
    }
}

// Start 启动服务
func (s *SystemdService) Start() error {
    s.logInfo("服务启动中...")

    // 创建 HTTP 服务器
    mux := http.NewServeMux()
    mux.HandleFunc("/", s.handleRoot)
    mux.HandleFunc("/health", s.handleHealth)
    mux.HandleFunc("/metrics", s.handleMetrics)

    s.server = &http.Server{
        Addr:    ":8080",
        Handler: mux,
    }

    // 启动服务器
    listener, err := net.Listen("tcp", s.server.Addr)
    if err != nil {
        return fmt.Errorf("监听端口失败: %v", err)
    }

    go func() {
        if err := s.server.Serve(listener); err != nil && err != http.ErrServerClosed {
            s.logError("服务器运行失败: %v", err)
        }
    }()

    // 通知 systemd 服务已就绪
    if supported, err := daemon.SdNotify(false, daemon.SdNotifyReady); err != nil {
        s.logError("通知 systemd 失败: %v", err)
    } else if supported {
        s.logInfo("已通知 systemd 服务就绪")
    }

    // 启动 watchdog
    s.startWatchdog()

    s.logInfo("服务启动完成,监听端口 :8080")

    // 等待信号
    return s.waitForSignal()
}

// startWatchdog 启动 systemd watchdog
func (s *SystemdService) startWatchdog() {
    // 获取 watchdog 间隔
    interval, err := daemon.SdWatchdogEnabled(false)
    if err != nil {
        s.logError("获取 watchdog 间隔失败: %v", err)
        return
    }

    if interval == 0 {
        s.logInfo("Watchdog 未启用")
        return
    }

    s.logInfo("启动 watchdog,间隔: %v", interval)

    // 设置 watchdog 定时器(间隔的一半)
    s.watchdog = time.NewTicker(interval / 2)

    go func() {
        for range s.watchdog.C {
            // 检查服务健康状态
            if s.isHealthy() {
                // 通知 systemd 服务正常
                daemon.SdNotify(false, daemon.SdNotifyWatchdog)
            } else {
                s.logError("服务健康检查失败")
                // 不发送 watchdog 通知,让 systemd 重启服务
            }
        }
    }()
}

// isHealthy 检查服务健康状态
func (s *SystemdService) isHealthy() bool {
    // 这里可以添加具体的健康检查逻辑
    // 例如:检查数据库连接、检查关键组件状态等

    // 简单的健康检查:尝试连接自己的健康检查端点
    client := &http.Client{Timeout: 5 * time.Second}
    resp, err := client.Get("http://localhost:8080/health")
    if err != nil {
        return false
    }
    defer resp.Body.Close()

    return resp.StatusCode == http.StatusOK
}

// handleRoot 处理根路径
func (s *SystemdService) handleRoot(w http.ResponseWriter, r *http.Request) {
    fmt.Fprintf(w, "SystemdService is running!\nUptime: %v\n", time.Since(s.startTime))
}

// handleHealth 健康检查
func (s *SystemdService) handleHealth(w http.ResponseWriter, r *http.Request) {
    w.WriteHeader(http.StatusOK)
    fmt.Fprintf(w, "OK")
}

// handleMetrics 指标端点
func (s *SystemdService) handleMetrics(w http.ResponseWriter, r *http.Request) {
    metrics := map[string]interface{}{
        "uptime_seconds": time.Since(s.startTime).Seconds(),
        "pid":           os.Getpid(),
        "goroutines":    runtime.NumGoroutine(),
    }

    w.Header().Set("Content-Type", "application/json")
    json.NewEncoder(w).Encode(metrics)
}

// waitForSignal 等待系统信号
func (s *SystemdService) waitForSignal() error {
    sigChan := make(chan os.Signal, 1)
    signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)

    for {
        sig := <-sigChan
        s.logInfo("接收到信号: %s", sig)

        switch sig {
        case syscall.SIGINT, syscall.SIGTERM:
            s.logInfo("开始优雅关闭...")
            return s.shutdown()
        case syscall.SIGHUP:
            s.logInfo("重新加载配置...")
            s.reloadConfig()
        }
    }
}

// shutdown 优雅关闭
func (s *SystemdService) shutdown() error {
    // 通知 systemd 服务正在停止
    daemon.SdNotify(false, daemon.SdNotifyStopping)

    // 停止 watchdog
    if s.watchdog != nil {
        s.watchdog.Stop()
    }

    // 关闭 HTTP 服务器
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()

    s.logInfo("关闭 HTTP 服务器...")
    if err := s.server.Shutdown(ctx); err != nil {
        s.logError("服务器关闭失败: %v", err)
        return err
    }

    s.logInfo("服务已关闭")
    return nil
}

// reloadConfig 重新加载配置
func (s *SystemdService) reloadConfig() {
    s.logInfo("重新加载配置...")
    // 这里添加配置重载逻辑

    // 通知 systemd 配置已重载
    daemon.SdNotify(false, daemon.SdNotifyReloading)
    time.Sleep(1 * time.Second) // 模拟重载时间
    daemon.SdNotify(false, daemon.SdNotifyReady)
}

// 日志方法
func (s *SystemdService) logInfo(format string, args ...interface{}) {
    msg := fmt.Sprintf(format, args...)
    s.logger.Printf("[INFO] %s", msg)

    // 发送到 systemd journal
    journal.Send(msg, journal.PriInfo, nil)
}

func (s *SystemdService) logError(format string, args ...interface{}) {
    msg := fmt.Sprintf(format, args...)
    s.logger.Printf("[ERROR] %s", msg)

    // 发送到 systemd journal
    journal.Send(msg, journal.PriErr, nil)
}

func main() {
    service := NewSystemdService()

    if err := service.Start(); err != nil {
        log.Fatalf("服务启动失败: %v", err)
    }
}

带 Watchdog 的服务配置 #

# /etc/systemd/system/myservice.service
[Unit]
Description=My Go Service with Watchdog
After=network.target

[Service]
Type=notify
User=myservice
Group=myservice
WorkingDirectory=/opt/myservice
ExecStart=/opt/myservice/bin/myservice
Restart=always
RestartSec=10

# Watchdog 配置
WatchdogSec=30
NotifyAccess=main

# 健康检查失败时的处理
StartLimitInterval=60
StartLimitBurst=3

[Install]
WantedBy=multi-user.target

服务监控和日志 #

日志管理 #

package main

import (
    "fmt"
    "log/syslog"
    "os"
    "os/exec"
    "strings"
)

// LogManager 日志管理器
type LogManager struct {
    serviceName string
    syslogger   *syslog.Writer
}

// NewLogManager 创建日志管理器
func NewLogManager(serviceName string) (*LogManager, error) {
    // 连接到系统日志
    syslogger, err := syslog.New(syslog.LOG_INFO|syslog.LOG_DAEMON, serviceName)
    if err != nil {
        return nil, err
    }

    return &LogManager{
        serviceName: serviceName,
        syslogger:   syslogger,
    }, nil
}

// LogInfo 记录信息日志
func (lm *LogManager) LogInfo(message string) {
    lm.syslogger.Info(message)
}

// LogError 记录错误日志
func (lm *LogManager) LogError(message string) {
    lm.syslogger.Err(message)
}

// LogWarning 记录警告日志
func (lm *LogManager) LogWarning(message string) {
    lm.syslogger.Warning(message)
}

// GetLogs 获取服务日志
func (lm *LogManager) GetLogs(lines int) ([]string, error) {
    cmd := exec.Command("journalctl", "-u", lm.serviceName, "-n", fmt.Sprintf("%d", lines), "--no-pager")
    output, err := cmd.Output()
    if err != nil {
        return nil, err
    }

    return strings.Split(string(output), "\n"), nil
}

// FollowLogs 实时跟踪日志
func (lm *LogManager) FollowLogs() error {
    cmd := exec.Command("journalctl", "-u", lm.serviceName, "-f")
    cmd.Stdout = os.Stdout
    cmd.Stderr = os.Stderr

    return cmd.Run()
}

// RotateLogs 轮转日志
func (lm *LogManager) RotateLogs() error {
    cmd := exec.Command("systemctl", "kill", "-s", "USR1", lm.serviceName)
    return cmd.Run()
}

func main() {
    lm, err := NewLogManager("myservice")
    if err != nil {
        fmt.Printf("创建日志管理器失败: %v\n", err)
        return
    }
    defer lm.syslogger.Close()

    // 演示日志功能
    fmt.Println("=== 日志管理示例 ===")

    // 记录不同级别的日志
    lm.LogInfo("服务启动")
    lm.LogWarning("这是一个警告")
    lm.LogError("这是一个错误")

    // 获取最近的日志
    logs, err := lm.GetLogs(10)
    if err != nil {
        fmt.Printf("获取日志失败: %v\n", err)
        return
    }

    fmt.Println("\n最近的日志:")
    for _, line := range logs {
        if strings.TrimSpace(line) != "" {
            fmt.Println(line)
        }
    }
}

服务监控脚本 #

#!/bin/bash
# monitor-service.sh

SERVICE_NAME="myservice"
ALERT_EMAIL="[email protected]"
LOG_FILE="/var/log/service-monitor.log"

# 日志函数
log_message() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
}

# 检查服务状态
check_service_status() {
    if systemctl is-active --quiet "$SERVICE_NAME"; then
        return 0
    else
        return 1
    fi
}

# 检查服务健康状态
check_service_health() {
    local health_url="http://localhost:8080/health"
    local response=$(curl -s -o /dev/null -w "%{http_code}" "$health_url" 2>/dev/null)

    if [[ "$response" == "200" ]]; then
        return 0
    else
        return 1
    fi
}

# 获取服务指标
get_service_metrics() {
    local metrics_url="http://localhost:8080/metrics"
    curl -s "$metrics_url" 2>/dev/null
}

# 重启服务
restart_service() {
    log_message "重启服务: $SERVICE_NAME"
    systemctl restart "$SERVICE_NAME"

    # 等待服务启动
    sleep 5

    if check_service_status; then
        log_message "服务重启成功"
        return 0
    else
        log_message "服务重启失败"
        return 1
    fi
}

# 发送告警
send_alert() {
    local subject="$1"
    local message="$2"

    echo "$message" | mail -s "$subject" "$ALERT_EMAIL"
    log_message "发送告警: $subject"
}

# 主监控逻辑
monitor_service() {
    local restart_count=0
    local max_restarts=3

    while true; do
        if ! check_service_status; then
            log_message "服务 $SERVICE_NAME 未运行"

            if [[ $restart_count -lt $max_restarts ]]; then
                if restart_service; then
                    restart_count=0
                else
                    ((restart_count++))
                    send_alert "服务重启失败" "服务 $SERVICE_NAME 重启失败,尝试次数: $restart_count"
                fi
            else
                send_alert "服务持续失败" "服务 $SERVICE_NAME 已达到最大重启次数,需要人工干预"
                break
            fi
        elif ! check_service_health; then
            log_message "服务 $SERVICE_NAME 健康检查失败"

            if [[ $restart_count -lt $max_restarts ]]; then
                if restart_service; then
                    restart_count=0
                else
                    ((restart_count++))
                fi
            fi
        else
            # 服务正常运行
            restart_count=0

            # 记录指标
            metrics=$(get_service_metrics)
            if [[ -n "$metrics" ]]; then
                log_message "服务指标: $metrics"
            fi
        fi

        # 等待下次检查
        sleep 60
    done
}

# 显示服务状态
show_status() {
    echo "=== 服务状态 ==="
    systemctl status "$SERVICE_NAME" --no-pager

    echo -e "\n=== 健康检查 ==="
    if check_service_health; then
        echo "健康状态: 正常"
    else
        echo "健康状态: 异常"
    fi

    echo -e "\n=== 服务指标 ==="
    get_service_metrics

    echo -e "\n=== 最近日志 ==="
    journalctl -u "$SERVICE_NAME" -n 10 --no-pager
}

# 主函数
main() {
    case "${1:-monitor}" in
        monitor)
            log_message "开始监控服务: $SERVICE_NAME"
            monitor_service
            ;;
        status)
            show_status
            ;;
        restart)
            restart_service
            ;;
        *)
            echo "用法: $0 {monitor|status|restart}"
            exit 1
            ;;
    esac
}

main "$@"

小结 #

系统服务管理是现代应用部署的重要环节。通过本节学习,我们掌握了:

  1. systemd 基础:了解了 systemd 的基本概念和服务状态
  2. 服务配置:学会了创建和配置 systemd 服务单元文件
  3. 高级特性:掌握了服务依赖、安全设置、资源限制等高级配置
  4. 监控管理:实现了服务监控、日志管理和自动化运维

这些技术使我们能够将 Go 应用程序专业地部署为系统服务,确保服务的稳定性和可维护性。