性能监控

本指南涵盖 Swit 中的内置性能监控系统，包括指标收集、分析、阈值监控以及与外部监控系统的集成。

概述

Swit 框架通过 PerformanceMonitor 系统包含全面的性能监控，提供指标收集、事件驱动的监控钩子、操作分析和基于阈值的警报。

核心组件

PerformanceMetrics 结构

框架收集全面的性能指标：

type PerformanceMetrics struct {
    StartupTime    time.Duration // 服务器启动时间
    ShutdownTime   time.Duration // 服务器关闭时间
    Uptime         time.Duration // 当前运行时间
    MemoryUsage    uint64        // 当前内存使用（字节）
    GoroutineCount int           // 活跃协程数
    ServiceCount   int           // 注册服务数
    TransportCount int           // 活跃传输数
    StartCount     int64         // 启动操作计数
    RequestCount   int64         // 请求计数器
    ErrorCount     int64         // 错误计数器
}

PerformanceMonitor 接口

事件驱动的性能监控：

type PerformanceMonitor interface {
    AddHook(hook PerformanceHook)
    RecordEvent(event string)
    ProfileOperation(name string, operation func() error) error
    StartPeriodicCollection(ctx context.Context, interval time.Duration)
    GetSnapshot() *PerformanceMetrics
}

PerformanceHook

用于扩展功能的自定义监控钩子：

type PerformanceHook func(event string, metrics *PerformanceMetrics)

基本性能监控

访问性能指标

// 获取服务器实例
srv, err := server.NewBusinessServerCore(config, registrar, deps)
if err != nil {
    return err
}

// 访问性能监控
monitor := srv.(server.BusinessServerWithPerformance).GetPerformanceMonitor()
metrics := srv.(server.BusinessServerWithPerformance).GetPerformanceMetrics()

// 打印当前指标
fmt.Printf("运行时间: %v\n", srv.GetUptime())
fmt.Printf("内存使用: %d 字节\n", metrics.MemoryUsage)
fmt.Printf("协程数: %d\n", metrics.GoroutineCount)
fmt.Printf("服务数: %d\n", metrics.ServiceCount)

记录自定义事件

// 记录自定义事件
monitor.RecordEvent("user_registration")
monitor.RecordEvent("payment_processed")
monitor.RecordEvent("cache_miss")
monitor.RecordEvent("external_api_call")

// 事件会自动添加时间戳并可触发钩子

分析操作

// 分析数据库操作
err := monitor.ProfileOperation("user_query", func() error {
    users, err := userRepository.GetAllUsers(ctx)
    return err
})

// 分析外部 API 调用
err = monitor.ProfileOperation("payment_api", func() error {
    result, err := paymentClient.ProcessPayment(ctx, payment)
    return err
})

// 分析复杂业务操作
err = monitor.ProfileOperation("order_processing", func() error {
    return orderService.ProcessOrder(ctx, order)
})

内置性能钩子

日志钩子

记录性能事件和指标：

func PerformanceLoggingHook(event string, metrics *PerformanceMetrics) {
    logger.Info("性能事件",
        zap.String("event", event),
        zap.Duration("uptime", metrics.Uptime),
        zap.Uint64("memory_mb", metrics.MemoryUsage/1024/1024),
        zap.Int("goroutines", metrics.GoroutineCount),
        zap.Int64("requests", metrics.RequestCount),
        zap.Int64("errors", metrics.ErrorCount),
    )
}

// 注册钩子
monitor.AddHook(PerformanceLoggingHook)

阈值违规钩子

当超出性能阈值时发出警报：

func PerformanceThresholdViolationHook(event string, metrics *PerformanceMetrics) {
    const (
        MaxMemoryMB     = 512
        MaxGoroutines   = 1000
        MaxErrorRate    = 0.05 // 5%
    )
    
    memoryMB := metrics.MemoryUsage / 1024 / 1024
    if memoryMB > MaxMemoryMB {
        alerting.SendAlert(fmt.Sprintf("高内存使用: %d MB", memoryMB))
    }
    
    if metrics.GoroutineCount > MaxGoroutines {
        alerting.SendAlert(fmt.Sprintf("高协程数: %d", metrics.GoroutineCount))
    }
    
    if metrics.RequestCount > 0 {
        errorRate := float64(metrics.ErrorCount) / float64(metrics.RequestCount)
        if errorRate > MaxErrorRate {
            alerting.SendAlert(fmt.Sprintf("高错误率: %.2f%%", errorRate*100))
        }
    }
}

monitor.AddHook(PerformanceThresholdViolationHook)

指标收集钩子

定期触发指标收集：

func PerformanceMetricsCollectionHook(event string, metrics *PerformanceMetrics) {
    // 发送到 Prometheus
    prometheusMetrics.SetGauge("server_uptime_seconds", metrics.Uptime.Seconds())
    prometheusMetrics.SetGauge("server_memory_bytes", float64(metrics.MemoryUsage))
    prometheusMetrics.SetGauge("server_goroutines", float64(metrics.GoroutineCount))
    prometheusMetrics.IncrementCounter("server_requests_total", float64(metrics.RequestCount))
    prometheusMetrics.IncrementCounter("server_errors_total", float64(metrics.ErrorCount))
    
    // 发送到 InfluxDB
    influxPoint := influxdb2.NewPoint("server_metrics",
        map[string]string{"service": "my-service"},
        map[string]interface{}{
            "uptime_seconds":   metrics.Uptime.Seconds(),
            "memory_bytes":     metrics.MemoryUsage,
            "goroutine_count":  metrics.GoroutineCount,
            "service_count":    metrics.ServiceCount,
            "transport_count":  metrics.TransportCount,
            "request_count":    metrics.RequestCount,
            "error_count":      metrics.ErrorCount,
        },
        time.Now(),
    )
    influxWriter.WritePoint(point)
}

monitor.AddHook(PerformanceMetricsCollectionHook)

自定义性能监控

自定义指标钩子

type CustomMetrics struct {
    DatabaseConnections int
    CacheHitRate       float64
    ExternalAPILatency time.Duration
    ActiveSessions     int
}

func CustomMetricsHook(event string, metrics *PerformanceMetrics) {
    custom := &CustomMetrics{
        DatabaseConnections: getDatabaseConnectionCount(),
        CacheHitRate:       getCacheHitRate(),
        ExternalAPILatency: getAverageAPILatency(),
        ActiveSessions:     getActiveSessionCount(),
    }
    
    // 发送自定义指标到监控系统
    monitoring.RecordMetric("db_connections", float64(custom.DatabaseConnections))
    monitoring.RecordMetric("cache_hit_rate", custom.CacheHitRate)
    monitoring.RecordMetric("api_latency_ms", custom.ExternalAPILatency.Milliseconds())
    monitoring.RecordMetric("active_sessions", float64(custom.ActiveSessions))
}

monitor.AddHook(CustomMetricsHook)

定期指标收集

启动定期收集

// 每 30 秒启动一次定期指标收集
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

monitor.StartPeriodicCollection(ctx, 30*time.Second)

// 定期收集将使用 "metrics_collection" 事件触发钩子

自定义定期收集器

func startCustomPeriodicCollection(ctx context.Context, monitor *server.PerformanceMonitor) {
    ticker := time.NewTicker(1 * time.Minute)
    defer ticker.Stop()
    
    for {
        select {
        case <-ctx.Done():
            return
        case <-ticker.C:
            // 记录自定义事件
            monitor.RecordEvent("periodic_health_check")
            
            // 执行自定义指标收集
            collectCustomMetrics()
            
            // 检查系统健康
            checkSystemHealth()
        }
    }
}

func collectCustomMetrics() {
    // 收集数据库指标
    dbStats := database.Stats()
    prometheusMetrics.SetGauge("db_open_connections", float64(dbStats.OpenConnections))
    prometheusMetrics.SetGauge("db_in_use", float64(dbStats.InUse))
    prometheusMetrics.SetGauge("db_idle", float64(dbStats.Idle))
    
    // 收集缓存指标
    cacheStats := cache.Stats()
    prometheusMetrics.SetGauge("cache_hit_rate", cacheStats.HitRate)
    prometheusMetrics.SetGauge("cache_memory_usage", float64(cacheStats.MemoryUsage))
    
    // 收集业务指标
    prometheusMetrics.SetGauge("active_users", float64(getUserCount()))
    prometheusMetrics.SetGauge("pending_orders", float64(getPendingOrderCount()))
}

与外部监控集成

Prometheus 集成

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

// 定义 Prometheus 指标
var (
    serverUptime = promauto.NewGauge(prometheus.GaugeOpts{
        Name: "server_uptime_seconds",
        Help: "服务器运行时间（秒）",
    })
    
    memoryUsage = promauto.NewGauge(prometheus.GaugeOpts{
        Name: "server_memory_bytes",
        Help: "服务器内存使用（字节）",
    })
    
    requestCounter = promauto.NewCounter(prometheus.CounterOpts{
        Name: "server_requests_total",
        Help: "处理的请求总数",
    })
)

// Prometheus 钩子
func PrometheusHook(event string, metrics *PerformanceMetrics) {
    serverUptime.Set(metrics.Uptime.Seconds())
    memoryUsage.Set(float64(metrics.MemoryUsage))
    requestCounter.Add(float64(metrics.RequestCount))
}

// 启动 Prometheus 指标端点
go func() {
    http.Handle("/metrics", promhttp.Handler())
    log.Fatal(http.ListenAndServe(":2112", nil))
}()

monitor.AddHook(PrometheusHook)

性能分析

CPU 分析

import (
    _ "net/http/pprof"
    "runtime/pprof"
)

// 启用 pprof 端点
go func() {
    log.Println(http.ListenAndServe("localhost:6060", nil))
}()

// 程序化 CPU 分析
func profileCPU(monitor *server.PerformanceMonitor) {
    monitor.ProfileOperation("cpu_intensive_task", func() error {
        f, err := os.Create("cpu.prof")
        if err != nil {
            return err
        }
        defer f.Close()
        
        if err := pprof.StartCPUProfile(f); err != nil {
            return err
        }
        defer pprof.StopCPUProfile()
        
        // 执行 CPU 密集型操作
        performCPUIntensiveTask()
        
        return nil
    })
}

内存分析

import (
    "runtime"
    "runtime/pprof"
)

func profileMemory(monitor *server.PerformanceMonitor) {
    monitor.ProfileOperation("memory_intensive_task", func() error {
        // 分析前强制垃圾收集
        runtime.GC()
        
        f, err := os.Create("mem.prof")
        if err != nil {
            return err
        }
        defer f.Close()
        
        // 执行内存密集型操作
        performMemoryIntensiveTask()
        
        // 写入堆分析
        if err := pprof.WriteHeapProfile(f); err != nil {
            return err
        }
        
        return nil
    })
}

测试性能监控

单元测试性能钩子

func TestPerformanceHooks(t *testing.T) {
    var capturedEvents []string
    var capturedMetrics []*server.PerformanceMetrics
    
    testHook := func(event string, metrics *server.PerformanceMetrics) {
        capturedEvents = append(capturedEvents, event)
        capturedMetrics = append(capturedMetrics, metrics)
    }
    
    monitor := server.NewPerformanceMonitor()
    monitor.AddHook(testHook)
    
    // 记录测试事件
    monitor.RecordEvent("test_event_1")
    monitor.RecordEvent("test_event_2")
    
    // 验证钩子被调用
    assert.Equal(t, 2, len(capturedEvents))
    assert.Contains(t, capturedEvents, "test_event_1")
    assert.Contains(t, capturedEvents, "test_event_2")
    
    // 验证指标被捕获
    assert.Equal(t, 2, len(capturedMetrics))
    assert.NotNil(t, capturedMetrics[0])
    assert.NotNil(t, capturedMetrics[1])
}

最佳实践

性能监控

选择性监控 - 监控关键指标而不使系统过载
阈值设置 - 根据系统正常行为设置现实的阈值
事件粒度 - 在太多和太少事件之间取得平衡
资源影响 - 确保监控不会显著影响性能
数据保留 - 为指标配置适当的数据保留策略

警报策略

防止警报疲劳 - 避免太多误报
升级级别 - 使用不同的警报级别（信息、警告、严重）
上下文信息 - 在警报中包含相关上下文
可操作的警报 - 确保警报提供可操作的信息
测试警报 - 定期测试警报机制

这个性能监控指南提供了内置性能监控功能、自定义监控模式、与外部系统集成以及生产部署最佳实践的全面覆盖。

性能监控 ​

概述 ​

核心组件 ​

PerformanceMetrics 结构 ​

PerformanceMonitor 接口 ​

PerformanceHook ​

基本性能监控 ​

访问性能指标 ​

记录自定义事件 ​

分析操作 ​

内置性能钩子 ​

日志钩子 ​

阈值违规钩子 ​

指标收集钩子 ​

自定义性能监控 ​

自定义指标钩子 ​

定期指标收集 ​

启动定期收集 ​

自定义定期收集器 ​

与外部监控集成 ​

Prometheus 集成 ​

性能分析 ​

CPU 分析 ​

内存分析 ​

测试性能监控 ​

单元测试性能钩子 ​

最佳实践 ​

性能监控 ​

警报策略 ​

性能监控

概述

核心组件

PerformanceMetrics 结构

PerformanceMonitor 接口

PerformanceHook

基本性能监控

访问性能指标

记录自定义事件

分析操作

内置性能钩子

日志钩子

阈值违规钩子

指标收集钩子

自定义性能监控

自定义指标钩子

定期指标收集

启动定期收集

自定义定期收集器

与外部监控集成

Prometheus 集成

性能分析

CPU 分析

内存分析

测试性能监控

单元测试性能钩子

最佳实践

性能监控

警报策略