package monitoring import ( "bufio" "context" "database/sql" "fmt" "os" "runtime" "strconv" "strings" "time" "github.com/atlasos/calypso/internal/common/database" "github.com/atlasos/calypso/internal/common/logger" ) // Metrics represents system metrics type Metrics struct { System SystemMetrics `json:"system"` Storage StorageMetrics `json:"storage"` SCST SCSTMetrics `json:"scst"` Tape TapeMetrics `json:"tape"` VTL VTLMetrics `json:"vtl"` Tasks TaskMetrics `json:"tasks"` API APIMetrics `json:"api"` CollectedAt time.Time `json:"collected_at"` } // SystemMetrics represents system-level metrics type SystemMetrics struct { CPUUsagePercent float64 `json:"cpu_usage_percent"` MemoryUsed int64 `json:"memory_used_bytes"` MemoryTotal int64 `json:"memory_total_bytes"` MemoryPercent float64 `json:"memory_usage_percent"` DiskUsed int64 `json:"disk_used_bytes"` DiskTotal int64 `json:"disk_total_bytes"` DiskPercent float64 `json:"disk_usage_percent"` UptimeSeconds int64 `json:"uptime_seconds"` } // StorageMetrics represents storage metrics type StorageMetrics struct { TotalDisks int `json:"total_disks"` TotalRepositories int `json:"total_repositories"` TotalCapacityBytes int64 `json:"total_capacity_bytes"` UsedCapacityBytes int64 `json:"used_capacity_bytes"` AvailableBytes int64 `json:"available_bytes"` UsagePercent float64 `json:"usage_percent"` } // SCSTMetrics represents SCST metrics type SCSTMetrics struct { TotalTargets int `json:"total_targets"` TotalLUNs int `json:"total_luns"` TotalInitiators int `json:"total_initiators"` ActiveTargets int `json:"active_targets"` } // TapeMetrics represents physical tape metrics type TapeMetrics struct { TotalLibraries int `json:"total_libraries"` TotalDrives int `json:"total_drives"` TotalSlots int `json:"total_slots"` OccupiedSlots int `json:"occupied_slots"` } // VTLMetrics represents virtual tape library metrics type VTLMetrics struct { TotalLibraries int `json:"total_libraries"` TotalDrives int `json:"total_drives"` TotalTapes int `json:"total_tapes"` ActiveDrives int `json:"active_drives"` LoadedTapes int `json:"loaded_tapes"` } // TaskMetrics represents task execution metrics type TaskMetrics struct { TotalTasks int `json:"total_tasks"` PendingTasks int `json:"pending_tasks"` RunningTasks int `json:"running_tasks"` CompletedTasks int `json:"completed_tasks"` FailedTasks int `json:"failed_tasks"` AvgDurationSec float64 `json:"avg_duration_seconds"` } // APIMetrics represents API metrics type APIMetrics struct { TotalRequests int64 `json:"total_requests"` RequestsPerSec float64 `json:"requests_per_second"` ErrorRate float64 `json:"error_rate"` AvgLatencyMs float64 `json:"avg_latency_ms"` ActiveConnections int `json:"active_connections"` } // MetricsService collects and provides system metrics type MetricsService struct { db *database.DB logger *logger.Logger startTime time.Time lastCPU *cpuStats // For CPU usage calculation lastCPUTime time.Time } // cpuStats represents CPU statistics from /proc/stat type cpuStats struct { user uint64 nice uint64 system uint64 idle uint64 iowait uint64 irq uint64 softirq uint64 steal uint64 guest uint64 } // NewMetricsService creates a new metrics service func NewMetricsService(db *database.DB, log *logger.Logger) *MetricsService { return &MetricsService{ db: db, logger: log, startTime: time.Now(), } } // CollectMetrics collects all system metrics func (s *MetricsService) CollectMetrics(ctx context.Context) (*Metrics, error) { metrics := &Metrics{ CollectedAt: time.Now(), } // Collect system metrics sysMetrics, err := s.collectSystemMetrics(ctx) if err != nil { s.logger.Error("Failed to collect system metrics", "error", err) // Set default/zero values if collection fails metrics.System = SystemMetrics{} } else { metrics.System = *sysMetrics } // Collect storage metrics storageMetrics, err := s.collectStorageMetrics(ctx) if err != nil { s.logger.Error("Failed to collect storage metrics", "error", err) } else { metrics.Storage = *storageMetrics } // Collect SCST metrics scstMetrics, err := s.collectSCSTMetrics(ctx) if err != nil { s.logger.Error("Failed to collect SCST metrics", "error", err) } else { metrics.SCST = *scstMetrics } // Collect tape metrics tapeMetrics, err := s.collectTapeMetrics(ctx) if err != nil { s.logger.Error("Failed to collect tape metrics", "error", err) } else { metrics.Tape = *tapeMetrics } // Collect VTL metrics vtlMetrics, err := s.collectVTLMetrics(ctx) if err != nil { s.logger.Error("Failed to collect VTL metrics", "error", err) } else { metrics.VTL = *vtlMetrics } // Collect task metrics taskMetrics, err := s.collectTaskMetrics(ctx) if err != nil { s.logger.Error("Failed to collect task metrics", "error", err) } else { metrics.Tasks = *taskMetrics } // API metrics are collected separately via middleware metrics.API = APIMetrics{} // Placeholder return metrics, nil } // collectSystemMetrics collects system-level metrics func (s *MetricsService) collectSystemMetrics(ctx context.Context) (*SystemMetrics, error) { // Get system memory from /proc/meminfo memoryTotal, memoryUsed, memoryPercent := s.getSystemMemory() // Get CPU usage from /proc/stat cpuUsage := s.getCPUUsage() // Get system uptime from /proc/uptime uptime := s.getSystemUptime() metrics := &SystemMetrics{ CPUUsagePercent: cpuUsage, MemoryUsed: memoryUsed, MemoryTotal: memoryTotal, MemoryPercent: memoryPercent, DiskUsed: 0, // Would need to read from df DiskTotal: 0, DiskPercent: 0, UptimeSeconds: int64(uptime), } return metrics, nil } // collectStorageMetrics collects storage metrics func (s *MetricsService) collectStorageMetrics(ctx context.Context) (*StorageMetrics, error) { // Count disks diskQuery := `SELECT COUNT(*) FROM physical_disks WHERE is_active = true` var totalDisks int if err := s.db.QueryRowContext(ctx, diskQuery).Scan(&totalDisks); err != nil { return nil, fmt.Errorf("failed to count disks: %w", err) } // Count repositories and calculate capacity repoQuery := ` SELECT COUNT(*), COALESCE(SUM(total_bytes), 0), COALESCE(SUM(used_bytes), 0) FROM disk_repositories WHERE is_active = true ` var totalRepos int var totalCapacity, usedCapacity int64 if err := s.db.QueryRowContext(ctx, repoQuery).Scan(&totalRepos, &totalCapacity, &usedCapacity); err != nil { return nil, fmt.Errorf("failed to query repositories: %w", err) } availableBytes := totalCapacity - usedCapacity usagePercent := 0.0 if totalCapacity > 0 { usagePercent = float64(usedCapacity) / float64(totalCapacity) * 100 } return &StorageMetrics{ TotalDisks: totalDisks, TotalRepositories: totalRepos, TotalCapacityBytes: totalCapacity, UsedCapacityBytes: usedCapacity, AvailableBytes: availableBytes, UsagePercent: usagePercent, }, nil } // collectSCSTMetrics collects SCST metrics func (s *MetricsService) collectSCSTMetrics(ctx context.Context) (*SCSTMetrics, error) { // Count targets targetQuery := `SELECT COUNT(*) FROM scst_targets` var totalTargets int if err := s.db.QueryRowContext(ctx, targetQuery).Scan(&totalTargets); err != nil { return nil, fmt.Errorf("failed to count targets: %w", err) } // Count LUNs lunQuery := `SELECT COUNT(*) FROM scst_luns` var totalLUNs int if err := s.db.QueryRowContext(ctx, lunQuery).Scan(&totalLUNs); err != nil { return nil, fmt.Errorf("failed to count LUNs: %w", err) } // Count initiators initQuery := `SELECT COUNT(*) FROM scst_initiators` var totalInitiators int if err := s.db.QueryRowContext(ctx, initQuery).Scan(&totalInitiators); err != nil { return nil, fmt.Errorf("failed to count initiators: %w", err) } // Active targets (targets with at least one LUN) activeQuery := ` SELECT COUNT(DISTINCT target_id) FROM scst_luns ` var activeTargets int if err := s.db.QueryRowContext(ctx, activeQuery).Scan(&activeTargets); err != nil { activeTargets = 0 // Not critical } return &SCSTMetrics{ TotalTargets: totalTargets, TotalLUNs: totalLUNs, TotalInitiators: totalInitiators, ActiveTargets: activeTargets, }, nil } // collectTapeMetrics collects physical tape metrics func (s *MetricsService) collectTapeMetrics(ctx context.Context) (*TapeMetrics, error) { // Count libraries libQuery := `SELECT COUNT(*) FROM physical_tape_libraries` var totalLibraries int if err := s.db.QueryRowContext(ctx, libQuery).Scan(&totalLibraries); err != nil { return nil, fmt.Errorf("failed to count libraries: %w", err) } // Count drives driveQuery := `SELECT COUNT(*) FROM physical_tape_drives` var totalDrives int if err := s.db.QueryRowContext(ctx, driveQuery).Scan(&totalDrives); err != nil { return nil, fmt.Errorf("failed to count drives: %w", err) } // Count slots slotQuery := ` SELECT COUNT(*), COUNT(CASE WHEN tape_barcode IS NOT NULL THEN 1 END) FROM physical_tape_slots ` var totalSlots, occupiedSlots int if err := s.db.QueryRowContext(ctx, slotQuery).Scan(&totalSlots, &occupiedSlots); err != nil { return nil, fmt.Errorf("failed to count slots: %w", err) } return &TapeMetrics{ TotalLibraries: totalLibraries, TotalDrives: totalDrives, TotalSlots: totalSlots, OccupiedSlots: occupiedSlots, }, nil } // collectVTLMetrics collects VTL metrics func (s *MetricsService) collectVTLMetrics(ctx context.Context) (*VTLMetrics, error) { // Count libraries libQuery := `SELECT COUNT(*) FROM virtual_tape_libraries` var totalLibraries int if err := s.db.QueryRowContext(ctx, libQuery).Scan(&totalLibraries); err != nil { return nil, fmt.Errorf("failed to count VTL libraries: %w", err) } // Count drives driveQuery := `SELECT COUNT(*) FROM virtual_tape_drives` var totalDrives int if err := s.db.QueryRowContext(ctx, driveQuery).Scan(&totalDrives); err != nil { return nil, fmt.Errorf("failed to count VTL drives: %w", err) } // Count tapes tapeQuery := `SELECT COUNT(*) FROM virtual_tapes` var totalTapes int if err := s.db.QueryRowContext(ctx, tapeQuery).Scan(&totalTapes); err != nil { return nil, fmt.Errorf("failed to count VTL tapes: %w", err) } // Count active drives (drives with loaded tape) activeQuery := ` SELECT COUNT(*) FROM virtual_tape_drives WHERE loaded_tape_id IS NOT NULL ` var activeDrives int if err := s.db.QueryRowContext(ctx, activeQuery).Scan(&activeDrives); err != nil { activeDrives = 0 } // Count loaded tapes loadedQuery := ` SELECT COUNT(*) FROM virtual_tapes WHERE is_loaded = true ` var loadedTapes int if err := s.db.QueryRowContext(ctx, loadedQuery).Scan(&loadedTapes); err != nil { loadedTapes = 0 } return &VTLMetrics{ TotalLibraries: totalLibraries, TotalDrives: totalDrives, TotalTapes: totalTapes, ActiveDrives: activeDrives, LoadedTapes: loadedTapes, }, nil } // collectTaskMetrics collects task execution metrics func (s *MetricsService) collectTaskMetrics(ctx context.Context) (*TaskMetrics, error) { // Count tasks by status query := ` SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE status = 'pending') as pending, COUNT(*) FILTER (WHERE status = 'running') as running, COUNT(*) FILTER (WHERE status = 'completed') as completed, COUNT(*) FILTER (WHERE status = 'failed') as failed FROM tasks ` var total, pending, running, completed, failed int if err := s.db.QueryRowContext(ctx, query).Scan(&total, &pending, &running, &completed, &failed); err != nil { return nil, fmt.Errorf("failed to count tasks: %w", err) } // Calculate average duration for completed tasks avgDurationQuery := ` SELECT AVG(EXTRACT(EPOCH FROM (completed_at - started_at))) FROM tasks WHERE status = 'completed' AND started_at IS NOT NULL AND completed_at IS NOT NULL ` var avgDuration sql.NullFloat64 if err := s.db.QueryRowContext(ctx, avgDurationQuery).Scan(&avgDuration); err != nil { avgDuration = sql.NullFloat64{Valid: false} } avgDurationSec := 0.0 if avgDuration.Valid { avgDurationSec = avgDuration.Float64 } return &TaskMetrics{ TotalTasks: total, PendingTasks: pending, RunningTasks: running, CompletedTasks: completed, FailedTasks: failed, AvgDurationSec: avgDurationSec, }, nil } // getSystemUptime reads system uptime from /proc/uptime // Returns uptime in seconds, or service uptime as fallback func (s *MetricsService) getSystemUptime() float64 { file, err := os.Open("/proc/uptime") if err != nil { // Fallback to service uptime if /proc/uptime is not available s.logger.Warn("Failed to read /proc/uptime, using service uptime", "error", err) return time.Since(s.startTime).Seconds() } defer file.Close() scanner := bufio.NewScanner(file) if !scanner.Scan() { // Fallback to service uptime if file is empty s.logger.Warn("Failed to read /proc/uptime content, using service uptime") return time.Since(s.startTime).Seconds() } line := strings.TrimSpace(scanner.Text()) fields := strings.Fields(line) if len(fields) == 0 { // Fallback to service uptime if no data s.logger.Warn("No data in /proc/uptime, using service uptime") return time.Since(s.startTime).Seconds() } // First field is system uptime in seconds uptimeSeconds, err := strconv.ParseFloat(fields[0], 64) if err != nil { // Fallback to service uptime if parsing fails s.logger.Warn("Failed to parse /proc/uptime, using service uptime", "error", err) return time.Since(s.startTime).Seconds() } return uptimeSeconds } // getSystemMemory reads system memory from /proc/meminfo // Returns total, used (in bytes), and usage percentage func (s *MetricsService) getSystemMemory() (int64, int64, float64) { file, err := os.Open("/proc/meminfo") if err != nil { s.logger.Warn("Failed to read /proc/meminfo, using Go runtime memory", "error", err) var m runtime.MemStats runtime.ReadMemStats(&m) memoryUsed := int64(m.Alloc) memoryTotal := int64(m.Sys) memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100 return memoryTotal, memoryUsed, memoryPercent } defer file.Close() var memTotal, memAvailable, memFree, buffers, cached int64 scanner := bufio.NewScanner(file) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line == "" { continue } // Parse line like "MemTotal: 16375596 kB" // or "MemTotal: 16375596" (some systems don't have unit) colonIdx := strings.Index(line, ":") if colonIdx == -1 { continue } key := strings.TrimSpace(line[:colonIdx]) valuePart := strings.TrimSpace(line[colonIdx+1:]) // Split value part to get number (ignore unit like "kB") fields := strings.Fields(valuePart) if len(fields) == 0 { continue } value, err := strconv.ParseInt(fields[0], 10, 64) if err != nil { continue } // Values in /proc/meminfo are in KB, convert to bytes valueBytes := value * 1024 switch key { case "MemTotal": memTotal = valueBytes case "MemAvailable": memAvailable = valueBytes case "MemFree": memFree = valueBytes case "Buffers": buffers = valueBytes case "Cached": cached = valueBytes } } if err := scanner.Err(); err != nil { s.logger.Warn("Error scanning /proc/meminfo", "error", err) } if memTotal == 0 { s.logger.Warn("Failed to get MemTotal from /proc/meminfo, using Go runtime memory", "memTotal", memTotal) var m runtime.MemStats runtime.ReadMemStats(&m) memoryUsed := int64(m.Alloc) memoryTotal := int64(m.Sys) memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100 return memoryTotal, memoryUsed, memoryPercent } // Calculate used memory // If MemAvailable exists (kernel 3.14+), use it for more accurate calculation var memoryUsed int64 if memAvailable > 0 { memoryUsed = memTotal - memAvailable } else { // Fallback: MemTotal - MemFree - Buffers - Cached memoryUsed = memTotal - memFree - buffers - cached if memoryUsed < 0 { memoryUsed = memTotal - memFree } } memoryPercent := float64(memoryUsed) / float64(memTotal) * 100 s.logger.Debug("System memory stats", "memTotal", memTotal, "memAvailable", memAvailable, "memoryUsed", memoryUsed, "memoryPercent", memoryPercent) return memTotal, memoryUsed, memoryPercent } // getCPUUsage reads CPU usage from /proc/stat // Requires two readings to calculate percentage func (s *MetricsService) getCPUUsage() float64 { currentCPU, err := s.readCPUStats() if err != nil { s.logger.Warn("Failed to read CPU stats", "error", err) return 0.0 } // If this is the first reading, store it and return 0 if s.lastCPU == nil { s.lastCPU = currentCPU s.lastCPUTime = time.Now() return 0.0 } // Calculate time difference timeDiff := time.Since(s.lastCPUTime).Seconds() if timeDiff < 0.1 { // Too soon, return previous value or 0 return 0.0 } // Calculate total CPU time prevTotal := s.lastCPU.user + s.lastCPU.nice + s.lastCPU.system + s.lastCPU.idle + s.lastCPU.iowait + s.lastCPU.irq + s.lastCPU.softirq + s.lastCPU.steal + s.lastCPU.guest currTotal := currentCPU.user + currentCPU.nice + currentCPU.system + currentCPU.idle + currentCPU.iowait + currentCPU.irq + currentCPU.softirq + currentCPU.steal + currentCPU.guest // Calculate idle time prevIdle := s.lastCPU.idle + s.lastCPU.iowait currIdle := currentCPU.idle + currentCPU.iowait // Calculate used time totalDiff := currTotal - prevTotal idleDiff := currIdle - prevIdle if totalDiff == 0 { return 0.0 } // Calculate CPU usage percentage usagePercent := 100.0 * (1.0 - float64(idleDiff)/float64(totalDiff)) // Update last CPU stats s.lastCPU = currentCPU s.lastCPUTime = time.Now() return usagePercent } // readCPUStats reads CPU statistics from /proc/stat func (s *MetricsService) readCPUStats() (*cpuStats, error) { file, err := os.Open("/proc/stat") if err != nil { return nil, fmt.Errorf("failed to open /proc/stat: %w", err) } defer file.Close() scanner := bufio.NewScanner(file) if !scanner.Scan() { return nil, fmt.Errorf("failed to read /proc/stat") } line := strings.TrimSpace(scanner.Text()) if !strings.HasPrefix(line, "cpu ") { return nil, fmt.Errorf("invalid /proc/stat format") } fields := strings.Fields(line) if len(fields) < 8 { return nil, fmt.Errorf("insufficient CPU stats fields") } stats := &cpuStats{} stats.user, _ = strconv.ParseUint(fields[1], 10, 64) stats.nice, _ = strconv.ParseUint(fields[2], 10, 64) stats.system, _ = strconv.ParseUint(fields[3], 10, 64) stats.idle, _ = strconv.ParseUint(fields[4], 10, 64) stats.iowait, _ = strconv.ParseUint(fields[5], 10, 64) stats.irq, _ = strconv.ParseUint(fields[6], 10, 64) stats.softirq, _ = strconv.ParseUint(fields[7], 10, 64) if len(fields) > 8 { stats.steal, _ = strconv.ParseUint(fields[8], 10, 64) } if len(fields) > 9 { stats.guest, _ = strconv.ParseUint(fields[9], 10, 64) } return stats, nil }