package monitoring import ( "context" "database/sql" "fmt" "runtime" "time" "github.com/atlasos/calypso/internal/common/database" "github.com/atlasos/calypso/internal/common/logger" ) // Metrics represents system metrics type Metrics struct { System SystemMetrics `json:"system"` Storage StorageMetrics `json:"storage"` SCST SCSTMetrics `json:"scst"` Tape TapeMetrics `json:"tape"` VTL VTLMetrics `json:"vtl"` Tasks TaskMetrics `json:"tasks"` API APIMetrics `json:"api"` CollectedAt time.Time `json:"collected_at"` } // SystemMetrics represents system-level metrics type SystemMetrics struct { CPUUsagePercent float64 `json:"cpu_usage_percent"` MemoryUsed int64 `json:"memory_used_bytes"` MemoryTotal int64 `json:"memory_total_bytes"` MemoryPercent float64 `json:"memory_usage_percent"` DiskUsed int64 `json:"disk_used_bytes"` DiskTotal int64 `json:"disk_total_bytes"` DiskPercent float64 `json:"disk_usage_percent"` UptimeSeconds int64 `json:"uptime_seconds"` } // StorageMetrics represents storage metrics type StorageMetrics struct { TotalDisks int `json:"total_disks"` TotalRepositories int `json:"total_repositories"` TotalCapacityBytes int64 `json:"total_capacity_bytes"` UsedCapacityBytes int64 `json:"used_capacity_bytes"` AvailableBytes int64 `json:"available_bytes"` UsagePercent float64 `json:"usage_percent"` } // SCSTMetrics represents SCST metrics type SCSTMetrics struct { TotalTargets int `json:"total_targets"` TotalLUNs int `json:"total_luns"` TotalInitiators int `json:"total_initiators"` ActiveTargets int `json:"active_targets"` } // TapeMetrics represents physical tape metrics type TapeMetrics struct { TotalLibraries int `json:"total_libraries"` TotalDrives int `json:"total_drives"` TotalSlots int `json:"total_slots"` OccupiedSlots int `json:"occupied_slots"` } // VTLMetrics represents virtual tape library metrics type VTLMetrics struct { TotalLibraries int `json:"total_libraries"` TotalDrives int `json:"total_drives"` TotalTapes int `json:"total_tapes"` ActiveDrives int `json:"active_drives"` LoadedTapes int `json:"loaded_tapes"` } // TaskMetrics represents task execution metrics type TaskMetrics struct { TotalTasks int `json:"total_tasks"` PendingTasks int `json:"pending_tasks"` RunningTasks int `json:"running_tasks"` CompletedTasks int `json:"completed_tasks"` FailedTasks int `json:"failed_tasks"` AvgDurationSec float64 `json:"avg_duration_seconds"` } // APIMetrics represents API metrics type APIMetrics struct { TotalRequests int64 `json:"total_requests"` RequestsPerSec float64 `json:"requests_per_second"` ErrorRate float64 `json:"error_rate"` AvgLatencyMs float64 `json:"avg_latency_ms"` ActiveConnections int `json:"active_connections"` } // MetricsService collects and provides system metrics type MetricsService struct { db *database.DB logger *logger.Logger startTime time.Time } // NewMetricsService creates a new metrics service func NewMetricsService(db *database.DB, log *logger.Logger) *MetricsService { return &MetricsService{ db: db, logger: log, startTime: time.Now(), } } // CollectMetrics collects all system metrics func (s *MetricsService) CollectMetrics(ctx context.Context) (*Metrics, error) { metrics := &Metrics{ CollectedAt: time.Now(), } // Collect system metrics sysMetrics, err := s.collectSystemMetrics(ctx) if err != nil { s.logger.Error("Failed to collect system metrics", "error", err) } else { metrics.System = *sysMetrics } // Collect storage metrics storageMetrics, err := s.collectStorageMetrics(ctx) if err != nil { s.logger.Error("Failed to collect storage metrics", "error", err) } else { metrics.Storage = *storageMetrics } // Collect SCST metrics scstMetrics, err := s.collectSCSTMetrics(ctx) if err != nil { s.logger.Error("Failed to collect SCST metrics", "error", err) } else { metrics.SCST = *scstMetrics } // Collect tape metrics tapeMetrics, err := s.collectTapeMetrics(ctx) if err != nil { s.logger.Error("Failed to collect tape metrics", "error", err) } else { metrics.Tape = *tapeMetrics } // Collect VTL metrics vtlMetrics, err := s.collectVTLMetrics(ctx) if err != nil { s.logger.Error("Failed to collect VTL metrics", "error", err) } else { metrics.VTL = *vtlMetrics } // Collect task metrics taskMetrics, err := s.collectTaskMetrics(ctx) if err != nil { s.logger.Error("Failed to collect task metrics", "error", err) } else { metrics.Tasks = *taskMetrics } // API metrics are collected separately via middleware metrics.API = APIMetrics{} // Placeholder return metrics, nil } // collectSystemMetrics collects system-level metrics func (s *MetricsService) collectSystemMetrics(ctx context.Context) (*SystemMetrics, error) { var m runtime.MemStats runtime.ReadMemStats(&m) // Get memory info memoryUsed := int64(m.Alloc) memoryTotal := int64(m.Sys) memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100 // Uptime uptime := time.Since(s.startTime).Seconds() // CPU and disk would require external tools or system calls // For now, we'll use placeholders metrics := &SystemMetrics{ CPUUsagePercent: 0.0, // Would need to read from /proc/stat MemoryUsed: memoryUsed, MemoryTotal: memoryTotal, MemoryPercent: memoryPercent, DiskUsed: 0, // Would need to read from df DiskTotal: 0, DiskPercent: 0, UptimeSeconds: int64(uptime), } return metrics, nil } // collectStorageMetrics collects storage metrics func (s *MetricsService) collectStorageMetrics(ctx context.Context) (*StorageMetrics, error) { // Count disks diskQuery := `SELECT COUNT(*) FROM physical_disks WHERE is_active = true` var totalDisks int if err := s.db.QueryRowContext(ctx, diskQuery).Scan(&totalDisks); err != nil { return nil, fmt.Errorf("failed to count disks: %w", err) } // Count repositories and calculate capacity repoQuery := ` SELECT COUNT(*), COALESCE(SUM(total_bytes), 0), COALESCE(SUM(used_bytes), 0) FROM disk_repositories WHERE is_active = true ` var totalRepos int var totalCapacity, usedCapacity int64 if err := s.db.QueryRowContext(ctx, repoQuery).Scan(&totalRepos, &totalCapacity, &usedCapacity); err != nil { return nil, fmt.Errorf("failed to query repositories: %w", err) } availableBytes := totalCapacity - usedCapacity usagePercent := 0.0 if totalCapacity > 0 { usagePercent = float64(usedCapacity) / float64(totalCapacity) * 100 } return &StorageMetrics{ TotalDisks: totalDisks, TotalRepositories: totalRepos, TotalCapacityBytes: totalCapacity, UsedCapacityBytes: usedCapacity, AvailableBytes: availableBytes, UsagePercent: usagePercent, }, nil } // collectSCSTMetrics collects SCST metrics func (s *MetricsService) collectSCSTMetrics(ctx context.Context) (*SCSTMetrics, error) { // Count targets targetQuery := `SELECT COUNT(*) FROM scst_targets` var totalTargets int if err := s.db.QueryRowContext(ctx, targetQuery).Scan(&totalTargets); err != nil { return nil, fmt.Errorf("failed to count targets: %w", err) } // Count LUNs lunQuery := `SELECT COUNT(*) FROM scst_luns` var totalLUNs int if err := s.db.QueryRowContext(ctx, lunQuery).Scan(&totalLUNs); err != nil { return nil, fmt.Errorf("failed to count LUNs: %w", err) } // Count initiators initQuery := `SELECT COUNT(*) FROM scst_initiators` var totalInitiators int if err := s.db.QueryRowContext(ctx, initQuery).Scan(&totalInitiators); err != nil { return nil, fmt.Errorf("failed to count initiators: %w", err) } // Active targets (targets with at least one LUN) activeQuery := ` SELECT COUNT(DISTINCT target_id) FROM scst_luns ` var activeTargets int if err := s.db.QueryRowContext(ctx, activeQuery).Scan(&activeTargets); err != nil { activeTargets = 0 // Not critical } return &SCSTMetrics{ TotalTargets: totalTargets, TotalLUNs: totalLUNs, TotalInitiators: totalInitiators, ActiveTargets: activeTargets, }, nil } // collectTapeMetrics collects physical tape metrics func (s *MetricsService) collectTapeMetrics(ctx context.Context) (*TapeMetrics, error) { // Count libraries libQuery := `SELECT COUNT(*) FROM physical_tape_libraries` var totalLibraries int if err := s.db.QueryRowContext(ctx, libQuery).Scan(&totalLibraries); err != nil { return nil, fmt.Errorf("failed to count libraries: %w", err) } // Count drives driveQuery := `SELECT COUNT(*) FROM physical_tape_drives` var totalDrives int if err := s.db.QueryRowContext(ctx, driveQuery).Scan(&totalDrives); err != nil { return nil, fmt.Errorf("failed to count drives: %w", err) } // Count slots slotQuery := ` SELECT COUNT(*), COUNT(CASE WHEN tape_barcode IS NOT NULL THEN 1 END) FROM physical_tape_slots ` var totalSlots, occupiedSlots int if err := s.db.QueryRowContext(ctx, slotQuery).Scan(&totalSlots, &occupiedSlots); err != nil { return nil, fmt.Errorf("failed to count slots: %w", err) } return &TapeMetrics{ TotalLibraries: totalLibraries, TotalDrives: totalDrives, TotalSlots: totalSlots, OccupiedSlots: occupiedSlots, }, nil } // collectVTLMetrics collects VTL metrics func (s *MetricsService) collectVTLMetrics(ctx context.Context) (*VTLMetrics, error) { // Count libraries libQuery := `SELECT COUNT(*) FROM virtual_tape_libraries` var totalLibraries int if err := s.db.QueryRowContext(ctx, libQuery).Scan(&totalLibraries); err != nil { return nil, fmt.Errorf("failed to count VTL libraries: %w", err) } // Count drives driveQuery := `SELECT COUNT(*) FROM virtual_tape_drives` var totalDrives int if err := s.db.QueryRowContext(ctx, driveQuery).Scan(&totalDrives); err != nil { return nil, fmt.Errorf("failed to count VTL drives: %w", err) } // Count tapes tapeQuery := `SELECT COUNT(*) FROM virtual_tapes` var totalTapes int if err := s.db.QueryRowContext(ctx, tapeQuery).Scan(&totalTapes); err != nil { return nil, fmt.Errorf("failed to count VTL tapes: %w", err) } // Count active drives (drives with loaded tape) activeQuery := ` SELECT COUNT(*) FROM virtual_tape_drives WHERE loaded_tape_id IS NOT NULL ` var activeDrives int if err := s.db.QueryRowContext(ctx, activeQuery).Scan(&activeDrives); err != nil { activeDrives = 0 } // Count loaded tapes loadedQuery := ` SELECT COUNT(*) FROM virtual_tapes WHERE is_loaded = true ` var loadedTapes int if err := s.db.QueryRowContext(ctx, loadedQuery).Scan(&loadedTapes); err != nil { loadedTapes = 0 } return &VTLMetrics{ TotalLibraries: totalLibraries, TotalDrives: totalDrives, TotalTapes: totalTapes, ActiveDrives: activeDrives, LoadedTapes: loadedTapes, }, nil } // collectTaskMetrics collects task execution metrics func (s *MetricsService) collectTaskMetrics(ctx context.Context) (*TaskMetrics, error) { // Count tasks by status query := ` SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE status = 'pending') as pending, COUNT(*) FILTER (WHERE status = 'running') as running, COUNT(*) FILTER (WHERE status = 'completed') as completed, COUNT(*) FILTER (WHERE status = 'failed') as failed FROM tasks ` var total, pending, running, completed, failed int if err := s.db.QueryRowContext(ctx, query).Scan(&total, &pending, &running, &completed, &failed); err != nil { return nil, fmt.Errorf("failed to count tasks: %w", err) } // Calculate average duration for completed tasks avgDurationQuery := ` SELECT AVG(EXTRACT(EPOCH FROM (completed_at - started_at))) FROM tasks WHERE status = 'completed' AND started_at IS NOT NULL AND completed_at IS NOT NULL ` var avgDuration sql.NullFloat64 if err := s.db.QueryRowContext(ctx, avgDurationQuery).Scan(&avgDuration); err != nil { avgDuration = sql.NullFloat64{Valid: false} } avgDurationSec := 0.0 if avgDuration.Valid { avgDurationSec = avgDuration.Float64 } return &TaskMetrics{ TotalTasks: total, PendingTasks: pending, RunningTasks: running, CompletedTasks: completed, FailedTasks: failed, AvgDurationSec: avgDurationSec, }, nil }