calypso/backend/internal/monitoring/metrics.go

package monitoring

import (
	"bufio"
	"context"
	"database/sql"
	"fmt"
	"os"
	"runtime"
	"strconv"
	"strings"
	"time"

	"github.com/atlasos/calypso/internal/common/database"
	"github.com/atlasos/calypso/internal/common/logger"
)

// Metrics represents system metrics
type Metrics struct {
	System      SystemMetrics  `json:"system"`
	Storage     StorageMetrics `json:"storage"`
	SCST        SCSTMetrics    `json:"scst"`
	Tape        TapeMetrics    `json:"tape"`
	VTL         VTLMetrics     `json:"vtl"`
	Tasks       TaskMetrics    `json:"tasks"`
	API         APIMetrics     `json:"api"`
	CollectedAt time.Time      `json:"collected_at"`
}

// SystemMetrics represents system-level metrics
type SystemMetrics struct {
	CPUUsagePercent float64 `json:"cpu_usage_percent"`
	MemoryUsed      int64   `json:"memory_used_bytes"`
	MemoryTotal     int64   `json:"memory_total_bytes"`
	MemoryPercent   float64 `json:"memory_usage_percent"`
	DiskUsed        int64   `json:"disk_used_bytes"`
	DiskTotal       int64   `json:"disk_total_bytes"`
	DiskPercent     float64 `json:"disk_usage_percent"`
	UptimeSeconds   int64   `json:"uptime_seconds"`
}

// StorageMetrics represents storage metrics
type StorageMetrics struct {
	TotalDisks         int     `json:"total_disks"`
	TotalRepositories  int     `json:"total_repositories"`
	TotalCapacityBytes int64   `json:"total_capacity_bytes"`
	UsedCapacityBytes  int64   `json:"used_capacity_bytes"`
	AvailableBytes     int64   `json:"available_bytes"`
	UsagePercent       float64 `json:"usage_percent"`
}

// SCSTMetrics represents SCST metrics
type SCSTMetrics struct {
	TotalTargets    int `json:"total_targets"`
	TotalLUNs       int `json:"total_luns"`
	TotalInitiators int `json:"total_initiators"`
	ActiveTargets   int `json:"active_targets"`
}

// TapeMetrics represents physical tape metrics
type TapeMetrics struct {
	TotalLibraries int `json:"total_libraries"`
	TotalDrives    int `json:"total_drives"`
	TotalSlots     int `json:"total_slots"`
	OccupiedSlots  int `json:"occupied_slots"`
}

// VTLMetrics represents virtual tape library metrics
type VTLMetrics struct {
	TotalLibraries int `json:"total_libraries"`
	TotalDrives    int `json:"total_drives"`
	TotalTapes     int `json:"total_tapes"`
	ActiveDrives   int `json:"active_drives"`
	LoadedTapes    int `json:"loaded_tapes"`
}

// TaskMetrics represents task execution metrics
type TaskMetrics struct {
	TotalTasks     int     `json:"total_tasks"`
	PendingTasks   int     `json:"pending_tasks"`
	RunningTasks   int     `json:"running_tasks"`
	CompletedTasks int     `json:"completed_tasks"`
	FailedTasks    int     `json:"failed_tasks"`
	AvgDurationSec float64 `json:"avg_duration_seconds"`
}

// APIMetrics represents API metrics
type APIMetrics struct {
	TotalRequests     int64   `json:"total_requests"`
	RequestsPerSec    float64 `json:"requests_per_second"`
	ErrorRate         float64 `json:"error_rate"`
	AvgLatencyMs      float64 `json:"avg_latency_ms"`
	ActiveConnections int     `json:"active_connections"`
}

// MetricsService collects and provides system metrics
type MetricsService struct {
	db          *database.DB
	logger      *logger.Logger
	startTime   time.Time
	lastCPU     *cpuStats // For CPU usage calculation
	lastCPUTime time.Time
}

// cpuStats represents CPU statistics from /proc/stat
type cpuStats struct {
	user    uint64
	nice    uint64
	system  uint64
	idle    uint64
	iowait  uint64
	irq     uint64
	softirq uint64
	steal   uint64
	guest   uint64
}

// NewMetricsService creates a new metrics service
func NewMetricsService(db *database.DB, log *logger.Logger) *MetricsService {
	return &MetricsService{
		db:        db,
		logger:    log,
		startTime: time.Now(),
	}
}

// CollectMetrics collects all system metrics
func (s *MetricsService) CollectMetrics(ctx context.Context) (*Metrics, error) {
	metrics := &Metrics{
		CollectedAt: time.Now(),
	}

	// Collect system metrics
	sysMetrics, err := s.collectSystemMetrics(ctx)
	if err != nil {
		s.logger.Error("Failed to collect system metrics", "error", err)
		// Set default/zero values if collection fails
		metrics.System = SystemMetrics{}
	} else {
		metrics.System = *sysMetrics
	}

	// Collect storage metrics
	storageMetrics, err := s.collectStorageMetrics(ctx)
	if err != nil {
		s.logger.Error("Failed to collect storage metrics", "error", err)
	} else {
		metrics.Storage = *storageMetrics
	}

	// Collect SCST metrics
	scstMetrics, err := s.collectSCSTMetrics(ctx)
	if err != nil {
		s.logger.Error("Failed to collect SCST metrics", "error", err)
	} else {
		metrics.SCST = *scstMetrics
	}

	// Collect tape metrics
	tapeMetrics, err := s.collectTapeMetrics(ctx)
	if err != nil {
		s.logger.Error("Failed to collect tape metrics", "error", err)
	} else {
		metrics.Tape = *tapeMetrics
	}

	// Collect VTL metrics
	vtlMetrics, err := s.collectVTLMetrics(ctx)
	if err != nil {
		s.logger.Error("Failed to collect VTL metrics", "error", err)
	} else {
		metrics.VTL = *vtlMetrics
	}

	// Collect task metrics
	taskMetrics, err := s.collectTaskMetrics(ctx)
	if err != nil {
		s.logger.Error("Failed to collect task metrics", "error", err)
	} else {
		metrics.Tasks = *taskMetrics
	}

	// API metrics are collected separately via middleware
	metrics.API = APIMetrics{} // Placeholder

	return metrics, nil
}

// collectSystemMetrics collects system-level metrics
func (s *MetricsService) collectSystemMetrics(ctx context.Context) (*SystemMetrics, error) {
	// Get system memory from /proc/meminfo
	memoryTotal, memoryUsed, memoryPercent := s.getSystemMemory()

	// Get CPU usage from /proc/stat
	cpuUsage := s.getCPUUsage()

	// Get system uptime from /proc/uptime
	uptime := s.getSystemUptime()

	metrics := &SystemMetrics{
		CPUUsagePercent: cpuUsage,
		MemoryUsed:      memoryUsed,
		MemoryTotal:     memoryTotal,
		MemoryPercent:   memoryPercent,
		DiskUsed:        0, // Would need to read from df
		DiskTotal:       0,
		DiskPercent:     0,
		UptimeSeconds:   int64(uptime),
	}

	return metrics, nil
}

// collectStorageMetrics collects storage metrics
func (s *MetricsService) collectStorageMetrics(ctx context.Context) (*StorageMetrics, error) {
	// Count disks
	diskQuery := `SELECT COUNT(*) FROM physical_disks WHERE is_active = true`
	var totalDisks int
	if err := s.db.QueryRowContext(ctx, diskQuery).Scan(&totalDisks); err != nil {
		return nil, fmt.Errorf("failed to count disks: %w", err)
	}

	// Count repositories and calculate capacity
	repoQuery := `
		SELECT COUNT(*), COALESCE(SUM(total_bytes), 0), COALESCE(SUM(used_bytes), 0)
		FROM disk_repositories
		WHERE is_active = true
	`
	var totalRepos int
	var totalCapacity, usedCapacity int64
	if err := s.db.QueryRowContext(ctx, repoQuery).Scan(&totalRepos, &totalCapacity, &usedCapacity); err != nil {
		return nil, fmt.Errorf("failed to query repositories: %w", err)
	}

	availableBytes := totalCapacity - usedCapacity
	usagePercent := 0.0
	if totalCapacity > 0 {
		usagePercent = float64(usedCapacity) / float64(totalCapacity) * 100
	}

	return &StorageMetrics{
		TotalDisks:         totalDisks,
		TotalRepositories:  totalRepos,
		TotalCapacityBytes: totalCapacity,
		UsedCapacityBytes:  usedCapacity,
		AvailableBytes:     availableBytes,
		UsagePercent:       usagePercent,
	}, nil
}

// collectSCSTMetrics collects SCST metrics
func (s *MetricsService) collectSCSTMetrics(ctx context.Context) (*SCSTMetrics, error) {
	// Count targets
	targetQuery := `SELECT COUNT(*) FROM scst_targets`
	var totalTargets int
	if err := s.db.QueryRowContext(ctx, targetQuery).Scan(&totalTargets); err != nil {
		return nil, fmt.Errorf("failed to count targets: %w", err)
	}

	// Count LUNs
	lunQuery := `SELECT COUNT(*) FROM scst_luns`
	var totalLUNs int
	if err := s.db.QueryRowContext(ctx, lunQuery).Scan(&totalLUNs); err != nil {
		return nil, fmt.Errorf("failed to count LUNs: %w", err)
	}

	// Count initiators
	initQuery := `SELECT COUNT(*) FROM scst_initiators`
	var totalInitiators int
	if err := s.db.QueryRowContext(ctx, initQuery).Scan(&totalInitiators); err != nil {
		return nil, fmt.Errorf("failed to count initiators: %w", err)
	}

	// Active targets (targets with at least one LUN)
	activeQuery := `
		SELECT COUNT(DISTINCT target_id)
		FROM scst_luns
	`
	var activeTargets int
	if err := s.db.QueryRowContext(ctx, activeQuery).Scan(&activeTargets); err != nil {
		activeTargets = 0 // Not critical
	}

	return &SCSTMetrics{
		TotalTargets:    totalTargets,
		TotalLUNs:       totalLUNs,
		TotalInitiators: totalInitiators,
		ActiveTargets:   activeTargets,
	}, nil
}

// collectTapeMetrics collects physical tape metrics
func (s *MetricsService) collectTapeMetrics(ctx context.Context) (*TapeMetrics, error) {
	// Count libraries
	libQuery := `SELECT COUNT(*) FROM physical_tape_libraries`
	var totalLibraries int
	if err := s.db.QueryRowContext(ctx, libQuery).Scan(&totalLibraries); err != nil {
		return nil, fmt.Errorf("failed to count libraries: %w", err)
	}

	// Count drives
	driveQuery := `SELECT COUNT(*) FROM physical_tape_drives`
	var totalDrives int
	if err := s.db.QueryRowContext(ctx, driveQuery).Scan(&totalDrives); err != nil {
		return nil, fmt.Errorf("failed to count drives: %w", err)
	}

	// Count slots
	slotQuery := `
		SELECT COUNT(*), COUNT(CASE WHEN tape_barcode IS NOT NULL THEN 1 END)
		FROM physical_tape_slots
	`
	var totalSlots, occupiedSlots int
	if err := s.db.QueryRowContext(ctx, slotQuery).Scan(&totalSlots, &occupiedSlots); err != nil {
		return nil, fmt.Errorf("failed to count slots: %w", err)
	}

	return &TapeMetrics{
		TotalLibraries: totalLibraries,
		TotalDrives:    totalDrives,
		TotalSlots:     totalSlots,
		OccupiedSlots:  occupiedSlots,
	}, nil
}

// collectVTLMetrics collects VTL metrics
func (s *MetricsService) collectVTLMetrics(ctx context.Context) (*VTLMetrics, error) {
	// Count libraries
	libQuery := `SELECT COUNT(*) FROM virtual_tape_libraries`
	var totalLibraries int
	if err := s.db.QueryRowContext(ctx, libQuery).Scan(&totalLibraries); err != nil {
		return nil, fmt.Errorf("failed to count VTL libraries: %w", err)
	}

	// Count drives
	driveQuery := `SELECT COUNT(*) FROM virtual_tape_drives`
	var totalDrives int
	if err := s.db.QueryRowContext(ctx, driveQuery).Scan(&totalDrives); err != nil {
		return nil, fmt.Errorf("failed to count VTL drives: %w", err)
	}

	// Count tapes
	tapeQuery := `SELECT COUNT(*) FROM virtual_tapes`
	var totalTapes int
	if err := s.db.QueryRowContext(ctx, tapeQuery).Scan(&totalTapes); err != nil {
		return nil, fmt.Errorf("failed to count VTL tapes: %w", err)
	}

	// Count active drives (drives with loaded tape)
	activeQuery := `
		SELECT COUNT(*)
		FROM virtual_tape_drives
		WHERE loaded_tape_id IS NOT NULL
	`
	var activeDrives int
	if err := s.db.QueryRowContext(ctx, activeQuery).Scan(&activeDrives); err != nil {
		activeDrives = 0
	}

	// Count loaded tapes
	loadedQuery := `
		SELECT COUNT(*)
		FROM virtual_tapes
		WHERE is_loaded = true
	`
	var loadedTapes int
	if err := s.db.QueryRowContext(ctx, loadedQuery).Scan(&loadedTapes); err != nil {
		loadedTapes = 0
	}

	return &VTLMetrics{
		TotalLibraries: totalLibraries,
		TotalDrives:    totalDrives,
		TotalTapes:     totalTapes,
		ActiveDrives:   activeDrives,
		LoadedTapes:    loadedTapes,
	}, nil
}

// collectTaskMetrics collects task execution metrics
func (s *MetricsService) collectTaskMetrics(ctx context.Context) (*TaskMetrics, error) {
	// Count tasks by status
	query := `
		SELECT
			COUNT(*) as total,
			COUNT(*) FILTER (WHERE status = 'pending') as pending,
			COUNT(*) FILTER (WHERE status = 'running') as running,
			COUNT(*) FILTER (WHERE status = 'completed') as completed,
			COUNT(*) FILTER (WHERE status = 'failed') as failed
		FROM tasks
	`
	var total, pending, running, completed, failed int
	if err := s.db.QueryRowContext(ctx, query).Scan(&total, &pending, &running, &completed, &failed); err != nil {
		return nil, fmt.Errorf("failed to count tasks: %w", err)
	}

	// Calculate average duration for completed tasks
	avgDurationQuery := `
		SELECT AVG(EXTRACT(EPOCH FROM (completed_at - started_at)))
		FROM tasks
		WHERE status = 'completed' AND started_at IS NOT NULL AND completed_at IS NOT NULL
	`
	var avgDuration sql.NullFloat64
	if err := s.db.QueryRowContext(ctx, avgDurationQuery).Scan(&avgDuration); err != nil {
		avgDuration = sql.NullFloat64{Valid: false}
	}

	avgDurationSec := 0.0
	if avgDuration.Valid {
		avgDurationSec = avgDuration.Float64
	}

	return &TaskMetrics{
		TotalTasks:     total,
		PendingTasks:   pending,
		RunningTasks:   running,
		CompletedTasks: completed,
		FailedTasks:    failed,
		AvgDurationSec: avgDurationSec,
	}, nil
}

// getSystemUptime reads system uptime from /proc/uptime
// Returns uptime in seconds, or service uptime as fallback
func (s *MetricsService) getSystemUptime() float64 {
	file, err := os.Open("/proc/uptime")
	if err != nil {
		// Fallback to service uptime if /proc/uptime is not available
		s.logger.Warn("Failed to read /proc/uptime, using service uptime", "error", err)
		return time.Since(s.startTime).Seconds()
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	if !scanner.Scan() {
		// Fallback to service uptime if file is empty
		s.logger.Warn("Failed to read /proc/uptime content, using service uptime")
		return time.Since(s.startTime).Seconds()
	}

	line := strings.TrimSpace(scanner.Text())
	fields := strings.Fields(line)
	if len(fields) == 0 {
		// Fallback to service uptime if no data
		s.logger.Warn("No data in /proc/uptime, using service uptime")
		return time.Since(s.startTime).Seconds()
	}

	// First field is system uptime in seconds
	uptimeSeconds, err := strconv.ParseFloat(fields[0], 64)
	if err != nil {
		// Fallback to service uptime if parsing fails
		s.logger.Warn("Failed to parse /proc/uptime, using service uptime", "error", err)
		return time.Since(s.startTime).Seconds()
	}

	return uptimeSeconds
}

// getSystemMemory reads system memory from /proc/meminfo
// Returns total, used (in bytes), and usage percentage
func (s *MetricsService) getSystemMemory() (int64, int64, float64) {
	file, err := os.Open("/proc/meminfo")
	if err != nil {
		s.logger.Warn("Failed to read /proc/meminfo, using Go runtime memory", "error", err)
		var m runtime.MemStats
		runtime.ReadMemStats(&m)
		memoryUsed := int64(m.Alloc)
		memoryTotal := int64(m.Sys)
		memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100
		return memoryTotal, memoryUsed, memoryPercent
	}
	defer file.Close()

	var memTotal, memAvailable, memFree, buffers, cached int64
	scanner := bufio.NewScanner(file)

	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())
		if line == "" {
			continue
		}

		// Parse line like "MemTotal:       16375596 kB"
		// or "MemTotal:       16375596" (some systems don't have unit)
		colonIdx := strings.Index(line, ":")
		if colonIdx == -1 {
			continue
		}

		key := strings.TrimSpace(line[:colonIdx])
		valuePart := strings.TrimSpace(line[colonIdx+1:])

		// Split value part to get number (ignore unit like "kB")
		fields := strings.Fields(valuePart)
		if len(fields) == 0 {
			continue
		}

		value, err := strconv.ParseInt(fields[0], 10, 64)
		if err != nil {
			continue
		}

		// Values in /proc/meminfo are in KB, convert to bytes
		valueBytes := value * 1024

		switch key {
		case "MemTotal":
			memTotal = valueBytes
		case "MemAvailable":
			memAvailable = valueBytes
		case "MemFree":
			memFree = valueBytes
		case "Buffers":
			buffers = valueBytes
		case "Cached":
			cached = valueBytes
		}
	}

	if err := scanner.Err(); err != nil {
		s.logger.Warn("Error scanning /proc/meminfo", "error", err)
	}

	if memTotal == 0 {
		s.logger.Warn("Failed to get MemTotal from /proc/meminfo, using Go runtime memory", "memTotal", memTotal)
		var m runtime.MemStats
		runtime.ReadMemStats(&m)
		memoryUsed := int64(m.Alloc)
		memoryTotal := int64(m.Sys)
		memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100
		return memoryTotal, memoryUsed, memoryPercent
	}

	// Calculate used memory
	// If MemAvailable exists (kernel 3.14+), use it for more accurate calculation
	var memoryUsed int64
	if memAvailable > 0 {
		memoryUsed = memTotal - memAvailable
	} else {
		// Fallback: MemTotal - MemFree - Buffers - Cached
		memoryUsed = memTotal - memFree - buffers - cached
		if memoryUsed < 0 {
			memoryUsed = memTotal - memFree
		}
	}

	memoryPercent := float64(memoryUsed) / float64(memTotal) * 100

	s.logger.Debug("System memory stats",
		"memTotal", memTotal,
		"memAvailable", memAvailable,
		"memoryUsed", memoryUsed,
		"memoryPercent", memoryPercent)

	return memTotal, memoryUsed, memoryPercent
}

// getCPUUsage reads CPU usage from /proc/stat
// Requires two readings to calculate percentage
func (s *MetricsService) getCPUUsage() float64 {
	currentCPU, err := s.readCPUStats()
	if err != nil {
		s.logger.Warn("Failed to read CPU stats", "error", err)
		return 0.0
	}

	// If this is the first reading, store it and return 0
	if s.lastCPU == nil {
		s.lastCPU = currentCPU
		s.lastCPUTime = time.Now()
		return 0.0
	}

	// Calculate time difference
	timeDiff := time.Since(s.lastCPUTime).Seconds()
	if timeDiff < 0.1 {
		// Too soon, return previous value or 0
		return 0.0
	}

	// Calculate total CPU time
	prevTotal := s.lastCPU.user + s.lastCPU.nice + s.lastCPU.system + s.lastCPU.idle +
		s.lastCPU.iowait + s.lastCPU.irq + s.lastCPU.softirq + s.lastCPU.steal + s.lastCPU.guest
	currTotal := currentCPU.user + currentCPU.nice + currentCPU.system + currentCPU.idle +
		currentCPU.iowait + currentCPU.irq + currentCPU.softirq + currentCPU.steal + currentCPU.guest

	// Calculate idle time
	prevIdle := s.lastCPU.idle + s.lastCPU.iowait
	currIdle := currentCPU.idle + currentCPU.iowait

	// Calculate used time
	totalDiff := currTotal - prevTotal
	idleDiff := currIdle - prevIdle

	if totalDiff == 0 {
		return 0.0
	}

	// Calculate CPU usage percentage
	usagePercent := 100.0 * (1.0 - float64(idleDiff)/float64(totalDiff))

	// Update last CPU stats
	s.lastCPU = currentCPU
	s.lastCPUTime = time.Now()

	return usagePercent
}

// readCPUStats reads CPU statistics from /proc/stat
func (s *MetricsService) readCPUStats() (*cpuStats, error) {
	file, err := os.Open("/proc/stat")
	if err != nil {
		return nil, fmt.Errorf("failed to open /proc/stat: %w", err)
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	if !scanner.Scan() {
		return nil, fmt.Errorf("failed to read /proc/stat")
	}

	line := strings.TrimSpace(scanner.Text())
	if !strings.HasPrefix(line, "cpu ") {
		return nil, fmt.Errorf("invalid /proc/stat format")
	}

	fields := strings.Fields(line)
	if len(fields) < 8 {
		return nil, fmt.Errorf("insufficient CPU stats fields")
	}

	stats := &cpuStats{}
	stats.user, _ = strconv.ParseUint(fields[1], 10, 64)
	stats.nice, _ = strconv.ParseUint(fields[2], 10, 64)
	stats.system, _ = strconv.ParseUint(fields[3], 10, 64)
	stats.idle, _ = strconv.ParseUint(fields[4], 10, 64)
	stats.iowait, _ = strconv.ParseUint(fields[5], 10, 64)
	stats.irq, _ = strconv.ParseUint(fields[6], 10, 64)
	stats.softirq, _ = strconv.ParseUint(fields[7], 10, 64)

	if len(fields) > 8 {
		stats.steal, _ = strconv.ParseUint(fields[8], 10, 64)
	}
	if len(fields) > 9 {
		stats.guest, _ = strconv.ParseUint(fields[9], 10, 64)
	}

	return stats, nil
}