working on the backup management parts

This commit is contained in:
Warp Agent
2025-12-29 02:44:52 +07:00
parent f1448d512c
commit fc64391cfb
20 changed files with 4897 additions and 249 deletions

View File

@@ -1,10 +1,14 @@
package monitoring
import (
"bufio"
"context"
"database/sql"
"fmt"
"os"
"runtime"
"strconv"
"strings"
"time"
"github.com/atlasos/calypso/internal/common/database"
@@ -13,14 +17,14 @@ import (
// Metrics represents system metrics
type Metrics struct {
System SystemMetrics `json:"system"`
Storage StorageMetrics `json:"storage"`
SCST SCSTMetrics `json:"scst"`
Tape TapeMetrics `json:"tape"`
VTL VTLMetrics `json:"vtl"`
Tasks TaskMetrics `json:"tasks"`
API APIMetrics `json:"api"`
CollectedAt time.Time `json:"collected_at"`
System SystemMetrics `json:"system"`
Storage StorageMetrics `json:"storage"`
SCST SCSTMetrics `json:"scst"`
Tape TapeMetrics `json:"tape"`
VTL VTLMetrics `json:"vtl"`
Tasks TaskMetrics `json:"tasks"`
API APIMetrics `json:"api"`
CollectedAt time.Time `json:"collected_at"`
}
// SystemMetrics represents system-level metrics
@@ -37,11 +41,11 @@ type SystemMetrics struct {
// StorageMetrics represents storage metrics
type StorageMetrics struct {
TotalDisks int `json:"total_disks"`
TotalRepositories int `json:"total_repositories"`
TotalCapacityBytes int64 `json:"total_capacity_bytes"`
UsedCapacityBytes int64 `json:"used_capacity_bytes"`
AvailableBytes int64 `json:"available_bytes"`
TotalDisks int `json:"total_disks"`
TotalRepositories int `json:"total_repositories"`
TotalCapacityBytes int64 `json:"total_capacity_bytes"`
UsedCapacityBytes int64 `json:"used_capacity_bytes"`
AvailableBytes int64 `json:"available_bytes"`
UsagePercent float64 `json:"usage_percent"`
}
@@ -72,28 +76,43 @@ type VTLMetrics struct {
// TaskMetrics represents task execution metrics
type TaskMetrics struct {
TotalTasks int `json:"total_tasks"`
PendingTasks int `json:"pending_tasks"`
RunningTasks int `json:"running_tasks"`
CompletedTasks int `json:"completed_tasks"`
FailedTasks int `json:"failed_tasks"`
AvgDurationSec float64 `json:"avg_duration_seconds"`
TotalTasks int `json:"total_tasks"`
PendingTasks int `json:"pending_tasks"`
RunningTasks int `json:"running_tasks"`
CompletedTasks int `json:"completed_tasks"`
FailedTasks int `json:"failed_tasks"`
AvgDurationSec float64 `json:"avg_duration_seconds"`
}
// APIMetrics represents API metrics
type APIMetrics struct {
TotalRequests int64 `json:"total_requests"`
RequestsPerSec float64 `json:"requests_per_second"`
ErrorRate float64 `json:"error_rate"`
AvgLatencyMs float64 `json:"avg_latency_ms"`
ActiveConnections int `json:"active_connections"`
TotalRequests int64 `json:"total_requests"`
RequestsPerSec float64 `json:"requests_per_second"`
ErrorRate float64 `json:"error_rate"`
AvgLatencyMs float64 `json:"avg_latency_ms"`
ActiveConnections int `json:"active_connections"`
}
// MetricsService collects and provides system metrics
type MetricsService struct {
db *database.DB
logger *logger.Logger
startTime time.Time
db *database.DB
logger *logger.Logger
startTime time.Time
lastCPU *cpuStats // For CPU usage calculation
lastCPUTime time.Time
}
// cpuStats represents CPU statistics from /proc/stat
type cpuStats struct {
user uint64
nice uint64
system uint64
idle uint64
iowait uint64
irq uint64
softirq uint64
steal uint64
guest uint64
}
// NewMetricsService creates a new metrics service
@@ -115,6 +134,8 @@ func (s *MetricsService) CollectMetrics(ctx context.Context) (*Metrics, error) {
sysMetrics, err := s.collectSystemMetrics(ctx)
if err != nil {
s.logger.Error("Failed to collect system metrics", "error", err)
// Set default/zero values if collection fails
metrics.System = SystemMetrics{}
} else {
metrics.System = *sysMetrics
}
@@ -167,21 +188,17 @@ func (s *MetricsService) CollectMetrics(ctx context.Context) (*Metrics, error) {
// collectSystemMetrics collects system-level metrics
func (s *MetricsService) collectSystemMetrics(ctx context.Context) (*SystemMetrics, error) {
var m runtime.MemStats
runtime.ReadMemStats(&m)
// Get system memory from /proc/meminfo
memoryTotal, memoryUsed, memoryPercent := s.getSystemMemory()
// Get memory info
memoryUsed := int64(m.Alloc)
memoryTotal := int64(m.Sys)
memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100
// Get CPU usage from /proc/stat
cpuUsage := s.getCPUUsage()
// Uptime
uptime := time.Since(s.startTime).Seconds()
// Get system uptime from /proc/uptime
uptime := s.getSystemUptime()
// CPU and disk would require external tools or system calls
// For now, we'll use placeholders
metrics := &SystemMetrics{
CPUUsagePercent: 0.0, // Would need to read from /proc/stat
CPUUsagePercent: cpuUsage,
MemoryUsed: memoryUsed,
MemoryTotal: memoryTotal,
MemoryPercent: memoryPercent,
@@ -268,7 +285,7 @@ func (s *MetricsService) collectSCSTMetrics(ctx context.Context) (*SCSTMetrics,
TotalTargets: totalTargets,
TotalLUNs: totalLUNs,
TotalInitiators: totalInitiators,
ActiveTargets: activeTargets,
ActiveTargets: activeTargets,
}, nil
}
@@ -403,3 +420,232 @@ func (s *MetricsService) collectTaskMetrics(ctx context.Context) (*TaskMetrics,
}, nil
}
// getSystemUptime reads system uptime from /proc/uptime
// Returns uptime in seconds, or service uptime as fallback
func (s *MetricsService) getSystemUptime() float64 {
file, err := os.Open("/proc/uptime")
if err != nil {
// Fallback to service uptime if /proc/uptime is not available
s.logger.Warn("Failed to read /proc/uptime, using service uptime", "error", err)
return time.Since(s.startTime).Seconds()
}
defer file.Close()
scanner := bufio.NewScanner(file)
if !scanner.Scan() {
// Fallback to service uptime if file is empty
s.logger.Warn("Failed to read /proc/uptime content, using service uptime")
return time.Since(s.startTime).Seconds()
}
line := strings.TrimSpace(scanner.Text())
fields := strings.Fields(line)
if len(fields) == 0 {
// Fallback to service uptime if no data
s.logger.Warn("No data in /proc/uptime, using service uptime")
return time.Since(s.startTime).Seconds()
}
// First field is system uptime in seconds
uptimeSeconds, err := strconv.ParseFloat(fields[0], 64)
if err != nil {
// Fallback to service uptime if parsing fails
s.logger.Warn("Failed to parse /proc/uptime, using service uptime", "error", err)
return time.Since(s.startTime).Seconds()
}
return uptimeSeconds
}
// getSystemMemory reads system memory from /proc/meminfo
// Returns total, used (in bytes), and usage percentage
func (s *MetricsService) getSystemMemory() (int64, int64, float64) {
file, err := os.Open("/proc/meminfo")
if err != nil {
s.logger.Warn("Failed to read /proc/meminfo, using Go runtime memory", "error", err)
var m runtime.MemStats
runtime.ReadMemStats(&m)
memoryUsed := int64(m.Alloc)
memoryTotal := int64(m.Sys)
memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100
return memoryTotal, memoryUsed, memoryPercent
}
defer file.Close()
var memTotal, memAvailable, memFree, buffers, cached int64
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
// Parse line like "MemTotal: 16375596 kB"
// or "MemTotal: 16375596" (some systems don't have unit)
colonIdx := strings.Index(line, ":")
if colonIdx == -1 {
continue
}
key := strings.TrimSpace(line[:colonIdx])
valuePart := strings.TrimSpace(line[colonIdx+1:])
// Split value part to get number (ignore unit like "kB")
fields := strings.Fields(valuePart)
if len(fields) == 0 {
continue
}
value, err := strconv.ParseInt(fields[0], 10, 64)
if err != nil {
continue
}
// Values in /proc/meminfo are in KB, convert to bytes
valueBytes := value * 1024
switch key {
case "MemTotal":
memTotal = valueBytes
case "MemAvailable":
memAvailable = valueBytes
case "MemFree":
memFree = valueBytes
case "Buffers":
buffers = valueBytes
case "Cached":
cached = valueBytes
}
}
if err := scanner.Err(); err != nil {
s.logger.Warn("Error scanning /proc/meminfo", "error", err)
}
if memTotal == 0 {
s.logger.Warn("Failed to get MemTotal from /proc/meminfo, using Go runtime memory", "memTotal", memTotal)
var m runtime.MemStats
runtime.ReadMemStats(&m)
memoryUsed := int64(m.Alloc)
memoryTotal := int64(m.Sys)
memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100
return memoryTotal, memoryUsed, memoryPercent
}
// Calculate used memory
// If MemAvailable exists (kernel 3.14+), use it for more accurate calculation
var memoryUsed int64
if memAvailable > 0 {
memoryUsed = memTotal - memAvailable
} else {
// Fallback: MemTotal - MemFree - Buffers - Cached
memoryUsed = memTotal - memFree - buffers - cached
if memoryUsed < 0 {
memoryUsed = memTotal - memFree
}
}
memoryPercent := float64(memoryUsed) / float64(memTotal) * 100
s.logger.Debug("System memory stats",
"memTotal", memTotal,
"memAvailable", memAvailable,
"memoryUsed", memoryUsed,
"memoryPercent", memoryPercent)
return memTotal, memoryUsed, memoryPercent
}
// getCPUUsage reads CPU usage from /proc/stat
// Requires two readings to calculate percentage
func (s *MetricsService) getCPUUsage() float64 {
currentCPU, err := s.readCPUStats()
if err != nil {
s.logger.Warn("Failed to read CPU stats", "error", err)
return 0.0
}
// If this is the first reading, store it and return 0
if s.lastCPU == nil {
s.lastCPU = currentCPU
s.lastCPUTime = time.Now()
return 0.0
}
// Calculate time difference
timeDiff := time.Since(s.lastCPUTime).Seconds()
if timeDiff < 0.1 {
// Too soon, return previous value or 0
return 0.0
}
// Calculate total CPU time
prevTotal := s.lastCPU.user + s.lastCPU.nice + s.lastCPU.system + s.lastCPU.idle +
s.lastCPU.iowait + s.lastCPU.irq + s.lastCPU.softirq + s.lastCPU.steal + s.lastCPU.guest
currTotal := currentCPU.user + currentCPU.nice + currentCPU.system + currentCPU.idle +
currentCPU.iowait + currentCPU.irq + currentCPU.softirq + currentCPU.steal + currentCPU.guest
// Calculate idle time
prevIdle := s.lastCPU.idle + s.lastCPU.iowait
currIdle := currentCPU.idle + currentCPU.iowait
// Calculate used time
totalDiff := currTotal - prevTotal
idleDiff := currIdle - prevIdle
if totalDiff == 0 {
return 0.0
}
// Calculate CPU usage percentage
usagePercent := 100.0 * (1.0 - float64(idleDiff)/float64(totalDiff))
// Update last CPU stats
s.lastCPU = currentCPU
s.lastCPUTime = time.Now()
return usagePercent
}
// readCPUStats reads CPU statistics from /proc/stat
func (s *MetricsService) readCPUStats() (*cpuStats, error) {
file, err := os.Open("/proc/stat")
if err != nil {
return nil, fmt.Errorf("failed to open /proc/stat: %w", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
if !scanner.Scan() {
return nil, fmt.Errorf("failed to read /proc/stat")
}
line := strings.TrimSpace(scanner.Text())
if !strings.HasPrefix(line, "cpu ") {
return nil, fmt.Errorf("invalid /proc/stat format")
}
fields := strings.Fields(line)
if len(fields) < 8 {
return nil, fmt.Errorf("insufficient CPU stats fields")
}
stats := &cpuStats{}
stats.user, _ = strconv.ParseUint(fields[1], 10, 64)
stats.nice, _ = strconv.ParseUint(fields[2], 10, 64)
stats.system, _ = strconv.ParseUint(fields[3], 10, 64)
stats.idle, _ = strconv.ParseUint(fields[4], 10, 64)
stats.iowait, _ = strconv.ParseUint(fields[5], 10, 64)
stats.irq, _ = strconv.ParseUint(fields[6], 10, 64)
stats.softirq, _ = strconv.ParseUint(fields[7], 10, 64)
if len(fields) > 8 {
stats.steal, _ = strconv.ParseUint(fields[8], 10, 64)
}
if len(fields) > 9 {
stats.guest, _ = strconv.ParseUint(fields[9], 10, 64)
}
return stats, nil
}