working on the backup management parts
This commit is contained in:
@@ -1,10 +1,14 @@
|
||||
package monitoring
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/atlasos/calypso/internal/common/database"
|
||||
@@ -13,14 +17,14 @@ import (
|
||||
|
||||
// Metrics represents system metrics
|
||||
type Metrics struct {
|
||||
System SystemMetrics `json:"system"`
|
||||
Storage StorageMetrics `json:"storage"`
|
||||
SCST SCSTMetrics `json:"scst"`
|
||||
Tape TapeMetrics `json:"tape"`
|
||||
VTL VTLMetrics `json:"vtl"`
|
||||
Tasks TaskMetrics `json:"tasks"`
|
||||
API APIMetrics `json:"api"`
|
||||
CollectedAt time.Time `json:"collected_at"`
|
||||
System SystemMetrics `json:"system"`
|
||||
Storage StorageMetrics `json:"storage"`
|
||||
SCST SCSTMetrics `json:"scst"`
|
||||
Tape TapeMetrics `json:"tape"`
|
||||
VTL VTLMetrics `json:"vtl"`
|
||||
Tasks TaskMetrics `json:"tasks"`
|
||||
API APIMetrics `json:"api"`
|
||||
CollectedAt time.Time `json:"collected_at"`
|
||||
}
|
||||
|
||||
// SystemMetrics represents system-level metrics
|
||||
@@ -37,11 +41,11 @@ type SystemMetrics struct {
|
||||
|
||||
// StorageMetrics represents storage metrics
|
||||
type StorageMetrics struct {
|
||||
TotalDisks int `json:"total_disks"`
|
||||
TotalRepositories int `json:"total_repositories"`
|
||||
TotalCapacityBytes int64 `json:"total_capacity_bytes"`
|
||||
UsedCapacityBytes int64 `json:"used_capacity_bytes"`
|
||||
AvailableBytes int64 `json:"available_bytes"`
|
||||
TotalDisks int `json:"total_disks"`
|
||||
TotalRepositories int `json:"total_repositories"`
|
||||
TotalCapacityBytes int64 `json:"total_capacity_bytes"`
|
||||
UsedCapacityBytes int64 `json:"used_capacity_bytes"`
|
||||
AvailableBytes int64 `json:"available_bytes"`
|
||||
UsagePercent float64 `json:"usage_percent"`
|
||||
}
|
||||
|
||||
@@ -72,28 +76,43 @@ type VTLMetrics struct {
|
||||
|
||||
// TaskMetrics represents task execution metrics
|
||||
type TaskMetrics struct {
|
||||
TotalTasks int `json:"total_tasks"`
|
||||
PendingTasks int `json:"pending_tasks"`
|
||||
RunningTasks int `json:"running_tasks"`
|
||||
CompletedTasks int `json:"completed_tasks"`
|
||||
FailedTasks int `json:"failed_tasks"`
|
||||
AvgDurationSec float64 `json:"avg_duration_seconds"`
|
||||
TotalTasks int `json:"total_tasks"`
|
||||
PendingTasks int `json:"pending_tasks"`
|
||||
RunningTasks int `json:"running_tasks"`
|
||||
CompletedTasks int `json:"completed_tasks"`
|
||||
FailedTasks int `json:"failed_tasks"`
|
||||
AvgDurationSec float64 `json:"avg_duration_seconds"`
|
||||
}
|
||||
|
||||
// APIMetrics represents API metrics
|
||||
type APIMetrics struct {
|
||||
TotalRequests int64 `json:"total_requests"`
|
||||
RequestsPerSec float64 `json:"requests_per_second"`
|
||||
ErrorRate float64 `json:"error_rate"`
|
||||
AvgLatencyMs float64 `json:"avg_latency_ms"`
|
||||
ActiveConnections int `json:"active_connections"`
|
||||
TotalRequests int64 `json:"total_requests"`
|
||||
RequestsPerSec float64 `json:"requests_per_second"`
|
||||
ErrorRate float64 `json:"error_rate"`
|
||||
AvgLatencyMs float64 `json:"avg_latency_ms"`
|
||||
ActiveConnections int `json:"active_connections"`
|
||||
}
|
||||
|
||||
// MetricsService collects and provides system metrics
|
||||
type MetricsService struct {
|
||||
db *database.DB
|
||||
logger *logger.Logger
|
||||
startTime time.Time
|
||||
db *database.DB
|
||||
logger *logger.Logger
|
||||
startTime time.Time
|
||||
lastCPU *cpuStats // For CPU usage calculation
|
||||
lastCPUTime time.Time
|
||||
}
|
||||
|
||||
// cpuStats represents CPU statistics from /proc/stat
|
||||
type cpuStats struct {
|
||||
user uint64
|
||||
nice uint64
|
||||
system uint64
|
||||
idle uint64
|
||||
iowait uint64
|
||||
irq uint64
|
||||
softirq uint64
|
||||
steal uint64
|
||||
guest uint64
|
||||
}
|
||||
|
||||
// NewMetricsService creates a new metrics service
|
||||
@@ -115,6 +134,8 @@ func (s *MetricsService) CollectMetrics(ctx context.Context) (*Metrics, error) {
|
||||
sysMetrics, err := s.collectSystemMetrics(ctx)
|
||||
if err != nil {
|
||||
s.logger.Error("Failed to collect system metrics", "error", err)
|
||||
// Set default/zero values if collection fails
|
||||
metrics.System = SystemMetrics{}
|
||||
} else {
|
||||
metrics.System = *sysMetrics
|
||||
}
|
||||
@@ -167,21 +188,17 @@ func (s *MetricsService) CollectMetrics(ctx context.Context) (*Metrics, error) {
|
||||
|
||||
// collectSystemMetrics collects system-level metrics
|
||||
func (s *MetricsService) collectSystemMetrics(ctx context.Context) (*SystemMetrics, error) {
|
||||
var m runtime.MemStats
|
||||
runtime.ReadMemStats(&m)
|
||||
// Get system memory from /proc/meminfo
|
||||
memoryTotal, memoryUsed, memoryPercent := s.getSystemMemory()
|
||||
|
||||
// Get memory info
|
||||
memoryUsed := int64(m.Alloc)
|
||||
memoryTotal := int64(m.Sys)
|
||||
memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100
|
||||
// Get CPU usage from /proc/stat
|
||||
cpuUsage := s.getCPUUsage()
|
||||
|
||||
// Uptime
|
||||
uptime := time.Since(s.startTime).Seconds()
|
||||
// Get system uptime from /proc/uptime
|
||||
uptime := s.getSystemUptime()
|
||||
|
||||
// CPU and disk would require external tools or system calls
|
||||
// For now, we'll use placeholders
|
||||
metrics := &SystemMetrics{
|
||||
CPUUsagePercent: 0.0, // Would need to read from /proc/stat
|
||||
CPUUsagePercent: cpuUsage,
|
||||
MemoryUsed: memoryUsed,
|
||||
MemoryTotal: memoryTotal,
|
||||
MemoryPercent: memoryPercent,
|
||||
@@ -268,7 +285,7 @@ func (s *MetricsService) collectSCSTMetrics(ctx context.Context) (*SCSTMetrics,
|
||||
TotalTargets: totalTargets,
|
||||
TotalLUNs: totalLUNs,
|
||||
TotalInitiators: totalInitiators,
|
||||
ActiveTargets: activeTargets,
|
||||
ActiveTargets: activeTargets,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -403,3 +420,232 @@ func (s *MetricsService) collectTaskMetrics(ctx context.Context) (*TaskMetrics,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// getSystemUptime reads system uptime from /proc/uptime
|
||||
// Returns uptime in seconds, or service uptime as fallback
|
||||
func (s *MetricsService) getSystemUptime() float64 {
|
||||
file, err := os.Open("/proc/uptime")
|
||||
if err != nil {
|
||||
// Fallback to service uptime if /proc/uptime is not available
|
||||
s.logger.Warn("Failed to read /proc/uptime, using service uptime", "error", err)
|
||||
return time.Since(s.startTime).Seconds()
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
if !scanner.Scan() {
|
||||
// Fallback to service uptime if file is empty
|
||||
s.logger.Warn("Failed to read /proc/uptime content, using service uptime")
|
||||
return time.Since(s.startTime).Seconds()
|
||||
}
|
||||
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) == 0 {
|
||||
// Fallback to service uptime if no data
|
||||
s.logger.Warn("No data in /proc/uptime, using service uptime")
|
||||
return time.Since(s.startTime).Seconds()
|
||||
}
|
||||
|
||||
// First field is system uptime in seconds
|
||||
uptimeSeconds, err := strconv.ParseFloat(fields[0], 64)
|
||||
if err != nil {
|
||||
// Fallback to service uptime if parsing fails
|
||||
s.logger.Warn("Failed to parse /proc/uptime, using service uptime", "error", err)
|
||||
return time.Since(s.startTime).Seconds()
|
||||
}
|
||||
|
||||
return uptimeSeconds
|
||||
}
|
||||
|
||||
// getSystemMemory reads system memory from /proc/meminfo
|
||||
// Returns total, used (in bytes), and usage percentage
|
||||
func (s *MetricsService) getSystemMemory() (int64, int64, float64) {
|
||||
file, err := os.Open("/proc/meminfo")
|
||||
if err != nil {
|
||||
s.logger.Warn("Failed to read /proc/meminfo, using Go runtime memory", "error", err)
|
||||
var m runtime.MemStats
|
||||
runtime.ReadMemStats(&m)
|
||||
memoryUsed := int64(m.Alloc)
|
||||
memoryTotal := int64(m.Sys)
|
||||
memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100
|
||||
return memoryTotal, memoryUsed, memoryPercent
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var memTotal, memAvailable, memFree, buffers, cached int64
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse line like "MemTotal: 16375596 kB"
|
||||
// or "MemTotal: 16375596" (some systems don't have unit)
|
||||
colonIdx := strings.Index(line, ":")
|
||||
if colonIdx == -1 {
|
||||
continue
|
||||
}
|
||||
|
||||
key := strings.TrimSpace(line[:colonIdx])
|
||||
valuePart := strings.TrimSpace(line[colonIdx+1:])
|
||||
|
||||
// Split value part to get number (ignore unit like "kB")
|
||||
fields := strings.Fields(valuePart)
|
||||
if len(fields) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
value, err := strconv.ParseInt(fields[0], 10, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Values in /proc/meminfo are in KB, convert to bytes
|
||||
valueBytes := value * 1024
|
||||
|
||||
switch key {
|
||||
case "MemTotal":
|
||||
memTotal = valueBytes
|
||||
case "MemAvailable":
|
||||
memAvailable = valueBytes
|
||||
case "MemFree":
|
||||
memFree = valueBytes
|
||||
case "Buffers":
|
||||
buffers = valueBytes
|
||||
case "Cached":
|
||||
cached = valueBytes
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
s.logger.Warn("Error scanning /proc/meminfo", "error", err)
|
||||
}
|
||||
|
||||
if memTotal == 0 {
|
||||
s.logger.Warn("Failed to get MemTotal from /proc/meminfo, using Go runtime memory", "memTotal", memTotal)
|
||||
var m runtime.MemStats
|
||||
runtime.ReadMemStats(&m)
|
||||
memoryUsed := int64(m.Alloc)
|
||||
memoryTotal := int64(m.Sys)
|
||||
memoryPercent := float64(memoryUsed) / float64(memoryTotal) * 100
|
||||
return memoryTotal, memoryUsed, memoryPercent
|
||||
}
|
||||
|
||||
// Calculate used memory
|
||||
// If MemAvailable exists (kernel 3.14+), use it for more accurate calculation
|
||||
var memoryUsed int64
|
||||
if memAvailable > 0 {
|
||||
memoryUsed = memTotal - memAvailable
|
||||
} else {
|
||||
// Fallback: MemTotal - MemFree - Buffers - Cached
|
||||
memoryUsed = memTotal - memFree - buffers - cached
|
||||
if memoryUsed < 0 {
|
||||
memoryUsed = memTotal - memFree
|
||||
}
|
||||
}
|
||||
|
||||
memoryPercent := float64(memoryUsed) / float64(memTotal) * 100
|
||||
|
||||
s.logger.Debug("System memory stats",
|
||||
"memTotal", memTotal,
|
||||
"memAvailable", memAvailable,
|
||||
"memoryUsed", memoryUsed,
|
||||
"memoryPercent", memoryPercent)
|
||||
|
||||
return memTotal, memoryUsed, memoryPercent
|
||||
}
|
||||
|
||||
// getCPUUsage reads CPU usage from /proc/stat
|
||||
// Requires two readings to calculate percentage
|
||||
func (s *MetricsService) getCPUUsage() float64 {
|
||||
currentCPU, err := s.readCPUStats()
|
||||
if err != nil {
|
||||
s.logger.Warn("Failed to read CPU stats", "error", err)
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// If this is the first reading, store it and return 0
|
||||
if s.lastCPU == nil {
|
||||
s.lastCPU = currentCPU
|
||||
s.lastCPUTime = time.Now()
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Calculate time difference
|
||||
timeDiff := time.Since(s.lastCPUTime).Seconds()
|
||||
if timeDiff < 0.1 {
|
||||
// Too soon, return previous value or 0
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Calculate total CPU time
|
||||
prevTotal := s.lastCPU.user + s.lastCPU.nice + s.lastCPU.system + s.lastCPU.idle +
|
||||
s.lastCPU.iowait + s.lastCPU.irq + s.lastCPU.softirq + s.lastCPU.steal + s.lastCPU.guest
|
||||
currTotal := currentCPU.user + currentCPU.nice + currentCPU.system + currentCPU.idle +
|
||||
currentCPU.iowait + currentCPU.irq + currentCPU.softirq + currentCPU.steal + currentCPU.guest
|
||||
|
||||
// Calculate idle time
|
||||
prevIdle := s.lastCPU.idle + s.lastCPU.iowait
|
||||
currIdle := currentCPU.idle + currentCPU.iowait
|
||||
|
||||
// Calculate used time
|
||||
totalDiff := currTotal - prevTotal
|
||||
idleDiff := currIdle - prevIdle
|
||||
|
||||
if totalDiff == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Calculate CPU usage percentage
|
||||
usagePercent := 100.0 * (1.0 - float64(idleDiff)/float64(totalDiff))
|
||||
|
||||
// Update last CPU stats
|
||||
s.lastCPU = currentCPU
|
||||
s.lastCPUTime = time.Now()
|
||||
|
||||
return usagePercent
|
||||
}
|
||||
|
||||
// readCPUStats reads CPU statistics from /proc/stat
|
||||
func (s *MetricsService) readCPUStats() (*cpuStats, error) {
|
||||
file, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open /proc/stat: %w", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
if !scanner.Scan() {
|
||||
return nil, fmt.Errorf("failed to read /proc/stat")
|
||||
}
|
||||
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if !strings.HasPrefix(line, "cpu ") {
|
||||
return nil, fmt.Errorf("invalid /proc/stat format")
|
||||
}
|
||||
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 8 {
|
||||
return nil, fmt.Errorf("insufficient CPU stats fields")
|
||||
}
|
||||
|
||||
stats := &cpuStats{}
|
||||
stats.user, _ = strconv.ParseUint(fields[1], 10, 64)
|
||||
stats.nice, _ = strconv.ParseUint(fields[2], 10, 64)
|
||||
stats.system, _ = strconv.ParseUint(fields[3], 10, 64)
|
||||
stats.idle, _ = strconv.ParseUint(fields[4], 10, 64)
|
||||
stats.iowait, _ = strconv.ParseUint(fields[5], 10, 64)
|
||||
stats.irq, _ = strconv.ParseUint(fields[6], 10, 64)
|
||||
stats.softirq, _ = strconv.ParseUint(fields[7], 10, 64)
|
||||
|
||||
if len(fields) > 8 {
|
||||
stats.steal, _ = strconv.ParseUint(fields[8], 10, 64)
|
||||
}
|
||||
if len(fields) > 9 {
|
||||
stats.guest, _ = strconv.ParseUint(fields[9], 10, 64)
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user