Files
2025-12-24 19:53:45 +00:00

202 lines
5.3 KiB
Go

package monitoring
import (
"context"
"time"
"github.com/atlasos/calypso/internal/common/database"
"github.com/atlasos/calypso/internal/common/logger"
)
// HealthStatus represents the health status of a component
type HealthStatus string
const (
HealthStatusHealthy HealthStatus = "healthy"
HealthStatusDegraded HealthStatus = "degraded"
HealthStatusUnhealthy HealthStatus = "unhealthy"
HealthStatusUnknown HealthStatus = "unknown"
)
// ComponentHealth represents the health of a system component
type ComponentHealth struct {
Name string `json:"name"`
Status HealthStatus `json:"status"`
Message string `json:"message,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
// EnhancedHealth represents enhanced health check response
type EnhancedHealth struct {
Status string `json:"status"`
Service string `json:"service"`
Version string `json:"version,omitempty"`
Uptime int64 `json:"uptime_seconds"`
Components []ComponentHealth `json:"components"`
Timestamp time.Time `json:"timestamp"`
}
// HealthService provides enhanced health checking
type HealthService struct {
db *database.DB
logger *logger.Logger
startTime time.Time
metricsService *MetricsService
}
// NewHealthService creates a new health service
func NewHealthService(db *database.DB, log *logger.Logger, metricsService *MetricsService) *HealthService {
return &HealthService{
db: db,
logger: log,
startTime: time.Now(),
metricsService: metricsService,
}
}
// CheckHealth performs a comprehensive health check
func (s *HealthService) CheckHealth(ctx context.Context) *EnhancedHealth {
health := &EnhancedHealth{
Status: string(HealthStatusHealthy),
Service: "calypso-api",
Uptime: int64(time.Since(s.startTime).Seconds()),
Timestamp: time.Now(),
Components: []ComponentHealth{},
}
// Check database
dbHealth := s.checkDatabase(ctx)
health.Components = append(health.Components, dbHealth)
// Check storage
storageHealth := s.checkStorage(ctx)
health.Components = append(health.Components, storageHealth)
// Check SCST
scstHealth := s.checkSCST(ctx)
health.Components = append(health.Components, scstHealth)
// Determine overall status
hasUnhealthy := false
hasDegraded := false
for _, comp := range health.Components {
if comp.Status == HealthStatusUnhealthy {
hasUnhealthy = true
} else if comp.Status == HealthStatusDegraded {
hasDegraded = true
}
}
if hasUnhealthy {
health.Status = string(HealthStatusUnhealthy)
} else if hasDegraded {
health.Status = string(HealthStatusDegraded)
}
return health
}
// checkDatabase checks database health
func (s *HealthService) checkDatabase(ctx context.Context) ComponentHealth {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
if err := s.db.PingContext(ctx); err != nil {
return ComponentHealth{
Name: "database",
Status: HealthStatusUnhealthy,
Message: "Database connection failed: " + err.Error(),
Timestamp: time.Now(),
}
}
// Check if we can query
var count int
if err := s.db.QueryRowContext(ctx, "SELECT 1").Scan(&count); err != nil {
return ComponentHealth{
Name: "database",
Status: HealthStatusDegraded,
Message: "Database query failed: " + err.Error(),
Timestamp: time.Now(),
}
}
return ComponentHealth{
Name: "database",
Status: HealthStatusHealthy,
Timestamp: time.Now(),
}
}
// checkStorage checks storage component health
func (s *HealthService) checkStorage(ctx context.Context) ComponentHealth {
// Check if we have any active repositories
var count int
if err := s.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM disk_repositories WHERE is_active = true").Scan(&count); err != nil {
return ComponentHealth{
Name: "storage",
Status: HealthStatusDegraded,
Message: "Failed to query storage repositories",
Timestamp: time.Now(),
}
}
if count == 0 {
return ComponentHealth{
Name: "storage",
Status: HealthStatusDegraded,
Message: "No active storage repositories configured",
Timestamp: time.Now(),
}
}
// Check repository capacity
var usagePercent float64
query := `
SELECT COALESCE(
SUM(used_bytes)::float / NULLIF(SUM(total_bytes), 0) * 100,
0
)
FROM disk_repositories
WHERE is_active = true
`
if err := s.db.QueryRowContext(ctx, query).Scan(&usagePercent); err == nil {
if usagePercent > 95 {
return ComponentHealth{
Name: "storage",
Status: HealthStatusDegraded,
Message: "Storage repositories are nearly full",
Timestamp: time.Now(),
}
}
}
return ComponentHealth{
Name: "storage",
Status: HealthStatusHealthy,
Timestamp: time.Now(),
}
}
// checkSCST checks SCST component health
func (s *HealthService) checkSCST(ctx context.Context) ComponentHealth {
// Check if SCST targets exist
var count int
if err := s.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM scst_targets").Scan(&count); err != nil {
return ComponentHealth{
Name: "scst",
Status: HealthStatusUnknown,
Message: "Failed to query SCST targets",
Timestamp: time.Now(),
}
}
// SCST is healthy if we can query it (even if no targets exist)
return ComponentHealth{
Name: "scst",
Status: HealthStatusHealthy,
Timestamp: time.Now(),
}
}