202 lines
5.3 KiB
Go
202 lines
5.3 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/atlasos/calypso/internal/common/database"
|
|
"github.com/atlasos/calypso/internal/common/logger"
|
|
)
|
|
|
|
// HealthStatus represents the health status of a component
|
|
type HealthStatus string
|
|
|
|
const (
|
|
HealthStatusHealthy HealthStatus = "healthy"
|
|
HealthStatusDegraded HealthStatus = "degraded"
|
|
HealthStatusUnhealthy HealthStatus = "unhealthy"
|
|
HealthStatusUnknown HealthStatus = "unknown"
|
|
)
|
|
|
|
// ComponentHealth represents the health of a system component
|
|
type ComponentHealth struct {
|
|
Name string `json:"name"`
|
|
Status HealthStatus `json:"status"`
|
|
Message string `json:"message,omitempty"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// EnhancedHealth represents enhanced health check response
|
|
type EnhancedHealth struct {
|
|
Status string `json:"status"`
|
|
Service string `json:"service"`
|
|
Version string `json:"version,omitempty"`
|
|
Uptime int64 `json:"uptime_seconds"`
|
|
Components []ComponentHealth `json:"components"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// HealthService provides enhanced health checking
|
|
type HealthService struct {
|
|
db *database.DB
|
|
logger *logger.Logger
|
|
startTime time.Time
|
|
metricsService *MetricsService
|
|
}
|
|
|
|
// NewHealthService creates a new health service
|
|
func NewHealthService(db *database.DB, log *logger.Logger, metricsService *MetricsService) *HealthService {
|
|
return &HealthService{
|
|
db: db,
|
|
logger: log,
|
|
startTime: time.Now(),
|
|
metricsService: metricsService,
|
|
}
|
|
}
|
|
|
|
// CheckHealth performs a comprehensive health check
|
|
func (s *HealthService) CheckHealth(ctx context.Context) *EnhancedHealth {
|
|
health := &EnhancedHealth{
|
|
Status: string(HealthStatusHealthy),
|
|
Service: "calypso-api",
|
|
Uptime: int64(time.Since(s.startTime).Seconds()),
|
|
Timestamp: time.Now(),
|
|
Components: []ComponentHealth{},
|
|
}
|
|
|
|
// Check database
|
|
dbHealth := s.checkDatabase(ctx)
|
|
health.Components = append(health.Components, dbHealth)
|
|
|
|
// Check storage
|
|
storageHealth := s.checkStorage(ctx)
|
|
health.Components = append(health.Components, storageHealth)
|
|
|
|
// Check SCST
|
|
scstHealth := s.checkSCST(ctx)
|
|
health.Components = append(health.Components, scstHealth)
|
|
|
|
// Determine overall status
|
|
hasUnhealthy := false
|
|
hasDegraded := false
|
|
for _, comp := range health.Components {
|
|
if comp.Status == HealthStatusUnhealthy {
|
|
hasUnhealthy = true
|
|
} else if comp.Status == HealthStatusDegraded {
|
|
hasDegraded = true
|
|
}
|
|
}
|
|
|
|
if hasUnhealthy {
|
|
health.Status = string(HealthStatusUnhealthy)
|
|
} else if hasDegraded {
|
|
health.Status = string(HealthStatusDegraded)
|
|
}
|
|
|
|
return health
|
|
}
|
|
|
|
// checkDatabase checks database health
|
|
func (s *HealthService) checkDatabase(ctx context.Context) ComponentHealth {
|
|
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
defer cancel()
|
|
|
|
if err := s.db.PingContext(ctx); err != nil {
|
|
return ComponentHealth{
|
|
Name: "database",
|
|
Status: HealthStatusUnhealthy,
|
|
Message: "Database connection failed: " + err.Error(),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
// Check if we can query
|
|
var count int
|
|
if err := s.db.QueryRowContext(ctx, "SELECT 1").Scan(&count); err != nil {
|
|
return ComponentHealth{
|
|
Name: "database",
|
|
Status: HealthStatusDegraded,
|
|
Message: "Database query failed: " + err.Error(),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
return ComponentHealth{
|
|
Name: "database",
|
|
Status: HealthStatusHealthy,
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
// checkStorage checks storage component health
|
|
func (s *HealthService) checkStorage(ctx context.Context) ComponentHealth {
|
|
// Check if we have any active repositories
|
|
var count int
|
|
if err := s.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM disk_repositories WHERE is_active = true").Scan(&count); err != nil {
|
|
return ComponentHealth{
|
|
Name: "storage",
|
|
Status: HealthStatusDegraded,
|
|
Message: "Failed to query storage repositories",
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
if count == 0 {
|
|
return ComponentHealth{
|
|
Name: "storage",
|
|
Status: HealthStatusDegraded,
|
|
Message: "No active storage repositories configured",
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
// Check repository capacity
|
|
var usagePercent float64
|
|
query := `
|
|
SELECT COALESCE(
|
|
SUM(used_bytes)::float / NULLIF(SUM(total_bytes), 0) * 100,
|
|
0
|
|
)
|
|
FROM disk_repositories
|
|
WHERE is_active = true
|
|
`
|
|
if err := s.db.QueryRowContext(ctx, query).Scan(&usagePercent); err == nil {
|
|
if usagePercent > 95 {
|
|
return ComponentHealth{
|
|
Name: "storage",
|
|
Status: HealthStatusDegraded,
|
|
Message: "Storage repositories are nearly full",
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
}
|
|
|
|
return ComponentHealth{
|
|
Name: "storage",
|
|
Status: HealthStatusHealthy,
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
// checkSCST checks SCST component health
|
|
func (s *HealthService) checkSCST(ctx context.Context) ComponentHealth {
|
|
// Check if SCST targets exist
|
|
var count int
|
|
if err := s.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM scst_targets").Scan(&count); err != nil {
|
|
return ComponentHealth{
|
|
Name: "scst",
|
|
Status: HealthStatusUnknown,
|
|
Message: "Failed to query SCST targets",
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
// SCST is healthy if we can query it (even if no targets exist)
|
|
return ComponentHealth{
|
|
Name: "scst",
|
|
Status: HealthStatusHealthy,
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|