start working on the frontend side

2025-12-24 19:53:45 +00:00
parent 3aa0169af0
commit c962a223c6
84 changed files with 14761 additions and 58 deletions
--- a/backend/internal/monitoring/rules.go
+++ b/backend/internal/monitoring/rules.go
@@ -0,0 +1,233 @@
+package monitoring
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/atlasos/calypso/internal/common/database"
+	"github.com/atlasos/calypso/internal/common/logger"
+)
+
+// AlertRule represents a rule that can trigger alerts
+type AlertRule struct {
+	ID          string
+	Name        string
+	Source      AlertSource
+	Condition   AlertCondition
+	Severity    AlertSeverity
+	Enabled     bool
+	Description string
+}
+
+// NewAlertRule creates a new alert rule (helper function)
+func NewAlertRule(id, name string, source AlertSource, condition AlertCondition, severity AlertSeverity, enabled bool, description string) *AlertRule {
+	return &AlertRule{
+		ID:          id,
+		Name:        name,
+		Source:      source,
+		Condition:   condition,
+		Severity:    severity,
+		Enabled:     enabled,
+		Description: description,
+	}
+}
+
+// AlertCondition represents a condition that triggers an alert
+type AlertCondition interface {
+	Evaluate(ctx context.Context, db *database.DB, logger *logger.Logger) (bool, *Alert, error)
+}
+
+// AlertRuleEngine manages alert rules and evaluation
+type AlertRuleEngine struct {
+	db       *database.DB
+	logger   *logger.Logger
+	service  *AlertService
+	rules    []*AlertRule
+	interval time.Duration
+	stopCh   chan struct{}
+}
+
+// NewAlertRuleEngine creates a new alert rule engine
+func NewAlertRuleEngine(db *database.DB, log *logger.Logger, service *AlertService) *AlertRuleEngine {
+	return &AlertRuleEngine{
+		db:       db,
+		logger:   log,
+		service:  service,
+		rules:    []*AlertRule{},
+		interval: 30 * time.Second, // Check every 30 seconds
+		stopCh:   make(chan struct{}),
+	}
+}
+
+// RegisterRule registers an alert rule
+func (e *AlertRuleEngine) RegisterRule(rule *AlertRule) {
+	e.rules = append(e.rules, rule)
+	e.logger.Info("Alert rule registered", "rule_id", rule.ID, "name", rule.Name)
+}
+
+// Start starts the alert rule engine background monitoring
+func (e *AlertRuleEngine) Start(ctx context.Context) {
+	e.logger.Info("Starting alert rule engine", "interval", e.interval)
+	ticker := time.NewTicker(e.interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			e.logger.Info("Alert rule engine stopped")
+			return
+		case <-e.stopCh:
+			e.logger.Info("Alert rule engine stopped")
+			return
+		case <-ticker.C:
+			e.evaluateRules(ctx)
+		}
+	}
+}
+
+// Stop stops the alert rule engine
+func (e *AlertRuleEngine) Stop() {
+	close(e.stopCh)
+}
+
+// evaluateRules evaluates all registered rules
+func (e *AlertRuleEngine) evaluateRules(ctx context.Context) {
+	for _, rule := range e.rules {
+		if !rule.Enabled {
+			continue
+		}
+
+		triggered, alert, err := rule.Condition.Evaluate(ctx, e.db, e.logger)
+		if err != nil {
+			e.logger.Error("Error evaluating alert rule",
+				"rule_id", rule.ID,
+				"rule_name", rule.Name,
+				"error", err,
+			)
+			continue
+		}
+
+		if triggered && alert != nil {
+			alert.Severity = rule.Severity
+			alert.Source = rule.Source
+			if err := e.service.CreateAlert(ctx, alert); err != nil {
+				e.logger.Error("Failed to create alert from rule",
+					"rule_id", rule.ID,
+					"error", err,
+				)
+			}
+		}
+	}
+}
+
+// Built-in alert conditions
+
+// StorageCapacityCondition checks if storage capacity is below threshold
+type StorageCapacityCondition struct {
+	ThresholdPercent float64
+}
+
+func (c *StorageCapacityCondition) Evaluate(ctx context.Context, db *database.DB, logger *logger.Logger) (bool, *Alert, error) {
+	query := `
+		SELECT id, name, used_bytes, total_bytes
+		FROM disk_repositories
+		WHERE is_active = true
+	`
+
+	rows, err := db.QueryContext(ctx, query)
+	if err != nil {
+		return false, nil, fmt.Errorf("failed to query repositories: %w", err)
+	}
+	defer rows.Close()
+
+	for rows.Next() {
+		var id, name string
+		var usedBytes, totalBytes int64
+
+		if err := rows.Scan(&id, &name, &usedBytes, &totalBytes); err != nil {
+			continue
+		}
+
+		if totalBytes == 0 {
+			continue
+		}
+
+		usagePercent := float64(usedBytes) / float64(totalBytes) * 100
+
+		if usagePercent >= c.ThresholdPercent {
+			alert := &Alert{
+				Title:        fmt.Sprintf("Storage repository %s is %d%% full", name, int(usagePercent)),
+				Message:      fmt.Sprintf("Repository %s has used %d%% of its capacity (%d/%d bytes)", name, int(usagePercent), usedBytes, totalBytes),
+				ResourceType: "repository",
+				ResourceID:   id,
+				Metadata: map[string]interface{}{
+					"usage_percent": usagePercent,
+					"used_bytes":    usedBytes,
+					"total_bytes":   totalBytes,
+				},
+			}
+			return true, alert, nil
+		}
+	}
+
+	return false, nil, nil
+}
+
+// TaskFailureCondition checks for failed tasks
+type TaskFailureCondition struct {
+	LookbackMinutes int
+}
+
+func (c *TaskFailureCondition) Evaluate(ctx context.Context, db *database.DB, logger *logger.Logger) (bool, *Alert, error) {
+	query := `
+		SELECT id, type, error_message, created_at
+		FROM tasks
+		WHERE status = 'failed'
+		  AND created_at > NOW() - INTERVAL '%d minutes'
+		ORDER BY created_at DESC
+		LIMIT 1
+	`
+
+	rows, err := db.QueryContext(ctx, fmt.Sprintf(query, c.LookbackMinutes))
+	if err != nil {
+		return false, nil, fmt.Errorf("failed to query failed tasks: %w", err)
+	}
+	defer rows.Close()
+
+	if rows.Next() {
+		var id, taskType, errorMsg string
+		var createdAt time.Time
+
+		if err := rows.Scan(&id, &taskType, &errorMsg, &createdAt); err != nil {
+			return false, nil, err
+		}
+
+		alert := &Alert{
+			Title:        fmt.Sprintf("Task %s failed", taskType),
+			Message:      errorMsg,
+			ResourceType: "task",
+			ResourceID:   id,
+			Metadata: map[string]interface{}{
+				"task_type": taskType,
+				"created_at": createdAt,
+			},
+		}
+		return true, alert, nil
+	}
+
+	return false, nil, nil
+}
+
+// SystemServiceDownCondition checks if critical services are down
+type SystemServiceDownCondition struct {
+	CriticalServices []string
+}
+
+func (c *SystemServiceDownCondition) Evaluate(ctx context.Context, db *database.DB, logger *logger.Logger) (bool, *Alert, error) {
+	// This would check systemd service status
+	// For now, we'll return false as this requires systemd integration
+	// This is a placeholder for future implementation
+	return false, nil, nil
+}
+