start working on the frontend side
This commit is contained in:
233
backend/internal/monitoring/rules.go
Normal file
233
backend/internal/monitoring/rules.go
Normal file
@@ -0,0 +1,233 @@
|
||||
package monitoring
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/atlasos/calypso/internal/common/database"
|
||||
"github.com/atlasos/calypso/internal/common/logger"
|
||||
)
|
||||
|
||||
// AlertRule represents a rule that can trigger alerts
|
||||
type AlertRule struct {
|
||||
ID string
|
||||
Name string
|
||||
Source AlertSource
|
||||
Condition AlertCondition
|
||||
Severity AlertSeverity
|
||||
Enabled bool
|
||||
Description string
|
||||
}
|
||||
|
||||
// NewAlertRule creates a new alert rule (helper function)
|
||||
func NewAlertRule(id, name string, source AlertSource, condition AlertCondition, severity AlertSeverity, enabled bool, description string) *AlertRule {
|
||||
return &AlertRule{
|
||||
ID: id,
|
||||
Name: name,
|
||||
Source: source,
|
||||
Condition: condition,
|
||||
Severity: severity,
|
||||
Enabled: enabled,
|
||||
Description: description,
|
||||
}
|
||||
}
|
||||
|
||||
// AlertCondition represents a condition that triggers an alert
|
||||
type AlertCondition interface {
|
||||
Evaluate(ctx context.Context, db *database.DB, logger *logger.Logger) (bool, *Alert, error)
|
||||
}
|
||||
|
||||
// AlertRuleEngine manages alert rules and evaluation
|
||||
type AlertRuleEngine struct {
|
||||
db *database.DB
|
||||
logger *logger.Logger
|
||||
service *AlertService
|
||||
rules []*AlertRule
|
||||
interval time.Duration
|
||||
stopCh chan struct{}
|
||||
}
|
||||
|
||||
// NewAlertRuleEngine creates a new alert rule engine
|
||||
func NewAlertRuleEngine(db *database.DB, log *logger.Logger, service *AlertService) *AlertRuleEngine {
|
||||
return &AlertRuleEngine{
|
||||
db: db,
|
||||
logger: log,
|
||||
service: service,
|
||||
rules: []*AlertRule{},
|
||||
interval: 30 * time.Second, // Check every 30 seconds
|
||||
stopCh: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterRule registers an alert rule
|
||||
func (e *AlertRuleEngine) RegisterRule(rule *AlertRule) {
|
||||
e.rules = append(e.rules, rule)
|
||||
e.logger.Info("Alert rule registered", "rule_id", rule.ID, "name", rule.Name)
|
||||
}
|
||||
|
||||
// Start starts the alert rule engine background monitoring
|
||||
func (e *AlertRuleEngine) Start(ctx context.Context) {
|
||||
e.logger.Info("Starting alert rule engine", "interval", e.interval)
|
||||
ticker := time.NewTicker(e.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
e.logger.Info("Alert rule engine stopped")
|
||||
return
|
||||
case <-e.stopCh:
|
||||
e.logger.Info("Alert rule engine stopped")
|
||||
return
|
||||
case <-ticker.C:
|
||||
e.evaluateRules(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Stop stops the alert rule engine
|
||||
func (e *AlertRuleEngine) Stop() {
|
||||
close(e.stopCh)
|
||||
}
|
||||
|
||||
// evaluateRules evaluates all registered rules
|
||||
func (e *AlertRuleEngine) evaluateRules(ctx context.Context) {
|
||||
for _, rule := range e.rules {
|
||||
if !rule.Enabled {
|
||||
continue
|
||||
}
|
||||
|
||||
triggered, alert, err := rule.Condition.Evaluate(ctx, e.db, e.logger)
|
||||
if err != nil {
|
||||
e.logger.Error("Error evaluating alert rule",
|
||||
"rule_id", rule.ID,
|
||||
"rule_name", rule.Name,
|
||||
"error", err,
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
if triggered && alert != nil {
|
||||
alert.Severity = rule.Severity
|
||||
alert.Source = rule.Source
|
||||
if err := e.service.CreateAlert(ctx, alert); err != nil {
|
||||
e.logger.Error("Failed to create alert from rule",
|
||||
"rule_id", rule.ID,
|
||||
"error", err,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Built-in alert conditions
|
||||
|
||||
// StorageCapacityCondition checks if storage capacity is below threshold
|
||||
type StorageCapacityCondition struct {
|
||||
ThresholdPercent float64
|
||||
}
|
||||
|
||||
func (c *StorageCapacityCondition) Evaluate(ctx context.Context, db *database.DB, logger *logger.Logger) (bool, *Alert, error) {
|
||||
query := `
|
||||
SELECT id, name, used_bytes, total_bytes
|
||||
FROM disk_repositories
|
||||
WHERE is_active = true
|
||||
`
|
||||
|
||||
rows, err := db.QueryContext(ctx, query)
|
||||
if err != nil {
|
||||
return false, nil, fmt.Errorf("failed to query repositories: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
var id, name string
|
||||
var usedBytes, totalBytes int64
|
||||
|
||||
if err := rows.Scan(&id, &name, &usedBytes, &totalBytes); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if totalBytes == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
usagePercent := float64(usedBytes) / float64(totalBytes) * 100
|
||||
|
||||
if usagePercent >= c.ThresholdPercent {
|
||||
alert := &Alert{
|
||||
Title: fmt.Sprintf("Storage repository %s is %d%% full", name, int(usagePercent)),
|
||||
Message: fmt.Sprintf("Repository %s has used %d%% of its capacity (%d/%d bytes)", name, int(usagePercent), usedBytes, totalBytes),
|
||||
ResourceType: "repository",
|
||||
ResourceID: id,
|
||||
Metadata: map[string]interface{}{
|
||||
"usage_percent": usagePercent,
|
||||
"used_bytes": usedBytes,
|
||||
"total_bytes": totalBytes,
|
||||
},
|
||||
}
|
||||
return true, alert, nil
|
||||
}
|
||||
}
|
||||
|
||||
return false, nil, nil
|
||||
}
|
||||
|
||||
// TaskFailureCondition checks for failed tasks
|
||||
type TaskFailureCondition struct {
|
||||
LookbackMinutes int
|
||||
}
|
||||
|
||||
func (c *TaskFailureCondition) Evaluate(ctx context.Context, db *database.DB, logger *logger.Logger) (bool, *Alert, error) {
|
||||
query := `
|
||||
SELECT id, type, error_message, created_at
|
||||
FROM tasks
|
||||
WHERE status = 'failed'
|
||||
AND created_at > NOW() - INTERVAL '%d minutes'
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
`
|
||||
|
||||
rows, err := db.QueryContext(ctx, fmt.Sprintf(query, c.LookbackMinutes))
|
||||
if err != nil {
|
||||
return false, nil, fmt.Errorf("failed to query failed tasks: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
if rows.Next() {
|
||||
var id, taskType, errorMsg string
|
||||
var createdAt time.Time
|
||||
|
||||
if err := rows.Scan(&id, &taskType, &errorMsg, &createdAt); err != nil {
|
||||
return false, nil, err
|
||||
}
|
||||
|
||||
alert := &Alert{
|
||||
Title: fmt.Sprintf("Task %s failed", taskType),
|
||||
Message: errorMsg,
|
||||
ResourceType: "task",
|
||||
ResourceID: id,
|
||||
Metadata: map[string]interface{}{
|
||||
"task_type": taskType,
|
||||
"created_at": createdAt,
|
||||
},
|
||||
}
|
||||
return true, alert, nil
|
||||
}
|
||||
|
||||
return false, nil, nil
|
||||
}
|
||||
|
||||
// SystemServiceDownCondition checks if critical services are down
|
||||
type SystemServiceDownCondition struct {
|
||||
CriticalServices []string
|
||||
}
|
||||
|
||||
func (c *SystemServiceDownCondition) Evaluate(ctx context.Context, db *database.DB, logger *logger.Logger) (bool, *Alert, error) {
|
||||
// This would check systemd service status
|
||||
// For now, we'll return false as this requires systemd integration
|
||||
// This is a placeholder for future implementation
|
||||
return false, nil, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user