Files
storage-appliance/internal/monitoring/collectors.go

439 lines
11 KiB
Go

package monitoring
import (
"context"
"fmt"
"os"
"strconv"
"strings"
"time"
"github.com/example/storage-appliance/internal/infra/osexec"
"github.com/example/storage-appliance/internal/service"
)
const (
DefaultTimeout = 5 * time.Second
)
// MetricValue represents a single metric value
type MetricValue struct {
Name string
Labels map[string]string
Value float64
Type string // "gauge" or "counter"
}
// MetricCollection represents a collection of metrics
type MetricCollection struct {
Metrics []MetricValue
Errors []string
}
// Collector interface for different metric collectors
type Collector interface {
Collect(ctx context.Context) MetricCollection
Name() string
}
// ZFSCollector collects ZFS pool health and scrub status
type ZFSCollector struct {
ZFSSvc service.ZFSService
Runner osexec.Runner
}
func NewZFSCollector(zfsSvc service.ZFSService, runner osexec.Runner) *ZFSCollector {
return &ZFSCollector{ZFSSvc: zfsSvc, Runner: runner}
}
func (c *ZFSCollector) Name() string {
return "zfs"
}
func (c *ZFSCollector) Collect(ctx context.Context) MetricCollection {
ctx, cancel := context.WithTimeout(ctx, DefaultTimeout)
defer cancel()
collection := MetricCollection{
Metrics: []MetricValue{},
Errors: []string{},
}
// Get pool list
pools, err := c.ZFSSvc.ListPools(ctx)
if err != nil {
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to list pools: %v", err))
return collection
}
for _, pool := range pools {
// Pool health metric (1 = ONLINE, 0.5 = DEGRADED, 0 = FAULTED/OFFLINE)
healthValue := 0.0
switch strings.ToUpper(pool.Health) {
case "ONLINE":
healthValue = 1.0
case "DEGRADED":
healthValue = 0.5
case "FAULTED", "OFFLINE", "UNAVAIL":
healthValue = 0.0
}
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "zfs_pool_health",
Labels: map[string]string{"pool": pool.Name},
Value: healthValue,
Type: "gauge",
})
// Get scrub status
scrubStatus, err := c.getScrubStatus(ctx, pool.Name)
if err != nil {
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to get scrub status for %s: %v", pool.Name, err))
continue
}
// Scrub in progress (1 = yes, 0 = no)
scrubInProgress := 0.0
if strings.Contains(scrubStatus, "scan: scrub in progress") {
scrubInProgress = 1.0
}
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "zfs_pool_scrub_in_progress",
Labels: map[string]string{"pool": pool.Name},
Value: scrubInProgress,
Type: "gauge",
})
}
return collection
}
func (c *ZFSCollector) getScrubStatus(ctx context.Context, pool string) (string, error) {
out, _, _, err := osexec.ExecWithRunner(c.Runner, ctx, "zpool", "status", pool)
if err != nil {
return "", err
}
for _, line := range strings.Split(out, "\n") {
if strings.Contains(line, "scan:") {
return strings.TrimSpace(line), nil
}
}
return "no-scan", nil
}
// SMARTCollector collects SMART health status
type SMARTCollector struct {
Runner osexec.Runner
}
func NewSMARTCollector(runner osexec.Runner) *SMARTCollector {
return &SMARTCollector{Runner: runner}
}
func (c *SMARTCollector) Name() string {
return "smart"
}
func (c *SMARTCollector) Collect(ctx context.Context) MetricCollection {
ctx, cancel := context.WithTimeout(ctx, DefaultTimeout)
defer cancel()
collection := MetricCollection{
Metrics: []MetricValue{},
Errors: []string{},
}
// List all disks (simplified - try common devices)
// In a real implementation, you'd scan /dev/ or use lsblk
commonDisks := []string{"sda", "sdb", "sdc", "nvme0n1", "nvme1n1"}
disks := []string{}
for _, d := range commonDisks {
disks = append(disks, fmt.Sprintf("/dev/%s", d))
}
// Check SMART health for each disk
for _, disk := range disks {
health, err := c.getSMARTHealth(ctx, disk)
if err != nil {
// Skip devices that don't exist or don't support SMART
continue
}
// SMART health: 1 = PASSED, 0 = FAILED
healthValue := 0.0
if strings.Contains(strings.ToUpper(health), "PASSED") {
healthValue = 1.0
}
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "smart_health",
Labels: map[string]string{"device": disk},
Value: healthValue,
Type: "gauge",
})
}
return collection
}
func (c *SMARTCollector) getSMARTHealth(ctx context.Context, device string) (string, error) {
// Use smartctl -H to get health status
out, _, code, err := osexec.ExecWithRunner(c.Runner, ctx, "smartctl", "-H", device)
if err != nil || code != 0 {
return "", fmt.Errorf("smartctl failed: %v", err)
}
return out, nil
}
// ServiceCollector collects service states
type ServiceCollector struct {
Runner osexec.Runner
}
func NewServiceCollector(runner osexec.Runner) *ServiceCollector {
return &ServiceCollector{Runner: runner}
}
func (c *ServiceCollector) Name() string {
return "services"
}
func (c *ServiceCollector) Collect(ctx context.Context) MetricCollection {
ctx, cancel := context.WithTimeout(ctx, DefaultTimeout)
defer cancel()
collection := MetricCollection{
Metrics: []MetricValue{},
Errors: []string{},
}
services := []string{"nfs-server", "smbd", "iscsid", "iscsi", "minio"}
for _, svc := range services {
status, err := c.getServiceStatus(ctx, svc)
if err != nil {
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to check %s: %v", svc, err))
continue
}
// Service state: 1 = active/running, 0 = inactive/stopped
stateValue := 0.0
if strings.Contains(strings.ToLower(status), "active") || strings.Contains(strings.ToLower(status), "running") {
stateValue = 1.0
}
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "service_state",
Labels: map[string]string{"service": svc},
Value: stateValue,
Type: "gauge",
})
}
return collection
}
func (c *ServiceCollector) getServiceStatus(ctx context.Context, service string) (string, error) {
// Try systemctl first
out, _, code, err := osexec.ExecWithRunner(c.Runner, ctx, "systemctl", "is-active", service)
if err == nil && code == 0 {
return out, nil
}
// Fallback to checking process
out, _, code, err = osexec.ExecWithRunner(c.Runner, ctx, "pgrep", "-f", service)
if err == nil && code == 0 && strings.TrimSpace(out) != "" {
return "running", nil
}
return "inactive", nil
}
// HostCollector collects host metrics from /proc
type HostCollector struct{}
func NewHostCollector() *HostCollector {
return &HostCollector{}
}
func (c *HostCollector) Name() string {
return "host"
}
func (c *HostCollector) Collect(ctx context.Context) MetricCollection {
ctx, cancel := context.WithTimeout(ctx, DefaultTimeout)
defer cancel()
collection := MetricCollection{
Metrics: []MetricValue{},
Errors: []string{},
}
// Load average
loadavg, err := c.readLoadAvg()
if err != nil {
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to read loadavg: %v", err))
} else {
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "host_load1",
Labels: map[string]string{},
Value: loadavg.Load1,
Type: "gauge",
})
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "host_load5",
Labels: map[string]string{},
Value: loadavg.Load5,
Type: "gauge",
})
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "host_load15",
Labels: map[string]string{},
Value: loadavg.Load15,
Type: "gauge",
})
}
// Memory info
meminfo, err := c.readMemInfo()
if err != nil {
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to read meminfo: %v", err))
} else {
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "host_memory_total_bytes",
Labels: map[string]string{},
Value: meminfo.MemTotal,
Type: "gauge",
})
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "host_memory_free_bytes",
Labels: map[string]string{},
Value: meminfo.MemFree,
Type: "gauge",
})
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "host_memory_available_bytes",
Labels: map[string]string{},
Value: meminfo.MemAvailable,
Type: "gauge",
})
}
// Disk IO (simplified - read from /proc/diskstats)
diskIO, err := c.readDiskIO()
if err != nil {
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to read disk IO: %v", err))
} else {
for device, io := range diskIO {
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "host_disk_reads_completed",
Labels: map[string]string{"device": device},
Value: io.ReadsCompleted,
Type: "counter",
})
collection.Metrics = append(collection.Metrics, MetricValue{
Name: "host_disk_writes_completed",
Labels: map[string]string{"device": device},
Value: io.WritesCompleted,
Type: "counter",
})
}
}
return collection
}
type LoadAvg struct {
Load1 float64
Load5 float64
Load15 float64
}
func (c *HostCollector) readLoadAvg() (LoadAvg, error) {
data, err := os.ReadFile("/proc/loadavg")
if err != nil {
return LoadAvg{}, err
}
fields := strings.Fields(string(data))
if len(fields) < 3 {
return LoadAvg{}, fmt.Errorf("invalid loadavg format")
}
load1, _ := strconv.ParseFloat(fields[0], 64)
load5, _ := strconv.ParseFloat(fields[1], 64)
load15, _ := strconv.ParseFloat(fields[2], 64)
return LoadAvg{Load1: load1, Load5: load5, Load15: load15}, nil
}
type MemInfo struct {
MemTotal float64
MemFree float64
MemAvailable float64
}
func (c *HostCollector) readMemInfo() (MemInfo, error) {
data, err := os.ReadFile("/proc/meminfo")
if err != nil {
return MemInfo{}, err
}
info := MemInfo{}
lines := strings.Split(string(data), "\n")
for _, line := range lines {
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
key := strings.TrimSuffix(fields[0], ":")
value, _ := strconv.ParseFloat(fields[1], 64)
// Values are in KB, convert to bytes
valueBytes := value * 1024
switch key {
case "MemTotal":
info.MemTotal = valueBytes
case "MemFree":
info.MemFree = valueBytes
case "MemAvailable":
info.MemAvailable = valueBytes
}
}
return info, nil
}
type DiskIO struct {
ReadsCompleted float64
WritesCompleted float64
}
func (c *HostCollector) readDiskIO() (map[string]DiskIO, error) {
data, err := os.ReadFile("/proc/diskstats")
if err != nil {
return nil, err
}
result := make(map[string]DiskIO)
lines := strings.Split(string(data), "\n")
for _, line := range lines {
fields := strings.Fields(line)
if len(fields) < 14 {
continue
}
device := fields[2]
reads, _ := strconv.ParseFloat(fields[3], 64)
writes, _ := strconv.ParseFloat(fields[7], 64)
result[device] = DiskIO{
ReadsCompleted: reads,
WritesCompleted: writes,
}
}
return result, nil
}