439 lines
11 KiB
Go
439 lines
11 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/example/storage-appliance/internal/infra/osexec"
|
|
"github.com/example/storage-appliance/internal/service"
|
|
)
|
|
|
|
const (
|
|
DefaultTimeout = 5 * time.Second
|
|
)
|
|
|
|
// MetricValue represents a single metric value
|
|
type MetricValue struct {
|
|
Name string
|
|
Labels map[string]string
|
|
Value float64
|
|
Type string // "gauge" or "counter"
|
|
}
|
|
|
|
// MetricCollection represents a collection of metrics
|
|
type MetricCollection struct {
|
|
Metrics []MetricValue
|
|
Errors []string
|
|
}
|
|
|
|
// Collector interface for different metric collectors
|
|
type Collector interface {
|
|
Collect(ctx context.Context) MetricCollection
|
|
Name() string
|
|
}
|
|
|
|
// ZFSCollector collects ZFS pool health and scrub status
|
|
type ZFSCollector struct {
|
|
ZFSSvc service.ZFSService
|
|
Runner osexec.Runner
|
|
}
|
|
|
|
func NewZFSCollector(zfsSvc service.ZFSService, runner osexec.Runner) *ZFSCollector {
|
|
return &ZFSCollector{ZFSSvc: zfsSvc, Runner: runner}
|
|
}
|
|
|
|
func (c *ZFSCollector) Name() string {
|
|
return "zfs"
|
|
}
|
|
|
|
func (c *ZFSCollector) Collect(ctx context.Context) MetricCollection {
|
|
ctx, cancel := context.WithTimeout(ctx, DefaultTimeout)
|
|
defer cancel()
|
|
|
|
collection := MetricCollection{
|
|
Metrics: []MetricValue{},
|
|
Errors: []string{},
|
|
}
|
|
|
|
// Get pool list
|
|
pools, err := c.ZFSSvc.ListPools(ctx)
|
|
if err != nil {
|
|
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to list pools: %v", err))
|
|
return collection
|
|
}
|
|
|
|
for _, pool := range pools {
|
|
// Pool health metric (1 = ONLINE, 0.5 = DEGRADED, 0 = FAULTED/OFFLINE)
|
|
healthValue := 0.0
|
|
switch strings.ToUpper(pool.Health) {
|
|
case "ONLINE":
|
|
healthValue = 1.0
|
|
case "DEGRADED":
|
|
healthValue = 0.5
|
|
case "FAULTED", "OFFLINE", "UNAVAIL":
|
|
healthValue = 0.0
|
|
}
|
|
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "zfs_pool_health",
|
|
Labels: map[string]string{"pool": pool.Name},
|
|
Value: healthValue,
|
|
Type: "gauge",
|
|
})
|
|
|
|
// Get scrub status
|
|
scrubStatus, err := c.getScrubStatus(ctx, pool.Name)
|
|
if err != nil {
|
|
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to get scrub status for %s: %v", pool.Name, err))
|
|
continue
|
|
}
|
|
|
|
// Scrub in progress (1 = yes, 0 = no)
|
|
scrubInProgress := 0.0
|
|
if strings.Contains(scrubStatus, "scan: scrub in progress") {
|
|
scrubInProgress = 1.0
|
|
}
|
|
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "zfs_pool_scrub_in_progress",
|
|
Labels: map[string]string{"pool": pool.Name},
|
|
Value: scrubInProgress,
|
|
Type: "gauge",
|
|
})
|
|
}
|
|
|
|
return collection
|
|
}
|
|
|
|
func (c *ZFSCollector) getScrubStatus(ctx context.Context, pool string) (string, error) {
|
|
out, _, _, err := osexec.ExecWithRunner(c.Runner, ctx, "zpool", "status", pool)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
for _, line := range strings.Split(out, "\n") {
|
|
if strings.Contains(line, "scan:") {
|
|
return strings.TrimSpace(line), nil
|
|
}
|
|
}
|
|
return "no-scan", nil
|
|
}
|
|
|
|
// SMARTCollector collects SMART health status
|
|
type SMARTCollector struct {
|
|
Runner osexec.Runner
|
|
}
|
|
|
|
func NewSMARTCollector(runner osexec.Runner) *SMARTCollector {
|
|
return &SMARTCollector{Runner: runner}
|
|
}
|
|
|
|
func (c *SMARTCollector) Name() string {
|
|
return "smart"
|
|
}
|
|
|
|
func (c *SMARTCollector) Collect(ctx context.Context) MetricCollection {
|
|
ctx, cancel := context.WithTimeout(ctx, DefaultTimeout)
|
|
defer cancel()
|
|
|
|
collection := MetricCollection{
|
|
Metrics: []MetricValue{},
|
|
Errors: []string{},
|
|
}
|
|
|
|
// List all disks (simplified - try common devices)
|
|
// In a real implementation, you'd scan /dev/ or use lsblk
|
|
commonDisks := []string{"sda", "sdb", "sdc", "nvme0n1", "nvme1n1"}
|
|
disks := []string{}
|
|
for _, d := range commonDisks {
|
|
disks = append(disks, fmt.Sprintf("/dev/%s", d))
|
|
}
|
|
|
|
// Check SMART health for each disk
|
|
for _, disk := range disks {
|
|
health, err := c.getSMARTHealth(ctx, disk)
|
|
if err != nil {
|
|
// Skip devices that don't exist or don't support SMART
|
|
continue
|
|
}
|
|
|
|
// SMART health: 1 = PASSED, 0 = FAILED
|
|
healthValue := 0.0
|
|
if strings.Contains(strings.ToUpper(health), "PASSED") {
|
|
healthValue = 1.0
|
|
}
|
|
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "smart_health",
|
|
Labels: map[string]string{"device": disk},
|
|
Value: healthValue,
|
|
Type: "gauge",
|
|
})
|
|
}
|
|
|
|
return collection
|
|
}
|
|
|
|
func (c *SMARTCollector) getSMARTHealth(ctx context.Context, device string) (string, error) {
|
|
// Use smartctl -H to get health status
|
|
out, _, code, err := osexec.ExecWithRunner(c.Runner, ctx, "smartctl", "-H", device)
|
|
if err != nil || code != 0 {
|
|
return "", fmt.Errorf("smartctl failed: %v", err)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// ServiceCollector collects service states
|
|
type ServiceCollector struct {
|
|
Runner osexec.Runner
|
|
}
|
|
|
|
func NewServiceCollector(runner osexec.Runner) *ServiceCollector {
|
|
return &ServiceCollector{Runner: runner}
|
|
}
|
|
|
|
func (c *ServiceCollector) Name() string {
|
|
return "services"
|
|
}
|
|
|
|
func (c *ServiceCollector) Collect(ctx context.Context) MetricCollection {
|
|
ctx, cancel := context.WithTimeout(ctx, DefaultTimeout)
|
|
defer cancel()
|
|
|
|
collection := MetricCollection{
|
|
Metrics: []MetricValue{},
|
|
Errors: []string{},
|
|
}
|
|
|
|
services := []string{"nfs-server", "smbd", "iscsid", "iscsi", "minio"}
|
|
|
|
for _, svc := range services {
|
|
status, err := c.getServiceStatus(ctx, svc)
|
|
if err != nil {
|
|
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to check %s: %v", svc, err))
|
|
continue
|
|
}
|
|
|
|
// Service state: 1 = active/running, 0 = inactive/stopped
|
|
stateValue := 0.0
|
|
if strings.Contains(strings.ToLower(status), "active") || strings.Contains(strings.ToLower(status), "running") {
|
|
stateValue = 1.0
|
|
}
|
|
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "service_state",
|
|
Labels: map[string]string{"service": svc},
|
|
Value: stateValue,
|
|
Type: "gauge",
|
|
})
|
|
}
|
|
|
|
return collection
|
|
}
|
|
|
|
func (c *ServiceCollector) getServiceStatus(ctx context.Context, service string) (string, error) {
|
|
// Try systemctl first
|
|
out, _, code, err := osexec.ExecWithRunner(c.Runner, ctx, "systemctl", "is-active", service)
|
|
if err == nil && code == 0 {
|
|
return out, nil
|
|
}
|
|
|
|
// Fallback to checking process
|
|
out, _, code, err = osexec.ExecWithRunner(c.Runner, ctx, "pgrep", "-f", service)
|
|
if err == nil && code == 0 && strings.TrimSpace(out) != "" {
|
|
return "running", nil
|
|
}
|
|
|
|
return "inactive", nil
|
|
}
|
|
|
|
// HostCollector collects host metrics from /proc
|
|
type HostCollector struct{}
|
|
|
|
func NewHostCollector() *HostCollector {
|
|
return &HostCollector{}
|
|
}
|
|
|
|
func (c *HostCollector) Name() string {
|
|
return "host"
|
|
}
|
|
|
|
func (c *HostCollector) Collect(ctx context.Context) MetricCollection {
|
|
ctx, cancel := context.WithTimeout(ctx, DefaultTimeout)
|
|
defer cancel()
|
|
|
|
collection := MetricCollection{
|
|
Metrics: []MetricValue{},
|
|
Errors: []string{},
|
|
}
|
|
|
|
// Load average
|
|
loadavg, err := c.readLoadAvg()
|
|
if err != nil {
|
|
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to read loadavg: %v", err))
|
|
} else {
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "host_load1",
|
|
Labels: map[string]string{},
|
|
Value: loadavg.Load1,
|
|
Type: "gauge",
|
|
})
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "host_load5",
|
|
Labels: map[string]string{},
|
|
Value: loadavg.Load5,
|
|
Type: "gauge",
|
|
})
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "host_load15",
|
|
Labels: map[string]string{},
|
|
Value: loadavg.Load15,
|
|
Type: "gauge",
|
|
})
|
|
}
|
|
|
|
// Memory info
|
|
meminfo, err := c.readMemInfo()
|
|
if err != nil {
|
|
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to read meminfo: %v", err))
|
|
} else {
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "host_memory_total_bytes",
|
|
Labels: map[string]string{},
|
|
Value: meminfo.MemTotal,
|
|
Type: "gauge",
|
|
})
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "host_memory_free_bytes",
|
|
Labels: map[string]string{},
|
|
Value: meminfo.MemFree,
|
|
Type: "gauge",
|
|
})
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "host_memory_available_bytes",
|
|
Labels: map[string]string{},
|
|
Value: meminfo.MemAvailable,
|
|
Type: "gauge",
|
|
})
|
|
}
|
|
|
|
// Disk IO (simplified - read from /proc/diskstats)
|
|
diskIO, err := c.readDiskIO()
|
|
if err != nil {
|
|
collection.Errors = append(collection.Errors, fmt.Sprintf("failed to read disk IO: %v", err))
|
|
} else {
|
|
for device, io := range diskIO {
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "host_disk_reads_completed",
|
|
Labels: map[string]string{"device": device},
|
|
Value: io.ReadsCompleted,
|
|
Type: "counter",
|
|
})
|
|
collection.Metrics = append(collection.Metrics, MetricValue{
|
|
Name: "host_disk_writes_completed",
|
|
Labels: map[string]string{"device": device},
|
|
Value: io.WritesCompleted,
|
|
Type: "counter",
|
|
})
|
|
}
|
|
}
|
|
|
|
return collection
|
|
}
|
|
|
|
type LoadAvg struct {
|
|
Load1 float64
|
|
Load5 float64
|
|
Load15 float64
|
|
}
|
|
|
|
func (c *HostCollector) readLoadAvg() (LoadAvg, error) {
|
|
data, err := os.ReadFile("/proc/loadavg")
|
|
if err != nil {
|
|
return LoadAvg{}, err
|
|
}
|
|
|
|
fields := strings.Fields(string(data))
|
|
if len(fields) < 3 {
|
|
return LoadAvg{}, fmt.Errorf("invalid loadavg format")
|
|
}
|
|
|
|
load1, _ := strconv.ParseFloat(fields[0], 64)
|
|
load5, _ := strconv.ParseFloat(fields[1], 64)
|
|
load15, _ := strconv.ParseFloat(fields[2], 64)
|
|
|
|
return LoadAvg{Load1: load1, Load5: load5, Load15: load15}, nil
|
|
}
|
|
|
|
type MemInfo struct {
|
|
MemTotal float64
|
|
MemFree float64
|
|
MemAvailable float64
|
|
}
|
|
|
|
func (c *HostCollector) readMemInfo() (MemInfo, error) {
|
|
data, err := os.ReadFile("/proc/meminfo")
|
|
if err != nil {
|
|
return MemInfo{}, err
|
|
}
|
|
|
|
info := MemInfo{}
|
|
lines := strings.Split(string(data), "\n")
|
|
for _, line := range lines {
|
|
fields := strings.Fields(line)
|
|
if len(fields) < 2 {
|
|
continue
|
|
}
|
|
key := strings.TrimSuffix(fields[0], ":")
|
|
value, _ := strconv.ParseFloat(fields[1], 64)
|
|
// Values are in KB, convert to bytes
|
|
valueBytes := value * 1024
|
|
|
|
switch key {
|
|
case "MemTotal":
|
|
info.MemTotal = valueBytes
|
|
case "MemFree":
|
|
info.MemFree = valueBytes
|
|
case "MemAvailable":
|
|
info.MemAvailable = valueBytes
|
|
}
|
|
}
|
|
|
|
return info, nil
|
|
}
|
|
|
|
type DiskIO struct {
|
|
ReadsCompleted float64
|
|
WritesCompleted float64
|
|
}
|
|
|
|
func (c *HostCollector) readDiskIO() (map[string]DiskIO, error) {
|
|
data, err := os.ReadFile("/proc/diskstats")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
result := make(map[string]DiskIO)
|
|
lines := strings.Split(string(data), "\n")
|
|
for _, line := range lines {
|
|
fields := strings.Fields(line)
|
|
if len(fields) < 14 {
|
|
continue
|
|
}
|
|
device := fields[2]
|
|
reads, _ := strconv.ParseFloat(fields[3], 64)
|
|
writes, _ := strconv.ParseFloat(fields[7], 64)
|
|
|
|
result[device] = DiskIO{
|
|
ReadsCompleted: reads,
|
|
WritesCompleted: writes,
|
|
}
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|