Files
atlas/internal/metrics/collector.go
othman.suseno df475bc85e
Some checks failed
CI / test-build (push) Failing after 2m11s
logging and diagnostic features added
2025-12-15 00:45:14 +07:00

218 lines
6.5 KiB
Go

package metrics
import (
"fmt"
"sync"
"time"
"gitea.avt.data-center.id/othman.suseno/atlas/internal/models"
)
// Collector gathers system metrics
type Collector struct {
mu sync.RWMutex
// ZFS metrics
poolCount int
datasetCount int
zvolCount int
snapshotCount int
totalCapacity uint64
totalAllocated uint64
totalFree uint64
// Service metrics
smbSharesCount int
nfsExportsCount int
iscsiTargetsCount int
smbServiceStatus int // 1 = running, 0 = stopped
nfsServiceStatus int
iscsiServiceStatus int
// Job metrics
jobsTotal int
jobsRunning int
jobsCompleted int
jobsFailed int
// System metrics
uptimeSeconds int64
lastUpdate time.Time
}
// NewCollector creates a new metrics collector
func NewCollector() *Collector {
return &Collector{
lastUpdate: time.Now(),
}
}
// UpdateZFSMetrics updates ZFS-related metrics
func (c *Collector) UpdateZFSMetrics(pools []models.Pool, datasets []models.Dataset, zvols []models.ZVOL, snapshots []models.Snapshot) {
c.mu.Lock()
defer c.mu.Unlock()
c.poolCount = len(pools)
c.datasetCount = len(datasets)
c.zvolCount = len(zvols)
c.snapshotCount = len(snapshots)
c.totalCapacity = 0
c.totalAllocated = 0
c.totalFree = 0
for _, pool := range pools {
c.totalCapacity += pool.Size
c.totalAllocated += pool.Allocated
c.totalFree += pool.Free
}
c.lastUpdate = time.Now()
}
// UpdateServiceMetrics updates storage service metrics
func (c *Collector) UpdateServiceMetrics(smbShares, nfsExports, iscsiTargets int, smbStatus, nfsStatus, iscsiStatus bool) {
c.mu.Lock()
defer c.mu.Unlock()
c.smbSharesCount = smbShares
c.nfsExportsCount = nfsExports
c.iscsiTargetsCount = iscsiTargets
if smbStatus {
c.smbServiceStatus = 1
} else {
c.smbServiceStatus = 0
}
if nfsStatus {
c.nfsServiceStatus = 1
} else {
c.nfsServiceStatus = 0
}
if iscsiStatus {
c.iscsiServiceStatus = 1
} else {
c.iscsiServiceStatus = 0
}
c.lastUpdate = time.Now()
}
// UpdateJobMetrics updates job-related metrics
func (c *Collector) UpdateJobMetrics(total, running, completed, failed int) {
c.mu.Lock()
defer c.mu.Unlock()
c.jobsTotal = total
c.jobsRunning = running
c.jobsCompleted = completed
c.jobsFailed = failed
c.lastUpdate = time.Now()
}
// SetUptime sets the system uptime
func (c *Collector) SetUptime(seconds int64) {
c.mu.Lock()
defer c.mu.Unlock()
c.uptimeSeconds = seconds
}
// Collect returns metrics in Prometheus format
func (c *Collector) Collect() string {
c.mu.RLock()
defer c.mu.RUnlock()
var output string
// Build info
output += "# HELP atlas_build_info Build information\n"
output += "# TYPE atlas_build_info gauge\n"
output += `atlas_build_info{version="v0.1.0-dev"} 1` + "\n\n"
// System uptime
output += "# HELP atlas_uptime_seconds System uptime in seconds\n"
output += "# TYPE atlas_uptime_seconds gauge\n"
output += fmt.Sprintf("atlas_uptime_seconds %d\n\n", c.uptimeSeconds)
// ZFS metrics
output += "# HELP atlas_zfs_pools_total Total number of ZFS pools\n"
output += "# TYPE atlas_zfs_pools_total gauge\n"
output += fmt.Sprintf("atlas_zfs_pools_total %d\n\n", c.poolCount)
output += "# HELP atlas_zfs_datasets_total Total number of ZFS datasets\n"
output += "# TYPE atlas_zfs_datasets_total gauge\n"
output += fmt.Sprintf("atlas_zfs_datasets_total %d\n\n", c.datasetCount)
output += "# HELP atlas_zfs_zvols_total Total number of ZFS ZVOLs\n"
output += "# TYPE atlas_zfs_zvols_total gauge\n"
output += fmt.Sprintf("atlas_zfs_zvols_total %d\n\n", c.zvolCount)
output += "# HELP atlas_zfs_snapshots_total Total number of ZFS snapshots\n"
output += "# TYPE atlas_zfs_snapshots_total gauge\n"
output += fmt.Sprintf("atlas_zfs_snapshots_total %d\n\n", c.snapshotCount)
output += "# HELP atlas_zfs_capacity_bytes Total ZFS pool capacity in bytes\n"
output += "# TYPE atlas_zfs_capacity_bytes gauge\n"
output += fmt.Sprintf("atlas_zfs_capacity_bytes %d\n\n", c.totalCapacity)
output += "# HELP atlas_zfs_allocated_bytes Total ZFS pool allocated space in bytes\n"
output += "# TYPE atlas_zfs_allocated_bytes gauge\n"
output += fmt.Sprintf("atlas_zfs_allocated_bytes %d\n\n", c.totalAllocated)
output += "# HELP atlas_zfs_free_bytes Total ZFS pool free space in bytes\n"
output += "# TYPE atlas_zfs_free_bytes gauge\n"
output += fmt.Sprintf("atlas_zfs_free_bytes %d\n\n", c.totalFree)
// Service metrics
output += "# HELP atlas_smb_shares_total Total number of SMB shares\n"
output += "# TYPE atlas_smb_shares_total gauge\n"
output += fmt.Sprintf("atlas_smb_shares_total %d\n\n", c.smbSharesCount)
output += "# HELP atlas_nfs_exports_total Total number of NFS exports\n"
output += "# TYPE atlas_nfs_exports_total gauge\n"
output += fmt.Sprintf("atlas_nfs_exports_total %d\n\n", c.nfsExportsCount)
output += "# HELP atlas_iscsi_targets_total Total number of iSCSI targets\n"
output += "# TYPE atlas_iscsi_targets_total gauge\n"
output += fmt.Sprintf("atlas_iscsi_targets_total %d\n\n", c.iscsiTargetsCount)
output += "# HELP atlas_smb_service_status SMB service status (1=running, 0=stopped)\n"
output += "# TYPE atlas_smb_service_status gauge\n"
output += fmt.Sprintf("atlas_smb_service_status %d\n\n", c.smbServiceStatus)
output += "# HELP atlas_nfs_service_status NFS service status (1=running, 0=stopped)\n"
output += "# TYPE atlas_nfs_service_status gauge\n"
output += fmt.Sprintf("atlas_nfs_service_status %d\n\n", c.nfsServiceStatus)
output += "# HELP atlas_iscsi_service_status iSCSI service status (1=running, 0=stopped)\n"
output += "# TYPE atlas_iscsi_service_status gauge\n"
output += fmt.Sprintf("atlas_iscsi_service_status %d\n\n", c.iscsiServiceStatus)
// Job metrics
output += "# HELP atlas_jobs_total Total number of jobs\n"
output += "# TYPE atlas_jobs_total gauge\n"
output += fmt.Sprintf("atlas_jobs_total %d\n\n", c.jobsTotal)
output += "# HELP atlas_jobs_running Number of running jobs\n"
output += "# TYPE atlas_jobs_running gauge\n"
output += fmt.Sprintf("atlas_jobs_running %d\n\n", c.jobsRunning)
output += "# HELP atlas_jobs_completed_total Total number of completed jobs\n"
output += "# TYPE atlas_jobs_completed_total counter\n"
output += fmt.Sprintf("atlas_jobs_completed_total %d\n\n", c.jobsCompleted)
output += "# HELP atlas_jobs_failed_total Total number of failed jobs\n"
output += "# TYPE atlas_jobs_failed_total counter\n"
output += fmt.Sprintf("atlas_jobs_failed_total %d\n\n", c.jobsFailed)
// API status
output += "# HELP atlas_up Whether the atlas-api process is up\n"
output += "# TYPE atlas_up gauge\n"
output += "atlas_up 1\n"
return output
}