package metrics import ( "fmt" "sync" "time" "gitea.avt.data-center.id/othman.suseno/atlas/internal/models" ) // Collector gathers system metrics type Collector struct { mu sync.RWMutex // ZFS metrics poolCount int datasetCount int zvolCount int snapshotCount int totalCapacity uint64 totalAllocated uint64 totalFree uint64 // Service metrics smbSharesCount int nfsExportsCount int iscsiTargetsCount int smbServiceStatus int // 1 = running, 0 = stopped nfsServiceStatus int iscsiServiceStatus int // Job metrics jobsTotal int jobsRunning int jobsCompleted int jobsFailed int // System metrics uptimeSeconds int64 lastUpdate time.Time } // NewCollector creates a new metrics collector func NewCollector() *Collector { return &Collector{ lastUpdate: time.Now(), } } // UpdateZFSMetrics updates ZFS-related metrics func (c *Collector) UpdateZFSMetrics(pools []models.Pool, datasets []models.Dataset, zvols []models.ZVOL, snapshots []models.Snapshot) { c.mu.Lock() defer c.mu.Unlock() c.poolCount = len(pools) c.datasetCount = len(datasets) c.zvolCount = len(zvols) c.snapshotCount = len(snapshots) c.totalCapacity = 0 c.totalAllocated = 0 c.totalFree = 0 for _, pool := range pools { c.totalCapacity += pool.Size c.totalAllocated += pool.Allocated c.totalFree += pool.Free } c.lastUpdate = time.Now() } // UpdateServiceMetrics updates storage service metrics func (c *Collector) UpdateServiceMetrics(smbShares, nfsExports, iscsiTargets int, smbStatus, nfsStatus, iscsiStatus bool) { c.mu.Lock() defer c.mu.Unlock() c.smbSharesCount = smbShares c.nfsExportsCount = nfsExports c.iscsiTargetsCount = iscsiTargets if smbStatus { c.smbServiceStatus = 1 } else { c.smbServiceStatus = 0 } if nfsStatus { c.nfsServiceStatus = 1 } else { c.nfsServiceStatus = 0 } if iscsiStatus { c.iscsiServiceStatus = 1 } else { c.iscsiServiceStatus = 0 } c.lastUpdate = time.Now() } // UpdateJobMetrics updates job-related metrics func (c *Collector) UpdateJobMetrics(total, running, completed, failed int) { c.mu.Lock() defer c.mu.Unlock() c.jobsTotal = total c.jobsRunning = running c.jobsCompleted = completed c.jobsFailed = failed c.lastUpdate = time.Now() } // SetUptime sets the system uptime func (c *Collector) SetUptime(seconds int64) { c.mu.Lock() defer c.mu.Unlock() c.uptimeSeconds = seconds } // Collect returns metrics in Prometheus format func (c *Collector) Collect() string { c.mu.RLock() defer c.mu.RUnlock() var output string // Build info output += "# HELP atlas_build_info Build information\n" output += "# TYPE atlas_build_info gauge\n" output += `atlas_build_info{version="v0.1.0-dev"} 1` + "\n\n" // System uptime output += "# HELP atlas_uptime_seconds System uptime in seconds\n" output += "# TYPE atlas_uptime_seconds gauge\n" output += fmt.Sprintf("atlas_uptime_seconds %d\n\n", c.uptimeSeconds) // ZFS metrics output += "# HELP atlas_zfs_pools_total Total number of ZFS pools\n" output += "# TYPE atlas_zfs_pools_total gauge\n" output += fmt.Sprintf("atlas_zfs_pools_total %d\n\n", c.poolCount) output += "# HELP atlas_zfs_datasets_total Total number of ZFS datasets\n" output += "# TYPE atlas_zfs_datasets_total gauge\n" output += fmt.Sprintf("atlas_zfs_datasets_total %d\n\n", c.datasetCount) output += "# HELP atlas_zfs_zvols_total Total number of ZFS ZVOLs\n" output += "# TYPE atlas_zfs_zvols_total gauge\n" output += fmt.Sprintf("atlas_zfs_zvols_total %d\n\n", c.zvolCount) output += "# HELP atlas_zfs_snapshots_total Total number of ZFS snapshots\n" output += "# TYPE atlas_zfs_snapshots_total gauge\n" output += fmt.Sprintf("atlas_zfs_snapshots_total %d\n\n", c.snapshotCount) output += "# HELP atlas_zfs_capacity_bytes Total ZFS pool capacity in bytes\n" output += "# TYPE atlas_zfs_capacity_bytes gauge\n" output += fmt.Sprintf("atlas_zfs_capacity_bytes %d\n\n", c.totalCapacity) output += "# HELP atlas_zfs_allocated_bytes Total ZFS pool allocated space in bytes\n" output += "# TYPE atlas_zfs_allocated_bytes gauge\n" output += fmt.Sprintf("atlas_zfs_allocated_bytes %d\n\n", c.totalAllocated) output += "# HELP atlas_zfs_free_bytes Total ZFS pool free space in bytes\n" output += "# TYPE atlas_zfs_free_bytes gauge\n" output += fmt.Sprintf("atlas_zfs_free_bytes %d\n\n", c.totalFree) // Service metrics output += "# HELP atlas_smb_shares_total Total number of SMB shares\n" output += "# TYPE atlas_smb_shares_total gauge\n" output += fmt.Sprintf("atlas_smb_shares_total %d\n\n", c.smbSharesCount) output += "# HELP atlas_nfs_exports_total Total number of NFS exports\n" output += "# TYPE atlas_nfs_exports_total gauge\n" output += fmt.Sprintf("atlas_nfs_exports_total %d\n\n", c.nfsExportsCount) output += "# HELP atlas_iscsi_targets_total Total number of iSCSI targets\n" output += "# TYPE atlas_iscsi_targets_total gauge\n" output += fmt.Sprintf("atlas_iscsi_targets_total %d\n\n", c.iscsiTargetsCount) output += "# HELP atlas_smb_service_status SMB service status (1=running, 0=stopped)\n" output += "# TYPE atlas_smb_service_status gauge\n" output += fmt.Sprintf("atlas_smb_service_status %d\n\n", c.smbServiceStatus) output += "# HELP atlas_nfs_service_status NFS service status (1=running, 0=stopped)\n" output += "# TYPE atlas_nfs_service_status gauge\n" output += fmt.Sprintf("atlas_nfs_service_status %d\n\n", c.nfsServiceStatus) output += "# HELP atlas_iscsi_service_status iSCSI service status (1=running, 0=stopped)\n" output += "# TYPE atlas_iscsi_service_status gauge\n" output += fmt.Sprintf("atlas_iscsi_service_status %d\n\n", c.iscsiServiceStatus) // Job metrics output += "# HELP atlas_jobs_total Total number of jobs\n" output += "# TYPE atlas_jobs_total gauge\n" output += fmt.Sprintf("atlas_jobs_total %d\n\n", c.jobsTotal) output += "# HELP atlas_jobs_running Number of running jobs\n" output += "# TYPE atlas_jobs_running gauge\n" output += fmt.Sprintf("atlas_jobs_running %d\n\n", c.jobsRunning) output += "# HELP atlas_jobs_completed_total Total number of completed jobs\n" output += "# TYPE atlas_jobs_completed_total counter\n" output += fmt.Sprintf("atlas_jobs_completed_total %d\n\n", c.jobsCompleted) output += "# HELP atlas_jobs_failed_total Total number of failed jobs\n" output += "# TYPE atlas_jobs_failed_total counter\n" output += fmt.Sprintf("atlas_jobs_failed_total %d\n\n", c.jobsFailed) // API status output += "# HELP atlas_up Whether the atlas-api process is up\n" output += "# TYPE atlas_up gauge\n" output += "atlas_up 1\n" return output }