package config import ( "fmt" "os" "sync" "time" "gopkg.in/yaml.v3" ) // PeerInfo identifies a cluster member as known to all peers. // // CertPEM rides along so the daemon can populate trust.yaml when a // new node joins: a follower receiving an updated cluster.yaml from // the master trusts the master, and therefore trusts the peer // certificates it forwards. Without this, mTLS between new and old // peers would never succeed because neither would have the other in // its trust store. type PeerInfo struct { NodeID string `yaml:"node_id"` Advertise string `yaml:"advertise"` Fingerprint string `yaml:"fingerprint"` CertPEM string `yaml:"cert_pem,omitempty"` } // CheckType enumerates the supported probe kinds. type CheckType string const ( CheckHTTP CheckType = "http" CheckTCP CheckType = "tcp" CheckICMP CheckType = "icmp" ) // Check describes a single monitored target. type Check struct { ID string `yaml:"id"` Name string `yaml:"name"` Type CheckType `yaml:"type"` Target string `yaml:"target"` // URL, host:port, or host Interval time.Duration `yaml:"interval"` // default 30s Timeout time.Duration `yaml:"timeout"` // default 10s // HTTP-only options. ExpectStatus int `yaml:"expect_status,omitempty"` BodyMatch string `yaml:"body_match,omitempty"` // AlertIDs lists which configured alerts fire when this check // transitions state. AlertIDs []string `yaml:"alert_ids,omitempty"` } // AlertType enumerates supported notifier kinds. type AlertType string const ( AlertSMTP AlertType = "smtp" AlertDiscord AlertType = "discord" ) // Alert describes a single notifier destination. type Alert struct { ID string `yaml:"id"` Name string `yaml:"name"` Type AlertType `yaml:"type"` // SMTP options. SMTPHost string `yaml:"smtp_host,omitempty"` SMTPPort int `yaml:"smtp_port,omitempty"` SMTPUser string `yaml:"smtp_user,omitempty"` SMTPPassword string `yaml:"smtp_password,omitempty"` SMTPFrom string `yaml:"smtp_from,omitempty"` SMTPTo []string `yaml:"smtp_to,omitempty"` SMTPStartTLS bool `yaml:"smtp_starttls,omitempty"` // Discord options. DiscordWebhook string `yaml:"discord_webhook,omitempty"` } // ClusterConfig is the replicated cluster state. The Version field // strictly increases on every mutation; the master is the only node // that bumps it. type ClusterConfig struct { Version uint64 `yaml:"version"` UpdatedAt time.Time `yaml:"updated_at"` UpdatedBy string `yaml:"updated_by"` Peers []PeerInfo `yaml:"peers"` Checks []Check `yaml:"checks"` Alerts []Alert `yaml:"alerts"` mu sync.RWMutex `yaml:"-"` onChange []func() // fired after any successful Mutate/Replace } // OnChange registers a callback fired after every successful Mutate // or Replace. Callbacks run synchronously on the mutating goroutine // AFTER the lock is released — they may safely call back into the // config to read snapshots. func (c *ClusterConfig) OnChange(fn func()) { c.mu.Lock() c.onChange = append(c.onChange, fn) c.mu.Unlock() } func (c *ClusterConfig) fireOnChange() { c.mu.RLock() cbs := append([]func(){}, c.onChange...) c.mu.RUnlock() for _, fn := range cbs { fn() } } // LoadClusterConfig reads cluster.yaml. A missing file returns an // empty (version 0) config — callers should treat that as the // pre-bootstrap state. func LoadClusterConfig() (*ClusterConfig, error) { raw, err := os.ReadFile(ClusterFilePath()) if err != nil { if os.IsNotExist(err) { return &ClusterConfig{}, nil } return nil, err } cfg := &ClusterConfig{} if err := yaml.Unmarshal(raw, cfg); err != nil { return nil, fmt.Errorf("parse cluster.yaml: %w", err) } return cfg, nil } // Save writes cluster.yaml atomically. Caller is responsible for // having already taken any external locks. func (c *ClusterConfig) Save() error { c.mu.RLock() defer c.mu.RUnlock() out, err := yaml.Marshal(c) if err != nil { return err } return AtomicWrite(ClusterFilePath(), out, 0o600) } // Snapshot returns a deep-enough copy of the config that can be // safely serialized while the original continues to mutate. func (c *ClusterConfig) Snapshot() *ClusterConfig { c.mu.RLock() defer c.mu.RUnlock() cp := &ClusterConfig{ Version: c.Version, UpdatedAt: c.UpdatedAt, UpdatedBy: c.UpdatedBy, Peers: append([]PeerInfo(nil), c.Peers...), Checks: append([]Check(nil), c.Checks...), Alerts: append([]Alert(nil), c.Alerts...), } return cp } // Mutate runs fn under the config write lock, bumps Version on // success, and writes the file. Only the master should call this. func (c *ClusterConfig) Mutate(byNode string, fn func(*ClusterConfig) error) error { c.mu.Lock() if err := fn(c); err != nil { c.mu.Unlock() return err } c.Version++ c.UpdatedAt = time.Now().UTC() c.UpdatedBy = byNode out, err := yaml.Marshal(c) if err != nil { c.mu.Unlock() return err } if err := AtomicWrite(ClusterFilePath(), out, 0o600); err != nil { c.mu.Unlock() return err } c.mu.Unlock() c.fireOnChange() return nil } // Replace overwrites the local config with an incoming snapshot if // that snapshot has a strictly greater version. Returns true if // applied. func (c *ClusterConfig) Replace(incoming *ClusterConfig) (bool, error) { c.mu.Lock() if incoming.Version <= c.Version { c.mu.Unlock() return false, nil } c.Version = incoming.Version c.UpdatedAt = incoming.UpdatedAt c.UpdatedBy = incoming.UpdatedBy c.Peers = append([]PeerInfo(nil), incoming.Peers...) c.Checks = append([]Check(nil), incoming.Checks...) c.Alerts = append([]Alert(nil), incoming.Alerts...) out, err := yaml.Marshal(c) if err != nil { c.mu.Unlock() return false, err } if err := AtomicWrite(ClusterFilePath(), out, 0o600); err != nil { c.mu.Unlock() return false, err } c.mu.Unlock() c.fireOnChange() return true, nil } // FindAlert returns the alert with the given ID or name, or nil if // no entry matches. func (c *ClusterConfig) FindAlert(idOrName string) *Alert { c.mu.RLock() defer c.mu.RUnlock() for i := range c.Alerts { if c.Alerts[i].ID == idOrName || c.Alerts[i].Name == idOrName { cp := c.Alerts[i] return &cp } } return nil } // QuorumSize returns the minimum number of live nodes required for // the cluster to make progress: floor(N/2) + 1. func (c *ClusterConfig) QuorumSize() int { c.mu.RLock() defer c.mu.RUnlock() n := len(c.Peers) if n == 0 { return 1 } return n/2 + 1 }