diff --git a/internal/alerts/dispatcher.go b/internal/alerts/dispatcher.go index 3163fa4..49329a3 100644 --- a/internal/alerts/dispatcher.go +++ b/internal/alerts/dispatcher.go @@ -27,7 +27,7 @@ func New(cluster *config.ClusterConfig, selfID string, logger *log.Logger) *Disp // OnTransition is wired as checks.TransitionFn. func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, snap checks.Snapshot) { - if to == checks.StateUnknown { + if !shouldAlert(from, to) { return } alerts := d.cluster.EffectiveAlertsFor(check) @@ -77,6 +77,25 @@ func (d *Dispatcher) Test(alertID string) error { return d.dispatchOne(alert, msg) } +// shouldAlert decides whether a committed state transition warrants +// firing the configured alert channels. +// +// A fresh master's aggregator starts every check at StateUnknown, so +// the first successful evaluation always commits Unknown→Up. Without +// filtering, every master failover (or daemon restart) would spam an +// "is now UP" alert for every healthy check. We treat Unknown→Up as a +// silent cold start; real recoveries (Down→Up) and any transition to +// Down still alert. +func shouldAlert(from, to checks.State) bool { + if to == checks.StateUnknown { + return false + } + if from == checks.StateUnknown && to == checks.StateUp { + return false + } + return true +} + func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error { switch a.Type { case config.AlertSMTP: diff --git a/internal/alerts/dispatcher_test.go b/internal/alerts/dispatcher_test.go new file mode 100644 index 0000000..a0bda76 --- /dev/null +++ b/internal/alerts/dispatcher_test.go @@ -0,0 +1,30 @@ +package alerts + +import ( + "testing" + + "git.cer.sh/axodouble/quptime/internal/checks" +) + +func TestShouldAlertFiltersColdStartUp(t *testing.T) { + cases := []struct { + name string + from checks.State + to checks.State + want bool + }{ + {"cold start to up (master failover / daemon restart)", checks.StateUnknown, checks.StateUp, false}, + {"cold start to down still alerts", checks.StateUnknown, checks.StateDown, true}, + {"real recovery alerts", checks.StateDown, checks.StateUp, true}, + {"regression alerts", checks.StateUp, checks.StateDown, true}, + {"stale (up to unknown) suppressed", checks.StateUp, checks.StateUnknown, false}, + {"stale (down to unknown) suppressed", checks.StateDown, checks.StateUnknown, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := shouldAlert(c.from, c.to); got != c.want { + t.Errorf("shouldAlert(%s→%s) = %v, want %v", c.from, c.to, got, c.want) + } + }) + } +}