Fix Previously up services are alerted as going back up if the master goes down #1
Container image / image (push) Successful in 1m45s
Release / release (push) Successful in 1m44s

This gets rid of the alert on unknown -> up, will still alert unknown -> down by design.
This commit is contained in:
2026-05-15 07:01:29 +00:00
parent 8638ab5432
commit 3c85caabcf
2 changed files with 50 additions and 1 deletions
+20 -1
View File
@@ -27,7 +27,7 @@ func New(cluster *config.ClusterConfig, selfID string, logger *log.Logger) *Disp
// OnTransition is wired as checks.TransitionFn.
func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, snap checks.Snapshot) {
if to == checks.StateUnknown {
if !shouldAlert(from, to) {
return
}
alerts := d.cluster.EffectiveAlertsFor(check)
@@ -77,6 +77,25 @@ func (d *Dispatcher) Test(alertID string) error {
return d.dispatchOne(alert, msg)
}
// shouldAlert decides whether a committed state transition warrants
// firing the configured alert channels.
//
// A fresh master's aggregator starts every check at StateUnknown, so
// the first successful evaluation always commits Unknown→Up. Without
// filtering, every master failover (or daemon restart) would spam an
// "is now UP" alert for every healthy check. We treat Unknown→Up as a
// silent cold start; real recoveries (Down→Up) and any transition to
// Down still alert.
func shouldAlert(from, to checks.State) bool {
if to == checks.StateUnknown {
return false
}
if from == checks.StateUnknown && to == checks.StateUp {
return false
}
return true
}
func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error {
switch a.Type {
case config.AlertSMTP: