Fix Previously up services are alerted as going back up if the master goes down #1
This gets rid of the alert on unknown -> up, will still alert unknown -> down by design.
This commit is contained in:
@@ -27,7 +27,7 @@ func New(cluster *config.ClusterConfig, selfID string, logger *log.Logger) *Disp
|
|||||||
|
|
||||||
// OnTransition is wired as checks.TransitionFn.
|
// OnTransition is wired as checks.TransitionFn.
|
||||||
func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, snap checks.Snapshot) {
|
func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, snap checks.Snapshot) {
|
||||||
if to == checks.StateUnknown {
|
if !shouldAlert(from, to) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
alerts := d.cluster.EffectiveAlertsFor(check)
|
alerts := d.cluster.EffectiveAlertsFor(check)
|
||||||
@@ -77,6 +77,25 @@ func (d *Dispatcher) Test(alertID string) error {
|
|||||||
return d.dispatchOne(alert, msg)
|
return d.dispatchOne(alert, msg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// shouldAlert decides whether a committed state transition warrants
|
||||||
|
// firing the configured alert channels.
|
||||||
|
//
|
||||||
|
// A fresh master's aggregator starts every check at StateUnknown, so
|
||||||
|
// the first successful evaluation always commits Unknown→Up. Without
|
||||||
|
// filtering, every master failover (or daemon restart) would spam an
|
||||||
|
// "is now UP" alert for every healthy check. We treat Unknown→Up as a
|
||||||
|
// silent cold start; real recoveries (Down→Up) and any transition to
|
||||||
|
// Down still alert.
|
||||||
|
func shouldAlert(from, to checks.State) bool {
|
||||||
|
if to == checks.StateUnknown {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if from == checks.StateUnknown && to == checks.StateUp {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error {
|
func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error {
|
||||||
switch a.Type {
|
switch a.Type {
|
||||||
case config.AlertSMTP:
|
case config.AlertSMTP:
|
||||||
|
|||||||
@@ -0,0 +1,30 @@
|
|||||||
|
package alerts
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"git.cer.sh/axodouble/quptime/internal/checks"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestShouldAlertFiltersColdStartUp(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
from checks.State
|
||||||
|
to checks.State
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{"cold start to up (master failover / daemon restart)", checks.StateUnknown, checks.StateUp, false},
|
||||||
|
{"cold start to down still alerts", checks.StateUnknown, checks.StateDown, true},
|
||||||
|
{"real recovery alerts", checks.StateDown, checks.StateUp, true},
|
||||||
|
{"regression alerts", checks.StateUp, checks.StateDown, true},
|
||||||
|
{"stale (up to unknown) suppressed", checks.StateUp, checks.StateUnknown, false},
|
||||||
|
{"stale (down to unknown) suppressed", checks.StateDown, checks.StateUnknown, false},
|
||||||
|
}
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.name, func(t *testing.T) {
|
||||||
|
if got := shouldAlert(c.from, c.to); got != c.want {
|
||||||
|
t.Errorf("shouldAlert(%s→%s) = %v, want %v", c.from, c.to, got, c.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user