Fix Previously up services are alerted as going back up if the master goes down #1
This gets rid of the alert on unknown -> up, will still alert unknown -> down by design.
This commit is contained in:
@@ -27,7 +27,7 @@ func New(cluster *config.ClusterConfig, selfID string, logger *log.Logger) *Disp
|
||||
|
||||
// OnTransition is wired as checks.TransitionFn.
|
||||
func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, snap checks.Snapshot) {
|
||||
if to == checks.StateUnknown {
|
||||
if !shouldAlert(from, to) {
|
||||
return
|
||||
}
|
||||
alerts := d.cluster.EffectiveAlertsFor(check)
|
||||
@@ -77,6 +77,25 @@ func (d *Dispatcher) Test(alertID string) error {
|
||||
return d.dispatchOne(alert, msg)
|
||||
}
|
||||
|
||||
// shouldAlert decides whether a committed state transition warrants
|
||||
// firing the configured alert channels.
|
||||
//
|
||||
// A fresh master's aggregator starts every check at StateUnknown, so
|
||||
// the first successful evaluation always commits Unknown→Up. Without
|
||||
// filtering, every master failover (or daemon restart) would spam an
|
||||
// "is now UP" alert for every healthy check. We treat Unknown→Up as a
|
||||
// silent cold start; real recoveries (Down→Up) and any transition to
|
||||
// Down still alert.
|
||||
func shouldAlert(from, to checks.State) bool {
|
||||
if to == checks.StateUnknown {
|
||||
return false
|
||||
}
|
||||
if from == checks.StateUnknown && to == checks.StateUp {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error {
|
||||
switch a.Type {
|
||||
case config.AlertSMTP:
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.cer.sh/axodouble/quptime/internal/checks"
|
||||
)
|
||||
|
||||
func TestShouldAlertFiltersColdStartUp(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
from checks.State
|
||||
to checks.State
|
||||
want bool
|
||||
}{
|
||||
{"cold start to up (master failover / daemon restart)", checks.StateUnknown, checks.StateUp, false},
|
||||
{"cold start to down still alerts", checks.StateUnknown, checks.StateDown, true},
|
||||
{"real recovery alerts", checks.StateDown, checks.StateUp, true},
|
||||
{"regression alerts", checks.StateUp, checks.StateDown, true},
|
||||
{"stale (up to unknown) suppressed", checks.StateUp, checks.StateUnknown, false},
|
||||
{"stale (down to unknown) suppressed", checks.StateDown, checks.StateUnknown, false},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
if got := shouldAlert(c.from, c.to); got != c.want {
|
||||
t.Errorf("shouldAlert(%s→%s) = %v, want %v", c.from, c.to, got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user