Files
QUptime/internal/alerts/dispatcher.go
T
Axodouble 3c85caabcf
Container image / image (push) Successful in 1m45s
Release / release (push) Successful in 1m44s
Fix Previously up services are alerted as going back up if the master goes down #1
This gets rid of the alert on unknown -> up, will still alert unknown -> down by design.
2026-05-15 07:01:29 +00:00

109 lines
3.5 KiB
Go

package alerts
import (
"fmt"
"log"
"git.cer.sh/axodouble/quptime/internal/checks"
"git.cer.sh/axodouble/quptime/internal/config"
)
// Dispatcher fans an aggregator transition out to every alert listed
// on the check. Errors are logged but never propagated: alerting must
// not block the aggregation pipeline.
type Dispatcher struct {
cluster *config.ClusterConfig
selfID string
logger *log.Logger
}
// New constructs a Dispatcher.
func New(cluster *config.ClusterConfig, selfID string, logger *log.Logger) *Dispatcher {
if logger == nil {
logger = log.Default()
}
return &Dispatcher{cluster: cluster, selfID: selfID, logger: logger}
}
// OnTransition is wired as checks.TransitionFn.
func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, snap checks.Snapshot) {
if !shouldAlert(from, to) {
return
}
alerts := d.cluster.EffectiveAlertsFor(check)
if len(alerts) == 0 && len(check.AlertIDs) > 0 {
d.logger.Printf("alerts: check %q references alerts but none resolved", check.Name)
}
for i := range alerts {
alert := alerts[i]
msg, err := RenderFor(&alert, d.selfID, check, from, to, snap)
if err != nil {
d.logger.Printf("alerts: %q template: %v — falling back to default", alert.Name, err)
}
if err := d.dispatchOne(&alert, msg); err != nil {
d.logger.Printf("alerts: %q via %s: %v", alert.Name, alert.Type, err)
}
}
}
// Test sends a one-shot test message to the named alert. Returns an
// error so the CLI can surface failures interactively. If the alert
// carries custom templates they are exercised against a synthetic
// "homepage going DOWN" transition so the operator can confirm the
// template renders before a real outage.
func (d *Dispatcher) Test(alertID string) error {
alert := d.cluster.FindAlert(alertID)
if alert == nil {
return fmt.Errorf("alert %q not found", alertID)
}
if alert.SubjectTemplate == "" && alert.BodyTemplate == "" {
msg := Message{
Subject: "[quptime] test alert",
Body: fmt.Sprintf("This is a test of alert %q from node %s.\nIf you see this, the alert channel is wired correctly.\n", alert.Name, d.selfID),
}
return d.dispatchOne(alert, msg)
}
sample := &config.Check{
ID: "test-check",
Name: "test-check",
Type: config.CheckHTTP,
Target: "https://example.com",
}
snap := checks.Snapshot{Reports: 3, OKCount: 0, NotOK: 3, Detail: "synthetic test failure"}
msg, err := RenderFor(alert, d.selfID, sample, checks.StateUp, checks.StateDown, snap)
if err != nil {
return fmt.Errorf("render template: %w", err)
}
return d.dispatchOne(alert, msg)
}
// shouldAlert decides whether a committed state transition warrants
// firing the configured alert channels.
//
// A fresh master's aggregator starts every check at StateUnknown, so
// the first successful evaluation always commits Unknown→Up. Without
// filtering, every master failover (or daemon restart) would spam an
// "is now UP" alert for every healthy check. We treat Unknown→Up as a
// silent cold start; real recoveries (Down→Up) and any transition to
// Down still alert.
func shouldAlert(from, to checks.State) bool {
if to == checks.StateUnknown {
return false
}
if from == checks.StateUnknown && to == checks.StateUp {
return false
}
return true
}
func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error {
switch a.Type {
case config.AlertSMTP:
return sendSMTP(a, msg)
case config.AlertDiscord:
return sendDiscord(a, msg)
default:
return fmt.Errorf("unknown alert type %q", a.Type)
}
}