109 lines
3.5 KiB
Go
109 lines
3.5 KiB
Go
package alerts
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
|
|
"git.cer.sh/axodouble/quptime/internal/checks"
|
|
"git.cer.sh/axodouble/quptime/internal/config"
|
|
)
|
|
|
|
// Dispatcher fans an aggregator transition out to every alert listed
|
|
// on the check. Errors are logged but never propagated: alerting must
|
|
// not block the aggregation pipeline.
|
|
type Dispatcher struct {
|
|
cluster *config.ClusterConfig
|
|
selfID string
|
|
logger *log.Logger
|
|
}
|
|
|
|
// New constructs a Dispatcher.
|
|
func New(cluster *config.ClusterConfig, selfID string, logger *log.Logger) *Dispatcher {
|
|
if logger == nil {
|
|
logger = log.Default()
|
|
}
|
|
return &Dispatcher{cluster: cluster, selfID: selfID, logger: logger}
|
|
}
|
|
|
|
// OnTransition is wired as checks.TransitionFn.
|
|
func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, snap checks.Snapshot) {
|
|
if !shouldAlert(from, to) {
|
|
return
|
|
}
|
|
alerts := d.cluster.EffectiveAlertsFor(check)
|
|
if len(alerts) == 0 && len(check.AlertIDs) > 0 {
|
|
d.logger.Printf("alerts: check %q references alerts but none resolved", check.Name)
|
|
}
|
|
for i := range alerts {
|
|
alert := alerts[i]
|
|
msg, err := RenderFor(&alert, d.selfID, check, from, to, snap)
|
|
if err != nil {
|
|
d.logger.Printf("alerts: %q template: %v — falling back to default", alert.Name, err)
|
|
}
|
|
if err := d.dispatchOne(&alert, msg); err != nil {
|
|
d.logger.Printf("alerts: %q via %s: %v", alert.Name, alert.Type, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Test sends a one-shot test message to the named alert. Returns an
|
|
// error so the CLI can surface failures interactively. If the alert
|
|
// carries custom templates they are exercised against a synthetic
|
|
// "homepage going DOWN" transition so the operator can confirm the
|
|
// template renders before a real outage.
|
|
func (d *Dispatcher) Test(alertID string) error {
|
|
alert := d.cluster.FindAlert(alertID)
|
|
if alert == nil {
|
|
return fmt.Errorf("alert %q not found", alertID)
|
|
}
|
|
if alert.SubjectTemplate == "" && alert.BodyTemplate == "" {
|
|
msg := Message{
|
|
Subject: "[quptime] test alert",
|
|
Body: fmt.Sprintf("This is a test of alert %q from node %s.\nIf you see this, the alert channel is wired correctly.\n", alert.Name, d.selfID),
|
|
}
|
|
return d.dispatchOne(alert, msg)
|
|
}
|
|
sample := &config.Check{
|
|
ID: "test-check",
|
|
Name: "test-check",
|
|
Type: config.CheckHTTP,
|
|
Target: "https://example.com",
|
|
}
|
|
snap := checks.Snapshot{Reports: 3, OKCount: 0, NotOK: 3, Detail: "synthetic test failure"}
|
|
msg, err := RenderFor(alert, d.selfID, sample, checks.StateUp, checks.StateDown, snap)
|
|
if err != nil {
|
|
return fmt.Errorf("render template: %w", err)
|
|
}
|
|
return d.dispatchOne(alert, msg)
|
|
}
|
|
|
|
// shouldAlert decides whether a committed state transition warrants
|
|
// firing the configured alert channels.
|
|
//
|
|
// A fresh master's aggregator starts every check at StateUnknown, so
|
|
// the first successful evaluation always commits Unknown→Up. Without
|
|
// filtering, every master failover (or daemon restart) would spam an
|
|
// "is now UP" alert for every healthy check. We treat Unknown→Up as a
|
|
// silent cold start; real recoveries (Down→Up) and any transition to
|
|
// Down still alert.
|
|
func shouldAlert(from, to checks.State) bool {
|
|
if to == checks.StateUnknown {
|
|
return false
|
|
}
|
|
if from == checks.StateUnknown && to == checks.StateUp {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error {
|
|
switch a.Type {
|
|
case config.AlertSMTP:
|
|
return sendSMTP(a, msg)
|
|
case config.AlertDiscord:
|
|
return sendDiscord(a, msg)
|
|
default:
|
|
return fmt.Errorf("unknown alert type %q", a.Type)
|
|
}
|
|
}
|