Added custom messages for uptime alerts
This commit is contained in:
@@ -188,6 +188,48 @@ specific default by adding the alert's ID or name to its
|
||||
`suppress_alert_ids` list in `cluster.yaml` (see "Edit cluster.yaml
|
||||
directly" below).
|
||||
|
||||
## Custom alert messages
|
||||
|
||||
Each alert can carry its own `subject_template` and `body_template`
|
||||
(Go `text/template` syntax). When set, they override the built-in
|
||||
formatting for that one alert; the default renderer is used otherwise.
|
||||
Discord ignores the subject template (it has no subject line).
|
||||
|
||||
```sh
|
||||
qu alert add discord oncall --webhook https://... \
|
||||
--body ':rotating_light: **{{.Check.Name}}** is now {{.Verb}}
|
||||
target: `{{.Check.Target}}`
|
||||
detail: {{.Snapshot.Detail}}'
|
||||
|
||||
# multi-line templates are easier from a file
|
||||
qu alert add smtp ops --host ... --from ... --to ... \
|
||||
--subject-file /etc/quptime/templates/ops.subject \
|
||||
--body-file /etc/quptime/templates/ops.body
|
||||
```
|
||||
|
||||
Available template variables:
|
||||
|
||||
| Variable | Meaning |
|
||||
|---|---|
|
||||
| `{{.Check.Name}}` | check name |
|
||||
| `{{.Check.Type}}` | `http` / `tcp` / `icmp` |
|
||||
| `{{.Check.Target}}` | URL or host:port being probed |
|
||||
| `{{.Check.ID}}` | UUID |
|
||||
| `{{.From}}` | previous state (`up` / `down` / `unknown`) |
|
||||
| `{{.To}}` | new state |
|
||||
| `{{.Verb}}` | `UP` / `DOWN` / `RECOVERED` |
|
||||
| `{{.Snapshot.Reports}}` | total per-node reports counted |
|
||||
| `{{.Snapshot.OKCount}}` | how many reported OK |
|
||||
| `{{.Snapshot.NotOK}}` | how many reported failure |
|
||||
| `{{.Snapshot.Detail}}` | first failure detail string |
|
||||
| `{{.NodeID}}` | master that dispatched |
|
||||
| `{{.When}}` | RFC3339 timestamp |
|
||||
|
||||
`qu alert test <name>` exercises the template against a synthetic
|
||||
"homepage going DOWN" transition, so you can verify rendering before
|
||||
production traffic depends on it. A template parse or execution error
|
||||
falls back to the built-in format and is logged.
|
||||
|
||||
## Edit cluster.yaml directly
|
||||
|
||||
Anything you can do through the CLI you can also do by editing
|
||||
@@ -258,8 +300,8 @@ qu check add tcp <name> <host:port>
|
||||
qu check add icmp <name> <host>
|
||||
qu check list
|
||||
qu check remove <id-or-name>
|
||||
qu alert add smtp <name> --host … --port … --from … --to … [--user --password --starttls] [--default]
|
||||
qu alert add discord <name> --webhook … [--default]
|
||||
qu alert add smtp <name> --host … --port … --from … --to … [--user --password --starttls] [--default] [--subject … --body …]
|
||||
qu alert add discord <name> --webhook … [--default] [--body …]
|
||||
qu alert list / remove / test <id-or-name>
|
||||
qu alert default <id-or-name> on|off toggle default attachment to every check
|
||||
qu trust list / remove <node-id>
|
||||
|
||||
@@ -30,13 +30,16 @@ func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, sn
|
||||
if to == checks.StateUnknown {
|
||||
return
|
||||
}
|
||||
msg := Render(d.selfID, check, from, to, snap)
|
||||
alerts := d.cluster.EffectiveAlertsFor(check)
|
||||
if len(alerts) == 0 && len(check.AlertIDs) > 0 {
|
||||
d.logger.Printf("alerts: check %q references alerts but none resolved", check.Name)
|
||||
}
|
||||
for i := range alerts {
|
||||
alert := alerts[i]
|
||||
msg, err := RenderFor(&alert, d.selfID, check, from, to, snap)
|
||||
if err != nil {
|
||||
d.logger.Printf("alerts: %q template: %v — falling back to default", alert.Name, err)
|
||||
}
|
||||
if err := d.dispatchOne(&alert, msg); err != nil {
|
||||
d.logger.Printf("alerts: %q via %s: %v", alert.Name, alert.Type, err)
|
||||
}
|
||||
@@ -44,18 +47,35 @@ func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, sn
|
||||
}
|
||||
|
||||
// Test sends a one-shot test message to the named alert. Returns an
|
||||
// error so the CLI can surface failures interactively.
|
||||
// error so the CLI can surface failures interactively. If the alert
|
||||
// carries custom templates they are exercised against a synthetic
|
||||
// "homepage going DOWN" transition so the operator can confirm the
|
||||
// template renders before a real outage.
|
||||
func (d *Dispatcher) Test(alertID string) error {
|
||||
alert := d.cluster.FindAlert(alertID)
|
||||
if alert == nil {
|
||||
return fmt.Errorf("alert %q not found", alertID)
|
||||
}
|
||||
if alert.SubjectTemplate == "" && alert.BodyTemplate == "" {
|
||||
msg := Message{
|
||||
Subject: "[quptime] test alert",
|
||||
Body: fmt.Sprintf("This is a test of alert %q from node %s.\nIf you see this, the alert channel is wired correctly.\n", alert.Name, d.selfID),
|
||||
}
|
||||
return d.dispatchOne(alert, msg)
|
||||
}
|
||||
sample := &config.Check{
|
||||
ID: "test-check",
|
||||
Name: "test-check",
|
||||
Type: config.CheckHTTP,
|
||||
Target: "https://example.com",
|
||||
}
|
||||
snap := checks.Snapshot{Reports: 3, OKCount: 0, NotOK: 3, Detail: "synthetic test failure"}
|
||||
msg, err := RenderFor(alert, d.selfID, sample, checks.StateUp, checks.StateDown, snap)
|
||||
if err != nil {
|
||||
return fmt.Errorf("render template: %w", err)
|
||||
}
|
||||
return d.dispatchOne(alert, msg)
|
||||
}
|
||||
|
||||
func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error {
|
||||
switch a.Type {
|
||||
|
||||
@@ -4,14 +4,29 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"strings"
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"git.cer.sh/axodouble/quptime/internal/checks"
|
||||
"git.cer.sh/axodouble/quptime/internal/config"
|
||||
)
|
||||
|
||||
// TemplateContext is what user-provided subject/body templates see. It
|
||||
// is also the shape the default renderer fills in, so changing one
|
||||
// place keeps the two paths consistent.
|
||||
type TemplateContext struct {
|
||||
Check *config.Check
|
||||
From string // previous state name
|
||||
To string // new state name
|
||||
Verb string // "UP" | "DOWN" | "RECOVERED"
|
||||
Snapshot checks.Snapshot // aggregate counts and detail
|
||||
NodeID string // master that rendered the message
|
||||
When string // RFC3339 timestamp
|
||||
}
|
||||
|
||||
// Message is the rendered notification ready to ship across any
|
||||
// channel. Channels may format Subject + Body differently (SMTP uses
|
||||
// both; Discord renders a single string).
|
||||
@@ -20,25 +35,82 @@ type Message struct {
|
||||
Body string
|
||||
}
|
||||
|
||||
// Render produces a human-readable message from one state transition.
|
||||
// Render produces a human-readable message from one state transition
|
||||
// using the built-in format. Used as the fallback when no custom
|
||||
// template is configured (or when a custom template fails to render).
|
||||
func Render(nodeID string, check *config.Check, from, to checks.State, snap checks.Snapshot) Message {
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
verb := transitionVerb(from, to)
|
||||
subject := fmt.Sprintf("[quptime] %s %s — %s", check.Name, verb, check.Target)
|
||||
ctx := newContext(nodeID, check, from, to, snap)
|
||||
subject := fmt.Sprintf("[quptime] %s %s — %s", check.Name, ctx.Verb, check.Target)
|
||||
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "Check %q is now %s.\n", check.Name, strings.ToUpper(string(to)))
|
||||
fmt.Fprintf(&b, "Previous state: %s\n", from)
|
||||
fmt.Fprintf(&b, "Check %q is now %s.\n", check.Name, strings.ToUpper(ctx.To))
|
||||
fmt.Fprintf(&b, "Previous state: %s\n", ctx.From)
|
||||
fmt.Fprintf(&b, "Target: %s (%s)\n", check.Target, check.Type)
|
||||
fmt.Fprintf(&b, "Reports: %d (ok=%d, fail=%d)\n", snap.Reports, snap.OKCount, snap.NotOK)
|
||||
if snap.Detail != "" {
|
||||
fmt.Fprintf(&b, "Detail: %s\n", snap.Detail)
|
||||
}
|
||||
fmt.Fprintf(&b, "Master: %s\n", nodeID)
|
||||
fmt.Fprintf(&b, "When: %s\n", now)
|
||||
fmt.Fprintf(&b, "When: %s\n", ctx.When)
|
||||
return Message{Subject: subject, Body: b.String()}
|
||||
}
|
||||
|
||||
// RenderFor produces a message for one specific alert. If the alert
|
||||
// defines SubjectTemplate or BodyTemplate, those override the
|
||||
// corresponding field from the default render. A template error falls
|
||||
// back to the default for that field and is reported via the returned
|
||||
// error (the caller is expected to log but still ship the message).
|
||||
func RenderFor(alert *config.Alert, nodeID string, check *config.Check, from, to checks.State, snap checks.Snapshot) (Message, error) {
|
||||
def := Render(nodeID, check, from, to, snap)
|
||||
if alert == nil || (alert.SubjectTemplate == "" && alert.BodyTemplate == "") {
|
||||
return def, nil
|
||||
}
|
||||
ctx := newContext(nodeID, check, from, to, snap)
|
||||
msg := def
|
||||
var firstErr error
|
||||
if alert.SubjectTemplate != "" {
|
||||
s, err := execTemplate("subject", alert.SubjectTemplate, ctx)
|
||||
if err != nil {
|
||||
firstErr = err
|
||||
} else {
|
||||
msg.Subject = s
|
||||
}
|
||||
}
|
||||
if alert.BodyTemplate != "" {
|
||||
s, err := execTemplate("body", alert.BodyTemplate, ctx)
|
||||
if err != nil && firstErr == nil {
|
||||
firstErr = err
|
||||
} else if err == nil {
|
||||
msg.Body = s
|
||||
}
|
||||
}
|
||||
return msg, firstErr
|
||||
}
|
||||
|
||||
func newContext(nodeID string, check *config.Check, from, to checks.State, snap checks.Snapshot) TemplateContext {
|
||||
return TemplateContext{
|
||||
Check: check,
|
||||
From: string(from),
|
||||
To: string(to),
|
||||
Verb: transitionVerb(from, to),
|
||||
Snapshot: snap,
|
||||
NodeID: nodeID,
|
||||
When: time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
}
|
||||
|
||||
func execTemplate(name, src string, ctx TemplateContext) (string, error) {
|
||||
tmpl, err := template.New(name).Option("missingkey=zero").Parse(src)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse %s template: %w", name, err)
|
||||
}
|
||||
var b bytes.Buffer
|
||||
if err := tmpl.Execute(&b, ctx); err != nil {
|
||||
return "", fmt.Errorf("execute %s template: %w", name, err)
|
||||
}
|
||||
return b.String(), nil
|
||||
}
|
||||
|
||||
func transitionVerb(from, to checks.State) string {
|
||||
switch to {
|
||||
case checks.StateDown:
|
||||
|
||||
@@ -50,3 +50,61 @@ func TestRenderUpInitialTransition(t *testing.T) {
|
||||
t.Error("first-time UP should not be tagged RECOVERED")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderForUsesAlertTemplates(t *testing.T) {
|
||||
check := &config.Check{Name: "homepage", Target: "https://example.com", Type: config.CheckHTTP}
|
||||
snap := checks.Snapshot{Reports: 3, OKCount: 0, NotOK: 3, Detail: "connection refused"}
|
||||
alert := &config.Alert{
|
||||
SubjectTemplate: "{{.Check.Name}} is {{.Verb}}",
|
||||
BodyTemplate: "{{.Check.Target}} :: {{.Snapshot.Detail}}",
|
||||
}
|
||||
msg, err := RenderFor(alert, "master", check, checks.StateUp, checks.StateDown, snap)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if msg.Subject != "homepage is DOWN" {
|
||||
t.Errorf("subject = %q", msg.Subject)
|
||||
}
|
||||
if msg.Body != "https://example.com :: connection refused" {
|
||||
t.Errorf("body = %q", msg.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderForFallsBackToDefaultPerField(t *testing.T) {
|
||||
check := &config.Check{Name: "homepage", Target: "https://example.com", Type: config.CheckHTTP}
|
||||
snap := checks.Snapshot{Reports: 3, OKCount: 0, NotOK: 3}
|
||||
// only body overridden; subject should match default.
|
||||
alert := &config.Alert{BodyTemplate: "custom body"}
|
||||
msg, err := RenderFor(alert, "master", check, checks.StateUp, checks.StateDown, snap)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !strings.Contains(msg.Subject, "DOWN") {
|
||||
t.Errorf("subject should be default rendering, got %q", msg.Subject)
|
||||
}
|
||||
if msg.Body != "custom body" {
|
||||
t.Errorf("body = %q", msg.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderForReportsTemplateError(t *testing.T) {
|
||||
check := &config.Check{Name: "homepage", Target: "https://example.com"}
|
||||
snap := checks.Snapshot{}
|
||||
alert := &config.Alert{BodyTemplate: "{{.Check.MissingField"} // unbalanced
|
||||
_, err := RenderFor(alert, "master", check, checks.StateUp, checks.StateDown, snap)
|
||||
if err == nil {
|
||||
t.Fatal("expected parse error for malformed template")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderForNilAlertReturnsDefault(t *testing.T) {
|
||||
check := &config.Check{Name: "homepage", Target: "https://example.com"}
|
||||
snap := checks.Snapshot{Reports: 1, OKCount: 1}
|
||||
msg, err := RenderFor(nil, "master", check, checks.StateUp, checks.StateUp, snap)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !strings.Contains(msg.Subject, "homepage") {
|
||||
t.Errorf("default subject should mention check, got %q", msg.Subject)
|
||||
}
|
||||
}
|
||||
|
||||
+54
-4
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"text/tabwriter"
|
||||
"time"
|
||||
|
||||
@@ -15,6 +16,41 @@ import (
|
||||
"git.cer.sh/axodouble/quptime/internal/transport"
|
||||
)
|
||||
|
||||
// bindTemplateFlags attaches --subject / --subject-file / --body /
|
||||
// --body-file to a cobra command. resolveTemplateFlags reads the file
|
||||
// variants (if non-empty) and returns the effective subject + body
|
||||
// template strings. Inline flags take precedence over file flags.
|
||||
func bindTemplateFlags(cmd *cobra.Command) {
|
||||
cmd.Flags().String("subject", "", "subject template (text/template syntax — SMTP only)")
|
||||
cmd.Flags().String("subject-file", "", "path to a file containing the subject template")
|
||||
cmd.Flags().String("body", "", "body template (text/template syntax)")
|
||||
cmd.Flags().String("body-file", "", "path to a file containing the body template")
|
||||
}
|
||||
|
||||
func resolveTemplateFlags(cmd *cobra.Command) (subject, body string, err error) {
|
||||
subject, _ = cmd.Flags().GetString("subject")
|
||||
body, _ = cmd.Flags().GetString("body")
|
||||
if subject == "" {
|
||||
if p, _ := cmd.Flags().GetString("subject-file"); p != "" {
|
||||
raw, e := os.ReadFile(p)
|
||||
if e != nil {
|
||||
return "", "", fmt.Errorf("read --subject-file %s: %w", p, e)
|
||||
}
|
||||
subject = string(raw)
|
||||
}
|
||||
}
|
||||
if body == "" {
|
||||
if p, _ := cmd.Flags().GetString("body-file"); p != "" {
|
||||
raw, e := os.ReadFile(p)
|
||||
if e != nil {
|
||||
return "", "", fmt.Errorf("read --body-file %s: %w", p, e)
|
||||
}
|
||||
body = string(raw)
|
||||
}
|
||||
}
|
||||
return subject, body, nil
|
||||
}
|
||||
|
||||
func addAlertCmd(root *cobra.Command) {
|
||||
alert := &cobra.Command{
|
||||
Use: "alert",
|
||||
@@ -136,11 +172,17 @@ func buildSMTPAddCmd() *cobra.Command {
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
ctx, cancel := context.WithTimeout(cmd.Context(), 10*time.Second)
|
||||
defer cancel()
|
||||
subj, body, err := resolveTemplateFlags(cmd)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
a := config.Alert{
|
||||
ID: uuid.NewString(),
|
||||
Name: args[0],
|
||||
Type: config.AlertSMTP,
|
||||
Default: makeDefault,
|
||||
SubjectTemplate: subj,
|
||||
BodyTemplate: body,
|
||||
SMTPHost: host,
|
||||
SMTPPort: port,
|
||||
SMTPUser: user,
|
||||
@@ -150,8 +192,8 @@ func buildSMTPAddCmd() *cobra.Command {
|
||||
SMTPStartTLS: startTLS,
|
||||
}
|
||||
payload, _ := json.Marshal(a)
|
||||
body := daemon.MutateBody{Kind: transport.MutationAddAlert, Payload: payload}
|
||||
raw, err := callDaemon(ctx, daemon.CtrlMutate, body)
|
||||
mb := daemon.MutateBody{Kind: transport.MutationAddAlert, Payload: payload}
|
||||
raw, err := callDaemon(ctx, daemon.CtrlMutate, mb)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -170,6 +212,7 @@ func buildSMTPAddCmd() *cobra.Command {
|
||||
cmd.Flags().StringSliceVar(&to, "to", nil, "recipient address (repeat or comma-separate)")
|
||||
cmd.Flags().BoolVar(&startTLS, "starttls", true, "negotiate STARTTLS")
|
||||
cmd.Flags().BoolVar(&makeDefault, "default", false, "attach this alert to every check automatically")
|
||||
bindTemplateFlags(cmd)
|
||||
_ = cmd.MarkFlagRequired("host")
|
||||
_ = cmd.MarkFlagRequired("from")
|
||||
_ = cmd.MarkFlagRequired("to")
|
||||
@@ -186,16 +229,22 @@ func buildDiscordAddCmd() *cobra.Command {
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
ctx, cancel := context.WithTimeout(cmd.Context(), 10*time.Second)
|
||||
defer cancel()
|
||||
subj, body, err := resolveTemplateFlags(cmd)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
a := config.Alert{
|
||||
ID: uuid.NewString(),
|
||||
Name: args[0],
|
||||
Type: config.AlertDiscord,
|
||||
Default: makeDefault,
|
||||
SubjectTemplate: subj,
|
||||
BodyTemplate: body,
|
||||
DiscordWebhook: webhook,
|
||||
}
|
||||
payload, _ := json.Marshal(a)
|
||||
body := daemon.MutateBody{Kind: transport.MutationAddAlert, Payload: payload}
|
||||
raw, err := callDaemon(ctx, daemon.CtrlMutate, body)
|
||||
mb := daemon.MutateBody{Kind: transport.MutationAddAlert, Payload: payload}
|
||||
raw, err := callDaemon(ctx, daemon.CtrlMutate, mb)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -208,6 +257,7 @@ func buildDiscordAddCmd() *cobra.Command {
|
||||
}
|
||||
cmd.Flags().StringVar(&webhook, "webhook", "", "discord webhook URL")
|
||||
cmd.Flags().BoolVar(&makeDefault, "default", false, "attach this alert to every check automatically")
|
||||
bindTemplateFlags(cmd)
|
||||
_ = cmd.MarkFlagRequired("webhook")
|
||||
return cmd
|
||||
}
|
||||
|
||||
@@ -85,6 +85,16 @@ type Alert struct {
|
||||
|
||||
// Discord options.
|
||||
DiscordWebhook string `yaml:"discord_webhook,omitempty"`
|
||||
|
||||
// SubjectTemplate / BodyTemplate are optional text/template strings
|
||||
// that override the default rendering. Empty means use the built-in
|
||||
// format. Discord ignores SubjectTemplate (it has no subject line);
|
||||
// SMTP uses both. Available variables: {{.Check.Name}},
|
||||
// {{.Check.Type}}, {{.Check.Target}}, {{.Check.ID}}, {{.From}},
|
||||
// {{.To}}, {{.Verb}}, {{.Snapshot.Reports}}, {{.Snapshot.OKCount}},
|
||||
// {{.Snapshot.NotOK}}, {{.Snapshot.Detail}}, {{.NodeID}}, {{.When}}.
|
||||
SubjectTemplate string `yaml:"subject_template,omitempty"`
|
||||
BodyTemplate string `yaml:"body_template,omitempty"`
|
||||
}
|
||||
|
||||
// ClusterConfig is the replicated cluster state. The Version field
|
||||
|
||||
Reference in New Issue
Block a user