Initial structure

2026-05-12 06:07:16 +00:00
commit 7e85bb0fcc
38 changed files with 4594 additions and 0 deletions
@@ -0,0 +1,253 @@
+package checks
+
+import (
+	"sync"
+	"time"
+
+	"github.com/jasper/quptime/internal/config"
+)
+
+// State is the aggregate verdict on one check.
+type State string
+
+const (
+	StateUnknown State = "unknown"
+	StateUp      State = "up"
+	StateDown    State = "down"
+)
+
+// HysteresisCount is how many consecutive evaluations a candidate
+// state must hold before becoming the new committed state. 2 matches
+// the design doc ("down for ≥2 consecutive aggregate evaluations").
+const HysteresisCount = 2
+
+// TransitionFn is fired when a check's committed state flips. The
+// alert dispatcher is the production consumer.
+type TransitionFn func(check *config.Check, from, to State, snap Snapshot)
+
+// Snapshot summarises one check's current aggregate state.
+type Snapshot struct {
+	CheckID  string
+	State    State
+	Reports  int   // number of fresh per-node results
+	OKCount  int   // number reporting OK
+	NotOK    int   // number reporting not-OK
+	Detail   string
+	UpdateAt time.Time
+}
+
+// Aggregator runs on the master only. Other nodes ship their probe
+// results to it and it decides the cluster-wide truth.
+type Aggregator struct {
+	cluster    *config.ClusterConfig
+	transition TransitionFn
+
+	mu       sync.Mutex
+	perCheck map[string]*checkState
+}
+
+type checkState struct {
+	// nodeID → most recent result we've received from that node.
+	latest map[string]nodeResult
+
+	committed   State // last announced state (held through hysteresis)
+	candidate   State // state being considered for promotion
+	consecutive int   // ticks the candidate has persisted
+}
+
+type nodeResult struct {
+	OK     bool
+	Detail string
+	At     time.Time
+}
+
+// NewAggregator returns an empty aggregator. fn may be nil during
+// startup; SetTransition can wire it later.
+func NewAggregator(cluster *config.ClusterConfig, fn TransitionFn) *Aggregator {
+	return &Aggregator{
+		cluster:    cluster,
+		transition: fn,
+		perCheck:   map[string]*checkState{},
+	}
+}
+
+// SetTransition wires (or rewires) the transition callback.
+func (a *Aggregator) SetTransition(fn TransitionFn) {
+	a.mu.Lock()
+	a.transition = fn
+	a.mu.Unlock()
+}
+
+// Submit records one node's result for one check and immediately
+// re-evaluates that check's aggregate state.
+func (a *Aggregator) Submit(nodeID string, r Result) {
+	if r.CheckID == "" {
+		return
+	}
+	a.mu.Lock()
+	st, ok := a.perCheck[r.CheckID]
+	if !ok {
+		st = &checkState{
+			latest:    map[string]nodeResult{},
+			committed: StateUnknown,
+			candidate: StateUnknown,
+		}
+		a.perCheck[r.CheckID] = st
+	}
+	st.latest[nodeID] = nodeResult{OK: r.OK, Detail: r.Detail, At: r.Timestamp}
+	a.mu.Unlock()
+	a.evaluate(r.CheckID)
+}
+
+// SnapshotAll returns the current aggregate view of every known check.
+func (a *Aggregator) SnapshotAll() map[string]Snapshot {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	out := make(map[string]Snapshot, len(a.perCheck))
+	for id, st := range a.perCheck {
+		out[id] = a.snapshotLocked(id, st)
+	}
+	return out
+}
+
+// SnapshotFor returns the aggregate for a single check.
+func (a *Aggregator) SnapshotFor(checkID string) (Snapshot, bool) {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	st, ok := a.perCheck[checkID]
+	if !ok {
+		return Snapshot{}, false
+	}
+	return a.snapshotLocked(checkID, st), true
+}
+
+func (a *Aggregator) evaluate(checkID string) {
+	check := a.lookupCheck(checkID)
+	if check == nil {
+		// Check was removed from cluster.yaml — drop its state so we
+		// don't keep alerting on something the operator deleted.
+		a.mu.Lock()
+		delete(a.perCheck, checkID)
+		a.mu.Unlock()
+		return
+	}
+
+	a.mu.Lock()
+	st, ok := a.perCheck[checkID]
+	if !ok {
+		a.mu.Unlock()
+		return
+	}
+
+	freshWindow := freshWindowFor(check.Interval)
+	cutoff := time.Now().Add(-freshWindow)
+
+	var ok_, notOK int
+	var lastDetail string
+	for _, nr := range st.latest {
+		if nr.At.Before(cutoff) {
+			continue
+		}
+		if nr.OK {
+			ok_++
+		} else {
+			notOK++
+			if lastDetail == "" {
+				lastDetail = nr.Detail
+			}
+		}
+	}
+
+	var candidate State
+	switch {
+	case ok_+notOK == 0:
+		candidate = StateUnknown
+	case notOK > ok_:
+		candidate = StateDown
+	default:
+		candidate = StateUp
+	}
+
+	if candidate == st.candidate {
+		st.consecutive++
+	} else {
+		st.candidate = candidate
+		st.consecutive = 1
+	}
+
+	var fireFrom, fireTo State
+	var fired bool
+	if candidate != st.committed && st.consecutive >= HysteresisCount {
+		fireFrom = st.committed
+		fireTo = candidate
+		st.committed = candidate
+		fired = true
+	}
+
+	snap := a.snapshotLocked(checkID, st)
+	fn := a.transition
+	a.mu.Unlock()
+
+	if fired && fn != nil {
+		fn(check, fireFrom, fireTo, snap)
+	}
+}
+
+func (a *Aggregator) snapshotLocked(checkID string, st *checkState) Snapshot {
+	check := a.lookupCheck(checkID)
+	freshWindow := 60 * time.Second
+	if check != nil {
+		freshWindow = freshWindowFor(check.Interval)
+	}
+	cutoff := time.Now().Add(-freshWindow)
+
+	var ok_, notOK int
+	var detail string
+	for _, nr := range st.latest {
+		if nr.At.Before(cutoff) {
+			continue
+		}
+		if nr.OK {
+			ok_++
+		} else {
+			notOK++
+			if detail == "" {
+				detail = nr.Detail
+			}
+		}
+	}
+	return Snapshot{
+		CheckID:  checkID,
+		State:    st.committed,
+		Reports:  ok_ + notOK,
+		OKCount:  ok_,
+		NotOK:    notOK,
+		Detail:   detail,
+		UpdateAt: time.Now().UTC(),
+	}
+}
+
+func (a *Aggregator) lookupCheck(id string) *config.Check {
+	snap := a.cluster.Snapshot()
+	for i := range snap.Checks {
+		if snap.Checks[i].ID == id {
+			c := snap.Checks[i]
+			return &c
+		}
+	}
+	return nil
+}
+
+// freshWindowFor returns the staleness threshold for a check given
+// its configured interval. Anything older than this is considered too
+// stale to count.
+func freshWindowFor(interval time.Duration) time.Duration {
+	if interval <= 0 {
+		return 60 * time.Second
+	}
+	w := interval * 3
+	if w < 30*time.Second {
+		w = 30 * time.Second
+	}
+	return w
+}
@@ -0,0 +1,62 @@
+package checks
+
+import (
+	"context"
+	"crypto/tls"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/jasper/quptime/internal/config"
+)
+
+// maxBodyRead is the cap on how much body a check will pull when
+// BodyMatch is non-empty. Anything beyond is ignored.
+const maxBodyRead = 1 << 20 // 1 MiB
+
+type httpProber struct{}
+
+func (httpProber) Probe(ctx context.Context, c *config.Check) Result {
+	client := &http.Client{
+		Timeout: c.Timeout,
+		Transport: &http.Transport{
+			TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12},
+		},
+	}
+	start := time.Now()
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.Target, nil)
+	if err != nil {
+		return Result{OK: false, Detail: "build request: " + err.Error()}
+	}
+	req.Header.Set("User-Agent", "quptime/1.0")
+	resp, err := client.Do(req)
+	if err != nil {
+		return Result{OK: false, Detail: err.Error(), Latency: time.Since(start)}
+	}
+	defer resp.Body.Close()
+
+	expected := c.ExpectStatus
+	if expected == 0 {
+		expected = 200
+	}
+	if resp.StatusCode != expected {
+		return Result{
+			OK:      false,
+			Detail:  "status " + resp.Status,
+			Latency: time.Since(start),
+		}
+	}
+
+	if c.BodyMatch != "" {
+		body, err := io.ReadAll(io.LimitReader(resp.Body, maxBodyRead))
+		if err != nil {
+			return Result{OK: false, Detail: "read body: " + err.Error(), Latency: time.Since(start)}
+		}
+		if !strings.Contains(string(body), c.BodyMatch) {
+			return Result{OK: false, Detail: "body match miss", Latency: time.Since(start)}
+		}
+	}
+
+	return Result{OK: true, Latency: time.Since(start)}
+}
@@ -0,0 +1,48 @@
+package checks
+
+import (
+	"context"
+	"runtime"
+	"time"
+
+	probing "github.com/prometheus-community/pro-bing"
+
+	"github.com/jasper/quptime/internal/config"
+)
+
+type icmpProber struct{}
+
+// Probe sends a single ICMP echo. On Linux we default to unprivileged
+// "udp" mode so the daemon does not require CAP_NET_RAW. Operators
+// who can grant the cap (or run as root) get raw ICMP automatically.
+func (icmpProber) Probe(ctx context.Context, c *config.Check) Result {
+	start := time.Now()
+	pinger, err := probing.NewPinger(c.Target)
+	if err != nil {
+		return Result{OK: false, Detail: "resolve: " + err.Error()}
+	}
+	pinger.Count = 1
+	pinger.Timeout = c.Timeout
+	if pinger.Timeout <= 0 {
+		pinger.Timeout = 5 * time.Second
+	}
+	pinger.SetPrivileged(runtime.GOOS != "linux")
+
+	doneCh := make(chan struct{})
+	go func() {
+		_ = pinger.RunWithContext(ctx)
+		close(doneCh)
+	}()
+	select {
+	case <-doneCh:
+	case <-ctx.Done():
+		pinger.Stop()
+		return Result{OK: false, Detail: "timeout", Latency: time.Since(start)}
+	}
+
+	stats := pinger.Statistics()
+	if stats.PacketsRecv == 0 {
+		return Result{OK: false, Detail: "no reply", Latency: time.Since(start)}
+	}
+	return Result{OK: true, Latency: stats.AvgRtt}
+}
@@ -0,0 +1,65 @@
+// Package checks runs the configured HTTP/TCP/ICMP probes on every
+// node and aggregates per-check results on the master.
+//
+// Probers are the small "do one round of work" units. The Scheduler
+// drives them on a per-check timer and ships each result back to the
+// master via the inter-node transport. The Aggregator (master only)
+// folds incoming results into per-check sliding windows and decides
+// when a check has crossed UP↔DOWN.
+package checks
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/jasper/quptime/internal/config"
+)
+
+// Result is the outcome of a single probe.
+type Result struct {
+	CheckID   string
+	OK        bool
+	Detail    string
+	Latency   time.Duration
+	Timestamp time.Time
+}
+
+// Prober runs one probe of a configured check.
+type Prober interface {
+	Probe(ctx context.Context, c *config.Check) Result
+}
+
+// Run dispatches to the right Prober for the given check type. Returns
+// an error result instead of failing when a check has unknown type.
+func Run(ctx context.Context, c *config.Check) Result {
+	deadline := c.Timeout
+	if deadline <= 0 {
+		deadline = 10 * time.Second
+	}
+	pctx, cancel := context.WithTimeout(ctx, deadline)
+	defer cancel()
+
+	var p Prober
+	switch c.Type {
+	case config.CheckHTTP:
+		p = httpProber{}
+	case config.CheckTCP:
+		p = tcpProber{}
+	case config.CheckICMP:
+		p = icmpProber{}
+	default:
+		return Result{
+			CheckID:   c.ID,
+			OK:        false,
+			Detail:    fmt.Sprintf("unknown check type %q", c.Type),
+			Timestamp: time.Now().UTC(),
+		}
+	}
+	res := p.Probe(pctx, c)
+	res.CheckID = c.ID
+	if res.Timestamp.IsZero() {
+		res.Timestamp = time.Now().UTC()
+	}
+	return res
+}
@@ -0,0 +1,147 @@
+package checks
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/jasper/quptime/internal/config"
+)
+
+// ReconcileInterval is how often the scheduler reconciles its set of
+// running probes against cluster.yaml.
+const ReconcileInterval = 5 * time.Second
+
+// Sink is the abstraction the scheduler uses to report results.
+// Implemented by the daemon: results go straight to the local
+// aggregator when self is the master, otherwise they ship to the
+// master over the RPC channel.
+type Sink interface {
+	Submit(Result)
+}
+
+// Scheduler keeps a goroutine alive per configured check. On each
+// reconcile pass it starts probes for new checks, stops probes for
+// removed checks, and restarts probes whose interval or type changed.
+type Scheduler struct {
+	cluster *config.ClusterConfig
+	sink    Sink
+
+	mu      sync.Mutex
+	running map[string]*probeWorker
+}
+
+type probeWorker struct {
+	check  config.Check
+	cancel context.CancelFunc
+}
+
+// NewScheduler creates a scheduler bound to the given cluster config.
+func NewScheduler(cluster *config.ClusterConfig, sink Sink) *Scheduler {
+	return &Scheduler{
+		cluster: cluster,
+		sink:    sink,
+		running: map[string]*probeWorker{},
+	}
+}
+
+// Start runs the reconcile loop until ctx is cancelled. Reconcile is
+// also called immediately on entry so checks start without waiting
+// for the first tick.
+func (s *Scheduler) Start(ctx context.Context) {
+	s.reconcile(ctx)
+	t := time.NewTicker(ReconcileInterval)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			s.stopAll()
+			return
+		case <-t.C:
+			s.reconcile(ctx)
+		}
+	}
+}
+
+func (s *Scheduler) reconcile(ctx context.Context) {
+	snap := s.cluster.Snapshot()
+	want := map[string]config.Check{}
+	for _, c := range snap.Checks {
+		if c.ID == "" {
+			continue
+		}
+		want[c.ID] = c
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	for id, w := range s.running {
+		desired, stillThere := want[id]
+		if !stillThere || !sameCheck(desired, w.check) {
+			w.cancel()
+			delete(s.running, id)
+		}
+	}
+	for id, c := range want {
+		if _, exists := s.running[id]; exists {
+			continue
+		}
+		wctx, cancel := context.WithCancel(ctx)
+		s.running[id] = &probeWorker{check: c, cancel: cancel}
+		go s.run(wctx, c)
+	}
+}
+
+func (s *Scheduler) run(ctx context.Context, c config.Check) {
+	interval := c.Interval
+	if interval <= 0 {
+		interval = 30 * time.Second
+	}
+	// stagger startup so a freshly-loaded scheduler doesn't burst
+	// hundreds of probes simultaneously.
+	jitter := time.Duration(int64(interval) / 10)
+	if jitter > 0 {
+		select {
+		case <-ctx.Done():
+			return
+		case <-time.After(time.Duration(time.Now().UnixNano() % int64(jitter))):
+		}
+	}
+	t := time.NewTicker(interval)
+	defer t.Stop()
+
+	// fire one immediate probe so state populates without delay.
+	s.sink.Submit(Run(ctx, &c))
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			s.sink.Submit(Run(ctx, &c))
+		}
+	}
+}
+
+func (s *Scheduler) stopAll() {
+	s.mu.Lock()
+	for id, w := range s.running {
+		w.cancel()
+		delete(s.running, id)
+	}
+	s.mu.Unlock()
+}
+
+// sameCheck returns true when two Check structs would produce
+// identical probing behaviour, so the scheduler can leave the worker
+// running across a no-op config push.
+func sameCheck(a, b config.Check) bool {
+	return a.ID == b.ID &&
+		a.Type == b.Type &&
+		a.Target == b.Target &&
+		a.Interval == b.Interval &&
+		a.Timeout == b.Timeout &&
+		a.ExpectStatus == b.ExpectStatus &&
+		a.BodyMatch == b.BodyMatch
+}
@@ -0,0 +1,22 @@
+package checks
+
+import (
+	"context"
+	"net"
+	"time"
+
+	"github.com/jasper/quptime/internal/config"
+)
+
+type tcpProber struct{}
+
+func (tcpProber) Probe(ctx context.Context, c *config.Check) Result {
+	start := time.Now()
+	d := net.Dialer{Timeout: c.Timeout}
+	conn, err := d.DialContext(ctx, "tcp", c.Target)
+	if err != nil {
+		return Result{OK: false, Detail: err.Error(), Latency: time.Since(start)}
+	}
+	_ = conn.Close()
+	return Result{OK: true, Latency: time.Since(start)}
+}