Initial structure

This commit is contained in:
2026-05-12 06:07:16 +00:00
commit 7e85bb0fcc
38 changed files with 4594 additions and 0 deletions
+253
View File
@@ -0,0 +1,253 @@
package checks
import (
"sync"
"time"
"github.com/jasper/quptime/internal/config"
)
// State is the aggregate verdict on one check.
type State string
const (
StateUnknown State = "unknown"
StateUp State = "up"
StateDown State = "down"
)
// HysteresisCount is how many consecutive evaluations a candidate
// state must hold before becoming the new committed state. 2 matches
// the design doc ("down for ≥2 consecutive aggregate evaluations").
const HysteresisCount = 2
// TransitionFn is fired when a check's committed state flips. The
// alert dispatcher is the production consumer.
type TransitionFn func(check *config.Check, from, to State, snap Snapshot)
// Snapshot summarises one check's current aggregate state.
type Snapshot struct {
CheckID string
State State
Reports int // number of fresh per-node results
OKCount int // number reporting OK
NotOK int // number reporting not-OK
Detail string
UpdateAt time.Time
}
// Aggregator runs on the master only. Other nodes ship their probe
// results to it and it decides the cluster-wide truth.
type Aggregator struct {
cluster *config.ClusterConfig
transition TransitionFn
mu sync.Mutex
perCheck map[string]*checkState
}
type checkState struct {
// nodeID → most recent result we've received from that node.
latest map[string]nodeResult
committed State // last announced state (held through hysteresis)
candidate State // state being considered for promotion
consecutive int // ticks the candidate has persisted
}
type nodeResult struct {
OK bool
Detail string
At time.Time
}
// NewAggregator returns an empty aggregator. fn may be nil during
// startup; SetTransition can wire it later.
func NewAggregator(cluster *config.ClusterConfig, fn TransitionFn) *Aggregator {
return &Aggregator{
cluster: cluster,
transition: fn,
perCheck: map[string]*checkState{},
}
}
// SetTransition wires (or rewires) the transition callback.
func (a *Aggregator) SetTransition(fn TransitionFn) {
a.mu.Lock()
a.transition = fn
a.mu.Unlock()
}
// Submit records one node's result for one check and immediately
// re-evaluates that check's aggregate state.
func (a *Aggregator) Submit(nodeID string, r Result) {
if r.CheckID == "" {
return
}
a.mu.Lock()
st, ok := a.perCheck[r.CheckID]
if !ok {
st = &checkState{
latest: map[string]nodeResult{},
committed: StateUnknown,
candidate: StateUnknown,
}
a.perCheck[r.CheckID] = st
}
st.latest[nodeID] = nodeResult{OK: r.OK, Detail: r.Detail, At: r.Timestamp}
a.mu.Unlock()
a.evaluate(r.CheckID)
}
// SnapshotAll returns the current aggregate view of every known check.
func (a *Aggregator) SnapshotAll() map[string]Snapshot {
a.mu.Lock()
defer a.mu.Unlock()
out := make(map[string]Snapshot, len(a.perCheck))
for id, st := range a.perCheck {
out[id] = a.snapshotLocked(id, st)
}
return out
}
// SnapshotFor returns the aggregate for a single check.
func (a *Aggregator) SnapshotFor(checkID string) (Snapshot, bool) {
a.mu.Lock()
defer a.mu.Unlock()
st, ok := a.perCheck[checkID]
if !ok {
return Snapshot{}, false
}
return a.snapshotLocked(checkID, st), true
}
func (a *Aggregator) evaluate(checkID string) {
check := a.lookupCheck(checkID)
if check == nil {
// Check was removed from cluster.yaml — drop its state so we
// don't keep alerting on something the operator deleted.
a.mu.Lock()
delete(a.perCheck, checkID)
a.mu.Unlock()
return
}
a.mu.Lock()
st, ok := a.perCheck[checkID]
if !ok {
a.mu.Unlock()
return
}
freshWindow := freshWindowFor(check.Interval)
cutoff := time.Now().Add(-freshWindow)
var ok_, notOK int
var lastDetail string
for _, nr := range st.latest {
if nr.At.Before(cutoff) {
continue
}
if nr.OK {
ok_++
} else {
notOK++
if lastDetail == "" {
lastDetail = nr.Detail
}
}
}
var candidate State
switch {
case ok_+notOK == 0:
candidate = StateUnknown
case notOK > ok_:
candidate = StateDown
default:
candidate = StateUp
}
if candidate == st.candidate {
st.consecutive++
} else {
st.candidate = candidate
st.consecutive = 1
}
var fireFrom, fireTo State
var fired bool
if candidate != st.committed && st.consecutive >= HysteresisCount {
fireFrom = st.committed
fireTo = candidate
st.committed = candidate
fired = true
}
snap := a.snapshotLocked(checkID, st)
fn := a.transition
a.mu.Unlock()
if fired && fn != nil {
fn(check, fireFrom, fireTo, snap)
}
}
func (a *Aggregator) snapshotLocked(checkID string, st *checkState) Snapshot {
check := a.lookupCheck(checkID)
freshWindow := 60 * time.Second
if check != nil {
freshWindow = freshWindowFor(check.Interval)
}
cutoff := time.Now().Add(-freshWindow)
var ok_, notOK int
var detail string
for _, nr := range st.latest {
if nr.At.Before(cutoff) {
continue
}
if nr.OK {
ok_++
} else {
notOK++
if detail == "" {
detail = nr.Detail
}
}
}
return Snapshot{
CheckID: checkID,
State: st.committed,
Reports: ok_ + notOK,
OKCount: ok_,
NotOK: notOK,
Detail: detail,
UpdateAt: time.Now().UTC(),
}
}
func (a *Aggregator) lookupCheck(id string) *config.Check {
snap := a.cluster.Snapshot()
for i := range snap.Checks {
if snap.Checks[i].ID == id {
c := snap.Checks[i]
return &c
}
}
return nil
}
// freshWindowFor returns the staleness threshold for a check given
// its configured interval. Anything older than this is considered too
// stale to count.
func freshWindowFor(interval time.Duration) time.Duration {
if interval <= 0 {
return 60 * time.Second
}
w := interval * 3
if w < 30*time.Second {
w = 30 * time.Second
}
return w
}
+62
View File
@@ -0,0 +1,62 @@
package checks
import (
"context"
"crypto/tls"
"io"
"net/http"
"strings"
"time"
"github.com/jasper/quptime/internal/config"
)
// maxBodyRead is the cap on how much body a check will pull when
// BodyMatch is non-empty. Anything beyond is ignored.
const maxBodyRead = 1 << 20 // 1 MiB
type httpProber struct{}
func (httpProber) Probe(ctx context.Context, c *config.Check) Result {
client := &http.Client{
Timeout: c.Timeout,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12},
},
}
start := time.Now()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.Target, nil)
if err != nil {
return Result{OK: false, Detail: "build request: " + err.Error()}
}
req.Header.Set("User-Agent", "quptime/1.0")
resp, err := client.Do(req)
if err != nil {
return Result{OK: false, Detail: err.Error(), Latency: time.Since(start)}
}
defer resp.Body.Close()
expected := c.ExpectStatus
if expected == 0 {
expected = 200
}
if resp.StatusCode != expected {
return Result{
OK: false,
Detail: "status " + resp.Status,
Latency: time.Since(start),
}
}
if c.BodyMatch != "" {
body, err := io.ReadAll(io.LimitReader(resp.Body, maxBodyRead))
if err != nil {
return Result{OK: false, Detail: "read body: " + err.Error(), Latency: time.Since(start)}
}
if !strings.Contains(string(body), c.BodyMatch) {
return Result{OK: false, Detail: "body match miss", Latency: time.Since(start)}
}
}
return Result{OK: true, Latency: time.Since(start)}
}
+48
View File
@@ -0,0 +1,48 @@
package checks
import (
"context"
"runtime"
"time"
probing "github.com/prometheus-community/pro-bing"
"github.com/jasper/quptime/internal/config"
)
type icmpProber struct{}
// Probe sends a single ICMP echo. On Linux we default to unprivileged
// "udp" mode so the daemon does not require CAP_NET_RAW. Operators
// who can grant the cap (or run as root) get raw ICMP automatically.
func (icmpProber) Probe(ctx context.Context, c *config.Check) Result {
start := time.Now()
pinger, err := probing.NewPinger(c.Target)
if err != nil {
return Result{OK: false, Detail: "resolve: " + err.Error()}
}
pinger.Count = 1
pinger.Timeout = c.Timeout
if pinger.Timeout <= 0 {
pinger.Timeout = 5 * time.Second
}
pinger.SetPrivileged(runtime.GOOS != "linux")
doneCh := make(chan struct{})
go func() {
_ = pinger.RunWithContext(ctx)
close(doneCh)
}()
select {
case <-doneCh:
case <-ctx.Done():
pinger.Stop()
return Result{OK: false, Detail: "timeout", Latency: time.Since(start)}
}
stats := pinger.Statistics()
if stats.PacketsRecv == 0 {
return Result{OK: false, Detail: "no reply", Latency: time.Since(start)}
}
return Result{OK: true, Latency: stats.AvgRtt}
}
+65
View File
@@ -0,0 +1,65 @@
// Package checks runs the configured HTTP/TCP/ICMP probes on every
// node and aggregates per-check results on the master.
//
// Probers are the small "do one round of work" units. The Scheduler
// drives them on a per-check timer and ships each result back to the
// master via the inter-node transport. The Aggregator (master only)
// folds incoming results into per-check sliding windows and decides
// when a check has crossed UP↔DOWN.
package checks
import (
"context"
"fmt"
"time"
"github.com/jasper/quptime/internal/config"
)
// Result is the outcome of a single probe.
type Result struct {
CheckID string
OK bool
Detail string
Latency time.Duration
Timestamp time.Time
}
// Prober runs one probe of a configured check.
type Prober interface {
Probe(ctx context.Context, c *config.Check) Result
}
// Run dispatches to the right Prober for the given check type. Returns
// an error result instead of failing when a check has unknown type.
func Run(ctx context.Context, c *config.Check) Result {
deadline := c.Timeout
if deadline <= 0 {
deadline = 10 * time.Second
}
pctx, cancel := context.WithTimeout(ctx, deadline)
defer cancel()
var p Prober
switch c.Type {
case config.CheckHTTP:
p = httpProber{}
case config.CheckTCP:
p = tcpProber{}
case config.CheckICMP:
p = icmpProber{}
default:
return Result{
CheckID: c.ID,
OK: false,
Detail: fmt.Sprintf("unknown check type %q", c.Type),
Timestamp: time.Now().UTC(),
}
}
res := p.Probe(pctx, c)
res.CheckID = c.ID
if res.Timestamp.IsZero() {
res.Timestamp = time.Now().UTC()
}
return res
}
+147
View File
@@ -0,0 +1,147 @@
package checks
import (
"context"
"sync"
"time"
"github.com/jasper/quptime/internal/config"
)
// ReconcileInterval is how often the scheduler reconciles its set of
// running probes against cluster.yaml.
const ReconcileInterval = 5 * time.Second
// Sink is the abstraction the scheduler uses to report results.
// Implemented by the daemon: results go straight to the local
// aggregator when self is the master, otherwise they ship to the
// master over the RPC channel.
type Sink interface {
Submit(Result)
}
// Scheduler keeps a goroutine alive per configured check. On each
// reconcile pass it starts probes for new checks, stops probes for
// removed checks, and restarts probes whose interval or type changed.
type Scheduler struct {
cluster *config.ClusterConfig
sink Sink
mu sync.Mutex
running map[string]*probeWorker
}
type probeWorker struct {
check config.Check
cancel context.CancelFunc
}
// NewScheduler creates a scheduler bound to the given cluster config.
func NewScheduler(cluster *config.ClusterConfig, sink Sink) *Scheduler {
return &Scheduler{
cluster: cluster,
sink: sink,
running: map[string]*probeWorker{},
}
}
// Start runs the reconcile loop until ctx is cancelled. Reconcile is
// also called immediately on entry so checks start without waiting
// for the first tick.
func (s *Scheduler) Start(ctx context.Context) {
s.reconcile(ctx)
t := time.NewTicker(ReconcileInterval)
defer t.Stop()
for {
select {
case <-ctx.Done():
s.stopAll()
return
case <-t.C:
s.reconcile(ctx)
}
}
}
func (s *Scheduler) reconcile(ctx context.Context) {
snap := s.cluster.Snapshot()
want := map[string]config.Check{}
for _, c := range snap.Checks {
if c.ID == "" {
continue
}
want[c.ID] = c
}
s.mu.Lock()
defer s.mu.Unlock()
for id, w := range s.running {
desired, stillThere := want[id]
if !stillThere || !sameCheck(desired, w.check) {
w.cancel()
delete(s.running, id)
}
}
for id, c := range want {
if _, exists := s.running[id]; exists {
continue
}
wctx, cancel := context.WithCancel(ctx)
s.running[id] = &probeWorker{check: c, cancel: cancel}
go s.run(wctx, c)
}
}
func (s *Scheduler) run(ctx context.Context, c config.Check) {
interval := c.Interval
if interval <= 0 {
interval = 30 * time.Second
}
// stagger startup so a freshly-loaded scheduler doesn't burst
// hundreds of probes simultaneously.
jitter := time.Duration(int64(interval) / 10)
if jitter > 0 {
select {
case <-ctx.Done():
return
case <-time.After(time.Duration(time.Now().UnixNano() % int64(jitter))):
}
}
t := time.NewTicker(interval)
defer t.Stop()
// fire one immediate probe so state populates without delay.
s.sink.Submit(Run(ctx, &c))
for {
select {
case <-ctx.Done():
return
case <-t.C:
s.sink.Submit(Run(ctx, &c))
}
}
}
func (s *Scheduler) stopAll() {
s.mu.Lock()
for id, w := range s.running {
w.cancel()
delete(s.running, id)
}
s.mu.Unlock()
}
// sameCheck returns true when two Check structs would produce
// identical probing behaviour, so the scheduler can leave the worker
// running across a no-op config push.
func sameCheck(a, b config.Check) bool {
return a.ID == b.ID &&
a.Type == b.Type &&
a.Target == b.Target &&
a.Interval == b.Interval &&
a.Timeout == b.Timeout &&
a.ExpectStatus == b.ExpectStatus &&
a.BodyMatch == b.BodyMatch
}
+22
View File
@@ -0,0 +1,22 @@
package checks
import (
"context"
"net"
"time"
"github.com/jasper/quptime/internal/config"
)
type tcpProber struct{}
func (tcpProber) Probe(ctx context.Context, c *config.Check) Result {
start := time.Now()
d := net.Dialer{Timeout: c.Timeout}
conn, err := d.DialContext(ctx, "tcp", c.Target)
if err != nil {
return Result{OK: false, Detail: err.Error(), Latency: time.Since(start)}
}
_ = conn.Close()
return Result{OK: true, Latency: time.Since(start)}
}