Initial structure
This commit is contained in:
@@ -0,0 +1,253 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/jasper/quptime/internal/config"
|
||||
)
|
||||
|
||||
// State is the aggregate verdict on one check.
|
||||
type State string
|
||||
|
||||
const (
|
||||
StateUnknown State = "unknown"
|
||||
StateUp State = "up"
|
||||
StateDown State = "down"
|
||||
)
|
||||
|
||||
// HysteresisCount is how many consecutive evaluations a candidate
|
||||
// state must hold before becoming the new committed state. 2 matches
|
||||
// the design doc ("down for ≥2 consecutive aggregate evaluations").
|
||||
const HysteresisCount = 2
|
||||
|
||||
// TransitionFn is fired when a check's committed state flips. The
|
||||
// alert dispatcher is the production consumer.
|
||||
type TransitionFn func(check *config.Check, from, to State, snap Snapshot)
|
||||
|
||||
// Snapshot summarises one check's current aggregate state.
|
||||
type Snapshot struct {
|
||||
CheckID string
|
||||
State State
|
||||
Reports int // number of fresh per-node results
|
||||
OKCount int // number reporting OK
|
||||
NotOK int // number reporting not-OK
|
||||
Detail string
|
||||
UpdateAt time.Time
|
||||
}
|
||||
|
||||
// Aggregator runs on the master only. Other nodes ship their probe
|
||||
// results to it and it decides the cluster-wide truth.
|
||||
type Aggregator struct {
|
||||
cluster *config.ClusterConfig
|
||||
transition TransitionFn
|
||||
|
||||
mu sync.Mutex
|
||||
perCheck map[string]*checkState
|
||||
}
|
||||
|
||||
type checkState struct {
|
||||
// nodeID → most recent result we've received from that node.
|
||||
latest map[string]nodeResult
|
||||
|
||||
committed State // last announced state (held through hysteresis)
|
||||
candidate State // state being considered for promotion
|
||||
consecutive int // ticks the candidate has persisted
|
||||
}
|
||||
|
||||
type nodeResult struct {
|
||||
OK bool
|
||||
Detail string
|
||||
At time.Time
|
||||
}
|
||||
|
||||
// NewAggregator returns an empty aggregator. fn may be nil during
|
||||
// startup; SetTransition can wire it later.
|
||||
func NewAggregator(cluster *config.ClusterConfig, fn TransitionFn) *Aggregator {
|
||||
return &Aggregator{
|
||||
cluster: cluster,
|
||||
transition: fn,
|
||||
perCheck: map[string]*checkState{},
|
||||
}
|
||||
}
|
||||
|
||||
// SetTransition wires (or rewires) the transition callback.
|
||||
func (a *Aggregator) SetTransition(fn TransitionFn) {
|
||||
a.mu.Lock()
|
||||
a.transition = fn
|
||||
a.mu.Unlock()
|
||||
}
|
||||
|
||||
// Submit records one node's result for one check and immediately
|
||||
// re-evaluates that check's aggregate state.
|
||||
func (a *Aggregator) Submit(nodeID string, r Result) {
|
||||
if r.CheckID == "" {
|
||||
return
|
||||
}
|
||||
a.mu.Lock()
|
||||
st, ok := a.perCheck[r.CheckID]
|
||||
if !ok {
|
||||
st = &checkState{
|
||||
latest: map[string]nodeResult{},
|
||||
committed: StateUnknown,
|
||||
candidate: StateUnknown,
|
||||
}
|
||||
a.perCheck[r.CheckID] = st
|
||||
}
|
||||
st.latest[nodeID] = nodeResult{OK: r.OK, Detail: r.Detail, At: r.Timestamp}
|
||||
a.mu.Unlock()
|
||||
a.evaluate(r.CheckID)
|
||||
}
|
||||
|
||||
// SnapshotAll returns the current aggregate view of every known check.
|
||||
func (a *Aggregator) SnapshotAll() map[string]Snapshot {
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
out := make(map[string]Snapshot, len(a.perCheck))
|
||||
for id, st := range a.perCheck {
|
||||
out[id] = a.snapshotLocked(id, st)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// SnapshotFor returns the aggregate for a single check.
|
||||
func (a *Aggregator) SnapshotFor(checkID string) (Snapshot, bool) {
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
st, ok := a.perCheck[checkID]
|
||||
if !ok {
|
||||
return Snapshot{}, false
|
||||
}
|
||||
return a.snapshotLocked(checkID, st), true
|
||||
}
|
||||
|
||||
func (a *Aggregator) evaluate(checkID string) {
|
||||
check := a.lookupCheck(checkID)
|
||||
if check == nil {
|
||||
// Check was removed from cluster.yaml — drop its state so we
|
||||
// don't keep alerting on something the operator deleted.
|
||||
a.mu.Lock()
|
||||
delete(a.perCheck, checkID)
|
||||
a.mu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
a.mu.Lock()
|
||||
st, ok := a.perCheck[checkID]
|
||||
if !ok {
|
||||
a.mu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
freshWindow := freshWindowFor(check.Interval)
|
||||
cutoff := time.Now().Add(-freshWindow)
|
||||
|
||||
var ok_, notOK int
|
||||
var lastDetail string
|
||||
for _, nr := range st.latest {
|
||||
if nr.At.Before(cutoff) {
|
||||
continue
|
||||
}
|
||||
if nr.OK {
|
||||
ok_++
|
||||
} else {
|
||||
notOK++
|
||||
if lastDetail == "" {
|
||||
lastDetail = nr.Detail
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var candidate State
|
||||
switch {
|
||||
case ok_+notOK == 0:
|
||||
candidate = StateUnknown
|
||||
case notOK > ok_:
|
||||
candidate = StateDown
|
||||
default:
|
||||
candidate = StateUp
|
||||
}
|
||||
|
||||
if candidate == st.candidate {
|
||||
st.consecutive++
|
||||
} else {
|
||||
st.candidate = candidate
|
||||
st.consecutive = 1
|
||||
}
|
||||
|
||||
var fireFrom, fireTo State
|
||||
var fired bool
|
||||
if candidate != st.committed && st.consecutive >= HysteresisCount {
|
||||
fireFrom = st.committed
|
||||
fireTo = candidate
|
||||
st.committed = candidate
|
||||
fired = true
|
||||
}
|
||||
|
||||
snap := a.snapshotLocked(checkID, st)
|
||||
fn := a.transition
|
||||
a.mu.Unlock()
|
||||
|
||||
if fired && fn != nil {
|
||||
fn(check, fireFrom, fireTo, snap)
|
||||
}
|
||||
}
|
||||
|
||||
func (a *Aggregator) snapshotLocked(checkID string, st *checkState) Snapshot {
|
||||
check := a.lookupCheck(checkID)
|
||||
freshWindow := 60 * time.Second
|
||||
if check != nil {
|
||||
freshWindow = freshWindowFor(check.Interval)
|
||||
}
|
||||
cutoff := time.Now().Add(-freshWindow)
|
||||
|
||||
var ok_, notOK int
|
||||
var detail string
|
||||
for _, nr := range st.latest {
|
||||
if nr.At.Before(cutoff) {
|
||||
continue
|
||||
}
|
||||
if nr.OK {
|
||||
ok_++
|
||||
} else {
|
||||
notOK++
|
||||
if detail == "" {
|
||||
detail = nr.Detail
|
||||
}
|
||||
}
|
||||
}
|
||||
return Snapshot{
|
||||
CheckID: checkID,
|
||||
State: st.committed,
|
||||
Reports: ok_ + notOK,
|
||||
OKCount: ok_,
|
||||
NotOK: notOK,
|
||||
Detail: detail,
|
||||
UpdateAt: time.Now().UTC(),
|
||||
}
|
||||
}
|
||||
|
||||
func (a *Aggregator) lookupCheck(id string) *config.Check {
|
||||
snap := a.cluster.Snapshot()
|
||||
for i := range snap.Checks {
|
||||
if snap.Checks[i].ID == id {
|
||||
c := snap.Checks[i]
|
||||
return &c
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// freshWindowFor returns the staleness threshold for a check given
|
||||
// its configured interval. Anything older than this is considered too
|
||||
// stale to count.
|
||||
func freshWindowFor(interval time.Duration) time.Duration {
|
||||
if interval <= 0 {
|
||||
return 60 * time.Second
|
||||
}
|
||||
w := interval * 3
|
||||
if w < 30*time.Second {
|
||||
w = 30 * time.Second
|
||||
}
|
||||
return w
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/jasper/quptime/internal/config"
|
||||
)
|
||||
|
||||
// maxBodyRead is the cap on how much body a check will pull when
|
||||
// BodyMatch is non-empty. Anything beyond is ignored.
|
||||
const maxBodyRead = 1 << 20 // 1 MiB
|
||||
|
||||
type httpProber struct{}
|
||||
|
||||
func (httpProber) Probe(ctx context.Context, c *config.Check) Result {
|
||||
client := &http.Client{
|
||||
Timeout: c.Timeout,
|
||||
Transport: &http.Transport{
|
||||
TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12},
|
||||
},
|
||||
}
|
||||
start := time.Now()
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.Target, nil)
|
||||
if err != nil {
|
||||
return Result{OK: false, Detail: "build request: " + err.Error()}
|
||||
}
|
||||
req.Header.Set("User-Agent", "quptime/1.0")
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return Result{OK: false, Detail: err.Error(), Latency: time.Since(start)}
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
expected := c.ExpectStatus
|
||||
if expected == 0 {
|
||||
expected = 200
|
||||
}
|
||||
if resp.StatusCode != expected {
|
||||
return Result{
|
||||
OK: false,
|
||||
Detail: "status " + resp.Status,
|
||||
Latency: time.Since(start),
|
||||
}
|
||||
}
|
||||
|
||||
if c.BodyMatch != "" {
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, maxBodyRead))
|
||||
if err != nil {
|
||||
return Result{OK: false, Detail: "read body: " + err.Error(), Latency: time.Since(start)}
|
||||
}
|
||||
if !strings.Contains(string(body), c.BodyMatch) {
|
||||
return Result{OK: false, Detail: "body match miss", Latency: time.Since(start)}
|
||||
}
|
||||
}
|
||||
|
||||
return Result{OK: true, Latency: time.Since(start)}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
probing "github.com/prometheus-community/pro-bing"
|
||||
|
||||
"github.com/jasper/quptime/internal/config"
|
||||
)
|
||||
|
||||
type icmpProber struct{}
|
||||
|
||||
// Probe sends a single ICMP echo. On Linux we default to unprivileged
|
||||
// "udp" mode so the daemon does not require CAP_NET_RAW. Operators
|
||||
// who can grant the cap (or run as root) get raw ICMP automatically.
|
||||
func (icmpProber) Probe(ctx context.Context, c *config.Check) Result {
|
||||
start := time.Now()
|
||||
pinger, err := probing.NewPinger(c.Target)
|
||||
if err != nil {
|
||||
return Result{OK: false, Detail: "resolve: " + err.Error()}
|
||||
}
|
||||
pinger.Count = 1
|
||||
pinger.Timeout = c.Timeout
|
||||
if pinger.Timeout <= 0 {
|
||||
pinger.Timeout = 5 * time.Second
|
||||
}
|
||||
pinger.SetPrivileged(runtime.GOOS != "linux")
|
||||
|
||||
doneCh := make(chan struct{})
|
||||
go func() {
|
||||
_ = pinger.RunWithContext(ctx)
|
||||
close(doneCh)
|
||||
}()
|
||||
select {
|
||||
case <-doneCh:
|
||||
case <-ctx.Done():
|
||||
pinger.Stop()
|
||||
return Result{OK: false, Detail: "timeout", Latency: time.Since(start)}
|
||||
}
|
||||
|
||||
stats := pinger.Statistics()
|
||||
if stats.PacketsRecv == 0 {
|
||||
return Result{OK: false, Detail: "no reply", Latency: time.Since(start)}
|
||||
}
|
||||
return Result{OK: true, Latency: stats.AvgRtt}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
// Package checks runs the configured HTTP/TCP/ICMP probes on every
|
||||
// node and aggregates per-check results on the master.
|
||||
//
|
||||
// Probers are the small "do one round of work" units. The Scheduler
|
||||
// drives them on a per-check timer and ships each result back to the
|
||||
// master via the inter-node transport. The Aggregator (master only)
|
||||
// folds incoming results into per-check sliding windows and decides
|
||||
// when a check has crossed UP↔DOWN.
|
||||
package checks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/jasper/quptime/internal/config"
|
||||
)
|
||||
|
||||
// Result is the outcome of a single probe.
|
||||
type Result struct {
|
||||
CheckID string
|
||||
OK bool
|
||||
Detail string
|
||||
Latency time.Duration
|
||||
Timestamp time.Time
|
||||
}
|
||||
|
||||
// Prober runs one probe of a configured check.
|
||||
type Prober interface {
|
||||
Probe(ctx context.Context, c *config.Check) Result
|
||||
}
|
||||
|
||||
// Run dispatches to the right Prober for the given check type. Returns
|
||||
// an error result instead of failing when a check has unknown type.
|
||||
func Run(ctx context.Context, c *config.Check) Result {
|
||||
deadline := c.Timeout
|
||||
if deadline <= 0 {
|
||||
deadline = 10 * time.Second
|
||||
}
|
||||
pctx, cancel := context.WithTimeout(ctx, deadline)
|
||||
defer cancel()
|
||||
|
||||
var p Prober
|
||||
switch c.Type {
|
||||
case config.CheckHTTP:
|
||||
p = httpProber{}
|
||||
case config.CheckTCP:
|
||||
p = tcpProber{}
|
||||
case config.CheckICMP:
|
||||
p = icmpProber{}
|
||||
default:
|
||||
return Result{
|
||||
CheckID: c.ID,
|
||||
OK: false,
|
||||
Detail: fmt.Sprintf("unknown check type %q", c.Type),
|
||||
Timestamp: time.Now().UTC(),
|
||||
}
|
||||
}
|
||||
res := p.Probe(pctx, c)
|
||||
res.CheckID = c.ID
|
||||
if res.Timestamp.IsZero() {
|
||||
res.Timestamp = time.Now().UTC()
|
||||
}
|
||||
return res
|
||||
}
|
||||
@@ -0,0 +1,147 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/jasper/quptime/internal/config"
|
||||
)
|
||||
|
||||
// ReconcileInterval is how often the scheduler reconciles its set of
|
||||
// running probes against cluster.yaml.
|
||||
const ReconcileInterval = 5 * time.Second
|
||||
|
||||
// Sink is the abstraction the scheduler uses to report results.
|
||||
// Implemented by the daemon: results go straight to the local
|
||||
// aggregator when self is the master, otherwise they ship to the
|
||||
// master over the RPC channel.
|
||||
type Sink interface {
|
||||
Submit(Result)
|
||||
}
|
||||
|
||||
// Scheduler keeps a goroutine alive per configured check. On each
|
||||
// reconcile pass it starts probes for new checks, stops probes for
|
||||
// removed checks, and restarts probes whose interval or type changed.
|
||||
type Scheduler struct {
|
||||
cluster *config.ClusterConfig
|
||||
sink Sink
|
||||
|
||||
mu sync.Mutex
|
||||
running map[string]*probeWorker
|
||||
}
|
||||
|
||||
type probeWorker struct {
|
||||
check config.Check
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
// NewScheduler creates a scheduler bound to the given cluster config.
|
||||
func NewScheduler(cluster *config.ClusterConfig, sink Sink) *Scheduler {
|
||||
return &Scheduler{
|
||||
cluster: cluster,
|
||||
sink: sink,
|
||||
running: map[string]*probeWorker{},
|
||||
}
|
||||
}
|
||||
|
||||
// Start runs the reconcile loop until ctx is cancelled. Reconcile is
|
||||
// also called immediately on entry so checks start without waiting
|
||||
// for the first tick.
|
||||
func (s *Scheduler) Start(ctx context.Context) {
|
||||
s.reconcile(ctx)
|
||||
t := time.NewTicker(ReconcileInterval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
s.stopAll()
|
||||
return
|
||||
case <-t.C:
|
||||
s.reconcile(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scheduler) reconcile(ctx context.Context) {
|
||||
snap := s.cluster.Snapshot()
|
||||
want := map[string]config.Check{}
|
||||
for _, c := range snap.Checks {
|
||||
if c.ID == "" {
|
||||
continue
|
||||
}
|
||||
want[c.ID] = c
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
for id, w := range s.running {
|
||||
desired, stillThere := want[id]
|
||||
if !stillThere || !sameCheck(desired, w.check) {
|
||||
w.cancel()
|
||||
delete(s.running, id)
|
||||
}
|
||||
}
|
||||
for id, c := range want {
|
||||
if _, exists := s.running[id]; exists {
|
||||
continue
|
||||
}
|
||||
wctx, cancel := context.WithCancel(ctx)
|
||||
s.running[id] = &probeWorker{check: c, cancel: cancel}
|
||||
go s.run(wctx, c)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scheduler) run(ctx context.Context, c config.Check) {
|
||||
interval := c.Interval
|
||||
if interval <= 0 {
|
||||
interval = 30 * time.Second
|
||||
}
|
||||
// stagger startup so a freshly-loaded scheduler doesn't burst
|
||||
// hundreds of probes simultaneously.
|
||||
jitter := time.Duration(int64(interval) / 10)
|
||||
if jitter > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(time.Duration(time.Now().UnixNano() % int64(jitter))):
|
||||
}
|
||||
}
|
||||
t := time.NewTicker(interval)
|
||||
defer t.Stop()
|
||||
|
||||
// fire one immediate probe so state populates without delay.
|
||||
s.sink.Submit(Run(ctx, &c))
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
s.sink.Submit(Run(ctx, &c))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scheduler) stopAll() {
|
||||
s.mu.Lock()
|
||||
for id, w := range s.running {
|
||||
w.cancel()
|
||||
delete(s.running, id)
|
||||
}
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
// sameCheck returns true when two Check structs would produce
|
||||
// identical probing behaviour, so the scheduler can leave the worker
|
||||
// running across a no-op config push.
|
||||
func sameCheck(a, b config.Check) bool {
|
||||
return a.ID == b.ID &&
|
||||
a.Type == b.Type &&
|
||||
a.Target == b.Target &&
|
||||
a.Interval == b.Interval &&
|
||||
a.Timeout == b.Timeout &&
|
||||
a.ExpectStatus == b.ExpectStatus &&
|
||||
a.BodyMatch == b.BodyMatch
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
package checks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net"
|
||||
"time"
|
||||
|
||||
"github.com/jasper/quptime/internal/config"
|
||||
)
|
||||
|
||||
type tcpProber struct{}
|
||||
|
||||
func (tcpProber) Probe(ctx context.Context, c *config.Check) Result {
|
||||
start := time.Now()
|
||||
d := net.Dialer{Timeout: c.Timeout}
|
||||
conn, err := d.DialContext(ctx, "tcp", c.Target)
|
||||
if err != nil {
|
||||
return Result{OK: false, Detail: err.Error(), Latency: time.Since(start)}
|
||||
}
|
||||
_ = conn.Close()
|
||||
return Result{OK: true, Latency: time.Since(start)}
|
||||
}
|
||||
Reference in New Issue
Block a user