Initial structure
This commit is contained in:
@@ -0,0 +1,383 @@
|
||||
package daemon
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
|
||||
"github.com/jasper/quptime/internal/config"
|
||||
"github.com/jasper/quptime/internal/crypto"
|
||||
"github.com/jasper/quptime/internal/transport"
|
||||
"github.com/jasper/quptime/internal/trust"
|
||||
)
|
||||
|
||||
// controlMaxFrame caps unix-socket request/response frames. Generous
|
||||
// because cluster.yaml snapshots travel over this channel too.
|
||||
const controlMaxFrame = 16 * 1024 * 1024
|
||||
|
||||
// Control method names. Defined as constants so the CLI side cannot
|
||||
// drift out of sync with the daemon.
|
||||
const (
|
||||
CtrlStatus = "status"
|
||||
CtrlMutate = "mutate"
|
||||
CtrlNodeProbe = "node.probe"
|
||||
CtrlNodeAdd = "node.add"
|
||||
CtrlNodeRemove = "node.remove"
|
||||
CtrlTrustList = "trust.list"
|
||||
CtrlTrustRemove = "trust.remove"
|
||||
CtrlAlertTest = "alert.test"
|
||||
)
|
||||
|
||||
// CtrlRequest is the wire envelope for a CLI ↔ daemon message.
|
||||
type CtrlRequest struct {
|
||||
Method string `json:"method"`
|
||||
Body json.RawMessage `json:"body,omitempty"`
|
||||
}
|
||||
|
||||
// CtrlResponse carries either an error or a result body.
|
||||
type CtrlResponse struct {
|
||||
Error string `json:"error,omitempty"`
|
||||
Body json.RawMessage `json:"body,omitempty"`
|
||||
}
|
||||
|
||||
// MutateBody is the payload of CtrlMutate.
|
||||
type MutateBody struct {
|
||||
Kind transport.MutationKind `json:"kind"`
|
||||
Payload json.RawMessage `json:"payload"`
|
||||
}
|
||||
|
||||
// MutateResult reports the new cluster version after a successful
|
||||
// mutation.
|
||||
type MutateResult struct {
|
||||
Version uint64 `json:"version"`
|
||||
}
|
||||
|
||||
// NodeProbeBody is the payload of CtrlNodeProbe.
|
||||
type NodeProbeBody struct {
|
||||
Address string `json:"address"`
|
||||
}
|
||||
|
||||
// NodeProbeResult lets the CLI show the operator what they're about
|
||||
// to trust.
|
||||
type NodeProbeResult struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Fingerprint string `json:"fingerprint"`
|
||||
CertPEM string `json:"cert_pem"`
|
||||
}
|
||||
|
||||
// NodeAddBody captures everything the daemon needs once the operator
|
||||
// has confirmed the fingerprint.
|
||||
type NodeAddBody struct {
|
||||
Address string `json:"address"`
|
||||
Fingerprint string `json:"fingerprint"`
|
||||
}
|
||||
|
||||
// NodeAddResult is returned when a peer has been trusted, joined, and
|
||||
// added to the cluster config.
|
||||
type NodeAddResult struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Version uint64 `json:"version"`
|
||||
}
|
||||
|
||||
// AlertTestBody is the payload of CtrlAlertTest.
|
||||
type AlertTestBody struct {
|
||||
AlertID string `json:"alert_id"`
|
||||
}
|
||||
|
||||
// NodeRemoveBody / TrustRemoveBody share the same shape.
|
||||
type NodeRemoveBody struct {
|
||||
NodeID string `json:"node_id"`
|
||||
}
|
||||
|
||||
// controlServer accepts CLI commands over a unix socket.
|
||||
type controlServer struct {
|
||||
d *Daemon
|
||||
|
||||
mu sync.Mutex
|
||||
ln net.Listener
|
||||
conns map[net.Conn]struct{}
|
||||
}
|
||||
|
||||
func newControlServer(d *Daemon) *controlServer {
|
||||
return &controlServer{d: d, conns: map[net.Conn]struct{}{}}
|
||||
}
|
||||
|
||||
// Serve binds the unix socket and dispatches commands until ctx is
|
||||
// cancelled.
|
||||
func (c *controlServer) Serve(ctx context.Context) error {
|
||||
sockPath := config.SocketPath()
|
||||
if err := os.MkdirAll(filepath.Dir(sockPath), 0o700); err != nil {
|
||||
return fmt.Errorf("control socket dir: %w", err)
|
||||
}
|
||||
// stale socket from a previous crash — unlink before binding
|
||||
if fi, err := os.Stat(sockPath); err == nil && fi.Mode()&os.ModeSocket != 0 {
|
||||
_ = os.Remove(sockPath)
|
||||
}
|
||||
ln, err := net.Listen("unix", sockPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("listen %s: %w", sockPath, err)
|
||||
}
|
||||
if err := os.Chmod(sockPath, 0o600); err != nil {
|
||||
_ = ln.Close()
|
||||
return fmt.Errorf("chmod %s: %w", sockPath, err)
|
||||
}
|
||||
c.mu.Lock()
|
||||
c.ln = ln
|
||||
c.mu.Unlock()
|
||||
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
_ = ln.Close()
|
||||
}()
|
||||
|
||||
for {
|
||||
conn, err := ln.Accept()
|
||||
if err != nil {
|
||||
if errors.Is(err, net.ErrClosed) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
go c.handleConn(ctx, conn)
|
||||
}
|
||||
}
|
||||
|
||||
// Stop closes the listener and any in-flight connections.
|
||||
func (c *controlServer) Stop() {
|
||||
c.mu.Lock()
|
||||
if c.ln != nil {
|
||||
_ = c.ln.Close()
|
||||
}
|
||||
for cn := range c.conns {
|
||||
_ = cn.Close()
|
||||
}
|
||||
c.conns = map[net.Conn]struct{}{}
|
||||
c.mu.Unlock()
|
||||
}
|
||||
|
||||
func (c *controlServer) handleConn(ctx context.Context, conn net.Conn) {
|
||||
c.mu.Lock()
|
||||
c.conns[conn] = struct{}{}
|
||||
c.mu.Unlock()
|
||||
defer func() {
|
||||
c.mu.Lock()
|
||||
delete(c.conns, conn)
|
||||
c.mu.Unlock()
|
||||
_ = conn.Close()
|
||||
}()
|
||||
|
||||
body, err := readCtrlFrame(conn)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
var req CtrlRequest
|
||||
if err := json.Unmarshal(body, &req); err != nil {
|
||||
_ = writeCtrlResponse(conn, CtrlResponse{Error: "decode: " + err.Error()})
|
||||
return
|
||||
}
|
||||
resp := c.dispatch(ctx, req)
|
||||
_ = writeCtrlResponse(conn, resp)
|
||||
}
|
||||
|
||||
func (c *controlServer) dispatch(ctx context.Context, req CtrlRequest) CtrlResponse {
|
||||
switch req.Method {
|
||||
case CtrlStatus:
|
||||
return ok(c.d.buildStatus())
|
||||
|
||||
case CtrlMutate:
|
||||
var body MutateBody
|
||||
if err := json.Unmarshal(req.Body, &body); err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
var payload json.RawMessage = body.Payload
|
||||
ver, err := c.d.replicator.LocalMutate(ctx, body.Kind, json.RawMessage(payload))
|
||||
if err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
return ok(MutateResult{Version: ver})
|
||||
|
||||
case CtrlNodeProbe:
|
||||
var body NodeProbeBody
|
||||
if err := json.Unmarshal(req.Body, &body); err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
sample, err := transport.FetchPeerCert(ctx, c.d.assets, body.Address)
|
||||
if err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
return ok(NodeProbeResult{
|
||||
NodeID: sample.Cert.Subject.CommonName,
|
||||
Fingerprint: sample.Fingerprint,
|
||||
CertPEM: string(sample.CertPEM),
|
||||
})
|
||||
|
||||
case CtrlNodeAdd:
|
||||
var body NodeAddBody
|
||||
if err := json.Unmarshal(req.Body, &body); err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
result, err := c.d.nodeAdd(ctx, body)
|
||||
if err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
return ok(result)
|
||||
|
||||
case CtrlNodeRemove:
|
||||
var body NodeRemoveBody
|
||||
if err := json.Unmarshal(req.Body, &body); err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
ver, err := c.d.replicator.LocalMutate(ctx, transport.MutationRemovePeer, body.NodeID)
|
||||
if err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
if _, err := c.d.trust.Remove(body.NodeID); err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
return ok(MutateResult{Version: ver})
|
||||
|
||||
case CtrlTrustList:
|
||||
return ok(c.d.trust.List())
|
||||
|
||||
case CtrlTrustRemove:
|
||||
var body NodeRemoveBody
|
||||
if err := json.Unmarshal(req.Body, &body); err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
removed, err := c.d.trust.Remove(body.NodeID)
|
||||
if err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
return ok(map[string]bool{"removed": removed})
|
||||
|
||||
case CtrlAlertTest:
|
||||
var body AlertTestBody
|
||||
if err := json.Unmarshal(req.Body, &body); err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
if err := c.d.dispatcher.Test(body.AlertID); err != nil {
|
||||
return fail(err)
|
||||
}
|
||||
return ok(map[string]string{"status": "sent"})
|
||||
|
||||
default:
|
||||
return CtrlResponse{Error: "unknown method: " + req.Method}
|
||||
}
|
||||
}
|
||||
|
||||
// nodeAdd is the daemon-side TOFU completion path: it probes the peer
|
||||
// for its current cert (verifying the fingerprint matches what the
|
||||
// operator approved), records a trust entry, swaps trust with the
|
||||
// peer via the Join RPC, and finally proposes a cluster mutation to
|
||||
// list the peer in cluster.yaml.
|
||||
func (d *Daemon) nodeAdd(ctx context.Context, body NodeAddBody) (NodeAddResult, error) {
|
||||
sample, err := transport.FetchPeerCert(ctx, d.assets, body.Address)
|
||||
if err != nil {
|
||||
return NodeAddResult{}, fmt.Errorf("re-probe: %w", err)
|
||||
}
|
||||
if sample.Fingerprint != body.Fingerprint {
|
||||
return NodeAddResult{}, fmt.Errorf("fingerprint changed since probe: got %s want %s",
|
||||
sample.Fingerprint, body.Fingerprint)
|
||||
}
|
||||
peerID := sample.Cert.Subject.CommonName
|
||||
if peerID == "" {
|
||||
return NodeAddResult{}, errors.New("peer cert has no CommonName / NodeID")
|
||||
}
|
||||
|
||||
if err := d.trust.Add(trust.Entry{
|
||||
NodeID: peerID,
|
||||
Address: body.Address,
|
||||
Fingerprint: sample.Fingerprint,
|
||||
CertPEM: string(sample.CertPEM),
|
||||
}); err != nil {
|
||||
return NodeAddResult{}, fmt.Errorf("trust add: %w", err)
|
||||
}
|
||||
|
||||
// Ask the peer to record us symmetrically.
|
||||
myFP, err := crypto.FingerprintFromCertPEM(d.assets.Cert)
|
||||
if err != nil {
|
||||
return NodeAddResult{}, fmt.Errorf("own fingerprint: %w", err)
|
||||
}
|
||||
joinReq := transport.JoinRequest{
|
||||
NodeID: d.node.NodeID,
|
||||
Advertise: d.node.AdvertiseAddr(),
|
||||
Fingerprint: myFP,
|
||||
CertPEM: string(d.assets.Cert),
|
||||
}
|
||||
var joinResp transport.JoinResponse
|
||||
if err := d.client.Call(ctx, peerID, body.Address, transport.MethodJoin, joinReq, &joinResp); err != nil {
|
||||
return NodeAddResult{}, fmt.Errorf("join %s: %w", peerID, err)
|
||||
}
|
||||
if !joinResp.Accepted {
|
||||
return NodeAddResult{}, fmt.Errorf("peer rejected join: %s", joinResp.Error)
|
||||
}
|
||||
|
||||
// Propose the cluster-config addition. Routed to master via the
|
||||
// replicator; if we are the master, applied directly.
|
||||
peerInfo := config.PeerInfo{
|
||||
NodeID: peerID,
|
||||
Advertise: body.Address,
|
||||
Fingerprint: sample.Fingerprint,
|
||||
}
|
||||
ver, err := d.replicator.LocalMutate(ctx, transport.MutationAddPeer, peerInfo)
|
||||
if err != nil {
|
||||
return NodeAddResult{}, fmt.Errorf("propose peer: %w", err)
|
||||
}
|
||||
return NodeAddResult{NodeID: peerID, Version: ver}, nil
|
||||
}
|
||||
|
||||
func ok(v any) CtrlResponse {
|
||||
raw, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
return CtrlResponse{Error: err.Error()}
|
||||
}
|
||||
return CtrlResponse{Body: raw}
|
||||
}
|
||||
|
||||
func fail(err error) CtrlResponse {
|
||||
return CtrlResponse{Error: err.Error()}
|
||||
}
|
||||
|
||||
func writeCtrlResponse(w io.Writer, resp CtrlResponse) error {
|
||||
body, err := json.Marshal(resp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return writeCtrlFrame(w, body)
|
||||
}
|
||||
|
||||
func writeCtrlFrame(w io.Writer, body []byte) error {
|
||||
if len(body) > controlMaxFrame {
|
||||
return errors.New("control frame too large")
|
||||
}
|
||||
var hdr [4]byte
|
||||
binary.BigEndian.PutUint32(hdr[:], uint32(len(body)))
|
||||
if _, err := w.Write(hdr[:]); err != nil {
|
||||
return err
|
||||
}
|
||||
_, err := w.Write(body)
|
||||
return err
|
||||
}
|
||||
|
||||
func readCtrlFrame(r io.Reader) ([]byte, error) {
|
||||
var hdr [4]byte
|
||||
if _, err := io.ReadFull(r, hdr[:]); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
n := binary.BigEndian.Uint32(hdr[:])
|
||||
if n > controlMaxFrame {
|
||||
return nil, errors.New("control frame too large")
|
||||
}
|
||||
buf := make([]byte, n)
|
||||
if _, err := io.ReadFull(r, buf); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return buf, nil
|
||||
}
|
||||
@@ -0,0 +1,223 @@
|
||||
// Package daemon ties every long-running component together.
|
||||
//
|
||||
// Lifecycle
|
||||
//
|
||||
// - Load node identity, cluster config, trust store, and key material.
|
||||
// - Build a transport.Client + transport.Server, share TLS assets.
|
||||
// - Construct the quorum manager, replicator, aggregator and alert
|
||||
// dispatcher; wire transport handlers; wire the version observer
|
||||
// to the replicator's pull path; gate alert dispatch on
|
||||
// "I am the master".
|
||||
// - Start the inter-node listener, the local unix-socket control
|
||||
// plane, the heartbeat loop and the check scheduler.
|
||||
// - On ctx cancel, gracefully tear everything down.
|
||||
package daemon
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/jasper/quptime/internal/alerts"
|
||||
"github.com/jasper/quptime/internal/checks"
|
||||
"github.com/jasper/quptime/internal/config"
|
||||
"github.com/jasper/quptime/internal/crypto"
|
||||
"github.com/jasper/quptime/internal/quorum"
|
||||
"github.com/jasper/quptime/internal/replicate"
|
||||
"github.com/jasper/quptime/internal/transport"
|
||||
"github.com/jasper/quptime/internal/trust"
|
||||
)
|
||||
|
||||
// Daemon is the live process: every long-running component lives here.
|
||||
type Daemon struct {
|
||||
logger *log.Logger
|
||||
|
||||
node *config.NodeConfig
|
||||
cluster *config.ClusterConfig
|
||||
trust *trust.Store
|
||||
|
||||
assets *transport.TLSAssets
|
||||
client *transport.Client
|
||||
server *transport.Server
|
||||
|
||||
quorum *quorum.Manager
|
||||
replicator *replicate.Replicator
|
||||
aggregator *checks.Aggregator
|
||||
dispatcher *alerts.Dispatcher
|
||||
scheduler *checks.Scheduler
|
||||
|
||||
control *controlServer
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// New loads every persistent piece of state and assembles the daemon.
|
||||
// It does not start any goroutines.
|
||||
func New(logger *log.Logger) (*Daemon, error) {
|
||||
if logger == nil {
|
||||
logger = log.New(os.Stderr, "quptime: ", log.LstdFlags|log.Lmsgprefix)
|
||||
}
|
||||
|
||||
node, err := config.LoadNodeConfig()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load node.yaml: %w", err)
|
||||
}
|
||||
if node.NodeID == "" {
|
||||
return nil, errors.New("node.yaml has empty node_id — run `qu init` first")
|
||||
}
|
||||
|
||||
cluster, err := config.LoadClusterConfig()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load cluster.yaml: %w", err)
|
||||
}
|
||||
|
||||
store, err := trust.Load()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load trust.yaml: %w", err)
|
||||
}
|
||||
|
||||
priv, err := crypto.LoadPrivateKey()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load private key: %w", err)
|
||||
}
|
||||
certPEM, err := crypto.LoadCertPEM()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load cert: %w", err)
|
||||
}
|
||||
|
||||
assets := &transport.TLSAssets{Cert: certPEM, Key: priv, Trust: store}
|
||||
client := transport.NewClient(assets)
|
||||
server := transport.NewServer(assets)
|
||||
|
||||
d := &Daemon{
|
||||
logger: logger,
|
||||
node: node,
|
||||
cluster: cluster,
|
||||
trust: store,
|
||||
assets: assets,
|
||||
client: client,
|
||||
server: server,
|
||||
}
|
||||
|
||||
d.quorum = quorum.New(node.NodeID, cluster, client)
|
||||
d.replicator = replicate.New(node.NodeID, cluster, client, d.quorum)
|
||||
d.aggregator = checks.NewAggregator(cluster, nil)
|
||||
d.dispatcher = alerts.New(cluster, node.NodeID, logger)
|
||||
|
||||
d.aggregator.SetTransition(func(check *config.Check, from, to checks.State, snap checks.Snapshot) {
|
||||
if !d.quorum.IsMaster() {
|
||||
return
|
||||
}
|
||||
d.dispatcher.OnTransition(check, from, to, snap)
|
||||
})
|
||||
|
||||
d.quorum.SetVersionObserver(func(peerID, peerAddr string, peerVer uint64) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := d.replicator.PullFrom(ctx, peerID, peerAddr); err != nil {
|
||||
d.logger.Printf("replicate: pull from %s: %v", peerID, err)
|
||||
}
|
||||
})
|
||||
|
||||
d.scheduler = checks.NewScheduler(cluster, &sink{d: d})
|
||||
d.control = newControlServer(d)
|
||||
d.registerHandlers()
|
||||
return d, nil
|
||||
}
|
||||
|
||||
// Run binds the inter-node listener and the local control socket,
|
||||
// starts the quorum loop and the scheduler, and blocks until ctx is
|
||||
// cancelled.
|
||||
func (d *Daemon) Run(ctx context.Context) error {
|
||||
addr := fmt.Sprintf("%s:%d", d.node.BindAddr, d.node.BindPort)
|
||||
d.logger.Printf("listening on %s as node %s", addr, d.node.NodeID)
|
||||
|
||||
servErr := make(chan error, 1)
|
||||
d.wg.Add(1)
|
||||
go func() {
|
||||
defer d.wg.Done()
|
||||
servErr <- d.server.Serve(ctx, addr)
|
||||
}()
|
||||
|
||||
ctrlErr := make(chan error, 1)
|
||||
d.wg.Add(1)
|
||||
go func() {
|
||||
defer d.wg.Done()
|
||||
ctrlErr <- d.control.Serve(ctx)
|
||||
}()
|
||||
|
||||
d.wg.Add(1)
|
||||
go func() {
|
||||
defer d.wg.Done()
|
||||
d.quorum.Start(ctx)
|
||||
}()
|
||||
|
||||
d.wg.Add(1)
|
||||
go func() {
|
||||
defer d.wg.Done()
|
||||
d.scheduler.Start(ctx)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case err := <-servErr:
|
||||
if err != nil {
|
||||
d.logger.Printf("transport server exited: %v", err)
|
||||
}
|
||||
case err := <-ctrlErr:
|
||||
if err != nil {
|
||||
d.logger.Printf("control server exited: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
d.server.Stop()
|
||||
d.control.Stop()
|
||||
d.client.Close()
|
||||
d.wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
// sink routes scheduled probe results either into the local
|
||||
// aggregator (when self is master) or to the current master over
|
||||
// RPC. Implements checks.Sink.
|
||||
type sink struct{ d *Daemon }
|
||||
|
||||
func (s *sink) Submit(r checks.Result) {
|
||||
if s.d.quorum.IsMaster() {
|
||||
s.d.aggregator.Submit(s.d.node.NodeID, r)
|
||||
return
|
||||
}
|
||||
masterID := s.d.quorum.Master()
|
||||
if masterID == "" {
|
||||
return // no master right now — drop; we'll probe again next interval
|
||||
}
|
||||
addr := s.d.addressOf(masterID)
|
||||
if addr == "" {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
req := transport.ReportResultRequest{
|
||||
FromNodeID: s.d.node.NodeID,
|
||||
CheckID: r.CheckID,
|
||||
OK: r.OK,
|
||||
Detail: r.Detail,
|
||||
LatencyMS: r.Latency.Milliseconds(),
|
||||
At: r.Timestamp,
|
||||
}
|
||||
if err := s.d.client.Call(ctx, masterID, addr, transport.MethodReportResult, req, nil); err != nil {
|
||||
s.d.logger.Printf("report to master %s: %v", masterID, err)
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Daemon) addressOf(nodeID string) string {
|
||||
for _, p := range d.cluster.Snapshot().Peers {
|
||||
if p.NodeID == nodeID {
|
||||
return p.Advertise
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@@ -0,0 +1,150 @@
|
||||
package daemon
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"github.com/jasper/quptime/internal/checks"
|
||||
"github.com/jasper/quptime/internal/crypto"
|
||||
"github.com/jasper/quptime/internal/transport"
|
||||
"github.com/jasper/quptime/internal/trust"
|
||||
)
|
||||
|
||||
// registerHandlers wires every inter-node RPC method that the daemon
|
||||
// understands onto the transport server. Each method delegates to the
|
||||
// owning subsystem (quorum, replicator, etc.) so this file stays a
|
||||
// thin dispatch table.
|
||||
func (d *Daemon) registerHandlers() {
|
||||
d.server.Handle(transport.MethodPing, func(_ context.Context, _ string, _ json.RawMessage) (any, error) {
|
||||
return transport.PingResponse{NodeID: d.node.NodeID, Now: time.Now().UTC()}, nil
|
||||
})
|
||||
|
||||
d.server.Handle(transport.MethodWhoAmI, func(_ context.Context, _ string, _ json.RawMessage) (any, error) {
|
||||
fp, err := crypto.FingerprintFromCertPEM(d.assets.Cert)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return transport.WhoAmIResponse{
|
||||
NodeID: d.node.NodeID,
|
||||
Advertise: d.node.AdvertiseAddr(),
|
||||
Fingerprint: fp,
|
||||
CertPEM: string(d.assets.Cert),
|
||||
}, nil
|
||||
})
|
||||
|
||||
d.server.Handle(transport.MethodJoin, func(_ context.Context, _ string, raw json.RawMessage) (any, error) {
|
||||
var req transport.JoinRequest
|
||||
if err := json.Unmarshal(raw, &req); err != nil {
|
||||
return transport.JoinResponse{Error: err.Error()}, nil
|
||||
}
|
||||
fp, err := crypto.FingerprintFromCertPEM([]byte(req.CertPEM))
|
||||
if err != nil {
|
||||
return transport.JoinResponse{Error: "parse cert: " + err.Error()}, nil
|
||||
}
|
||||
if fp != req.Fingerprint {
|
||||
return transport.JoinResponse{Error: "fingerprint mismatch"}, nil
|
||||
}
|
||||
// Outbound join (the proposing node already accepted our cert
|
||||
// out of band). Symmetric trust is required for mTLS to work,
|
||||
// so we accept the join automatically. Operators who need
|
||||
// stricter onboarding can disable the listener and use the
|
||||
// CLI flow exclusively.
|
||||
if err := d.trust.Add(trust.Entry{
|
||||
NodeID: req.NodeID,
|
||||
Address: req.Advertise,
|
||||
Fingerprint: req.Fingerprint,
|
||||
CertPEM: req.CertPEM,
|
||||
}); err != nil {
|
||||
return transport.JoinResponse{Error: err.Error()}, nil
|
||||
}
|
||||
return transport.JoinResponse{Accepted: true}, nil
|
||||
})
|
||||
|
||||
d.server.Handle(transport.MethodHeartbeat, func(_ context.Context, _ string, raw json.RawMessage) (any, error) {
|
||||
var req transport.HeartbeatRequest
|
||||
if err := json.Unmarshal(raw, &req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return d.quorum.HandleHeartbeat(req), nil
|
||||
})
|
||||
|
||||
d.server.Handle(transport.MethodGetClusterCfg, func(_ context.Context, _ string, _ json.RawMessage) (any, error) {
|
||||
return d.replicator.HandleGetClusterCfg(), nil
|
||||
})
|
||||
|
||||
d.server.Handle(transport.MethodApplyClusterCfg, func(_ context.Context, _ string, raw json.RawMessage) (any, error) {
|
||||
var req transport.ApplyClusterCfgRequest
|
||||
if err := json.Unmarshal(raw, &req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return d.replicator.HandleApplyClusterCfg(req), nil
|
||||
})
|
||||
|
||||
d.server.Handle(transport.MethodProposeMutation, func(ctx context.Context, _ string, raw json.RawMessage) (any, error) {
|
||||
var req transport.ProposeMutationRequest
|
||||
if err := json.Unmarshal(raw, &req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return d.replicator.HandleProposeMutation(ctx, req), nil
|
||||
})
|
||||
|
||||
d.server.Handle(transport.MethodReportResult, func(_ context.Context, _ string, raw json.RawMessage) (any, error) {
|
||||
var req transport.ReportResultRequest
|
||||
if err := json.Unmarshal(raw, &req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res := checks.Result{
|
||||
CheckID: req.CheckID,
|
||||
OK: req.OK,
|
||||
Detail: req.Detail,
|
||||
Latency: time.Duration(req.LatencyMS) * time.Millisecond,
|
||||
Timestamp: req.At,
|
||||
}
|
||||
d.aggregator.Submit(req.FromNodeID, res)
|
||||
return transport.ReportResultResponse{}, nil
|
||||
})
|
||||
|
||||
d.server.Handle(transport.MethodStatus, func(_ context.Context, _ string, _ json.RawMessage) (any, error) {
|
||||
return d.buildStatus(), nil
|
||||
})
|
||||
}
|
||||
|
||||
// buildStatus is shared by both the inter-node Status RPC handler and
|
||||
// the local control plane's "status" command.
|
||||
func (d *Daemon) buildStatus() transport.StatusResponse {
|
||||
snap := d.cluster.Snapshot()
|
||||
liveness := d.quorum.Liveness()
|
||||
live := map[string]bool{}
|
||||
for _, id := range d.quorum.LiveSet() {
|
||||
live[id] = true
|
||||
}
|
||||
|
||||
out := transport.StatusResponse{
|
||||
NodeID: d.node.NodeID,
|
||||
Term: d.quorum.Term(),
|
||||
MasterID: d.quorum.Master(),
|
||||
Version: snap.Version,
|
||||
HasQuorum: d.quorum.HasQuorum(),
|
||||
QuorumSize: snap.QuorumSize(),
|
||||
}
|
||||
for _, p := range snap.Peers {
|
||||
out.Peers = append(out.Peers, transport.PeerLiveness{
|
||||
NodeID: p.NodeID,
|
||||
Advertise: p.Advertise,
|
||||
Live: live[p.NodeID],
|
||||
LastSeen: liveness[p.NodeID],
|
||||
})
|
||||
}
|
||||
for _, c := range snap.Checks {
|
||||
cs := transport.CheckSnapshot{CheckID: c.ID, Name: c.Name, State: "unknown"}
|
||||
if agg, ok := d.aggregator.SnapshotFor(c.ID); ok {
|
||||
cs.State = string(agg.State)
|
||||
cs.OKCount = agg.OKCount
|
||||
cs.Total = agg.Reports
|
||||
cs.Detail = agg.Detail
|
||||
}
|
||||
out.Checks = append(out.Checks, cs)
|
||||
}
|
||||
return out
|
||||
}
|
||||
Reference in New Issue
Block a user