From c90ce244b019481be8bb13ee2d7b014433d655c5 Mon Sep 17 00:00:00 2001 From: Axodouble Date: Tue, 12 May 2026 07:25:04 +0000 Subject: [PATCH] Fixed logic error; multi host chicken and egg finger printing causes hosts to never join --- internal/cli/init.go | 27 ++++++++++++++ internal/transport/rpc.go | 40 +++++++++++++++++++- internal/transport/rpc_test.go | 67 ++++++++++++++++++++++++++++++++++ internal/transport/tls.go | 19 ++++++---- 4 files changed, 145 insertions(+), 8 deletions(-) diff --git a/internal/cli/init.go b/internal/cli/init.go index 00b55ce..59de50c 100644 --- a/internal/cli/init.go +++ b/internal/cli/init.go @@ -45,6 +45,33 @@ overwritten. Re-run only after wiping the data directory.`, if _, err := crypto.GenerateKeyPair(nodeID); err != nil { return fmt.Errorf("generate keys: %w", err) } + + // Seed cluster.yaml with this node as its own first peer. + // Without this the math in `quorum` would treat a one-node + // cluster as "0 peers, fallback quorum=1, master=self" — + // which works in isolation but breaks the moment another + // node joins, because the replicated peers list would lack + // the inviter, leading to split-brain elections. + certPEM, err := crypto.LoadCertPEM() + if err != nil { + return fmt.Errorf("load cert: %w", err) + } + fp, err := crypto.FingerprintFromCertPEM(certPEM) + if err != nil { + return fmt.Errorf("fingerprint own cert: %w", err) + } + cluster := &config.ClusterConfig{} + if err := cluster.Mutate(nodeID, func(c *config.ClusterConfig) error { + c.Peers = []config.PeerInfo{{ + NodeID: nodeID, + Advertise: n.AdvertiseAddr(), + Fingerprint: fp, + }} + return nil + }); err != nil { + return fmt.Errorf("seed cluster.yaml: %w", err) + } + fmt.Fprintf(cmd.OutOrStdout(), "initialised node %s\n", nodeID) fmt.Fprintf(cmd.OutOrStdout(), "data dir: %s\n", config.DataDir()) fmt.Fprintf(cmd.OutOrStdout(), "advertise: %s\n", n.AdvertiseAddr()) diff --git a/internal/transport/rpc.go b/internal/transport/rpc.go index ab8138d..1ba0909 100644 --- a/internal/transport/rpc.go +++ b/internal/transport/rpc.go @@ -107,7 +107,10 @@ func (s *Server) handleConn(ctx context.Context, raw net.Conn) { if err := tlsConn.HandshakeContext(ctx); err != nil { return } - peerID := peerNodeIDFromConnState(tlsConn.ConnectionState()) + state := tlsConn.ConnectionState() + peerID := peerNodeIDFromConnState(state) + peerFP := peerFingerprintFromConnState(state) + trusted := s.peerTrusted(peerFP) for { body, err := readFrame(tlsConn) @@ -120,6 +123,14 @@ func (s *Server) handleConn(ctx context.Context, raw net.Conn) { return } + // Until the peer is trusted, only the bootstrap call (Join) is + // allowed through. Everything else gets a clear error so the + // caller knows to re-run `qu node add`. + if !trusted && req.Method != MethodJoin { + _ = writeError(tlsConn, req.ID, "peer not trusted; run `qu node add` first") + continue + } + fn, exists := s.handlers[req.Method] if !exists { _ = writeError(tlsConn, req.ID, "unknown method: "+req.Method) @@ -134,9 +145,26 @@ func (s *Server) handleConn(ctx context.Context, raw net.Conn) { if err := writeResult(tlsConn, req.ID, result); err != nil { return } + + // A successful Join writes the caller into our trust store; + // re-check so subsequent calls on this same connection (or + // after reconnect) flow through normally. + if req.Method == MethodJoin && !trusted { + trusted = s.peerTrusted(peerFP) + } } } +// peerTrusted reports whether peerFP is in our trust store. Returns +// false on empty input so a missing/parse-failed cert is never trusted. +func (s *Server) peerTrusted(peerFP string) bool { + if peerFP == "" { + return false + } + _, ok := s.assets.Trust.LookupByFingerprint(peerFP) + return ok +} + // Client opens and pools one mTLS connection per peer node ID. Each // connection serialises outstanding calls under a mutex; concurrent // calls to different peers proceed in parallel. @@ -321,3 +349,13 @@ func peerNodeIDFromConnState(cs tls.ConnectionState) string { } return cs.PeerCertificates[0].Subject.CommonName } + +// peerFingerprintFromConnState computes the SPKI fingerprint of the +// peer's leaf cert, matching the format the trust store stores. An +// empty result means the peer presented no cert. +func peerFingerprintFromConnState(cs tls.ConnectionState) string { + if len(cs.PeerCertificates) == 0 { + return "" + } + return fingerprintOf(cs.PeerCertificates[0]) +} diff --git a/internal/transport/rpc_test.go b/internal/transport/rpc_test.go index eb3622c..7516d6f 100644 --- a/internal/transport/rpc_test.go +++ b/internal/transport/rpc_test.go @@ -170,6 +170,73 @@ func TestRPCRejectsUntrustedPeer(t *testing.T) { } } +func TestJoinAllowedFromUntrustedPeer(t *testing.T) { + // Verifies the bootstrap path: B has not yet been added to A's + // trust store. A must still accept the Join RPC; subsequent + // non-Join calls on the same connection should succeed once Join + // has populated trust. + a := makeNode(t, t.TempDir(), "node-a") + b := makeNode(t, t.TempDir(), "node-b") + + tmpLn, _ := net.Listen("tcp", "127.0.0.1:0") + addr := tmpLn.Addr().String() + tmpLn.Close() + + // B must trust A so B's client-side handshake passes. + t.Setenv("QUPTIME_DIR", b.dir) + _ = b.assets.Trust.Add(trust.Entry{NodeID: a.id, Address: addr, Fingerprint: a.fp}) + + srv := NewServer(a.assets) + srv.Handle(MethodJoin, func(_ context.Context, _ string, raw json.RawMessage) (any, error) { + var req JoinRequest + if err := json.Unmarshal(raw, &req); err != nil { + return JoinResponse{Error: err.Error()}, nil + } + // A pretends to accept and records B in its trust store. + t.Setenv("QUPTIME_DIR", a.dir) + if err := a.assets.Trust.Add(trust.Entry{ + NodeID: req.NodeID, Address: req.Advertise, Fingerprint: req.Fingerprint, + }); err != nil { + return JoinResponse{Error: err.Error()}, nil + } + return JoinResponse{Accepted: true}, nil + }) + srv.Handle(MethodPing, func(_ context.Context, _ string, _ json.RawMessage) (any, error) { + return PingResponse{NodeID: a.id, Now: time.Now()}, nil + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go srv.Serve(ctx, addr) + defer srv.Stop() + if !waitForDial(addr, 2*time.Second) { + t.Fatal("server not up") + } + + cli := NewClient(b.assets) + defer cli.Close() + + // Pre-Join: Ping must be rejected with the "not trusted" error. + if err := cli.Call(ctx, a.id, addr, MethodPing, nil, nil); err == nil { + t.Error("Ping was allowed without prior trust") + } + + // Join must succeed even though B is untrusted. + joinReq := JoinRequest{NodeID: b.id, Advertise: addr, Fingerprint: b.fp, CertPEM: string(b.assets.Cert)} + var joinResp JoinResponse + if err := cli.Call(ctx, a.id, addr, MethodJoin, joinReq, &joinResp); err != nil { + t.Fatalf("Join: %v", err) + } + if !joinResp.Accepted { + t.Fatalf("Join not accepted: %s", joinResp.Error) + } + + // Post-Join on the SAME pooled connection: Ping should now flow. + if err := cli.Call(ctx, a.id, addr, MethodPing, nil, nil); err != nil { + t.Errorf("post-Join Ping failed: %v", err) + } +} + // waitForDial polls a TCP listener until it accepts a plain TCP // connection, signalling that Serve has begun listening. func waitForDial(addr string, max time.Duration) bool { diff --git a/internal/transport/tls.go b/internal/transport/tls.go index fe0a398..7482877 100644 --- a/internal/transport/tls.go +++ b/internal/transport/tls.go @@ -41,19 +41,24 @@ func (a *TLSAssets) tlsCert() (tls.Certificate, error) { } // ServerConfig produces a tls.Config suitable for an inter-node -// listener. Peers must present a certificate, and that certificate's -// fingerprint must already be present in the trust store. +// listener. +// +// We accept any client certificate at the TLS layer (no CA verification +// and no fingerprint pinning here). Trust is enforced one layer up by +// the RPC dispatcher: untrusted peers may only invoke MethodJoin, which +// is the protocol's bootstrap step. This avoids the chicken-and-egg +// where Join itself would need pre-existing symmetric trust to complete +// the handshake. func (a *TLSAssets) ServerConfig() (*tls.Config, error) { cert, err := a.tlsCert() if err != nil { return nil, err } return &tls.Config{ - Certificates: []tls.Certificate{cert}, - MinVersion: MinTLS, - ClientAuth: tls.RequireAnyClientCert, - InsecureSkipVerify: true, // we do our own pinning via VerifyPeerCertificate - VerifyPeerCertificate: a.Trust.VerifyPeerCert, + Certificates: []tls.Certificate{cert}, + MinVersion: MinTLS, + ClientAuth: tls.RequireAnyClientCert, + InsecureSkipVerify: true, // trust is gated per-method by the RPC dispatcher }, nil }