Fixed logic error; multi host chicken and egg finger printing causes hosts to never join

This commit is contained in:
2026-05-12 07:25:04 +00:00
parent 7ebed0f0aa
commit c90ce244b0
4 changed files with 145 additions and 8 deletions
+27
View File
@@ -45,6 +45,33 @@ overwritten. Re-run only after wiping the data directory.`,
if _, err := crypto.GenerateKeyPair(nodeID); err != nil {
return fmt.Errorf("generate keys: %w", err)
}
// Seed cluster.yaml with this node as its own first peer.
// Without this the math in `quorum` would treat a one-node
// cluster as "0 peers, fallback quorum=1, master=self" —
// which works in isolation but breaks the moment another
// node joins, because the replicated peers list would lack
// the inviter, leading to split-brain elections.
certPEM, err := crypto.LoadCertPEM()
if err != nil {
return fmt.Errorf("load cert: %w", err)
}
fp, err := crypto.FingerprintFromCertPEM(certPEM)
if err != nil {
return fmt.Errorf("fingerprint own cert: %w", err)
}
cluster := &config.ClusterConfig{}
if err := cluster.Mutate(nodeID, func(c *config.ClusterConfig) error {
c.Peers = []config.PeerInfo{{
NodeID: nodeID,
Advertise: n.AdvertiseAddr(),
Fingerprint: fp,
}}
return nil
}); err != nil {
return fmt.Errorf("seed cluster.yaml: %w", err)
}
fmt.Fprintf(cmd.OutOrStdout(), "initialised node %s\n", nodeID)
fmt.Fprintf(cmd.OutOrStdout(), "data dir: %s\n", config.DataDir())
fmt.Fprintf(cmd.OutOrStdout(), "advertise: %s\n", n.AdvertiseAddr())
+39 -1
View File
@@ -107,7 +107,10 @@ func (s *Server) handleConn(ctx context.Context, raw net.Conn) {
if err := tlsConn.HandshakeContext(ctx); err != nil {
return
}
peerID := peerNodeIDFromConnState(tlsConn.ConnectionState())
state := tlsConn.ConnectionState()
peerID := peerNodeIDFromConnState(state)
peerFP := peerFingerprintFromConnState(state)
trusted := s.peerTrusted(peerFP)
for {
body, err := readFrame(tlsConn)
@@ -120,6 +123,14 @@ func (s *Server) handleConn(ctx context.Context, raw net.Conn) {
return
}
// Until the peer is trusted, only the bootstrap call (Join) is
// allowed through. Everything else gets a clear error so the
// caller knows to re-run `qu node add`.
if !trusted && req.Method != MethodJoin {
_ = writeError(tlsConn, req.ID, "peer not trusted; run `qu node add` first")
continue
}
fn, exists := s.handlers[req.Method]
if !exists {
_ = writeError(tlsConn, req.ID, "unknown method: "+req.Method)
@@ -134,9 +145,26 @@ func (s *Server) handleConn(ctx context.Context, raw net.Conn) {
if err := writeResult(tlsConn, req.ID, result); err != nil {
return
}
// A successful Join writes the caller into our trust store;
// re-check so subsequent calls on this same connection (or
// after reconnect) flow through normally.
if req.Method == MethodJoin && !trusted {
trusted = s.peerTrusted(peerFP)
}
}
}
// peerTrusted reports whether peerFP is in our trust store. Returns
// false on empty input so a missing/parse-failed cert is never trusted.
func (s *Server) peerTrusted(peerFP string) bool {
if peerFP == "" {
return false
}
_, ok := s.assets.Trust.LookupByFingerprint(peerFP)
return ok
}
// Client opens and pools one mTLS connection per peer node ID. Each
// connection serialises outstanding calls under a mutex; concurrent
// calls to different peers proceed in parallel.
@@ -321,3 +349,13 @@ func peerNodeIDFromConnState(cs tls.ConnectionState) string {
}
return cs.PeerCertificates[0].Subject.CommonName
}
// peerFingerprintFromConnState computes the SPKI fingerprint of the
// peer's leaf cert, matching the format the trust store stores. An
// empty result means the peer presented no cert.
func peerFingerprintFromConnState(cs tls.ConnectionState) string {
if len(cs.PeerCertificates) == 0 {
return ""
}
return fingerprintOf(cs.PeerCertificates[0])
}
+67
View File
@@ -170,6 +170,73 @@ func TestRPCRejectsUntrustedPeer(t *testing.T) {
}
}
func TestJoinAllowedFromUntrustedPeer(t *testing.T) {
// Verifies the bootstrap path: B has not yet been added to A's
// trust store. A must still accept the Join RPC; subsequent
// non-Join calls on the same connection should succeed once Join
// has populated trust.
a := makeNode(t, t.TempDir(), "node-a")
b := makeNode(t, t.TempDir(), "node-b")
tmpLn, _ := net.Listen("tcp", "127.0.0.1:0")
addr := tmpLn.Addr().String()
tmpLn.Close()
// B must trust A so B's client-side handshake passes.
t.Setenv("QUPTIME_DIR", b.dir)
_ = b.assets.Trust.Add(trust.Entry{NodeID: a.id, Address: addr, Fingerprint: a.fp})
srv := NewServer(a.assets)
srv.Handle(MethodJoin, func(_ context.Context, _ string, raw json.RawMessage) (any, error) {
var req JoinRequest
if err := json.Unmarshal(raw, &req); err != nil {
return JoinResponse{Error: err.Error()}, nil
}
// A pretends to accept and records B in its trust store.
t.Setenv("QUPTIME_DIR", a.dir)
if err := a.assets.Trust.Add(trust.Entry{
NodeID: req.NodeID, Address: req.Advertise, Fingerprint: req.Fingerprint,
}); err != nil {
return JoinResponse{Error: err.Error()}, nil
}
return JoinResponse{Accepted: true}, nil
})
srv.Handle(MethodPing, func(_ context.Context, _ string, _ json.RawMessage) (any, error) {
return PingResponse{NodeID: a.id, Now: time.Now()}, nil
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go srv.Serve(ctx, addr)
defer srv.Stop()
if !waitForDial(addr, 2*time.Second) {
t.Fatal("server not up")
}
cli := NewClient(b.assets)
defer cli.Close()
// Pre-Join: Ping must be rejected with the "not trusted" error.
if err := cli.Call(ctx, a.id, addr, MethodPing, nil, nil); err == nil {
t.Error("Ping was allowed without prior trust")
}
// Join must succeed even though B is untrusted.
joinReq := JoinRequest{NodeID: b.id, Advertise: addr, Fingerprint: b.fp, CertPEM: string(b.assets.Cert)}
var joinResp JoinResponse
if err := cli.Call(ctx, a.id, addr, MethodJoin, joinReq, &joinResp); err != nil {
t.Fatalf("Join: %v", err)
}
if !joinResp.Accepted {
t.Fatalf("Join not accepted: %s", joinResp.Error)
}
// Post-Join on the SAME pooled connection: Ping should now flow.
if err := cli.Call(ctx, a.id, addr, MethodPing, nil, nil); err != nil {
t.Errorf("post-Join Ping failed: %v", err)
}
}
// waitForDial polls a TCP listener until it accepts a plain TCP
// connection, signalling that Serve has begun listening.
func waitForDial(addr string, max time.Duration) bool {
+12 -7
View File
@@ -41,19 +41,24 @@ func (a *TLSAssets) tlsCert() (tls.Certificate, error) {
}
// ServerConfig produces a tls.Config suitable for an inter-node
// listener. Peers must present a certificate, and that certificate's
// fingerprint must already be present in the trust store.
// listener.
//
// We accept any client certificate at the TLS layer (no CA verification
// and no fingerprint pinning here). Trust is enforced one layer up by
// the RPC dispatcher: untrusted peers may only invoke MethodJoin, which
// is the protocol's bootstrap step. This avoids the chicken-and-egg
// where Join itself would need pre-existing symmetric trust to complete
// the handshake.
func (a *TLSAssets) ServerConfig() (*tls.Config, error) {
cert, err := a.tlsCert()
if err != nil {
return nil, err
}
return &tls.Config{
Certificates: []tls.Certificate{cert},
MinVersion: MinTLS,
ClientAuth: tls.RequireAnyClientCert,
InsecureSkipVerify: true, // we do our own pinning via VerifyPeerCertificate
VerifyPeerCertificate: a.Trust.VerifyPeerCert,
Certificates: []tls.Certificate{cert},
MinVersion: MinTLS,
ClientAuth: tls.RequireAnyClientCert,
InsecureSkipVerify: true, // trust is gated per-method by the RPC dispatcher
}, nil
}