Fixed logic error; multi host chicken and egg finger printing causes hosts to never join
This commit is contained in:
@@ -45,6 +45,33 @@ overwritten. Re-run only after wiping the data directory.`,
|
|||||||
if _, err := crypto.GenerateKeyPair(nodeID); err != nil {
|
if _, err := crypto.GenerateKeyPair(nodeID); err != nil {
|
||||||
return fmt.Errorf("generate keys: %w", err)
|
return fmt.Errorf("generate keys: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Seed cluster.yaml with this node as its own first peer.
|
||||||
|
// Without this the math in `quorum` would treat a one-node
|
||||||
|
// cluster as "0 peers, fallback quorum=1, master=self" —
|
||||||
|
// which works in isolation but breaks the moment another
|
||||||
|
// node joins, because the replicated peers list would lack
|
||||||
|
// the inviter, leading to split-brain elections.
|
||||||
|
certPEM, err := crypto.LoadCertPEM()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("load cert: %w", err)
|
||||||
|
}
|
||||||
|
fp, err := crypto.FingerprintFromCertPEM(certPEM)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("fingerprint own cert: %w", err)
|
||||||
|
}
|
||||||
|
cluster := &config.ClusterConfig{}
|
||||||
|
if err := cluster.Mutate(nodeID, func(c *config.ClusterConfig) error {
|
||||||
|
c.Peers = []config.PeerInfo{{
|
||||||
|
NodeID: nodeID,
|
||||||
|
Advertise: n.AdvertiseAddr(),
|
||||||
|
Fingerprint: fp,
|
||||||
|
}}
|
||||||
|
return nil
|
||||||
|
}); err != nil {
|
||||||
|
return fmt.Errorf("seed cluster.yaml: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Fprintf(cmd.OutOrStdout(), "initialised node %s\n", nodeID)
|
fmt.Fprintf(cmd.OutOrStdout(), "initialised node %s\n", nodeID)
|
||||||
fmt.Fprintf(cmd.OutOrStdout(), "data dir: %s\n", config.DataDir())
|
fmt.Fprintf(cmd.OutOrStdout(), "data dir: %s\n", config.DataDir())
|
||||||
fmt.Fprintf(cmd.OutOrStdout(), "advertise: %s\n", n.AdvertiseAddr())
|
fmt.Fprintf(cmd.OutOrStdout(), "advertise: %s\n", n.AdvertiseAddr())
|
||||||
|
|||||||
@@ -107,7 +107,10 @@ func (s *Server) handleConn(ctx context.Context, raw net.Conn) {
|
|||||||
if err := tlsConn.HandshakeContext(ctx); err != nil {
|
if err := tlsConn.HandshakeContext(ctx); err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
peerID := peerNodeIDFromConnState(tlsConn.ConnectionState())
|
state := tlsConn.ConnectionState()
|
||||||
|
peerID := peerNodeIDFromConnState(state)
|
||||||
|
peerFP := peerFingerprintFromConnState(state)
|
||||||
|
trusted := s.peerTrusted(peerFP)
|
||||||
|
|
||||||
for {
|
for {
|
||||||
body, err := readFrame(tlsConn)
|
body, err := readFrame(tlsConn)
|
||||||
@@ -120,6 +123,14 @@ func (s *Server) handleConn(ctx context.Context, raw net.Conn) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Until the peer is trusted, only the bootstrap call (Join) is
|
||||||
|
// allowed through. Everything else gets a clear error so the
|
||||||
|
// caller knows to re-run `qu node add`.
|
||||||
|
if !trusted && req.Method != MethodJoin {
|
||||||
|
_ = writeError(tlsConn, req.ID, "peer not trusted; run `qu node add` first")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
fn, exists := s.handlers[req.Method]
|
fn, exists := s.handlers[req.Method]
|
||||||
if !exists {
|
if !exists {
|
||||||
_ = writeError(tlsConn, req.ID, "unknown method: "+req.Method)
|
_ = writeError(tlsConn, req.ID, "unknown method: "+req.Method)
|
||||||
@@ -134,7 +145,24 @@ func (s *Server) handleConn(ctx context.Context, raw net.Conn) {
|
|||||||
if err := writeResult(tlsConn, req.ID, result); err != nil {
|
if err := writeResult(tlsConn, req.ID, result); err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A successful Join writes the caller into our trust store;
|
||||||
|
// re-check so subsequent calls on this same connection (or
|
||||||
|
// after reconnect) flow through normally.
|
||||||
|
if req.Method == MethodJoin && !trusted {
|
||||||
|
trusted = s.peerTrusted(peerFP)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// peerTrusted reports whether peerFP is in our trust store. Returns
|
||||||
|
// false on empty input so a missing/parse-failed cert is never trusted.
|
||||||
|
func (s *Server) peerTrusted(peerFP string) bool {
|
||||||
|
if peerFP == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
_, ok := s.assets.Trust.LookupByFingerprint(peerFP)
|
||||||
|
return ok
|
||||||
}
|
}
|
||||||
|
|
||||||
// Client opens and pools one mTLS connection per peer node ID. Each
|
// Client opens and pools one mTLS connection per peer node ID. Each
|
||||||
@@ -321,3 +349,13 @@ func peerNodeIDFromConnState(cs tls.ConnectionState) string {
|
|||||||
}
|
}
|
||||||
return cs.PeerCertificates[0].Subject.CommonName
|
return cs.PeerCertificates[0].Subject.CommonName
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// peerFingerprintFromConnState computes the SPKI fingerprint of the
|
||||||
|
// peer's leaf cert, matching the format the trust store stores. An
|
||||||
|
// empty result means the peer presented no cert.
|
||||||
|
func peerFingerprintFromConnState(cs tls.ConnectionState) string {
|
||||||
|
if len(cs.PeerCertificates) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fingerprintOf(cs.PeerCertificates[0])
|
||||||
|
}
|
||||||
|
|||||||
@@ -170,6 +170,73 @@ func TestRPCRejectsUntrustedPeer(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestJoinAllowedFromUntrustedPeer(t *testing.T) {
|
||||||
|
// Verifies the bootstrap path: B has not yet been added to A's
|
||||||
|
// trust store. A must still accept the Join RPC; subsequent
|
||||||
|
// non-Join calls on the same connection should succeed once Join
|
||||||
|
// has populated trust.
|
||||||
|
a := makeNode(t, t.TempDir(), "node-a")
|
||||||
|
b := makeNode(t, t.TempDir(), "node-b")
|
||||||
|
|
||||||
|
tmpLn, _ := net.Listen("tcp", "127.0.0.1:0")
|
||||||
|
addr := tmpLn.Addr().String()
|
||||||
|
tmpLn.Close()
|
||||||
|
|
||||||
|
// B must trust A so B's client-side handshake passes.
|
||||||
|
t.Setenv("QUPTIME_DIR", b.dir)
|
||||||
|
_ = b.assets.Trust.Add(trust.Entry{NodeID: a.id, Address: addr, Fingerprint: a.fp})
|
||||||
|
|
||||||
|
srv := NewServer(a.assets)
|
||||||
|
srv.Handle(MethodJoin, func(_ context.Context, _ string, raw json.RawMessage) (any, error) {
|
||||||
|
var req JoinRequest
|
||||||
|
if err := json.Unmarshal(raw, &req); err != nil {
|
||||||
|
return JoinResponse{Error: err.Error()}, nil
|
||||||
|
}
|
||||||
|
// A pretends to accept and records B in its trust store.
|
||||||
|
t.Setenv("QUPTIME_DIR", a.dir)
|
||||||
|
if err := a.assets.Trust.Add(trust.Entry{
|
||||||
|
NodeID: req.NodeID, Address: req.Advertise, Fingerprint: req.Fingerprint,
|
||||||
|
}); err != nil {
|
||||||
|
return JoinResponse{Error: err.Error()}, nil
|
||||||
|
}
|
||||||
|
return JoinResponse{Accepted: true}, nil
|
||||||
|
})
|
||||||
|
srv.Handle(MethodPing, func(_ context.Context, _ string, _ json.RawMessage) (any, error) {
|
||||||
|
return PingResponse{NodeID: a.id, Now: time.Now()}, nil
|
||||||
|
})
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
go srv.Serve(ctx, addr)
|
||||||
|
defer srv.Stop()
|
||||||
|
if !waitForDial(addr, 2*time.Second) {
|
||||||
|
t.Fatal("server not up")
|
||||||
|
}
|
||||||
|
|
||||||
|
cli := NewClient(b.assets)
|
||||||
|
defer cli.Close()
|
||||||
|
|
||||||
|
// Pre-Join: Ping must be rejected with the "not trusted" error.
|
||||||
|
if err := cli.Call(ctx, a.id, addr, MethodPing, nil, nil); err == nil {
|
||||||
|
t.Error("Ping was allowed without prior trust")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Join must succeed even though B is untrusted.
|
||||||
|
joinReq := JoinRequest{NodeID: b.id, Advertise: addr, Fingerprint: b.fp, CertPEM: string(b.assets.Cert)}
|
||||||
|
var joinResp JoinResponse
|
||||||
|
if err := cli.Call(ctx, a.id, addr, MethodJoin, joinReq, &joinResp); err != nil {
|
||||||
|
t.Fatalf("Join: %v", err)
|
||||||
|
}
|
||||||
|
if !joinResp.Accepted {
|
||||||
|
t.Fatalf("Join not accepted: %s", joinResp.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Post-Join on the SAME pooled connection: Ping should now flow.
|
||||||
|
if err := cli.Call(ctx, a.id, addr, MethodPing, nil, nil); err != nil {
|
||||||
|
t.Errorf("post-Join Ping failed: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// waitForDial polls a TCP listener until it accepts a plain TCP
|
// waitForDial polls a TCP listener until it accepts a plain TCP
|
||||||
// connection, signalling that Serve has begun listening.
|
// connection, signalling that Serve has begun listening.
|
||||||
func waitForDial(addr string, max time.Duration) bool {
|
func waitForDial(addr string, max time.Duration) bool {
|
||||||
|
|||||||
@@ -41,8 +41,14 @@ func (a *TLSAssets) tlsCert() (tls.Certificate, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ServerConfig produces a tls.Config suitable for an inter-node
|
// ServerConfig produces a tls.Config suitable for an inter-node
|
||||||
// listener. Peers must present a certificate, and that certificate's
|
// listener.
|
||||||
// fingerprint must already be present in the trust store.
|
//
|
||||||
|
// We accept any client certificate at the TLS layer (no CA verification
|
||||||
|
// and no fingerprint pinning here). Trust is enforced one layer up by
|
||||||
|
// the RPC dispatcher: untrusted peers may only invoke MethodJoin, which
|
||||||
|
// is the protocol's bootstrap step. This avoids the chicken-and-egg
|
||||||
|
// where Join itself would need pre-existing symmetric trust to complete
|
||||||
|
// the handshake.
|
||||||
func (a *TLSAssets) ServerConfig() (*tls.Config, error) {
|
func (a *TLSAssets) ServerConfig() (*tls.Config, error) {
|
||||||
cert, err := a.tlsCert()
|
cert, err := a.tlsCert()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -52,8 +58,7 @@ func (a *TLSAssets) ServerConfig() (*tls.Config, error) {
|
|||||||
Certificates: []tls.Certificate{cert},
|
Certificates: []tls.Certificate{cert},
|
||||||
MinVersion: MinTLS,
|
MinVersion: MinTLS,
|
||||||
ClientAuth: tls.RequireAnyClientCert,
|
ClientAuth: tls.RequireAnyClientCert,
|
||||||
InsecureSkipVerify: true, // we do our own pinning via VerifyPeerCertificate
|
InsecureSkipVerify: true, // trust is gated per-method by the RPC dispatcher
|
||||||
VerifyPeerCertificate: a.Trust.VerifyPeerCert,
|
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user