Auto init via environment variables support, qu init for systemd
Container image / image (push) Successful in 1m38s

This commit is contained in:
2026-05-15 04:41:45 +00:00
parent 6953709574
commit e11b3f4547
9 changed files with 475 additions and 113 deletions
+122 -61
View File
@@ -5,6 +5,7 @@ import (
"encoding/base64"
"errors"
"fmt"
"io"
"os"
"github.com/google/uuid"
@@ -30,78 +31,50 @@ Pass --secret on every subsequent node so they share the same
cluster join secret. If --secret is omitted on the very first node, a
random secret is generated and printed for the operator to copy.
Every flag may also be supplied via its QUPTIME_* environment variable
(see docs/configuration.md). Explicit flags win over env values, which
in turn win over the compiled defaults.
Idempotent in one direction only: existing key material is never
overwritten. Re-run only after wiping the data directory.`,
RunE: func(cmd *cobra.Command, args []string) error {
if err := config.EnsureDataDir(); err != nil {
return err
}
if _, err := os.Stat(config.NodeFilePath()); err == nil {
return errors.New("node.yaml already exists in data dir — refusing to overwrite")
}
secret := clusterSecret
generated := false
if secret == "" {
s, err := generateSecret()
if err != nil {
return fmt.Errorf("generate cluster secret: %w", err)
}
secret = s
generated = true
// Only let env fill fields the operator did NOT pass on the
// command line; explicit flags must win over env.
n := &config.NodeConfig{}
if cmd.Flags().Changed("bind") {
n.BindAddr = bindAddr
}
if cmd.Flags().Changed("port") {
n.BindPort = bindPort
}
if cmd.Flags().Changed("advertise") {
n.Advertise = advertise
}
if cmd.Flags().Changed("secret") {
n.ClusterSecret = clusterSecret
}
if err := n.ApplyEnvOverrides(); err != nil {
return err
}
// Cobra defaults (bind=0.0.0.0, port=9901) are still
// available as fallbacks for fields neither flag nor env
// touched.
if n.BindAddr == "" {
n.BindAddr = bindAddr
}
if n.BindPort == 0 {
n.BindPort = bindPort
}
nodeID := uuid.NewString()
n := &config.NodeConfig{
NodeID: nodeID,
BindAddr: bindAddr,
BindPort: bindPort,
Advertise: advertise,
ClusterSecret: secret,
}
if err := n.Save(); err != nil {
return fmt.Errorf("save node.yaml: %w", err)
}
if _, err := crypto.GenerateKeyPair(nodeID); err != nil {
return fmt.Errorf("generate keys: %w", err)
}
// Seed cluster.yaml with this node as its own first peer.
// Without this the math in `quorum` would treat a one-node
// cluster as "0 peers, fallback quorum=1, master=self" —
// which works in isolation but breaks the moment another
// node joins, because the replicated peers list would lack
// the inviter, leading to split-brain elections.
certPEM, err := crypto.LoadCertPEM()
_, generated, err := bootstrapNode(n)
if err != nil {
return fmt.Errorf("load cert: %w", err)
}
fp, err := crypto.FingerprintFromCertPEM(certPEM)
if err != nil {
return fmt.Errorf("fingerprint own cert: %w", err)
}
cluster := &config.ClusterConfig{}
if err := cluster.Mutate(nodeID, func(c *config.ClusterConfig) error {
c.Peers = []config.PeerInfo{{
NodeID: nodeID,
Advertise: n.AdvertiseAddr(),
Fingerprint: fp,
CertPEM: string(certPEM),
}}
return nil
}); err != nil {
return fmt.Errorf("seed cluster.yaml: %w", err)
}
out := cmd.OutOrStdout()
fmt.Fprintf(out, "initialised node %s\n", nodeID)
fmt.Fprintf(out, "data dir: %s\n", config.DataDir())
fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr())
if generated {
fmt.Fprintln(out)
fmt.Fprintln(out, "cluster secret (copy to every other node via --secret):")
fmt.Fprintln(out, " "+secret)
return err
}
printBootstrapResult(cmd.OutOrStdout(), n, generated)
return nil
},
}
@@ -112,6 +85,94 @@ overwritten. Re-run only after wiping the data directory.`,
root.AddCommand(cmd)
}
// bootstrapNode creates the data dir, writes node.yaml, generates the
// keypair, and seeds cluster.yaml with this node as its own first
// peer. cfg may arrive with any subset of fields populated; missing
// NodeID and ClusterSecret are auto-generated, missing BindAddr /
// BindPort get the compiled defaults.
//
// Returns the populated config (the same pointer that was passed in)
// and a flag indicating whether ClusterSecret was generated here. The
// flag exists so the caller can print the secret for the operator —
// it must be copied to every follower node out-of-band.
//
// Caller is responsible for checking that node.yaml does not yet
// exist; bootstrapNode itself will refuse to overwrite an existing
// keypair (crypto.GenerateKeyPair errors out) but does not guard
// against clobbering node.yaml.
func bootstrapNode(cfg *config.NodeConfig) (*config.NodeConfig, bool, error) {
if err := config.EnsureDataDir(); err != nil {
return nil, false, err
}
if cfg.NodeID == "" {
cfg.NodeID = uuid.NewString()
}
if cfg.BindAddr == "" {
cfg.BindAddr = "0.0.0.0"
}
if cfg.BindPort == 0 {
cfg.BindPort = 9901
}
generated := false
if cfg.ClusterSecret == "" {
s, err := generateSecret()
if err != nil {
return nil, false, fmt.Errorf("generate cluster secret: %w", err)
}
cfg.ClusterSecret = s
generated = true
}
if err := cfg.Save(); err != nil {
return nil, false, fmt.Errorf("save node.yaml: %w", err)
}
if _, err := crypto.GenerateKeyPair(cfg.NodeID); err != nil {
return nil, false, fmt.Errorf("generate keys: %w", err)
}
// Seed cluster.yaml with this node as its own first peer.
// Without this the math in `quorum` would treat a one-node
// cluster as "0 peers, fallback quorum=1, master=self" — which
// works in isolation but breaks the moment another node joins,
// because the replicated peers list would lack the inviter,
// leading to split-brain elections.
certPEM, err := crypto.LoadCertPEM()
if err != nil {
return nil, false, fmt.Errorf("load cert: %w", err)
}
fp, err := crypto.FingerprintFromCertPEM(certPEM)
if err != nil {
return nil, false, fmt.Errorf("fingerprint own cert: %w", err)
}
cluster := &config.ClusterConfig{}
if err := cluster.Mutate(cfg.NodeID, func(c *config.ClusterConfig) error {
c.Peers = []config.PeerInfo{{
NodeID: cfg.NodeID,
Advertise: cfg.AdvertiseAddr(),
Fingerprint: fp,
CertPEM: string(certPEM),
}}
return nil
}); err != nil {
return nil, false, fmt.Errorf("seed cluster.yaml: %w", err)
}
return cfg, generated, nil
}
// printBootstrapResult emits the human-readable summary both `qu init`
// and the serve auto-init path print after bootstrapping. Kept in one
// place so the secret-disclosure format stays identical across the two
// entry points.
func printBootstrapResult(out io.Writer, n *config.NodeConfig, secretGenerated bool) {
fmt.Fprintf(out, "initialised node %s\n", n.NodeID)
fmt.Fprintf(out, "data dir: %s\n", config.DataDir())
fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr())
if secretGenerated {
fmt.Fprintln(out)
fmt.Fprintln(out, "cluster secret (copy to every other node via --secret or QUPTIME_CLUSTER_SECRET):")
fmt.Fprintln(out, " "+n.ClusterSecret)
}
}
// generateSecret produces 32 bytes of crypto-random data and returns
// it base64-encoded. Long enough that brute force isn't a concern;
// short enough that operators can copy-paste it without pagination.
+50 -1
View File
@@ -2,6 +2,9 @@ package cli
import (
"context"
"errors"
"fmt"
"io/fs"
"log"
"os"
"os/signal"
@@ -9,6 +12,7 @@ import (
"github.com/spf13/cobra"
"git.cer.sh/axodouble/quptime/internal/config"
"git.cer.sh/axodouble/quptime/internal/daemon"
)
@@ -18,9 +22,18 @@ func addServeCmd(root *cobra.Command) {
Short: "Run the qu daemon in the foreground",
Long: `Run the qu daemon: starts the inter-node listener, the local
control socket for the CLI, the heartbeat loop and the check
scheduler. Stops cleanly on SIGINT or SIGTERM.`,
scheduler. Stops cleanly on SIGINT or SIGTERM.
If node.yaml does not exist yet, serve will bootstrap it using values
from the QUPTIME_* environment variables (see docs/configuration.md).
This makes a single ` + "`docker compose up`" + ` enough to launch a new node —
no separate ` + "`qu init`" + ` step is required when the data volume is
fresh.`,
RunE: func(cmd *cobra.Command, args []string) error {
logger := log.New(os.Stderr, "quptime: ", log.LstdFlags|log.Lmsgprefix)
if err := autoInitIfNeeded(cmd, logger); err != nil {
return err
}
d, err := daemon.New(logger)
if err != nil {
return err
@@ -32,3 +45,39 @@ scheduler. Stops cleanly on SIGINT or SIGTERM.`,
}
root.AddCommand(cmd)
}
// autoInitIfNeeded bootstraps the node on first launch.
//
// Friction this removes for container deploys: before, the operator
// had to `docker compose run --rm quptime init …` once before the
// service could come up, which makes `restart: unless-stopped`
// awkward and forces an out-of-band step into every fresh volume.
// Now serve auto-runs the same bootstrap path using QUPTIME_* env
// vars when node.yaml is absent, so the compose file can come up on
// the first try.
//
// Pre-existing node.yaml is left untouched — we only bootstrap when
// the file is genuinely missing. Any other stat error (permission
// denied, broken symlink) is surfaced so the operator sees the real
// problem instead of a confused auto-init attempt clobbering state.
func autoInitIfNeeded(cmd *cobra.Command, logger *log.Logger) error {
_, err := os.Stat(config.NodeFilePath())
if err == nil {
return nil
}
if !errors.Is(err, fs.ErrNotExist) {
return fmt.Errorf("stat node.yaml: %w", err)
}
logger.Printf("node.yaml not found at %s — bootstrapping from environment", config.NodeFilePath())
n := &config.NodeConfig{}
if err := n.ApplyEnvOverrides(); err != nil {
return err
}
if _, generated, err := bootstrapNode(n); err != nil {
return fmt.Errorf("auto-init: %w", err)
} else {
printBootstrapResult(cmd.OutOrStderr(), n, generated)
}
return nil
}
+47
View File
@@ -3,10 +3,26 @@ package config
import (
"fmt"
"os"
"strconv"
"gopkg.in/yaml.v3"
)
// Environment variable names that override fields on NodeConfig at
// load time. Intended to let `docker compose` setups drive a node's
// identity and listener configuration without having to bake a
// node.yaml into the image or run `qu init` manually first.
//
// Empty values are ignored — they do not clear a field. The override
// order is therefore: env (non-empty) > file > compiled default.
const (
EnvNodeID = "QUPTIME_NODE_ID"
EnvBindAddr = "QUPTIME_BIND_ADDR"
EnvBindPort = "QUPTIME_BIND_PORT"
EnvAdvertise = "QUPTIME_ADVERTISE"
EnvClusterSecret = "QUPTIME_CLUSTER_SECRET"
)
// NodeConfig is the per-node, never-replicated identity file.
type NodeConfig struct {
// NodeID is a stable UUID generated at `qu init`. Used by all peers
@@ -45,6 +61,34 @@ func (n *NodeConfig) AdvertiseAddr() string {
return fmt.Sprintf("%s:%d", bind, n.BindPort)
}
// ApplyEnvOverrides folds QUPTIME_* environment variables onto n.
// Non-empty env values win over the existing field value. Called both
// by LoadNodeConfig and by the `qu init` / serve auto-init paths so
// the same precedence rules apply whether the daemon is reading a
// persisted node.yaml or constructing one from scratch.
func (n *NodeConfig) ApplyEnvOverrides() error {
if v := os.Getenv(EnvNodeID); v != "" {
n.NodeID = v
}
if v := os.Getenv(EnvBindAddr); v != "" {
n.BindAddr = v
}
if v := os.Getenv(EnvBindPort); v != "" {
p, err := strconv.Atoi(v)
if err != nil {
return fmt.Errorf("%s=%q: not an integer: %w", EnvBindPort, v, err)
}
n.BindPort = p
}
if v := os.Getenv(EnvAdvertise); v != "" {
n.Advertise = v
}
if v := os.Getenv(EnvClusterSecret); v != "" {
n.ClusterSecret = v
}
return nil
}
// LoadNodeConfig reads node.yaml from the data dir.
func LoadNodeConfig() (*NodeConfig, error) {
raw, err := os.ReadFile(NodeFilePath())
@@ -55,6 +99,9 @@ func LoadNodeConfig() (*NodeConfig, error) {
if err := yaml.Unmarshal(raw, cfg); err != nil {
return nil, fmt.Errorf("parse node.yaml: %w", err)
}
if err := cfg.ApplyEnvOverrides(); err != nil {
return nil, err
}
if cfg.BindPort == 0 {
cfg.BindPort = 9901
}
+95 -3
View File
@@ -4,9 +4,9 @@ import "testing"
func TestAdvertiseAddrFallback(t *testing.T) {
cases := []struct {
name string
cfg NodeConfig
want string
name string
cfg NodeConfig
want string
}{
{"explicit advertise wins", NodeConfig{Advertise: "host:1234", BindAddr: "0.0.0.0", BindPort: 9901}, "host:1234"},
{"empty bind falls back to loopback", NodeConfig{BindPort: 9901}, "127.0.0.1:9901"},
@@ -56,3 +56,95 @@ func TestLoadNodeConfigAppliesDefaults(t *testing.T) {
t.Errorf("BindAddr=%q want 0.0.0.0", loaded.BindAddr)
}
}
func TestApplyEnvOverrides(t *testing.T) {
t.Setenv(EnvNodeID, "node-from-env")
t.Setenv(EnvBindAddr, "1.2.3.4")
t.Setenv(EnvBindPort, "9999")
t.Setenv(EnvAdvertise, "public.example.com:9999")
t.Setenv(EnvClusterSecret, "shh-secret")
n := &NodeConfig{
NodeID: "original-id",
BindAddr: "0.0.0.0",
BindPort: 9901,
Advertise: "old.example.com:9901",
ClusterSecret: "old-secret",
}
if err := n.ApplyEnvOverrides(); err != nil {
t.Fatal(err)
}
want := NodeConfig{
NodeID: "node-from-env",
BindAddr: "1.2.3.4",
BindPort: 9999,
Advertise: "public.example.com:9999",
ClusterSecret: "shh-secret",
}
if *n != want {
t.Errorf("got %+v want %+v", *n, want)
}
}
func TestApplyEnvOverridesEmptyValuesIgnored(t *testing.T) {
// Explicitly empty env vars must NOT clobber existing fields —
// otherwise `docker run -e QUPTIME_ADVERTISE=` would silently
// erase a previously-persisted advertise address.
t.Setenv(EnvNodeID, "")
t.Setenv(EnvBindAddr, "")
t.Setenv(EnvBindPort, "")
t.Setenv(EnvAdvertise, "")
t.Setenv(EnvClusterSecret, "")
orig := NodeConfig{
NodeID: "keep-me",
BindAddr: "10.0.0.1",
BindPort: 9901,
Advertise: "keep.example.com:9901",
ClusterSecret: "keep-secret",
}
n := orig
if err := n.ApplyEnvOverrides(); err != nil {
t.Fatal(err)
}
if n != orig {
t.Errorf("empty env vars mutated config: got %+v want %+v", n, orig)
}
}
func TestApplyEnvOverridesBadPort(t *testing.T) {
t.Setenv(EnvBindPort, "not-an-int")
n := &NodeConfig{}
if err := n.ApplyEnvOverrides(); err == nil {
t.Fatal("expected error for non-integer port")
}
}
func TestLoadNodeConfigEnvOverridesFile(t *testing.T) {
t.Setenv("QUPTIME_DIR", t.TempDir())
// Persist a file with one bind addr; env should win on load.
n := &NodeConfig{NodeID: "abc", BindAddr: "127.0.0.1", BindPort: 9901, Advertise: "file.example.com:9901"}
if err := n.Save(); err != nil {
t.Fatal(err)
}
t.Setenv(EnvBindAddr, "0.0.0.0")
t.Setenv(EnvAdvertise, "env.example.com:9001")
t.Setenv(EnvBindPort, "9001")
loaded, err := LoadNodeConfig()
if err != nil {
t.Fatal(err)
}
if loaded.BindAddr != "0.0.0.0" {
t.Errorf("BindAddr=%q want 0.0.0.0 (env override)", loaded.BindAddr)
}
if loaded.BindPort != 9001 {
t.Errorf("BindPort=%d want 9001 (env override)", loaded.BindPort)
}
if loaded.Advertise != "env.example.com:9001" {
t.Errorf("Advertise=%q want env.example.com:9001 (env override)", loaded.Advertise)
}
if loaded.NodeID != "abc" {
t.Errorf("NodeID=%q want abc (unchanged)", loaded.NodeID)
}
}