From e11b3f4547513513549bd60b07d143f969f44873 Mon Sep 17 00:00:00 2001 From: Axodouble Date: Fri, 15 May 2026 04:41:45 +0000 Subject: [PATCH] Auto init via environment variables support, qu init for systemd --- docker/docker-compose-tailscale.yml | 33 +++-- docs/configuration.md | 45 +++++++ docs/deployment/docker.md | 57 ++++++--- docs/deployment/tailscale.md | 51 ++++---- docs/troubleshooting.md | 23 +++- internal/cli/init.go | 183 ++++++++++++++++++---------- internal/cli/serve.go | 51 +++++++- internal/config/node.go | 47 +++++++ internal/config/node_test.go | 98 ++++++++++++++- 9 files changed, 475 insertions(+), 113 deletions(-) diff --git a/docker/docker-compose-tailscale.yml b/docker/docker-compose-tailscale.yml index c7c6a7b..b7f5c9d 100644 --- a/docker/docker-compose-tailscale.yml +++ b/docker/docker-compose-tailscale.yml @@ -1,5 +1,14 @@ # An example of a docker compose with Tailscale & QUptime. -# This setup is specifically intended for hosts that may not be able to reach each other directly or have a public IP address. +# This setup is specifically intended for hosts that may not be able to +# reach each other directly or have a public IP address. +# +# Bring it up with `docker compose -f docker-compose-tailscale.yml up -d`. +# QUptime auto-initialises on first start using the QUPTIME_* env vars +# below — no separate `qu init` step is required. +# +# On the first node, omit QUPTIME_CLUSTER_SECRET to have one generated +# for you. Read it out of the logs (`docker logs quptime`) and copy it +# into the .env of every other node before bringing them up. services: tailscale: @@ -18,20 +27,28 @@ services: quptime: image: git.cer.sh/axodouble/quptime:master container_name: quptime + environment: + # host:port other QUptime nodes use to reach this one. Use the + # Tailscale IP / MagicDNS name of this host. Required behind NAT. + - QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE} + # Shared cluster join secret. Set on every node. Leave unset on + # the very first node — one will be generated and logged for you + # to copy to the others. Followers MUST set this before starting. + - QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-} + # Optional: pin a port other than the default 9901. + # - QUPTIME_BIND_PORT=9901 volumes: - quptime:/etc/quptime ports: - "9901:9901" depends_on: - tailscale - # No restart directive, user needs to init quptime first - # Run `docker compose -f docker-compose-tailscale.yml run --rm quptime init` to initialize - # the data volume before starting the service - # If this is not the master node, use - # `docker compose -f docker-compose-tailscale.yml run --rm quptime --advertise :9901 --secret ` - # And add the individual nodes to the cluster with `docker compose -f docker-compose-tailscale.yml run --rm quptime node add :9901` network_mode: "service:tailscale" # Use the Tailscale network stack + restart: unless-stopped + # After this node is up, add peers from the master with: + # docker compose -f docker-compose-tailscale.yml exec quptime \ + # qu node add :9901 volumes: tailscale: - quptime: \ No newline at end of file + quptime: diff --git a/docs/configuration.md b/docs/configuration.md index 750635f..95bf92a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -35,6 +35,8 @@ Override the socket path with `QUPTIME_SOCKET=/run/foo.sock`. ## Environment variables +### Paths + | Variable | Purpose | | ----------------- | ------------------------------------------------------------------------------------------------------------------------- | | `QUPTIME_DIR` | Data directory. Defaults to `/etc/quptime` (root) or `$XDG_CONFIG_HOME/quptime`. | @@ -42,9 +44,52 @@ Override the socket path with `QUPTIME_SOCKET=/run/foo.sock`. | `XDG_CONFIG_HOME` | Honored when running as non-root and `QUPTIME_DIR` is unset. | | `XDG_RUNTIME_DIR` | Honored when running as non-root and `QUPTIME_SOCKET` is unset. | +### `node.yaml` field overrides + +Every field in `node.yaml` can also be supplied via an environment +variable. This is the recommended way to drive Docker / Compose +deployments: drop the env vars into the compose file and the daemon +will bootstrap on first start without a separate `qu init` step. + +| Variable | `node.yaml` field | Notes | +| ------------------------ | ----------------- | -------------------------------------------------------------------------------------------------------------- | +| `QUPTIME_NODE_ID` | `node_id` | Pin a specific UUID. Leave unset to let `qu init` / auto-init generate one. | +| `QUPTIME_BIND_ADDR` | `bind_addr` | Defaults to `0.0.0.0`. | +| `QUPTIME_BIND_PORT` | `bind_port` | Integer. Defaults to `9901`. | +| `QUPTIME_ADVERTISE` | `advertise` | `host:port` other peers use to reach this node. Required when bound to a wildcard or behind NAT. | +| `QUPTIME_CLUSTER_SECRET` | `cluster_secret` | Pre-shared join secret. Set the same value on every node. If unset on the very first node, one is generated. | + +Precedence is **env > file > compiled default**. Non-empty env values +win over whatever is stored in `node.yaml` at load time, so changing a +variable in `docker-compose.yml` and restarting the container is +enough to roll out new bind/advertise values — no on-disk edit +required. Empty env values are ignored (they will not clear a +previously persisted field). + +For `qu init` specifically, explicit command-line flags take +precedence over env values; env values fill in only the fields the +operator did not pass on the command line. + The daemon does not read any other environment variables. SMTP, Discord, and HTTP probe targets are configured exclusively in `cluster.yaml`. +## Auto-init on `qu serve` + +If `node.yaml` does not exist when `qu serve` starts, the daemon +bootstraps it in-place using the `QUPTIME_*` env vars above: a fresh +UUID is generated (or `QUPTIME_NODE_ID` is honored if set), an RSA +keypair and self-signed cert are written under `keys/`, and +`cluster.yaml` is seeded with this node as its sole peer. If no +`QUPTIME_CLUSTER_SECRET` was provided, a random one is generated and +printed to stderr — copy it to every follower node's +`QUPTIME_CLUSTER_SECRET` (or `--secret` flag) before they start. + +This is what makes the docker-compose flow `docker compose up`-only +on a fresh volume. To opt out (e.g. so a misconfigured deployment +crashes loudly instead of silently generating a new identity), run +`qu init` against the volume yourself before letting `qu serve` ever +see it. + ## `node.yaml` — local identity Never replicated. One file per host. Generated by `qu init`. diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 7f22607..d86884b 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -27,6 +27,14 @@ services: image: git.cer.sh/axodouble/quptime:v0.1.0 container_name: quptime restart: unless-stopped + environment: + # host:port other nodes use to reach this one. Must be reachable + # from every peer — the loopback inside the container is useless. + - QUPTIME_ADVERTISE=:9901 + # Pre-shared join secret. Omit on the very first node and read + # the generated value out of `docker logs quptime`, then set + # this env var on every follower before bringing them up. + - QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-} ports: - "9901:9901" volumes: @@ -41,17 +49,25 @@ volumes: quptime-data: ``` -You must **`qu init` before the daemon will start**. With this compose -file: +`qu serve` auto-initialises the data volume on first start using the +`QUPTIME_*` env vars (see [configuration.md](../configuration.md) for +the full list). One command brings everything up: ```sh -docker compose run --rm quptime init --advertise :9901 docker compose up -d docker compose exec quptime qu status ``` -`` must be reachable from every other node — the loopback -address inside the container is useless to peers. +On the very first node, capture the auto-generated cluster secret: + +```sh +docker compose logs quptime | grep -A1 'cluster secret' +``` + +Copy that value into the `QUPTIME_CLUSTER_SECRET` env var of every +follower before starting them, otherwise their join RPCs will be +rejected. The full list of accepted env vars lives in +[configuration.md](../configuration.md#nodeyaml-field-overrides). ## Three-node compose on a single host @@ -69,18 +85,27 @@ services: alpha: <<: *quptime container_name: alpha + environment: + - QUPTIME_ADVERTISE=alpha:9901 + # First node: leave secret unset and read it from `docker logs`. ports: ["9901:9901"] volumes: ["alpha-data:/etc/quptime"] bravo: <<: *quptime container_name: bravo + environment: + - QUPTIME_ADVERTISE=bravo:9901 + - QUPTIME_CLUSTER_SECRET=${SECRET} ports: ["9902:9901"] volumes: ["bravo-data:/etc/quptime"] charlie: <<: *quptime container_name: charlie + environment: + - QUPTIME_ADVERTISE=charlie:9901 + - QUPTIME_CLUSTER_SECRET=${SECRET} ports: ["9903:9901"] volumes: ["charlie-data:/etc/quptime"] @@ -93,15 +118,12 @@ volumes: Bootstrap: ```sh -# First node: prints the secret to stdout. -docker compose run --rm alpha init --advertise alpha:9901 -# Capture the secret (or read it back from alpha-data). -SECRET=$(docker compose exec alpha cat /etc/quptime/node.yaml | grep cluster_secret | awk '{print $2}') - -docker compose run --rm bravo init --advertise bravo:9901 --secret "$SECRET" -docker compose run --rm charlie init --advertise charlie:9901 --secret "$SECRET" - -docker compose up -d +# 1. Start alpha first to mint the cluster secret. +docker compose up -d alpha +# 2. Read the secret off alpha's stdout. +export SECRET=$(docker compose logs alpha | awk '/cluster secret/{getline; print $1}') +# 3. Bring up the followers — they pick up the secret from $SECRET. +docker compose up -d bravo charlie # Invite from alpha. The hostnames resolve over the compose network. docker compose exec alpha qu node add bravo:9901 @@ -127,6 +149,9 @@ services: image: git.cer.sh/axodouble/quptime:v0.1.0 container_name: quptime restart: unless-stopped + environment: + - QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE} # host:9901 reachable from peers + - QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET} ports: - "9901:9901" volumes: @@ -135,6 +160,10 @@ services: - NET_RAW ``` +Put the per-host values (`QUPTIME_ADVERTISE`, `QUPTIME_CLUSTER_SECRET`) +in a sibling `.env` file or a config-management secret so the compose +file itself is identical across hosts. + Persistence is a bind-mount under `/srv/quptime/data` so backups and upgrades hit a known path. See [operations.md](../operations.md) for the backup recipe. diff --git a/docs/deployment/tailscale.md b/docs/deployment/tailscale.md index 1b6ae43..1b26be7 100644 --- a/docs/deployment/tailscale.md +++ b/docs/deployment/tailscale.md @@ -53,12 +53,21 @@ services: quptime: image: git.cer.sh/axodouble/quptime:v0.1.0 container_name: quptime + environment: + # host:port other QUptime nodes use to reach this one. Should be + # this node's tailnet IP / MagicDNS name. Auto-init reads this on + # first start. + - QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE} + # Shared cluster join secret. Omit on the very first node to have + # it generated and logged for you, then copy it into every + # follower's .env. + - QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-} volumes: - quptime:/etc/quptime network_mode: "service:tailscale" depends_on: [tailscale] cap_add: [NET_RAW] - # No restart directive yet — needs `qu init` first. + restart: unless-stopped volumes: tailscale: @@ -67,43 +76,41 @@ volumes: ### One-time bootstrap -Each host runs the same script with different `HOST` and `TAILSCALE_AUTHKEY`: +Each host runs the same compose file with a per-host `.env`: ```sh -# .env +# .env (alpha — the first node) HOST=alpha TAILSCALE_AUTHKEY=tskey-auth-xxxxxxxx +QUPTIME_ADVERTISE=100.64.1.1:9901 # this node's tailnet IP +# QUPTIME_CLUSTER_SECRET left unset — will be generated on first boot. ``` -Start Tailscale alone first so it gets an IP: +Start the stack on the first host. `qu serve` auto-initialises the +volume using the env vars above, so a single `docker compose up` +brings everything up: ```sh -docker compose up -d tailscale -sleep 5 -TSIP=$(docker compose exec tailscale tailscale ip --4) -echo "this node's tailnet IP: $TSIP" +docker compose up -d +docker compose logs quptime | grep -A1 'cluster secret' +# Pipe the secret through your password manager. ``` -On the **first** host, init without `--secret`: +On every **other** host, write the same `.env` plus the captured +secret: ```sh -docker compose run --rm quptime init --advertise "$TSIP:9901" -# Grab the printed secret; pipe through your password manager. +# .env (bravo, charlie, …) +HOST=bravo +TAILSCALE_AUTHKEY=tskey-auth-xxxxxxxx +QUPTIME_ADVERTISE=100.64.1.2:9901 +QUPTIME_CLUSTER_SECRET= ``` -On every **other** host, paste the secret: +Bring them up and invite them from the first node: ```sh -docker compose run --rm quptime init \ - --advertise "$TSIP:9901" \ - --secret "$CLUSTER_SECRET" -``` - -Then bring up `qu` on every node and invite from the first: - -```sh -# Each host -docker compose up -d quptime +docker compose up -d # From alpha docker compose exec quptime qu node add 100.64.1.2:9901 diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index c0e6e6d..f9025ab 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -146,15 +146,26 @@ both call this out. load node.yaml: open ...: no such file or directory ``` -Run `qu init` before `qu serve`. The daemon does not auto-init — -silently generating identities and secrets would be a worse failure -mode than crashing. +`qu serve` normally auto-bootstraps a missing `node.yaml` using the +`QUPTIME_*` env vars (see +[configuration.md](configuration.md#auto-init-on-qu-serve)). If you +still see this error, the most likely causes are: + +- The data directory is read-only or owned by a different user — the + bootstrap can't write `node.yaml`. Fix permissions on + `$QUPTIME_DIR`. +- Something else removed `node.yaml` mid-run (a config-management + tool, a misconfigured volume). Re-run `qu serve` and it will + rebuild from env, or run `qu init` manually with the flags you + want. ``` node.yaml has empty node_id — run `qu init` first ``` -Same fix. +`node.yaml` exists but lacks a `node_id`. Either delete the file and +let auto-init regenerate it, or run `qu init` against a wiped data +dir. ``` listen tcp :9901: bind: address already in use @@ -197,3 +208,7 @@ sudo systemctl start quptime The data directory is the only state. Wipe it and you're back to a fresh node. + +Under Docker (or any env-driven deploy), the explicit `qu init` step +isn't needed — wiping the data volume and restarting the container is +enough; `qu serve` will re-bootstrap from the `QUPTIME_*` env vars. diff --git a/internal/cli/init.go b/internal/cli/init.go index 085d28c..1604e7d 100644 --- a/internal/cli/init.go +++ b/internal/cli/init.go @@ -5,6 +5,7 @@ import ( "encoding/base64" "errors" "fmt" + "io" "os" "github.com/google/uuid" @@ -30,78 +31,50 @@ Pass --secret on every subsequent node so they share the same cluster join secret. If --secret is omitted on the very first node, a random secret is generated and printed for the operator to copy. +Every flag may also be supplied via its QUPTIME_* environment variable +(see docs/configuration.md). Explicit flags win over env values, which +in turn win over the compiled defaults. + Idempotent in one direction only: existing key material is never overwritten. Re-run only after wiping the data directory.`, RunE: func(cmd *cobra.Command, args []string) error { - if err := config.EnsureDataDir(); err != nil { - return err - } if _, err := os.Stat(config.NodeFilePath()); err == nil { return errors.New("node.yaml already exists in data dir — refusing to overwrite") } - secret := clusterSecret - generated := false - if secret == "" { - s, err := generateSecret() - if err != nil { - return fmt.Errorf("generate cluster secret: %w", err) - } - secret = s - generated = true + // Only let env fill fields the operator did NOT pass on the + // command line; explicit flags must win over env. + n := &config.NodeConfig{} + if cmd.Flags().Changed("bind") { + n.BindAddr = bindAddr + } + if cmd.Flags().Changed("port") { + n.BindPort = bindPort + } + if cmd.Flags().Changed("advertise") { + n.Advertise = advertise + } + if cmd.Flags().Changed("secret") { + n.ClusterSecret = clusterSecret + } + if err := n.ApplyEnvOverrides(); err != nil { + return err + } + // Cobra defaults (bind=0.0.0.0, port=9901) are still + // available as fallbacks for fields neither flag nor env + // touched. + if n.BindAddr == "" { + n.BindAddr = bindAddr + } + if n.BindPort == 0 { + n.BindPort = bindPort } - nodeID := uuid.NewString() - n := &config.NodeConfig{ - NodeID: nodeID, - BindAddr: bindAddr, - BindPort: bindPort, - Advertise: advertise, - ClusterSecret: secret, - } - if err := n.Save(); err != nil { - return fmt.Errorf("save node.yaml: %w", err) - } - if _, err := crypto.GenerateKeyPair(nodeID); err != nil { - return fmt.Errorf("generate keys: %w", err) - } - - // Seed cluster.yaml with this node as its own first peer. - // Without this the math in `quorum` would treat a one-node - // cluster as "0 peers, fallback quorum=1, master=self" — - // which works in isolation but breaks the moment another - // node joins, because the replicated peers list would lack - // the inviter, leading to split-brain elections. - certPEM, err := crypto.LoadCertPEM() + _, generated, err := bootstrapNode(n) if err != nil { - return fmt.Errorf("load cert: %w", err) - } - fp, err := crypto.FingerprintFromCertPEM(certPEM) - if err != nil { - return fmt.Errorf("fingerprint own cert: %w", err) - } - cluster := &config.ClusterConfig{} - if err := cluster.Mutate(nodeID, func(c *config.ClusterConfig) error { - c.Peers = []config.PeerInfo{{ - NodeID: nodeID, - Advertise: n.AdvertiseAddr(), - Fingerprint: fp, - CertPEM: string(certPEM), - }} - return nil - }); err != nil { - return fmt.Errorf("seed cluster.yaml: %w", err) - } - - out := cmd.OutOrStdout() - fmt.Fprintf(out, "initialised node %s\n", nodeID) - fmt.Fprintf(out, "data dir: %s\n", config.DataDir()) - fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr()) - if generated { - fmt.Fprintln(out) - fmt.Fprintln(out, "cluster secret (copy to every other node via --secret):") - fmt.Fprintln(out, " "+secret) + return err } + printBootstrapResult(cmd.OutOrStdout(), n, generated) return nil }, } @@ -112,6 +85,94 @@ overwritten. Re-run only after wiping the data directory.`, root.AddCommand(cmd) } +// bootstrapNode creates the data dir, writes node.yaml, generates the +// keypair, and seeds cluster.yaml with this node as its own first +// peer. cfg may arrive with any subset of fields populated; missing +// NodeID and ClusterSecret are auto-generated, missing BindAddr / +// BindPort get the compiled defaults. +// +// Returns the populated config (the same pointer that was passed in) +// and a flag indicating whether ClusterSecret was generated here. The +// flag exists so the caller can print the secret for the operator — +// it must be copied to every follower node out-of-band. +// +// Caller is responsible for checking that node.yaml does not yet +// exist; bootstrapNode itself will refuse to overwrite an existing +// keypair (crypto.GenerateKeyPair errors out) but does not guard +// against clobbering node.yaml. +func bootstrapNode(cfg *config.NodeConfig) (*config.NodeConfig, bool, error) { + if err := config.EnsureDataDir(); err != nil { + return nil, false, err + } + if cfg.NodeID == "" { + cfg.NodeID = uuid.NewString() + } + if cfg.BindAddr == "" { + cfg.BindAddr = "0.0.0.0" + } + if cfg.BindPort == 0 { + cfg.BindPort = 9901 + } + generated := false + if cfg.ClusterSecret == "" { + s, err := generateSecret() + if err != nil { + return nil, false, fmt.Errorf("generate cluster secret: %w", err) + } + cfg.ClusterSecret = s + generated = true + } + if err := cfg.Save(); err != nil { + return nil, false, fmt.Errorf("save node.yaml: %w", err) + } + if _, err := crypto.GenerateKeyPair(cfg.NodeID); err != nil { + return nil, false, fmt.Errorf("generate keys: %w", err) + } + + // Seed cluster.yaml with this node as its own first peer. + // Without this the math in `quorum` would treat a one-node + // cluster as "0 peers, fallback quorum=1, master=self" — which + // works in isolation but breaks the moment another node joins, + // because the replicated peers list would lack the inviter, + // leading to split-brain elections. + certPEM, err := crypto.LoadCertPEM() + if err != nil { + return nil, false, fmt.Errorf("load cert: %w", err) + } + fp, err := crypto.FingerprintFromCertPEM(certPEM) + if err != nil { + return nil, false, fmt.Errorf("fingerprint own cert: %w", err) + } + cluster := &config.ClusterConfig{} + if err := cluster.Mutate(cfg.NodeID, func(c *config.ClusterConfig) error { + c.Peers = []config.PeerInfo{{ + NodeID: cfg.NodeID, + Advertise: cfg.AdvertiseAddr(), + Fingerprint: fp, + CertPEM: string(certPEM), + }} + return nil + }); err != nil { + return nil, false, fmt.Errorf("seed cluster.yaml: %w", err) + } + return cfg, generated, nil +} + +// printBootstrapResult emits the human-readable summary both `qu init` +// and the serve auto-init path print after bootstrapping. Kept in one +// place so the secret-disclosure format stays identical across the two +// entry points. +func printBootstrapResult(out io.Writer, n *config.NodeConfig, secretGenerated bool) { + fmt.Fprintf(out, "initialised node %s\n", n.NodeID) + fmt.Fprintf(out, "data dir: %s\n", config.DataDir()) + fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr()) + if secretGenerated { + fmt.Fprintln(out) + fmt.Fprintln(out, "cluster secret (copy to every other node via --secret or QUPTIME_CLUSTER_SECRET):") + fmt.Fprintln(out, " "+n.ClusterSecret) + } +} + // generateSecret produces 32 bytes of crypto-random data and returns // it base64-encoded. Long enough that brute force isn't a concern; // short enough that operators can copy-paste it without pagination. diff --git a/internal/cli/serve.go b/internal/cli/serve.go index bb929fc..39559d3 100644 --- a/internal/cli/serve.go +++ b/internal/cli/serve.go @@ -2,6 +2,9 @@ package cli import ( "context" + "errors" + "fmt" + "io/fs" "log" "os" "os/signal" @@ -9,6 +12,7 @@ import ( "github.com/spf13/cobra" + "git.cer.sh/axodouble/quptime/internal/config" "git.cer.sh/axodouble/quptime/internal/daemon" ) @@ -18,9 +22,18 @@ func addServeCmd(root *cobra.Command) { Short: "Run the qu daemon in the foreground", Long: `Run the qu daemon: starts the inter-node listener, the local control socket for the CLI, the heartbeat loop and the check -scheduler. Stops cleanly on SIGINT or SIGTERM.`, +scheduler. Stops cleanly on SIGINT or SIGTERM. + +If node.yaml does not exist yet, serve will bootstrap it using values +from the QUPTIME_* environment variables (see docs/configuration.md). +This makes a single ` + "`docker compose up`" + ` enough to launch a new node — +no separate ` + "`qu init`" + ` step is required when the data volume is +fresh.`, RunE: func(cmd *cobra.Command, args []string) error { logger := log.New(os.Stderr, "quptime: ", log.LstdFlags|log.Lmsgprefix) + if err := autoInitIfNeeded(cmd, logger); err != nil { + return err + } d, err := daemon.New(logger) if err != nil { return err @@ -32,3 +45,39 @@ scheduler. Stops cleanly on SIGINT or SIGTERM.`, } root.AddCommand(cmd) } + +// autoInitIfNeeded bootstraps the node on first launch. +// +// Friction this removes for container deploys: before, the operator +// had to `docker compose run --rm quptime init …` once before the +// service could come up, which makes `restart: unless-stopped` +// awkward and forces an out-of-band step into every fresh volume. +// Now serve auto-runs the same bootstrap path using QUPTIME_* env +// vars when node.yaml is absent, so the compose file can come up on +// the first try. +// +// Pre-existing node.yaml is left untouched — we only bootstrap when +// the file is genuinely missing. Any other stat error (permission +// denied, broken symlink) is surfaced so the operator sees the real +// problem instead of a confused auto-init attempt clobbering state. +func autoInitIfNeeded(cmd *cobra.Command, logger *log.Logger) error { + _, err := os.Stat(config.NodeFilePath()) + if err == nil { + return nil + } + if !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("stat node.yaml: %w", err) + } + + logger.Printf("node.yaml not found at %s — bootstrapping from environment", config.NodeFilePath()) + n := &config.NodeConfig{} + if err := n.ApplyEnvOverrides(); err != nil { + return err + } + if _, generated, err := bootstrapNode(n); err != nil { + return fmt.Errorf("auto-init: %w", err) + } else { + printBootstrapResult(cmd.OutOrStderr(), n, generated) + } + return nil +} diff --git a/internal/config/node.go b/internal/config/node.go index 41993f5..7a50ee6 100644 --- a/internal/config/node.go +++ b/internal/config/node.go @@ -3,10 +3,26 @@ package config import ( "fmt" "os" + "strconv" "gopkg.in/yaml.v3" ) +// Environment variable names that override fields on NodeConfig at +// load time. Intended to let `docker compose` setups drive a node's +// identity and listener configuration without having to bake a +// node.yaml into the image or run `qu init` manually first. +// +// Empty values are ignored — they do not clear a field. The override +// order is therefore: env (non-empty) > file > compiled default. +const ( + EnvNodeID = "QUPTIME_NODE_ID" + EnvBindAddr = "QUPTIME_BIND_ADDR" + EnvBindPort = "QUPTIME_BIND_PORT" + EnvAdvertise = "QUPTIME_ADVERTISE" + EnvClusterSecret = "QUPTIME_CLUSTER_SECRET" +) + // NodeConfig is the per-node, never-replicated identity file. type NodeConfig struct { // NodeID is a stable UUID generated at `qu init`. Used by all peers @@ -45,6 +61,34 @@ func (n *NodeConfig) AdvertiseAddr() string { return fmt.Sprintf("%s:%d", bind, n.BindPort) } +// ApplyEnvOverrides folds QUPTIME_* environment variables onto n. +// Non-empty env values win over the existing field value. Called both +// by LoadNodeConfig and by the `qu init` / serve auto-init paths so +// the same precedence rules apply whether the daemon is reading a +// persisted node.yaml or constructing one from scratch. +func (n *NodeConfig) ApplyEnvOverrides() error { + if v := os.Getenv(EnvNodeID); v != "" { + n.NodeID = v + } + if v := os.Getenv(EnvBindAddr); v != "" { + n.BindAddr = v + } + if v := os.Getenv(EnvBindPort); v != "" { + p, err := strconv.Atoi(v) + if err != nil { + return fmt.Errorf("%s=%q: not an integer: %w", EnvBindPort, v, err) + } + n.BindPort = p + } + if v := os.Getenv(EnvAdvertise); v != "" { + n.Advertise = v + } + if v := os.Getenv(EnvClusterSecret); v != "" { + n.ClusterSecret = v + } + return nil +} + // LoadNodeConfig reads node.yaml from the data dir. func LoadNodeConfig() (*NodeConfig, error) { raw, err := os.ReadFile(NodeFilePath()) @@ -55,6 +99,9 @@ func LoadNodeConfig() (*NodeConfig, error) { if err := yaml.Unmarshal(raw, cfg); err != nil { return nil, fmt.Errorf("parse node.yaml: %w", err) } + if err := cfg.ApplyEnvOverrides(); err != nil { + return nil, err + } if cfg.BindPort == 0 { cfg.BindPort = 9901 } diff --git a/internal/config/node_test.go b/internal/config/node_test.go index 9c37fad..25ef9e6 100644 --- a/internal/config/node_test.go +++ b/internal/config/node_test.go @@ -4,9 +4,9 @@ import "testing" func TestAdvertiseAddrFallback(t *testing.T) { cases := []struct { - name string - cfg NodeConfig - want string + name string + cfg NodeConfig + want string }{ {"explicit advertise wins", NodeConfig{Advertise: "host:1234", BindAddr: "0.0.0.0", BindPort: 9901}, "host:1234"}, {"empty bind falls back to loopback", NodeConfig{BindPort: 9901}, "127.0.0.1:9901"}, @@ -56,3 +56,95 @@ func TestLoadNodeConfigAppliesDefaults(t *testing.T) { t.Errorf("BindAddr=%q want 0.0.0.0", loaded.BindAddr) } } + +func TestApplyEnvOverrides(t *testing.T) { + t.Setenv(EnvNodeID, "node-from-env") + t.Setenv(EnvBindAddr, "1.2.3.4") + t.Setenv(EnvBindPort, "9999") + t.Setenv(EnvAdvertise, "public.example.com:9999") + t.Setenv(EnvClusterSecret, "shh-secret") + + n := &NodeConfig{ + NodeID: "original-id", + BindAddr: "0.0.0.0", + BindPort: 9901, + Advertise: "old.example.com:9901", + ClusterSecret: "old-secret", + } + if err := n.ApplyEnvOverrides(); err != nil { + t.Fatal(err) + } + want := NodeConfig{ + NodeID: "node-from-env", + BindAddr: "1.2.3.4", + BindPort: 9999, + Advertise: "public.example.com:9999", + ClusterSecret: "shh-secret", + } + if *n != want { + t.Errorf("got %+v want %+v", *n, want) + } +} + +func TestApplyEnvOverridesEmptyValuesIgnored(t *testing.T) { + // Explicitly empty env vars must NOT clobber existing fields — + // otherwise `docker run -e QUPTIME_ADVERTISE=` would silently + // erase a previously-persisted advertise address. + t.Setenv(EnvNodeID, "") + t.Setenv(EnvBindAddr, "") + t.Setenv(EnvBindPort, "") + t.Setenv(EnvAdvertise, "") + t.Setenv(EnvClusterSecret, "") + + orig := NodeConfig{ + NodeID: "keep-me", + BindAddr: "10.0.0.1", + BindPort: 9901, + Advertise: "keep.example.com:9901", + ClusterSecret: "keep-secret", + } + n := orig + if err := n.ApplyEnvOverrides(); err != nil { + t.Fatal(err) + } + if n != orig { + t.Errorf("empty env vars mutated config: got %+v want %+v", n, orig) + } +} + +func TestApplyEnvOverridesBadPort(t *testing.T) { + t.Setenv(EnvBindPort, "not-an-int") + n := &NodeConfig{} + if err := n.ApplyEnvOverrides(); err == nil { + t.Fatal("expected error for non-integer port") + } +} + +func TestLoadNodeConfigEnvOverridesFile(t *testing.T) { + t.Setenv("QUPTIME_DIR", t.TempDir()) + // Persist a file with one bind addr; env should win on load. + n := &NodeConfig{NodeID: "abc", BindAddr: "127.0.0.1", BindPort: 9901, Advertise: "file.example.com:9901"} + if err := n.Save(); err != nil { + t.Fatal(err) + } + t.Setenv(EnvBindAddr, "0.0.0.0") + t.Setenv(EnvAdvertise, "env.example.com:9001") + t.Setenv(EnvBindPort, "9001") + + loaded, err := LoadNodeConfig() + if err != nil { + t.Fatal(err) + } + if loaded.BindAddr != "0.0.0.0" { + t.Errorf("BindAddr=%q want 0.0.0.0 (env override)", loaded.BindAddr) + } + if loaded.BindPort != 9001 { + t.Errorf("BindPort=%d want 9001 (env override)", loaded.BindPort) + } + if loaded.Advertise != "env.example.com:9001" { + t.Errorf("Advertise=%q want env.example.com:9001 (env override)", loaded.Advertise) + } + if loaded.NodeID != "abc" { + t.Errorf("NodeID=%q want abc (unchanged)", loaded.NodeID) + } +}