Auto init via environment variables support, qu init for systemd
Container image / image (push) Successful in 1m38s
Container image / image (push) Successful in 1m38s
This commit is contained in:
@@ -1,5 +1,14 @@
|
||||
# An example of a docker compose with Tailscale & QUptime.
|
||||
# This setup is specifically intended for hosts that may not be able to reach each other directly or have a public IP address.
|
||||
# This setup is specifically intended for hosts that may not be able to
|
||||
# reach each other directly or have a public IP address.
|
||||
#
|
||||
# Bring it up with `docker compose -f docker-compose-tailscale.yml up -d`.
|
||||
# QUptime auto-initialises on first start using the QUPTIME_* env vars
|
||||
# below — no separate `qu init` step is required.
|
||||
#
|
||||
# On the first node, omit QUPTIME_CLUSTER_SECRET to have one generated
|
||||
# for you. Read it out of the logs (`docker logs quptime`) and copy it
|
||||
# into the .env of every other node before bringing them up.
|
||||
|
||||
services:
|
||||
tailscale:
|
||||
@@ -18,19 +27,27 @@ services:
|
||||
quptime:
|
||||
image: git.cer.sh/axodouble/quptime:master
|
||||
container_name: quptime
|
||||
environment:
|
||||
# host:port other QUptime nodes use to reach this one. Use the
|
||||
# Tailscale IP / MagicDNS name of this host. Required behind NAT.
|
||||
- QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE}
|
||||
# Shared cluster join secret. Set on every node. Leave unset on
|
||||
# the very first node — one will be generated and logged for you
|
||||
# to copy to the others. Followers MUST set this before starting.
|
||||
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-}
|
||||
# Optional: pin a port other than the default 9901.
|
||||
# - QUPTIME_BIND_PORT=9901
|
||||
volumes:
|
||||
- quptime:/etc/quptime
|
||||
ports:
|
||||
- "9901:9901"
|
||||
depends_on:
|
||||
- tailscale
|
||||
# No restart directive, user needs to init quptime first
|
||||
# Run `docker compose -f docker-compose-tailscale.yml run --rm quptime init` to initialize
|
||||
# the data volume before starting the service
|
||||
# If this is not the master node, use
|
||||
# `docker compose -f docker-compose-tailscale.yml run --rm quptime --advertise <TAILSCALE_IP>:9901 --secret <SECRET>`
|
||||
# And add the individual nodes to the cluster with `docker compose -f docker-compose-tailscale.yml run --rm quptime node add <OTHER_NODE_IP>:9901`
|
||||
network_mode: "service:tailscale" # Use the Tailscale network stack
|
||||
restart: unless-stopped
|
||||
# After this node is up, add peers from the master with:
|
||||
# docker compose -f docker-compose-tailscale.yml exec quptime \
|
||||
# qu node add <OTHER_NODE_TAILSCALE_IP>:9901
|
||||
|
||||
volumes:
|
||||
tailscale:
|
||||
|
||||
@@ -35,6 +35,8 @@ Override the socket path with `QUPTIME_SOCKET=/run/foo.sock`.
|
||||
|
||||
## Environment variables
|
||||
|
||||
### Paths
|
||||
|
||||
| Variable | Purpose |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `QUPTIME_DIR` | Data directory. Defaults to `/etc/quptime` (root) or `$XDG_CONFIG_HOME/quptime`. |
|
||||
@@ -42,9 +44,52 @@ Override the socket path with `QUPTIME_SOCKET=/run/foo.sock`.
|
||||
| `XDG_CONFIG_HOME` | Honored when running as non-root and `QUPTIME_DIR` is unset. |
|
||||
| `XDG_RUNTIME_DIR` | Honored when running as non-root and `QUPTIME_SOCKET` is unset. |
|
||||
|
||||
### `node.yaml` field overrides
|
||||
|
||||
Every field in `node.yaml` can also be supplied via an environment
|
||||
variable. This is the recommended way to drive Docker / Compose
|
||||
deployments: drop the env vars into the compose file and the daemon
|
||||
will bootstrap on first start without a separate `qu init` step.
|
||||
|
||||
| Variable | `node.yaml` field | Notes |
|
||||
| ------------------------ | ----------------- | -------------------------------------------------------------------------------------------------------------- |
|
||||
| `QUPTIME_NODE_ID` | `node_id` | Pin a specific UUID. Leave unset to let `qu init` / auto-init generate one. |
|
||||
| `QUPTIME_BIND_ADDR` | `bind_addr` | Defaults to `0.0.0.0`. |
|
||||
| `QUPTIME_BIND_PORT` | `bind_port` | Integer. Defaults to `9901`. |
|
||||
| `QUPTIME_ADVERTISE` | `advertise` | `host:port` other peers use to reach this node. Required when bound to a wildcard or behind NAT. |
|
||||
| `QUPTIME_CLUSTER_SECRET` | `cluster_secret` | Pre-shared join secret. Set the same value on every node. If unset on the very first node, one is generated. |
|
||||
|
||||
Precedence is **env > file > compiled default**. Non-empty env values
|
||||
win over whatever is stored in `node.yaml` at load time, so changing a
|
||||
variable in `docker-compose.yml` and restarting the container is
|
||||
enough to roll out new bind/advertise values — no on-disk edit
|
||||
required. Empty env values are ignored (they will not clear a
|
||||
previously persisted field).
|
||||
|
||||
For `qu init` specifically, explicit command-line flags take
|
||||
precedence over env values; env values fill in only the fields the
|
||||
operator did not pass on the command line.
|
||||
|
||||
The daemon does not read any other environment variables. SMTP, Discord,
|
||||
and HTTP probe targets are configured exclusively in `cluster.yaml`.
|
||||
|
||||
## Auto-init on `qu serve`
|
||||
|
||||
If `node.yaml` does not exist when `qu serve` starts, the daemon
|
||||
bootstraps it in-place using the `QUPTIME_*` env vars above: a fresh
|
||||
UUID is generated (or `QUPTIME_NODE_ID` is honored if set), an RSA
|
||||
keypair and self-signed cert are written under `keys/`, and
|
||||
`cluster.yaml` is seeded with this node as its sole peer. If no
|
||||
`QUPTIME_CLUSTER_SECRET` was provided, a random one is generated and
|
||||
printed to stderr — copy it to every follower node's
|
||||
`QUPTIME_CLUSTER_SECRET` (or `--secret` flag) before they start.
|
||||
|
||||
This is what makes the docker-compose flow `docker compose up`-only
|
||||
on a fresh volume. To opt out (e.g. so a misconfigured deployment
|
||||
crashes loudly instead of silently generating a new identity), run
|
||||
`qu init` against the volume yourself before letting `qu serve` ever
|
||||
see it.
|
||||
|
||||
## `node.yaml` — local identity
|
||||
|
||||
Never replicated. One file per host. Generated by `qu init`.
|
||||
|
||||
+43
-14
@@ -27,6 +27,14 @@ services:
|
||||
image: git.cer.sh/axodouble/quptime:v0.1.0
|
||||
container_name: quptime
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
# host:port other nodes use to reach this one. Must be reachable
|
||||
# from every peer — the loopback inside the container is useless.
|
||||
- QUPTIME_ADVERTISE=<host-ip>:9901
|
||||
# Pre-shared join secret. Omit on the very first node and read
|
||||
# the generated value out of `docker logs quptime`, then set
|
||||
# this env var on every follower before bringing them up.
|
||||
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-}
|
||||
ports:
|
||||
- "9901:9901"
|
||||
volumes:
|
||||
@@ -41,17 +49,25 @@ volumes:
|
||||
quptime-data:
|
||||
```
|
||||
|
||||
You must **`qu init` before the daemon will start**. With this compose
|
||||
file:
|
||||
`qu serve` auto-initialises the data volume on first start using the
|
||||
`QUPTIME_*` env vars (see [configuration.md](../configuration.md) for
|
||||
the full list). One command brings everything up:
|
||||
|
||||
```sh
|
||||
docker compose run --rm quptime init --advertise <host-ip>:9901
|
||||
docker compose up -d
|
||||
docker compose exec quptime qu status
|
||||
```
|
||||
|
||||
`<host-ip>` must be reachable from every other node — the loopback
|
||||
address inside the container is useless to peers.
|
||||
On the very first node, capture the auto-generated cluster secret:
|
||||
|
||||
```sh
|
||||
docker compose logs quptime | grep -A1 'cluster secret'
|
||||
```
|
||||
|
||||
Copy that value into the `QUPTIME_CLUSTER_SECRET` env var of every
|
||||
follower before starting them, otherwise their join RPCs will be
|
||||
rejected. The full list of accepted env vars lives in
|
||||
[configuration.md](../configuration.md#nodeyaml-field-overrides).
|
||||
|
||||
## Three-node compose on a single host
|
||||
|
||||
@@ -69,18 +85,27 @@ services:
|
||||
alpha:
|
||||
<<: *quptime
|
||||
container_name: alpha
|
||||
environment:
|
||||
- QUPTIME_ADVERTISE=alpha:9901
|
||||
# First node: leave secret unset and read it from `docker logs`.
|
||||
ports: ["9901:9901"]
|
||||
volumes: ["alpha-data:/etc/quptime"]
|
||||
|
||||
bravo:
|
||||
<<: *quptime
|
||||
container_name: bravo
|
||||
environment:
|
||||
- QUPTIME_ADVERTISE=bravo:9901
|
||||
- QUPTIME_CLUSTER_SECRET=${SECRET}
|
||||
ports: ["9902:9901"]
|
||||
volumes: ["bravo-data:/etc/quptime"]
|
||||
|
||||
charlie:
|
||||
<<: *quptime
|
||||
container_name: charlie
|
||||
environment:
|
||||
- QUPTIME_ADVERTISE=charlie:9901
|
||||
- QUPTIME_CLUSTER_SECRET=${SECRET}
|
||||
ports: ["9903:9901"]
|
||||
volumes: ["charlie-data:/etc/quptime"]
|
||||
|
||||
@@ -93,15 +118,12 @@ volumes:
|
||||
Bootstrap:
|
||||
|
||||
```sh
|
||||
# First node: prints the secret to stdout.
|
||||
docker compose run --rm alpha init --advertise alpha:9901
|
||||
# Capture the secret (or read it back from alpha-data).
|
||||
SECRET=$(docker compose exec alpha cat /etc/quptime/node.yaml | grep cluster_secret | awk '{print $2}')
|
||||
|
||||
docker compose run --rm bravo init --advertise bravo:9901 --secret "$SECRET"
|
||||
docker compose run --rm charlie init --advertise charlie:9901 --secret "$SECRET"
|
||||
|
||||
docker compose up -d
|
||||
# 1. Start alpha first to mint the cluster secret.
|
||||
docker compose up -d alpha
|
||||
# 2. Read the secret off alpha's stdout.
|
||||
export SECRET=$(docker compose logs alpha | awk '/cluster secret/{getline; print $1}')
|
||||
# 3. Bring up the followers — they pick up the secret from $SECRET.
|
||||
docker compose up -d bravo charlie
|
||||
|
||||
# Invite from alpha. The hostnames resolve over the compose network.
|
||||
docker compose exec alpha qu node add bravo:9901
|
||||
@@ -127,6 +149,9 @@ services:
|
||||
image: git.cer.sh/axodouble/quptime:v0.1.0
|
||||
container_name: quptime
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE} # host:9901 reachable from peers
|
||||
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET}
|
||||
ports:
|
||||
- "9901:9901"
|
||||
volumes:
|
||||
@@ -135,6 +160,10 @@ services:
|
||||
- NET_RAW
|
||||
```
|
||||
|
||||
Put the per-host values (`QUPTIME_ADVERTISE`, `QUPTIME_CLUSTER_SECRET`)
|
||||
in a sibling `.env` file or a config-management secret so the compose
|
||||
file itself is identical across hosts.
|
||||
|
||||
Persistence is a bind-mount under `/srv/quptime/data` so backups and
|
||||
upgrades hit a known path. See [operations.md](../operations.md) for
|
||||
the backup recipe.
|
||||
|
||||
@@ -53,12 +53,21 @@ services:
|
||||
quptime:
|
||||
image: git.cer.sh/axodouble/quptime:v0.1.0
|
||||
container_name: quptime
|
||||
environment:
|
||||
# host:port other QUptime nodes use to reach this one. Should be
|
||||
# this node's tailnet IP / MagicDNS name. Auto-init reads this on
|
||||
# first start.
|
||||
- QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE}
|
||||
# Shared cluster join secret. Omit on the very first node to have
|
||||
# it generated and logged for you, then copy it into every
|
||||
# follower's .env.
|
||||
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-}
|
||||
volumes:
|
||||
- quptime:/etc/quptime
|
||||
network_mode: "service:tailscale"
|
||||
depends_on: [tailscale]
|
||||
cap_add: [NET_RAW]
|
||||
# No restart directive yet — needs `qu init` first.
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
tailscale:
|
||||
@@ -67,43 +76,41 @@ volumes:
|
||||
|
||||
### One-time bootstrap
|
||||
|
||||
Each host runs the same script with different `HOST` and `TAILSCALE_AUTHKEY`:
|
||||
Each host runs the same compose file with a per-host `.env`:
|
||||
|
||||
```sh
|
||||
# .env
|
||||
# .env (alpha — the first node)
|
||||
HOST=alpha
|
||||
TAILSCALE_AUTHKEY=tskey-auth-xxxxxxxx
|
||||
QUPTIME_ADVERTISE=100.64.1.1:9901 # this node's tailnet IP
|
||||
# QUPTIME_CLUSTER_SECRET left unset — will be generated on first boot.
|
||||
```
|
||||
|
||||
Start Tailscale alone first so it gets an IP:
|
||||
Start the stack on the first host. `qu serve` auto-initialises the
|
||||
volume using the env vars above, so a single `docker compose up`
|
||||
brings everything up:
|
||||
|
||||
```sh
|
||||
docker compose up -d tailscale
|
||||
sleep 5
|
||||
TSIP=$(docker compose exec tailscale tailscale ip --4)
|
||||
echo "this node's tailnet IP: $TSIP"
|
||||
docker compose up -d
|
||||
docker compose logs quptime | grep -A1 'cluster secret'
|
||||
# Pipe the secret through your password manager.
|
||||
```
|
||||
|
||||
On the **first** host, init without `--secret`:
|
||||
On every **other** host, write the same `.env` plus the captured
|
||||
secret:
|
||||
|
||||
```sh
|
||||
docker compose run --rm quptime init --advertise "$TSIP:9901"
|
||||
# Grab the printed secret; pipe through your password manager.
|
||||
# .env (bravo, charlie, …)
|
||||
HOST=bravo
|
||||
TAILSCALE_AUTHKEY=tskey-auth-xxxxxxxx
|
||||
QUPTIME_ADVERTISE=100.64.1.2:9901
|
||||
QUPTIME_CLUSTER_SECRET=<paste from alpha>
|
||||
```
|
||||
|
||||
On every **other** host, paste the secret:
|
||||
Bring them up and invite them from the first node:
|
||||
|
||||
```sh
|
||||
docker compose run --rm quptime init \
|
||||
--advertise "$TSIP:9901" \
|
||||
--secret "$CLUSTER_SECRET"
|
||||
```
|
||||
|
||||
Then bring up `qu` on every node and invite from the first:
|
||||
|
||||
```sh
|
||||
# Each host
|
||||
docker compose up -d quptime
|
||||
docker compose up -d
|
||||
|
||||
# From alpha
|
||||
docker compose exec quptime qu node add 100.64.1.2:9901
|
||||
|
||||
+19
-4
@@ -146,15 +146,26 @@ both call this out.
|
||||
load node.yaml: open ...: no such file or directory
|
||||
```
|
||||
|
||||
Run `qu init` before `qu serve`. The daemon does not auto-init —
|
||||
silently generating identities and secrets would be a worse failure
|
||||
mode than crashing.
|
||||
`qu serve` normally auto-bootstraps a missing `node.yaml` using the
|
||||
`QUPTIME_*` env vars (see
|
||||
[configuration.md](configuration.md#auto-init-on-qu-serve)). If you
|
||||
still see this error, the most likely causes are:
|
||||
|
||||
- The data directory is read-only or owned by a different user — the
|
||||
bootstrap can't write `node.yaml`. Fix permissions on
|
||||
`$QUPTIME_DIR`.
|
||||
- Something else removed `node.yaml` mid-run (a config-management
|
||||
tool, a misconfigured volume). Re-run `qu serve` and it will
|
||||
rebuild from env, or run `qu init` manually with the flags you
|
||||
want.
|
||||
|
||||
```
|
||||
node.yaml has empty node_id — run `qu init` first
|
||||
```
|
||||
|
||||
Same fix.
|
||||
`node.yaml` exists but lacks a `node_id`. Either delete the file and
|
||||
let auto-init regenerate it, or run `qu init` against a wiped data
|
||||
dir.
|
||||
|
||||
```
|
||||
listen tcp :9901: bind: address already in use
|
||||
@@ -197,3 +208,7 @@ sudo systemctl start quptime
|
||||
|
||||
The data directory is the only state. Wipe it and you're back to a
|
||||
fresh node.
|
||||
|
||||
Under Docker (or any env-driven deploy), the explicit `qu init` step
|
||||
isn't needed — wiping the data volume and restarting the container is
|
||||
enough; `qu serve` will re-bootstrap from the `QUPTIME_*` env vars.
|
||||
|
||||
+122
-61
@@ -5,6 +5,7 @@ import (
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/google/uuid"
|
||||
@@ -30,78 +31,50 @@ Pass --secret on every subsequent node so they share the same
|
||||
cluster join secret. If --secret is omitted on the very first node, a
|
||||
random secret is generated and printed for the operator to copy.
|
||||
|
||||
Every flag may also be supplied via its QUPTIME_* environment variable
|
||||
(see docs/configuration.md). Explicit flags win over env values, which
|
||||
in turn win over the compiled defaults.
|
||||
|
||||
Idempotent in one direction only: existing key material is never
|
||||
overwritten. Re-run only after wiping the data directory.`,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
if err := config.EnsureDataDir(); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := os.Stat(config.NodeFilePath()); err == nil {
|
||||
return errors.New("node.yaml already exists in data dir — refusing to overwrite")
|
||||
}
|
||||
|
||||
secret := clusterSecret
|
||||
generated := false
|
||||
if secret == "" {
|
||||
s, err := generateSecret()
|
||||
if err != nil {
|
||||
return fmt.Errorf("generate cluster secret: %w", err)
|
||||
}
|
||||
secret = s
|
||||
generated = true
|
||||
// Only let env fill fields the operator did NOT pass on the
|
||||
// command line; explicit flags must win over env.
|
||||
n := &config.NodeConfig{}
|
||||
if cmd.Flags().Changed("bind") {
|
||||
n.BindAddr = bindAddr
|
||||
}
|
||||
if cmd.Flags().Changed("port") {
|
||||
n.BindPort = bindPort
|
||||
}
|
||||
if cmd.Flags().Changed("advertise") {
|
||||
n.Advertise = advertise
|
||||
}
|
||||
if cmd.Flags().Changed("secret") {
|
||||
n.ClusterSecret = clusterSecret
|
||||
}
|
||||
if err := n.ApplyEnvOverrides(); err != nil {
|
||||
return err
|
||||
}
|
||||
// Cobra defaults (bind=0.0.0.0, port=9901) are still
|
||||
// available as fallbacks for fields neither flag nor env
|
||||
// touched.
|
||||
if n.BindAddr == "" {
|
||||
n.BindAddr = bindAddr
|
||||
}
|
||||
if n.BindPort == 0 {
|
||||
n.BindPort = bindPort
|
||||
}
|
||||
|
||||
nodeID := uuid.NewString()
|
||||
n := &config.NodeConfig{
|
||||
NodeID: nodeID,
|
||||
BindAddr: bindAddr,
|
||||
BindPort: bindPort,
|
||||
Advertise: advertise,
|
||||
ClusterSecret: secret,
|
||||
}
|
||||
if err := n.Save(); err != nil {
|
||||
return fmt.Errorf("save node.yaml: %w", err)
|
||||
}
|
||||
if _, err := crypto.GenerateKeyPair(nodeID); err != nil {
|
||||
return fmt.Errorf("generate keys: %w", err)
|
||||
}
|
||||
|
||||
// Seed cluster.yaml with this node as its own first peer.
|
||||
// Without this the math in `quorum` would treat a one-node
|
||||
// cluster as "0 peers, fallback quorum=1, master=self" —
|
||||
// which works in isolation but breaks the moment another
|
||||
// node joins, because the replicated peers list would lack
|
||||
// the inviter, leading to split-brain elections.
|
||||
certPEM, err := crypto.LoadCertPEM()
|
||||
_, generated, err := bootstrapNode(n)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load cert: %w", err)
|
||||
}
|
||||
fp, err := crypto.FingerprintFromCertPEM(certPEM)
|
||||
if err != nil {
|
||||
return fmt.Errorf("fingerprint own cert: %w", err)
|
||||
}
|
||||
cluster := &config.ClusterConfig{}
|
||||
if err := cluster.Mutate(nodeID, func(c *config.ClusterConfig) error {
|
||||
c.Peers = []config.PeerInfo{{
|
||||
NodeID: nodeID,
|
||||
Advertise: n.AdvertiseAddr(),
|
||||
Fingerprint: fp,
|
||||
CertPEM: string(certPEM),
|
||||
}}
|
||||
return nil
|
||||
}); err != nil {
|
||||
return fmt.Errorf("seed cluster.yaml: %w", err)
|
||||
}
|
||||
|
||||
out := cmd.OutOrStdout()
|
||||
fmt.Fprintf(out, "initialised node %s\n", nodeID)
|
||||
fmt.Fprintf(out, "data dir: %s\n", config.DataDir())
|
||||
fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr())
|
||||
if generated {
|
||||
fmt.Fprintln(out)
|
||||
fmt.Fprintln(out, "cluster secret (copy to every other node via --secret):")
|
||||
fmt.Fprintln(out, " "+secret)
|
||||
return err
|
||||
}
|
||||
printBootstrapResult(cmd.OutOrStdout(), n, generated)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
@@ -112,6 +85,94 @@ overwritten. Re-run only after wiping the data directory.`,
|
||||
root.AddCommand(cmd)
|
||||
}
|
||||
|
||||
// bootstrapNode creates the data dir, writes node.yaml, generates the
|
||||
// keypair, and seeds cluster.yaml with this node as its own first
|
||||
// peer. cfg may arrive with any subset of fields populated; missing
|
||||
// NodeID and ClusterSecret are auto-generated, missing BindAddr /
|
||||
// BindPort get the compiled defaults.
|
||||
//
|
||||
// Returns the populated config (the same pointer that was passed in)
|
||||
// and a flag indicating whether ClusterSecret was generated here. The
|
||||
// flag exists so the caller can print the secret for the operator —
|
||||
// it must be copied to every follower node out-of-band.
|
||||
//
|
||||
// Caller is responsible for checking that node.yaml does not yet
|
||||
// exist; bootstrapNode itself will refuse to overwrite an existing
|
||||
// keypair (crypto.GenerateKeyPair errors out) but does not guard
|
||||
// against clobbering node.yaml.
|
||||
func bootstrapNode(cfg *config.NodeConfig) (*config.NodeConfig, bool, error) {
|
||||
if err := config.EnsureDataDir(); err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
if cfg.NodeID == "" {
|
||||
cfg.NodeID = uuid.NewString()
|
||||
}
|
||||
if cfg.BindAddr == "" {
|
||||
cfg.BindAddr = "0.0.0.0"
|
||||
}
|
||||
if cfg.BindPort == 0 {
|
||||
cfg.BindPort = 9901
|
||||
}
|
||||
generated := false
|
||||
if cfg.ClusterSecret == "" {
|
||||
s, err := generateSecret()
|
||||
if err != nil {
|
||||
return nil, false, fmt.Errorf("generate cluster secret: %w", err)
|
||||
}
|
||||
cfg.ClusterSecret = s
|
||||
generated = true
|
||||
}
|
||||
if err := cfg.Save(); err != nil {
|
||||
return nil, false, fmt.Errorf("save node.yaml: %w", err)
|
||||
}
|
||||
if _, err := crypto.GenerateKeyPair(cfg.NodeID); err != nil {
|
||||
return nil, false, fmt.Errorf("generate keys: %w", err)
|
||||
}
|
||||
|
||||
// Seed cluster.yaml with this node as its own first peer.
|
||||
// Without this the math in `quorum` would treat a one-node
|
||||
// cluster as "0 peers, fallback quorum=1, master=self" — which
|
||||
// works in isolation but breaks the moment another node joins,
|
||||
// because the replicated peers list would lack the inviter,
|
||||
// leading to split-brain elections.
|
||||
certPEM, err := crypto.LoadCertPEM()
|
||||
if err != nil {
|
||||
return nil, false, fmt.Errorf("load cert: %w", err)
|
||||
}
|
||||
fp, err := crypto.FingerprintFromCertPEM(certPEM)
|
||||
if err != nil {
|
||||
return nil, false, fmt.Errorf("fingerprint own cert: %w", err)
|
||||
}
|
||||
cluster := &config.ClusterConfig{}
|
||||
if err := cluster.Mutate(cfg.NodeID, func(c *config.ClusterConfig) error {
|
||||
c.Peers = []config.PeerInfo{{
|
||||
NodeID: cfg.NodeID,
|
||||
Advertise: cfg.AdvertiseAddr(),
|
||||
Fingerprint: fp,
|
||||
CertPEM: string(certPEM),
|
||||
}}
|
||||
return nil
|
||||
}); err != nil {
|
||||
return nil, false, fmt.Errorf("seed cluster.yaml: %w", err)
|
||||
}
|
||||
return cfg, generated, nil
|
||||
}
|
||||
|
||||
// printBootstrapResult emits the human-readable summary both `qu init`
|
||||
// and the serve auto-init path print after bootstrapping. Kept in one
|
||||
// place so the secret-disclosure format stays identical across the two
|
||||
// entry points.
|
||||
func printBootstrapResult(out io.Writer, n *config.NodeConfig, secretGenerated bool) {
|
||||
fmt.Fprintf(out, "initialised node %s\n", n.NodeID)
|
||||
fmt.Fprintf(out, "data dir: %s\n", config.DataDir())
|
||||
fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr())
|
||||
if secretGenerated {
|
||||
fmt.Fprintln(out)
|
||||
fmt.Fprintln(out, "cluster secret (copy to every other node via --secret or QUPTIME_CLUSTER_SECRET):")
|
||||
fmt.Fprintln(out, " "+n.ClusterSecret)
|
||||
}
|
||||
}
|
||||
|
||||
// generateSecret produces 32 bytes of crypto-random data and returns
|
||||
// it base64-encoded. Long enough that brute force isn't a concern;
|
||||
// short enough that operators can copy-paste it without pagination.
|
||||
|
||||
+50
-1
@@ -2,6 +2,9 @@ package cli
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
@@ -9,6 +12,7 @@ import (
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
|
||||
"git.cer.sh/axodouble/quptime/internal/config"
|
||||
"git.cer.sh/axodouble/quptime/internal/daemon"
|
||||
)
|
||||
|
||||
@@ -18,9 +22,18 @@ func addServeCmd(root *cobra.Command) {
|
||||
Short: "Run the qu daemon in the foreground",
|
||||
Long: `Run the qu daemon: starts the inter-node listener, the local
|
||||
control socket for the CLI, the heartbeat loop and the check
|
||||
scheduler. Stops cleanly on SIGINT or SIGTERM.`,
|
||||
scheduler. Stops cleanly on SIGINT or SIGTERM.
|
||||
|
||||
If node.yaml does not exist yet, serve will bootstrap it using values
|
||||
from the QUPTIME_* environment variables (see docs/configuration.md).
|
||||
This makes a single ` + "`docker compose up`" + ` enough to launch a new node —
|
||||
no separate ` + "`qu init`" + ` step is required when the data volume is
|
||||
fresh.`,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
logger := log.New(os.Stderr, "quptime: ", log.LstdFlags|log.Lmsgprefix)
|
||||
if err := autoInitIfNeeded(cmd, logger); err != nil {
|
||||
return err
|
||||
}
|
||||
d, err := daemon.New(logger)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -32,3 +45,39 @@ scheduler. Stops cleanly on SIGINT or SIGTERM.`,
|
||||
}
|
||||
root.AddCommand(cmd)
|
||||
}
|
||||
|
||||
// autoInitIfNeeded bootstraps the node on first launch.
|
||||
//
|
||||
// Friction this removes for container deploys: before, the operator
|
||||
// had to `docker compose run --rm quptime init …` once before the
|
||||
// service could come up, which makes `restart: unless-stopped`
|
||||
// awkward and forces an out-of-band step into every fresh volume.
|
||||
// Now serve auto-runs the same bootstrap path using QUPTIME_* env
|
||||
// vars when node.yaml is absent, so the compose file can come up on
|
||||
// the first try.
|
||||
//
|
||||
// Pre-existing node.yaml is left untouched — we only bootstrap when
|
||||
// the file is genuinely missing. Any other stat error (permission
|
||||
// denied, broken symlink) is surfaced so the operator sees the real
|
||||
// problem instead of a confused auto-init attempt clobbering state.
|
||||
func autoInitIfNeeded(cmd *cobra.Command, logger *log.Logger) error {
|
||||
_, err := os.Stat(config.NodeFilePath())
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
if !errors.Is(err, fs.ErrNotExist) {
|
||||
return fmt.Errorf("stat node.yaml: %w", err)
|
||||
}
|
||||
|
||||
logger.Printf("node.yaml not found at %s — bootstrapping from environment", config.NodeFilePath())
|
||||
n := &config.NodeConfig{}
|
||||
if err := n.ApplyEnvOverrides(); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, generated, err := bootstrapNode(n); err != nil {
|
||||
return fmt.Errorf("auto-init: %w", err)
|
||||
} else {
|
||||
printBootstrapResult(cmd.OutOrStderr(), n, generated)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -3,10 +3,26 @@ package config
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// Environment variable names that override fields on NodeConfig at
|
||||
// load time. Intended to let `docker compose` setups drive a node's
|
||||
// identity and listener configuration without having to bake a
|
||||
// node.yaml into the image or run `qu init` manually first.
|
||||
//
|
||||
// Empty values are ignored — they do not clear a field. The override
|
||||
// order is therefore: env (non-empty) > file > compiled default.
|
||||
const (
|
||||
EnvNodeID = "QUPTIME_NODE_ID"
|
||||
EnvBindAddr = "QUPTIME_BIND_ADDR"
|
||||
EnvBindPort = "QUPTIME_BIND_PORT"
|
||||
EnvAdvertise = "QUPTIME_ADVERTISE"
|
||||
EnvClusterSecret = "QUPTIME_CLUSTER_SECRET"
|
||||
)
|
||||
|
||||
// NodeConfig is the per-node, never-replicated identity file.
|
||||
type NodeConfig struct {
|
||||
// NodeID is a stable UUID generated at `qu init`. Used by all peers
|
||||
@@ -45,6 +61,34 @@ func (n *NodeConfig) AdvertiseAddr() string {
|
||||
return fmt.Sprintf("%s:%d", bind, n.BindPort)
|
||||
}
|
||||
|
||||
// ApplyEnvOverrides folds QUPTIME_* environment variables onto n.
|
||||
// Non-empty env values win over the existing field value. Called both
|
||||
// by LoadNodeConfig and by the `qu init` / serve auto-init paths so
|
||||
// the same precedence rules apply whether the daemon is reading a
|
||||
// persisted node.yaml or constructing one from scratch.
|
||||
func (n *NodeConfig) ApplyEnvOverrides() error {
|
||||
if v := os.Getenv(EnvNodeID); v != "" {
|
||||
n.NodeID = v
|
||||
}
|
||||
if v := os.Getenv(EnvBindAddr); v != "" {
|
||||
n.BindAddr = v
|
||||
}
|
||||
if v := os.Getenv(EnvBindPort); v != "" {
|
||||
p, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s=%q: not an integer: %w", EnvBindPort, v, err)
|
||||
}
|
||||
n.BindPort = p
|
||||
}
|
||||
if v := os.Getenv(EnvAdvertise); v != "" {
|
||||
n.Advertise = v
|
||||
}
|
||||
if v := os.Getenv(EnvClusterSecret); v != "" {
|
||||
n.ClusterSecret = v
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadNodeConfig reads node.yaml from the data dir.
|
||||
func LoadNodeConfig() (*NodeConfig, error) {
|
||||
raw, err := os.ReadFile(NodeFilePath())
|
||||
@@ -55,6 +99,9 @@ func LoadNodeConfig() (*NodeConfig, error) {
|
||||
if err := yaml.Unmarshal(raw, cfg); err != nil {
|
||||
return nil, fmt.Errorf("parse node.yaml: %w", err)
|
||||
}
|
||||
if err := cfg.ApplyEnvOverrides(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if cfg.BindPort == 0 {
|
||||
cfg.BindPort = 9901
|
||||
}
|
||||
|
||||
@@ -4,9 +4,9 @@ import "testing"
|
||||
|
||||
func TestAdvertiseAddrFallback(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
cfg NodeConfig
|
||||
want string
|
||||
name string
|
||||
cfg NodeConfig
|
||||
want string
|
||||
}{
|
||||
{"explicit advertise wins", NodeConfig{Advertise: "host:1234", BindAddr: "0.0.0.0", BindPort: 9901}, "host:1234"},
|
||||
{"empty bind falls back to loopback", NodeConfig{BindPort: 9901}, "127.0.0.1:9901"},
|
||||
@@ -56,3 +56,95 @@ func TestLoadNodeConfigAppliesDefaults(t *testing.T) {
|
||||
t.Errorf("BindAddr=%q want 0.0.0.0", loaded.BindAddr)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyEnvOverrides(t *testing.T) {
|
||||
t.Setenv(EnvNodeID, "node-from-env")
|
||||
t.Setenv(EnvBindAddr, "1.2.3.4")
|
||||
t.Setenv(EnvBindPort, "9999")
|
||||
t.Setenv(EnvAdvertise, "public.example.com:9999")
|
||||
t.Setenv(EnvClusterSecret, "shh-secret")
|
||||
|
||||
n := &NodeConfig{
|
||||
NodeID: "original-id",
|
||||
BindAddr: "0.0.0.0",
|
||||
BindPort: 9901,
|
||||
Advertise: "old.example.com:9901",
|
||||
ClusterSecret: "old-secret",
|
||||
}
|
||||
if err := n.ApplyEnvOverrides(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
want := NodeConfig{
|
||||
NodeID: "node-from-env",
|
||||
BindAddr: "1.2.3.4",
|
||||
BindPort: 9999,
|
||||
Advertise: "public.example.com:9999",
|
||||
ClusterSecret: "shh-secret",
|
||||
}
|
||||
if *n != want {
|
||||
t.Errorf("got %+v want %+v", *n, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyEnvOverridesEmptyValuesIgnored(t *testing.T) {
|
||||
// Explicitly empty env vars must NOT clobber existing fields —
|
||||
// otherwise `docker run -e QUPTIME_ADVERTISE=` would silently
|
||||
// erase a previously-persisted advertise address.
|
||||
t.Setenv(EnvNodeID, "")
|
||||
t.Setenv(EnvBindAddr, "")
|
||||
t.Setenv(EnvBindPort, "")
|
||||
t.Setenv(EnvAdvertise, "")
|
||||
t.Setenv(EnvClusterSecret, "")
|
||||
|
||||
orig := NodeConfig{
|
||||
NodeID: "keep-me",
|
||||
BindAddr: "10.0.0.1",
|
||||
BindPort: 9901,
|
||||
Advertise: "keep.example.com:9901",
|
||||
ClusterSecret: "keep-secret",
|
||||
}
|
||||
n := orig
|
||||
if err := n.ApplyEnvOverrides(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if n != orig {
|
||||
t.Errorf("empty env vars mutated config: got %+v want %+v", n, orig)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyEnvOverridesBadPort(t *testing.T) {
|
||||
t.Setenv(EnvBindPort, "not-an-int")
|
||||
n := &NodeConfig{}
|
||||
if err := n.ApplyEnvOverrides(); err == nil {
|
||||
t.Fatal("expected error for non-integer port")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadNodeConfigEnvOverridesFile(t *testing.T) {
|
||||
t.Setenv("QUPTIME_DIR", t.TempDir())
|
||||
// Persist a file with one bind addr; env should win on load.
|
||||
n := &NodeConfig{NodeID: "abc", BindAddr: "127.0.0.1", BindPort: 9901, Advertise: "file.example.com:9901"}
|
||||
if err := n.Save(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Setenv(EnvBindAddr, "0.0.0.0")
|
||||
t.Setenv(EnvAdvertise, "env.example.com:9001")
|
||||
t.Setenv(EnvBindPort, "9001")
|
||||
|
||||
loaded, err := LoadNodeConfig()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if loaded.BindAddr != "0.0.0.0" {
|
||||
t.Errorf("BindAddr=%q want 0.0.0.0 (env override)", loaded.BindAddr)
|
||||
}
|
||||
if loaded.BindPort != 9001 {
|
||||
t.Errorf("BindPort=%d want 9001 (env override)", loaded.BindPort)
|
||||
}
|
||||
if loaded.Advertise != "env.example.com:9001" {
|
||||
t.Errorf("Advertise=%q want env.example.com:9001 (env override)", loaded.Advertise)
|
||||
}
|
||||
if loaded.NodeID != "abc" {
|
||||
t.Errorf("NodeID=%q want abc (unchanged)", loaded.NodeID)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user