12 Commits

Author SHA1 Message Date
Axodouble 005be12dd1 Updated the custom message area to be a text area instead for better text editing
Container image / image (push) Successful in 1m36s
Release / release (push) Successful in 1m41s
2026-05-15 06:41:10 +00:00
Axodouble e48da30240 Added documentation and installer support for the github secondary mirror
Container image / image (push) Successful in 1m42s
2026-05-15 05:32:03 +00:00
Axodouble b46c258e4e Added github workflows next to gitea's workflows
Container image / image (push) Successful in 1m43s
Release / release (push) Successful in 1m45s
2026-05-15 05:16:26 +00:00
Axodouble 7bc33b1837 v0.0.1 release
Container image / image (push) Successful in 1m47s
Release / release (push) Successful in 1m46s
2026-05-15 05:03:06 +00:00
Axodouble 7b6acb20eb Added license
Container image / image (push) Successful in 1m33s
2026-05-15 04:53:33 +00:00
Axodouble e11b3f4547 Auto init via environment variables support, qu init for systemd
Container image / image (push) Successful in 1m38s
2026-05-15 04:41:45 +00:00
Axodouble 6953709574 AI assisted documentation
Container image / image (push) Successful in 1m37s
2026-05-15 04:05:30 +00:00
Axodouble 364ba222e2 Got rid of 2 dead functions 2026-05-15 04:05:02 +00:00
Axodouble b029c0a25d Added example compose for a tailscale deployment
Container image / image (push) Successful in 3m36s
Release / release (push) Successful in 4m7s
2026-05-15 02:01:01 +00:00
Axodouble 3453bf5ec7 Updated action to use a pat due to failure otherwise, fixed cache issue
Container image / image (push) Successful in 3m17s
2026-05-15 01:44:39 +00:00
Axodouble acd55d145c Fixed incorrect shell causing a broken substitution
Container image / image (push) Failing after 9m41s
2026-05-15 01:19:21 +00:00
Axodouble ebbbd8c218 Updated when workflows run and fixed issue with the duplicate mount
Container image / image (push) Failing after 10m21s
2026-05-15 01:11:27 +00:00
27 changed files with 3278 additions and 173 deletions
+60 -22
View File
@@ -1,14 +1,18 @@
name: Container image name: Container image
# Builds the multi-arch container image. On tag push (v*) it logs in # Three modes, all driven by the same job:
# to the Gitea registry on this host and publishes the image as # - Tag push (v*) → full release: :v1.2.3, :1.2, :latest, :sha-<short>
# git.cer.sh/<owner>/<repo>:<version> plus :latest. On pull requests # - Branch push → canary: :<branch>, :sha-<short>
# it builds without pushing — purely a smoke test that the Dockerfile # - Pull request → smoke test: build only, nothing pushed
# still works. #
# metadata-action emits the right subset of tags for each event based
# on the `tags:` rules below — no manual branching needed.
on: on:
push: push:
branches:
- "**"
tags: tags:
- 'v*' - "v*"
pull_request: pull_request:
permissions: permissions:
@@ -19,42 +23,74 @@ jobs:
image: image:
runs-on: ubuntu-latest runs-on: ubuntu-latest
# The default `ubuntu-latest` label on aether-runner maps to # The default `ubuntu-latest` label on aether-runner maps to
# `node:16-bullseye`, which has no docker CLI — so the docker/* # `node:16-bullseye`, which has no docker CLI. Override to an
# actions fail. Override the job container to catthehacker's # act-compatible image that ships docker + buildx. The runner
# act-compatible image (ships docker CLI + buildx) and mount the # already bind-mounts /var/run/docker.sock into every job
# host's docker socket through. The runner already has the socket # container, so we do NOT add a `volumes:` entry — doing so
# bind-mounted from the host (see docker.yml gitea-runner volume), # produces a duplicate-mount error from the daemon.
# so this exposes that same daemon to the nested job container.
container: container:
image: catthehacker/ubuntu:act-latest image: catthehacker/ubuntu:act-latest
volumes: # aether-runner defaults `run:` blocks to POSIX `sh`, which
- /var/run/docker.sock:/var/run/docker.sock # chokes on bash-isms like ${var,,} (lowercase) and ${var:0:7}
# (substring). Pin bash for the whole job.
defaults:
run:
shell: bash
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Set up QEMU - name: Set up QEMU
uses: docker/setup-qemu-action@v3 uses: docker/setup-qemu-action@v3
with:
# Skip the GHA-cache lookup for the binfmt image. The Gitea
# runner has no GHA cache server, so the action would
# otherwise sit in a ~5-minute TCP timeout before falling
# back to a direct docker pull. Going straight to pull
# cuts QEMU setup from ~5 min to ~15 s.
cache-image: false
- name: Set up Buildx - name: Set up Buildx
uses: docker/setup-buildx-action@v3 uses: docker/setup-buildx-action@v3
# github.repository is owner/name with the repo's original casing; # Registries want lowercase namespaces, and Gitea's container
# registries require lowercase, so normalise once here and reuse # registry is case-sensitive on the login username too. Lowercase
# the result in metadata-action below. # both repo path and actor once here and reuse below.
- name: Resolve image name - name: Resolve image name
id: img id: img
run: | run: |
repo='${{ github.repository }}' repo='${{ github.repository }}'
actor='${{ github.actor }}'
echo "ref=git.cer.sh/${repo,,}" >> "$GITHUB_OUTPUT" echo "ref=git.cer.sh/${repo,,}" >> "$GITHUB_OUTPUT"
echo "user=${actor,,}" >> "$GITHUB_OUTPUT"
# Version stamp baked into the binary via -ldflags. Tag pushes
# use the tag name directly; everything else gets a short SHA
# suffix so `qu version` on a canary build is debuggable.
- name: Compute version
id: ver
run: |
if [[ "$GITHUB_REF" == refs/tags/* ]]; then
v="${GITHUB_REF_NAME}"
else
v="${GITHUB_REF_NAME}-${GITHUB_SHA:0:7}"
fi
echo "version=$v" >> "$GITHUB_OUTPUT"
# Prefers a user-provided PAT (repo secret REGISTRY_TOKEN with
# `write:package` scope) and falls back to the auto-injected
# runner token. The auto-token works on Gitea >= 1.21 when the
# workflow declares `packages: write` in permissions, but if
# the registry still rejects it (older instance, container
# registry gated by config, etc.), REGISTRY_TOKEN takes over
# without any workflow edits.
- name: Login to Gitea registry - name: Login to Gitea registry
if: github.event_name == 'push' if: github.event_name == 'push'
uses: docker/login-action@v3 uses: docker/login-action@v3
with: with:
registry: git.cer.sh registry: git.cer.sh
username: ${{ github.actor }} username: ${{ steps.img.outputs.user }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.REGISTRY_TOKEN || secrets.GITHUB_TOKEN }}
- name: Docker metadata - name: Docker metadata
id: meta id: meta
@@ -65,18 +101,20 @@ jobs:
type=semver,pattern={{version}} type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}} type=semver,pattern={{major}}.{{minor}}
type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/v') }} type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/v') }}
type=ref,event=branch
type=sha,prefix=sha-,format=short
- name: Build (and push on tag) - name: Build (and push on push events)
uses: docker/build-push-action@v6 uses: docker/build-push-action@v6
with: with:
context: . context: .
file: ./Dockerfile file: ./docker/Dockerfile
platforms: linux/amd64,linux/arm64 platforms: linux/amd64,linux/arm64
push: ${{ github.event_name == 'push' }} push: ${{ github.event_name == 'push' }}
tags: ${{ steps.meta.outputs.tags }} tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
build-args: | build-args: |
VERSION=${{ github.ref_name }} VERSION=${{ steps.ver.outputs.version }}
# Inline cache embeds layer metadata into the pushed image # Inline cache embeds layer metadata into the pushed image
# itself — no external cache server needed, which keeps the # itself — no external cache server needed, which keeps the
# workflow self-contained on the Gitea runner. # workflow self-contained on the Gitea runner.
+72
View File
@@ -0,0 +1,72 @@
name: Container image
# Mirrors .gitea/workflows/container.yaml — publishes a multi-arch
# (amd64 + arm64) image to the GitHub Container Registry whenever the
# Gitea→GitHub mirror pushes a `v*` tag. Image lands at
# ghcr.io/axodouble/quptime with tags :vX.Y.Z, :X.Y, and :latest.
on:
push:
tags:
- 'v*'
permissions:
contents: read
packages: write
jobs:
image:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Buildx
uses: docker/setup-buildx-action@v3
# GHCR namespaces must be lowercase. Lowercase the repository
# path once and reuse below so a mixed-case org/repo (e.g.
# Axodouble/QUptime) still resolves to a valid image reference.
- name: Resolve image name
id: img
run: |
repo='${{ github.repository }}'
echo "ref=ghcr.io/${repo,,}" >> "$GITHUB_OUTPUT"
- name: Compute version
id: ver
run: |
echo "version=${GITHUB_REF_NAME}" >> "$GITHUB_OUTPUT"
- name: Login to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Docker metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ steps.img.outputs.ref }}
tags: |
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=raw,value=latest
- name: Build and push
uses: docker/build-push-action@v6
with:
context: .
file: ./docker/Dockerfile
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
build-args: |
VERSION=${{ steps.ver.outputs.version }}
cache-from: type=gha
cache-to: type=gha,mode=max
+60
View File
@@ -0,0 +1,60 @@
name: Release
# Mirrors .gitea/workflows/release.yaml — fires when the Gitea→GitHub
# mirror pushes a `v*` tag, builds static Linux binaries for amd64 +
# arm64, and publishes them to GitHub Releases alongside the Gitea
# release the same tag produces upstream.
on:
push:
tags:
- 'v*'
permissions:
contents: write
jobs:
release:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.24'
check-latest: false
cache: false
- name: Test
run: go test -race ./...
- name: Build binaries
env:
CGO_ENABLED: '0'
run: |
set -euo pipefail
VERSION="${GITHUB_REF_NAME}"
mkdir -p dist
for arch in amd64 arm64; do
out="dist/qu-${VERSION}-linux-${arch}"
echo "building ${out}"
GOOS=linux GOARCH="${arch}" \
go build \
-trimpath \
-ldflags "-s -w -X main.version=${VERSION}" \
-o "${out}" \
./cmd/qu
done
(cd dist && sha256sum qu-* > SHA256SUMS)
ls -lh dist
- name: Publish release
uses: softprops/action-gh-release@v2
with:
files: |
dist/qu-*
dist/SHA256SUMS
fail_on_unmatched_files: true
generate_release_notes: true
token: ${{ secrets.GITHUB_TOKEN }}
+34
View File
@@ -0,0 +1,34 @@
# Build artifacts
/qu
/qu-*
/dist/
*.exe
*.test
*.out
# Go workspace / module cache (only relevant if vendored)
/vendor/
# Local node state — never commit anything that looks like a data dir
/quptime/
/etc/quptime/
node.yaml
cluster.yaml
trust.yaml
keys/
# Compose / secrets
.env
.env.local
*.local.yml
*.local.yaml
# Editor / OS scratch
*.swp
*.swo
*~
.DS_Store
# Test / coverage
coverage.out
coverage.html
+89
View File
@@ -0,0 +1,89 @@
# Changelog
All notable changes to this project are documented here. The format
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and
this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [v0.0.1] — 2026-05-15
Initial public release.
### Added
- **Quorum-based uptime monitoring.** Multiple cooperating nodes run
the same probes (HTTP, TCP, ICMP) and vote on the cluster-wide
truth. A check flips state only after two consecutive aggregate
evaluations agree (hysteresis), so single-node flake doesn't page
anyone.
- **Deterministic master election.** Among the live members of the
quorum the lexicographically smallest NodeID wins — no negotiation
step, no split-brain window.
- **mTLS inter-node transport** with TLS 1.3 minimum, SSH-style
fingerprint pinning, and a pre-shared `cluster_secret` gating the
Join RPC.
- **Replicated `cluster.yaml`** carrying peers, checks, and alerts.
Master is the only writer; followers receive monotonic-versioned
snapshots and converge on the latest. Hand-edits to the file on any
node are picked up by the manual-edit watcher and forwarded through
the master.
- **HTTP, TCP, and ICMP probes** with configurable interval,
timeout, expected status, and optional body-substring match. ICMP
defaults to unprivileged UDP-mode pings so the daemon can run as a
non-root user.
- **SMTP and Discord alerts** with optional Go `text/template`
subject/body overrides per alert, default-attach mode (`default:
true`), and per-check opt-outs via `suppress_alert_ids`.
- **Docker-friendly env-var configuration.** Every field in
`node.yaml` can also be supplied via a `QUPTIME_*` environment
variable; `qu serve` auto-initialises a fresh data volume from
these on first start, so `docker compose up` is enough to launch a
node.
- **Interactive TUI** (`qu tui`) for peers, checks, and alerts with
live refresh.
- **Hardened systemd unit** shipped via `install.sh`: dedicated
`quptime` user, `ProtectSystem=strict`, all capabilities dropped by
default.
- **Multi-arch Docker images** (`linux/amd64`, `linux/arm64`)
published to `git.cer.sh/axodouble/quptime` (primary) and
`ghcr.io/axodouble/quptime` (GitHub push-mirror) on every tag.
- **Static Linux binaries** (`amd64`, `arm64`) published per tag with
a `SHA256SUMS` file to both Gitea Releases (primary) and GitHub
Releases (mirror). The official installer prefers Gitea, falls back
to GitHub on failure, and verifies the checksum before placing the
binary on disk.
### Security
- Cluster secret is compared in constant time
(`crypto/subtle.ConstantTimeCompare`).
- Self-signed RSA certs minted at `qu init`; SPKI SHA-256
fingerprints are what's pinned, matching the canonical OpenSSL
representation.
- Private keys are written with mode `0600`; data and runtime
directories with `0700`/`0750`.
- All `cluster.yaml` writes go through an atomic `tmpfile + rename`.
- `install.sh` downloads the published `SHA256SUMS` and refuses to
install if the downloaded binary doesn't match.
### Known limitations
- **Cluster-wide secret distribution.** SMTP passwords and Discord
webhook URLs configured via `qu alert add …` are stored in
`cluster.yaml`, which is replicated to every node. Treat every node
as having read access to every alert credential. Restrict who can
reach the data directory accordingly. See
[docs/security.md](docs/security.md) for the threat model.
- **No automatic key rotation.** Rolling a node's identity means
wiping its data directory, running `qu init` again, and re-adding
it from another node.
- **No historical metrics.** Only the current aggregate state is kept
in memory. There is no built-in graph store, SLA calculator, or
audit log.
- **Master-flap state.** Aggregator hysteresis state lives in
memory on the current master. When leadership changes the new
master starts from `StateUnknown` and re-accumulates hysteresis —
expect a few seconds of delayed alerting after a master switch.
- **No release signing beyond SHA256SUMS** (no cosign / GPG).
Planned for a future release.
[v0.0.1]: https://git.cer.sh/axodouble/quptime/releases/tag/v0.0.1
+21
View File
@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 Jasper V.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+55 -4
View File
@@ -14,12 +14,37 @@ trust — no central CA, no shared secret.
### From pre-built binary ### From pre-built binary
This can be done in one step, either by downloading the latest release from The canonical home is Gitea; the repo is push-mirrored to GitHub on
the [Gitea releases page](https://git.cer.sh/axodouble/quptime/releases) or by running the following script: every tag. Releases and multi-arch container images are published to
both.
| Source | Releases | Container image |
| ---------------- | ------------------------------------------------------------ | -------------------------------- |
| Gitea (primary) | <https://git.cer.sh/axodouble/quptime/releases> | `git.cer.sh/axodouble/quptime` |
| GitHub (mirror) | <https://github.com/Axodouble/QUptime/releases> | `ghcr.io/axodouble/quptime` |
One-step install — tries Gitea first, falls back to GitHub automatically:
```sh ```sh
curl -fsSL https://git.cer.sh/Axodouble/QUptime/raw/branch/master/install.sh | sudo bash curl -fsSL https://git.cer.sh/Axodouble/QUptime/raw/branch/master/install.sh | sudo bash
# or, via the GitHub mirror:
# curl -fsSL https://raw.githubusercontent.com/Axodouble/QUptime/master/install.sh | sudo bash
``` ```
The script verifies the binary against the published `SHA256SUMS`
before installing and refuses to proceed on a mismatch.
### From Docker
```sh
docker pull git.cer.sh/axodouble/quptime:latest
# or, via the GitHub mirror:
# docker pull ghcr.io/axodouble/quptime:latest
```
See [docs/deployment/docker.md](docs/deployment/docker.md) for compose
recipes.
## Why ## Why
Most uptime monitors are either a SaaS or a single box that, by Most uptime monitors are either a SaaS or a single box that, by
@@ -27,6 +52,23 @@ definition, can't tell you when it's the one that's down. `qu` solves
both: run it on a few cheap hosts in different networks and they vote both: run it on a few cheap hosts in different networks and they vote
on truth. If one of them loses its uplink, the rest keep alerting. on truth. If one of them loses its uplink, the rest keep alerting.
## Documentation
This README is the quick-start. For production use, the longer guides
live under [`docs/`](docs/README.md):
| If you want to… | Read |
| ----------------------------------------------------- | ------------------------------------------------------------------ |
| understand the consensus / replication model | [docs/architecture.md](docs/architecture.md) |
| reference every field in `node.yaml` / `cluster.yaml` | [docs/configuration.md](docs/configuration.md) |
| deploy on Linux with systemd hardening | [docs/deployment/systemd.md](docs/deployment/systemd.md) |
| deploy with Docker / docker-compose | [docs/deployment/docker.md](docs/deployment/docker.md) |
| deploy over Tailscale or WireGuard | [docs/deployment/tailscale.md](docs/deployment/tailscale.md) |
| expose `qu` on the open internet safely | [docs/deployment/public-internet.md](docs/deployment/public-internet.md) |
| upgrade, back up, or recover from failures | [docs/operations.md](docs/operations.md) |
| understand the trust model and rotate identities | [docs/security.md](docs/security.md) |
| diagnose a misbehaving cluster | [docs/troubleshooting.md](docs/troubleshooting.md) |
## Architecture ## Architecture
``` ```
@@ -71,7 +113,7 @@ go build -o qu ./cmd/qu
To stamp the version into the binary: To stamp the version into the binary:
```sh ```sh
go build -ldflags "-X main.version=v0.1.0" -o qu ./cmd/qu go build -ldflags "-X main.version=v0.0.1" -o qu ./cmd/qu
qu --version qu --version
``` ```
@@ -83,7 +125,7 @@ amd64 and arm64, and publishes them as a Gitea release with a
`SHA256SUMS` file alongside. `SHA256SUMS` file alongside.
```sh ```sh
git tag v0.1.0 git tag v0.0.1
git push --tags git push --tags
``` ```
@@ -149,6 +191,15 @@ c0d4... charlie.example.com:9901 true 2026-05-12T15:01:32Z
## Adding checks and alerts ## Adding checks and alerts
> ⚠️ **Alert credentials are replicated cluster-wide.** SMTP passwords
> and Discord webhook URLs live in `cluster.yaml`, which is mirrored to
> every node. Any node that can read its own data directory can read
> every alert secret. Treat compromising one node as compromising every
> alert credential, and restrict who can reach `$QUPTIME_DIR` on each
> host (the hardened systemd unit and the Docker image both default to
> `0700`/`0750`). See [docs/security.md](docs/security.md) for the full
> threat model.
```sh ```sh
# alerts first so checks can reference them # alerts first so checks can reference them
qu alert add discord oncall --webhook https://discord.com/api/webhooks/... qu alert add discord oncall --webhook https://discord.com/api/webhooks/...
View File
+54
View File
@@ -0,0 +1,54 @@
# An example of a docker compose with Tailscale & QUptime.
# This setup is specifically intended for hosts that may not be able to
# reach each other directly or have a public IP address.
#
# Bring it up with `docker compose -f docker-compose-tailscale.yml up -d`.
# QUptime auto-initialises on first start using the QUPTIME_* env vars
# below — no separate `qu init` step is required.
#
# On the first node, omit QUPTIME_CLUSTER_SECRET to have one generated
# for you. Read it out of the logs (`docker logs quptime`) and copy it
# into the .env of every other node before bringing them up.
services:
tailscale:
image: tailscale/tailscale:latest
container_name: tailscale
cap_add:
- NET_ADMIN
environment:
- TS_AUTHKEY=${TAILSCALE_AUTHKEY} # Set this in your .env file with a Tailscale auth key
- TS_HOSTNAME=quptime-tailscale
volumes:
- /dev/net/tun:/dev/net/tun
- tailscale:/var/lib/tailscale
restart: unless-stopped
quptime:
image: git.cer.sh/axodouble/quptime:latest
container_name: quptime
environment:
# host:port other QUptime nodes use to reach this one. Use the
# Tailscale IP / MagicDNS name of this host. Required behind NAT.
- QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE}
# Shared cluster join secret. Set on every node. Leave unset on
# the very first node — one will be generated and logged for you
# to copy to the others. Followers MUST set this before starting.
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-}
# Optional: pin a port other than the default 9901.
# - QUPTIME_BIND_PORT=9901
volumes:
- quptime:/etc/quptime
ports:
- "9901:9901"
depends_on:
- tailscale
network_mode: "service:tailscale" # Use the Tailscale network stack
restart: unless-stopped
# After this node is up, add peers from the master with:
# docker compose -f docker-compose-tailscale.yml exec quptime \
# qu node add <OTHER_NODE_TAILSCALE_IP>:9901
volumes:
tailscale:
quptime:
+53
View File
@@ -0,0 +1,53 @@
# QUptime documentation
Production-oriented documentation for `qu`, a small distributed uptime
monitor that votes on the health of HTTP/TCP/ICMP targets across a
cluster of cooperating nodes.
The top-level `README.md` is the marketing pitch and quick-start. The
pages here go deeper and are organised by what you're trying to do.
## Getting set up
- [Installation](installation.md) — pre-built binaries, building from
source, verifying release artifacts, what the install script does.
- [Configuration](configuration.md) — `node.yaml`, `cluster.yaml`,
`trust.yaml`, environment variables, file layout, defaults.
## Running it
- [Architecture](architecture.md) — how nodes form quorum, how a master
is elected, how cluster state replicates, what happens during a
partition, and exactly which guarantees the design gives you.
- [Operations](operations.md) — day-2 tasks: upgrades, backups,
recovery from a lost node, recovery from a lost quorum, monitoring
`qu` itself.
- [Security](security.md) — the mTLS / TOFU trust model, what the
cluster secret protects, how to rotate keys, what to put on a public
network and what not to.
- [Troubleshooting](troubleshooting.md) — common failure modes with
the log lines you'll see and the fix.
## Deployment recipes
Pick the one that matches your environment. They share most of the
operational guidance — what differs is how `qu` is packaged and how
the inter-node link is secured at the network layer.
- [systemd on bare metal / VM](deployment/systemd.md) — single static
binary, hardened unit file, `CAP_NET_RAW` for ICMP.
- [Docker / docker-compose](deployment/docker.md) — official image,
single-node and multi-node compose files, persistent volumes.
- [Tailscale / WireGuard overlay](deployment/tailscale.md) — nodes in
separate networks with no public ingress; cluster traffic stays on
the tailnet.
- [Public-internet exposure](deployment/public-internet.md) — when
you have no overlay and `:9901` is reachable from the open
internet: firewalling, rate-limiting, secret hygiene.
## A note on stability
The wire protocol (`internal/transport`) and the on-disk format
(`cluster.yaml`, `node.yaml`, `trust.yaml`) are considered stable
within a minor version. Breaking changes will bump the major version
and ship with a migration note.
+196
View File
@@ -0,0 +1,196 @@
# Architecture
This page is the long-form companion to the diagram in the top-level
README. Read it if you need to reason about partitions, recovery,
upgrade ordering, or the consistency guarantees of `qu`.
## Components
A running `qu serve` is one process containing five long-lived
goroutines plus the listeners:
| Component | Package | Role |
| --------------- | ------------------------ | ------------------------------------------------------------------------ |
| Transport | `internal/transport` | mTLS listener + dialer, length-prefixed JSON-RPC framing. |
| Quorum manager | `internal/quorum` | 1 Hz heartbeats, liveness tracking, deterministic master election. |
| Replicator | `internal/replicate` | Master-routed mutations, version-gated broadcast and pull. |
| Scheduler | `internal/checks` | One goroutine per check; runs HTTP/TCP/ICMP probes on each node. |
| Aggregator | `internal/checks` | Master-only. Folds per-node probe results into a cluster-wide verdict. |
| Alert dispatch | `internal/alerts` | Master-only. Renders templates and ships SMTP / Discord notifications. |
| Control socket | `internal/daemon` | Local-only unix socket; the CLI and TUI talk to the daemon through it. |
Every node runs every component. Whether the master-only ones actually
*do* anything depends on the result of master election.
## Trust and transport
Inter-node traffic is TLS 1.3 with mutual authentication. There is **no
central CA**. Each node generates a self-signed RSA cert at `qu init`
and the SPKI fingerprint of that cert is what other nodes pin against.
Two layers gate access:
1. **TLS layer** accepts any client cert. This avoids a chicken-and-egg
during bootstrap — a brand-new node has no entry in anyone's trust
store yet, so a strict TLS check would refuse the very first
handshake.
2. **RPC dispatcher** rejects every method except `Join` for callers
whose presented fingerprint is not in `trust.yaml`. So an untrusted
peer can knock on the door but cannot ask questions.
`Join` itself is gated by the **cluster secret** — a pre-shared base64
string generated at `qu init` on the first node. Without it, an
attacker who can reach `:9901` cannot enrol themselves into the
cluster.
The local CLI talks to the daemon over a unix socket with `0600`
permissions; filesystem ACLs are the only authentication and no TLS is
used on that channel.
## The replicated state machine
`cluster.yaml` is the single replicated source of truth. It holds three
editable lists — `peers`, `checks`, `alerts` — plus three
server-controlled fields:
```yaml
version: 7 # monotonically increasing
updated_at: 2026-05-15T...
updated_by: <node-id> # master that committed this version
peers: [...]
checks: [...]
alerts: [...]
```
### How mutations flow
1. The CLI (or the manual-edit watcher; see below) issues a mutation
on the local daemon's control socket.
2. The daemon's replicator looks at the current quorum view:
- If there is no quorum, the mutation fails loudly with
`no quorum: refusing mutation`.
- If this node is the master, apply locally and broadcast.
- Otherwise, ship the mutation to the master via the
`ProposeMutation` RPC and wait for the result.
3. The master holds the cluster lock, applies the mutation, bumps
`version`, writes `cluster.yaml` atomically, and broadcasts the new
snapshot to every peer via `ApplyClusterCfg`.
4. Each follower's `Replace` accepts the snapshot **only if**
`incoming.Version > local.Version`. Older or equal versions are
dropped silently.
The mutation kinds are enumerated in `internal/transport/messages.go`:
`add_check`, `remove_check`, `add_alert`, `remove_alert`, `add_peer`,
`remove_peer`, `replace_config`.
### Manual edits to `cluster.yaml`
Operators can `sudoedit /etc/quptime/cluster.yaml` on any node. Every
2 seconds the daemon hashes the file. When the on-disk hash diverges
from the last hash the daemon wrote, the new content is parsed and
forwarded to the master as a `replace_config` mutation. So a hand-edit
on a follower still ends up on the master, version-bumped, and
broadcast everywhere.
If the parse fails (invalid YAML), the daemon logs and pins the bad
hash so it doesn't loop. The operator's next valid save unblocks it.
## Quorum and master election
Every node sends a heartbeat to every peer once per second. A peer is
**live** if a heartbeat (sent or received) was observed within the
last 4 seconds — comfortably more than three missed beats so a one-tick
blip does not unseat the master.
**Quorum** is met when `len(live_peers) >= floor(N/2) + 1` where `N`
is the total peer count in `cluster.yaml`. Below quorum, the cluster
refuses every mutation; existing checks continue probing locally but no
state transitions are committed (the master is the only one who
aggregates, and there is no master).
**Master election** is deterministic with no negotiation step: among
the live members, the master is the one with the lexicographically
smallest `NodeID`. Every node that observes the same live set picks the
same master — so there is no split-brain window even during a partial
partition.
The `term` integer in `qu status` is bumped every time the elected
master changes (including transitions to and from "no master"). Use it
to spot flappy clusters.
## Catch-up when a node reconnects
This is the scenario most people ask about: node C is offline, the
master commits config version 7, node C comes back online. What
happens?
1. Node C's tick loop fires heartbeats every second regardless of its
previous state. There is no backoff, no give-up.
2. Each heartbeat carries the sender's `Version`. Each response carries
the responder's `Version`.
3. The first time C sees a peer reporting a higher version than its
own, the version-observer fires and calls
`replicator.PullFrom(peerID, addr)`.
4. `PullFrom` does a `GetClusterCfg` RPC against that peer and feeds
the snapshot through `Replace`, which writes `cluster.yaml`
atomically and refreshes the on-disk hash so the manual-edit
watcher doesn't re-fire.
5. Within ~1 heartbeat C is byte-for-byte identical to the master.
The same path catches a stale node up when the partition heals on the
minority side: the minority side cannot mutate, so when it rejoins it
strictly has the older version, and the pull fires.
There is one corner case worth knowing about: the pull only fires when
`peer_version > local_version`. Two nodes at the same version with
different content would silently diverge — but the design forbids
that (only the master mutates, and the master is the only one bumping
the version) unless somebody hand-edits `cluster.yaml` and also
manually sets `version:`. Don't do that.
## Why a check flips state
The aggregator runs on the master only. Followers' probe results are
shipped to the master via the `ReportResult` RPC; the master's own
probe results are submitted directly.
For each check, the aggregator keeps the latest result per node within
a freshness window (3× the check interval, minimum 30s). On each
incoming submission it counts OK vs not-OK across the fresh results:
- 0 fresh reports → `unknown`
- more OK than not-OK → `up`
- more not-OK than OK → `down`
- tie → `up` (a tie at one report means one node says yes and one says
no; biasing toward `up` avoids false alerts when nodes disagree
transiently).
A state flip is **not** committed immediately. Hysteresis requires the
candidate state to hold for **two consecutive aggregate evaluations**
before the state transition fires and the alert dispatcher is called.
Set in `internal/checks/aggregator.go` as the `HysteresisCount`
constant — change it there if you want a hair-trigger or a slower
alert.
If the master changes, the new master starts the per-check state from
`unknown` and rebuilds it as fresh results arrive. The first few
seconds after a re-election can therefore show `unknown` even for
checks that were `up` a moment ago.
## What `qu` does *not* do
These omissions are intentional in v1 and useful to know up front:
- **No persistent history.** Only the current aggregate state lives in
memory. There are no graphs, no SLA reports. Add a sidecar (Prometheus
exporter, SQLite logger) if you need them.
- **No automatic key rotation.** Re-init a node and re-trust if you
need to roll its identity. See [security.md](security.md).
- **No multi-tenant isolation.** One cluster = one set of checks =
one alert tree.
- **No web UI.** Operator surface is `qu` (CLI), `qu tui`, and direct
edits to `cluster.yaml`.
- **No automatic peer eviction on prolonged downtime.** A dead peer
stays in `cluster.yaml` until an operator runs `qu node remove`,
because that decision affects the quorum size and shouldn't happen
silently.
+318
View File
@@ -0,0 +1,318 @@
# Configuration
This page is the canonical reference for the on-disk files, the
environment variables, and every field that `qu` reads. It's
deliberately tedious — when something doesn't behave the way you
expect, this is where the answer lives.
## File layout
When running as **root** (the typical case under systemd):
```
/etc/quptime/
├── node.yaml identity, never replicated
├── cluster.yaml replicated state
├── trust.yaml local fingerprint trust store
└── keys/
├── private.pem RSA private key (0600)
├── public.pem RSA public key
└── cert.pem self-signed X.509 cert
/var/run/quptime/quptime.sock control socket (0600)
```
When running as a **non-root** user (the typical case for `go run` or a
desktop test):
```
~/.config/quptime/... same shape as /etc/quptime
$XDG_RUNTIME_DIR/quptime/quptime.sock control socket
```
Override the data directory with `QUPTIME_DIR=/some/path qu serve`.
Override the socket path with `QUPTIME_SOCKET=/run/foo.sock`.
## Environment variables
### Paths
| Variable | Purpose |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `QUPTIME_DIR` | Data directory. Defaults to `/etc/quptime` (root) or `$XDG_CONFIG_HOME/quptime`. |
| `QUPTIME_SOCKET` | Path to the CLI ↔ daemon unix socket. Defaults to `/var/run/quptime/quptime.sock` (root) or `$XDG_RUNTIME_DIR/quptime/…`. |
| `XDG_CONFIG_HOME` | Honored when running as non-root and `QUPTIME_DIR` is unset. |
| `XDG_RUNTIME_DIR` | Honored when running as non-root and `QUPTIME_SOCKET` is unset. |
### `node.yaml` field overrides
Every field in `node.yaml` can also be supplied via an environment
variable. This is the recommended way to drive Docker / Compose
deployments: drop the env vars into the compose file and the daemon
will bootstrap on first start without a separate `qu init` step.
| Variable | `node.yaml` field | Notes |
| ------------------------ | ----------------- | -------------------------------------------------------------------------------------------------------------- |
| `QUPTIME_NODE_ID` | `node_id` | Pin a specific UUID. Leave unset to let `qu init` / auto-init generate one. |
| `QUPTIME_BIND_ADDR` | `bind_addr` | Defaults to `0.0.0.0`. |
| `QUPTIME_BIND_PORT` | `bind_port` | Integer. Defaults to `9901`. |
| `QUPTIME_ADVERTISE` | `advertise` | `host:port` other peers use to reach this node. Required when bound to a wildcard or behind NAT. |
| `QUPTIME_CLUSTER_SECRET` | `cluster_secret` | Pre-shared join secret. Set the same value on every node. If unset on the very first node, one is generated. |
Precedence is **env > file > compiled default**. Non-empty env values
win over whatever is stored in `node.yaml` at load time, so changing a
variable in `docker-compose.yml` and restarting the container is
enough to roll out new bind/advertise values — no on-disk edit
required. Empty env values are ignored (they will not clear a
previously persisted field).
For `qu init` specifically, explicit command-line flags take
precedence over env values; env values fill in only the fields the
operator did not pass on the command line.
The daemon does not read any other environment variables. SMTP, Discord,
and HTTP probe targets are configured exclusively in `cluster.yaml`.
## Auto-init on `qu serve`
If `node.yaml` does not exist when `qu serve` starts, the daemon
bootstraps it in-place using the `QUPTIME_*` env vars above: a fresh
UUID is generated (or `QUPTIME_NODE_ID` is honored if set), an RSA
keypair and self-signed cert are written under `keys/`, and
`cluster.yaml` is seeded with this node as its sole peer. If no
`QUPTIME_CLUSTER_SECRET` was provided, a random one is generated and
printed to stderr — copy it to every follower node's
`QUPTIME_CLUSTER_SECRET` (or `--secret` flag) before they start.
This is what makes the docker-compose flow `docker compose up`-only
on a fresh volume. To opt out (e.g. so a misconfigured deployment
crashes loudly instead of silently generating a new identity), run
`qu init` against the volume yourself before letting `qu serve` ever
see it.
## `node.yaml` — local identity
Never replicated. One file per host. Generated by `qu init`.
```yaml
node_id: 7f3a5b9e-... # UUIDv4, immutable after init
bind_addr: 0.0.0.0 # listen address for :9901
bind_port: 9901 # listen port
advertise: alpha.example.com:9901 # how peers reach us; may differ from bind
cluster_secret: 4hZqK8vT9... # base64; required to Join, never replicated
```
### Field reference
- `node_id` — UUIDv4 generated at `qu init`. Used by every peer to
refer to this node across IP changes and restarts. Do not edit.
- `bind_addr` — Address the daemon listens on. `0.0.0.0` is the
default. Set to `127.0.0.1` if you only want to expose the daemon
through an overlay (Tailscale, WireGuard) — see
[deployment/tailscale.md](deployment/tailscale.md).
- `bind_port` — Defaults to `9901`. Change here if 9901 is taken; the
cluster does not require port-uniformity, peers just need to know
what to dial via the `advertise` field.
- `advertise` — Host:port other nodes use to reach this one. Must be
routable from every peer. Falls back to `bind_addr:bind_port` if
unset, which is rarely what you want behind NAT.
- `cluster_secret` — Pre-shared base64 string. Required on every
`Join` RPC; constant-time comparison on the receiver. Generate on
the first node, distribute out-of-band, keep out of version
control.
### How `qu init` populates this file
```sh
qu init \
--advertise alpha.example.com:9901 \
--bind 0.0.0.0 \
--port 9901 \
--secret '<paste from first node, or omit on the first node>'
```
Idempotent in one direction only: if `node.yaml` exists, `qu init`
refuses to overwrite. To re-init, delete the data directory entirely.
## `cluster.yaml` — replicated state
This is the file that every node converges on. The master is the only
one allowed to bump `version`; followers `Replace` it whole each time
they receive a higher-versioned snapshot.
```yaml
version: 12
updated_at: 2026-05-15T14:01:00Z
updated_by: 7f3a5b9e-...
peers:
- node_id: 7f3a5b9e-...
advertise: alpha.example.com:9901
fingerprint: SHA256:abcd...
cert_pem: |
-----BEGIN CERTIFICATE-----
...
-----END CERTIFICATE-----
checks:
- id: 0006a1...
name: homepage
type: http
target: https://example.com
interval: 30s
timeout: 10s
expect_status: 200
alert_ids: [oncall]
suppress_alert_ids: []
alerts:
- id: f001ab...
name: oncall
type: discord
default: true
discord_webhook: https://discord.com/api/webhooks/...
body_template: |
:rotating_light: {{.Check.Name}} is {{.Verb}}
```
### Top-level fields
| Field | Owner | Notes |
| ------------ | -------- | ---------------------------------------------------------------------------------- |
| `version` | master | Monotonic. Followers reject snapshots whose version is ≤ their local. |
| `updated_at` | master | UTC RFC3339. Cosmetic — humans use it, no logic depends on it. |
| `updated_by` | master | NodeID of the committing master. |
| `peers` | editable | Cluster members. Edits go through `add_peer` / `remove_peer` mutations. |
| `checks` | editable | Monitored targets. |
| `alerts` | editable | Notifier destinations. |
### `peers[]`
```yaml
- node_id: 7f3a5b9e-... # immutable, the peer's own UUID
advertise: host:port # how anyone dials this peer
fingerprint: SHA256:... # SPKI fingerprint of the peer's cert
cert_pem: | # full PEM so other peers can mTLS without a separate invite
-----BEGIN CERTIFICATE-----
...
```
The `cert_pem` field is what enables N-node clusters without N×(N-1)
manual invites: when peer X is added via the master, every other node
that receives the new `cluster.yaml` learns X's cert at the same time
and adds it to the local trust store. See
`internal/daemon/daemon.go:syncTrustFromCluster`.
### `checks[]`
```yaml
- id: 0006a1... # UUIDv4, generated when the check is created
name: homepage # human-friendly, must be unique within cluster
type: http # http | tcp | icmp
target: https://example.com
interval: 30s # Go duration syntax: 5s, 1m30s, 2h
timeout: 10s # default 10s
expect_status: 200 # http only; 0 = accept anything < 400
body_match: "OK" # http only; substring match on response body
alert_ids: [oncall] # alerts attached explicitly
suppress_alert_ids: [] # opt out of specific default alerts
```
Defaults:
- `interval`: 30s
- `timeout`: 10s
- `expect_status`: 0 → any 2xx is OK; otherwise the configured status
must match exactly.
ICMP checks default to **unprivileged UDP-mode pings** so the daemon
does not need root. For raw ICMP, grant the capability — see
[deployment/systemd.md](deployment/systemd.md).
### `alerts[]`
Two notifier kinds, distinguished by `type`:
```yaml
# Discord
- id: f001ab...
name: oncall
type: discord
default: true # attach to every check automatically
discord_webhook: https://...
body_template: | # optional Go text/template override
{{.Check.Name}} is {{.Verb}}
# SMTP
- id: f002cd...
name: ops
type: smtp
smtp_host: smtp.example.com
smtp_port: 587
smtp_user: mailbot
smtp_password: '...'
smtp_from: monitor@example.com
smtp_to: [ops@example.com]
smtp_starttls: true
subject_template: '[{{.Verb}}] {{.Check.Name}}'
body_template: |
Check {{.Check.Name}} ({{.Check.Target}}) is now {{.Verb}}.
```
If `default: true`, the alert fires for every check unless the check
lists the alert's ID or name in `suppress_alert_ids`. Otherwise the
alert only fires for checks that name it in `alert_ids`.
Templates are Go `text/template`. The full variable list is in the
top-level README under "Custom alert messages" — `qu alert add smtp
--help` and `qu alert add discord --help` print the same table.
### Suppression precedence
For each check, the dispatcher computes the effective alert list as:
```
( explicit alert_ids alerts with default=true ) \ suppress_alert_ids
```
de-duplicated by alert ID. So a check can both opt in to specific
alerts and opt out of specific defaults.
## `trust.yaml` — local trust store
A flat list of fingerprints this node accepts. One entry per peer,
populated by `qu node add` (or pulled in automatically when a peer's
cert arrives via the replicated `cluster.yaml`).
```yaml
entries:
- node_id: 7f3a5b9e-...
address: alpha.example.com:9901
fingerprint: SHA256:...
cert_pem: |
-----BEGIN CERTIFICATE-----
...
```
Never edit this by hand. Use `qu trust list` and `qu trust remove`.
## Key material
`keys/private.pem` is the only secret on disk besides
`node.yaml.cluster_secret`. It's chmod 0600 by default; preserve that.
The public cert at `keys/cert.pem` is what gets fingerprinted and
shipped in `cluster.yaml.peers[].cert_pem`.
There is **no automatic key rotation**. Rolling a node's identity
means wiping its data directory, running `qu init` again, and
re-adding it from another node as a fresh peer.
## Tunables that don't live in YAML
A few values are compiled constants. Change them in source and rebuild
if you need different behaviour.
| Constant | Default | What it does |
| ----------------------------------------------------- | ------- | ------------------------------------------------------------- |
| `quorum.DefaultHeartbeatInterval` | `1s` | How often each node heartbeats every peer. |
| `quorum.DefaultDeadAfter` | `4s` | A peer is dead if no heartbeat is seen within this window. |
| `checks.HysteresisCount` | `2` | Consecutive aggregate evaluations needed before a state flip. |
| `checks.ReconcileInterval` | `5s` | How often the scheduler reconciles its workers vs `checks[]`. |
| `daemon.manualEditPollInterval` (`internal/daemon/watcher.go`) | `2s` | How often the daemon hashes `cluster.yaml` for hand edits. |
+243
View File
@@ -0,0 +1,243 @@
# Deployment: Docker / docker-compose
The published image is a 14 MB distroless static container with the
`qu` binary as the entrypoint. It runs as root by default so the
daemon can bind privileged ports and open ICMP sockets; override with
`--user` if your host doesn't need that.
## Image references
The same multi-arch (amd64 + arm64) image is published to two
registries. **The Gitea registry is the canonical source** — it also
publishes canary `:master` builds on every branch push. GHCR is a
tag-only push-mirror for users who can't reach `git.cer.sh`.
Primary — Gitea registry:
```
git.cer.sh/axodouble/quptime:master # tip of main, multi-arch
git.cer.sh/axodouble/quptime:latest # latest tagged release
git.cer.sh/axodouble/quptime:v0.0.1 # specific tagged release
git.cer.sh/axodouble/quptime:latest-amd64 # single-arch (if you must pin)
```
Fallback — GitHub Container Registry:
```
ghcr.io/axodouble/quptime:latest # latest tagged release
ghcr.io/axodouble/quptime:v0.0.1 # specific tagged release
ghcr.io/axodouble/quptime:0.0 # latest patch in the 0.0 minor line
```
The image embeds `QUPTIME_DIR=/etc/quptime` and declares it a volume —
treat it as the only piece of state worth persisting.
## Single-node, single-container compose
For a development cluster or a single-node smoke test:
```yaml
# compose.yaml
services:
quptime:
image: git.cer.sh/axodouble/quptime:latest
container_name: quptime
restart: unless-stopped
environment:
# host:port other nodes use to reach this one. Must be reachable
# from every peer — the loopback inside the container is useless.
- QUPTIME_ADVERTISE=<host-ip>:9901
# Pre-shared join secret. Omit on the very first node and read
# the generated value out of `docker logs quptime`, then set
# this env var on every follower before bringing them up.
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-}
ports:
- "9901:9901"
volumes:
- quptime-data:/etc/quptime
# ICMP UDP-mode pings need a permissive sysctl on the host:
# sysctl net.ipv4.ping_group_range="0 2147483647"
# Or grant CAP_NET_RAW (more accurate, raw ICMP).
cap_add:
- NET_RAW
volumes:
quptime-data:
```
`qu serve` auto-initialises the data volume on first start using the
`QUPTIME_*` env vars (see [configuration.md](../configuration.md) for
the full list). One command brings everything up:
```sh
docker compose up -d
docker compose exec quptime qu status
```
On the very first node, capture the auto-generated cluster secret:
```sh
docker compose logs quptime | grep -A1 'cluster secret'
```
Copy that value into the `QUPTIME_CLUSTER_SECRET` env var of every
follower before starting them, otherwise their join RPCs will be
rejected. The full list of accepted env vars lives in
[configuration.md](../configuration.md#nodeyaml-field-overrides).
## Three-node compose on a single host
For local testing of the full quorum machinery without three machines:
```yaml
# compose.yaml
x-quptime: &quptime
image: git.cer.sh/axodouble/quptime:latest
restart: unless-stopped
cap_add:
- NET_RAW
services:
alpha:
<<: *quptime
container_name: alpha
environment:
- QUPTIME_ADVERTISE=alpha:9901
# First node: leave secret unset and read it from `docker logs`.
ports: ["9901:9901"]
volumes: ["alpha-data:/etc/quptime"]
bravo:
<<: *quptime
container_name: bravo
environment:
- QUPTIME_ADVERTISE=bravo:9901
- QUPTIME_CLUSTER_SECRET=${SECRET}
ports: ["9902:9901"]
volumes: ["bravo-data:/etc/quptime"]
charlie:
<<: *quptime
container_name: charlie
environment:
- QUPTIME_ADVERTISE=charlie:9901
- QUPTIME_CLUSTER_SECRET=${SECRET}
ports: ["9903:9901"]
volumes: ["charlie-data:/etc/quptime"]
volumes:
alpha-data:
bravo-data:
charlie-data:
```
Bootstrap:
```sh
# 1. Start alpha first to mint the cluster secret.
docker compose up -d alpha
# 2. Read the secret off alpha's stdout.
export SECRET=$(docker compose logs alpha | awk '/cluster secret/{getline; print $1}')
# 3. Bring up the followers — they pick up the secret from $SECRET.
docker compose up -d bravo charlie
# Invite from alpha. The hostnames resolve over the compose network.
docker compose exec alpha qu node add bravo:9901
sleep 3 # wait for heartbeats before the next add
docker compose exec alpha qu node add charlie:9901
docker compose exec alpha qu status
```
For a cluster on three separate hosts, replicate the compose file on
each box with different `advertise` addresses (the public hostname or
the overlay IP) and bootstrap the same way.
## Multi-host compose
The natural unit is one compose file per host, each running one
`qu` container. The minimum-viable file per host:
```yaml
# /etc/qu-stack/compose.yaml
services:
quptime:
image: git.cer.sh/axodouble/quptime:latest
container_name: quptime
restart: unless-stopped
environment:
- QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE} # host:9901 reachable from peers
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET}
ports:
- "9901:9901"
volumes:
- /srv/quptime/data:/etc/quptime
cap_add:
- NET_RAW
```
Put the per-host values (`QUPTIME_ADVERTISE`, `QUPTIME_CLUSTER_SECRET`)
in a sibling `.env` file or a config-management secret so the compose
file itself is identical across hosts.
Persistence is a bind-mount under `/srv/quptime/data` so backups and
upgrades hit a known path. See [operations.md](../operations.md) for
the backup recipe.
Inter-host traffic on TCP/9901 must be reachable. If the boxes don't
share a private network, prefer the
[Tailscale recipe](tailscale.md) over exposing 9901 directly — see
[public-internet.md](public-internet.md) for the threat model if you
must expose it.
## Behind a reverse proxy
**Don't.** `qu` is mTLS-pinned at the application layer, so a TLS-
terminating proxy would force the daemon to trust whatever cert the
proxy presents — defeating fingerprint pinning. If you need a single
public address per node, use a Layer 4 TCP proxy (`nginx stream`,
HAProxy `mode tcp`, or a plain firewall NAT) that forwards bytes
without touching them.
## Image internals
Build locally if you want to inspect what you're running:
```sh
docker buildx build \
--build-arg VERSION=$(git describe --tags --always) \
--platform linux/amd64,linux/arm64 \
--file docker/Dockerfile \
--tag quptime:dev \
--load \
.
```
The Dockerfile (see `docker/Dockerfile`) is two stages: a `golang:1.24-alpine`
builder that cross-compiles with `-trimpath -ldflags "-s -w"`, and a
`gcr.io/distroless/static-debian12` runtime. No shell, no package
manager, no SSH; you cannot `docker exec -it sh` into it. Use
`docker exec quptime qu ...` for everything.
## Healthcheck
The container exits non-zero if the daemon crashes, so the default
`restart: unless-stopped` policy is enough for liveness. A more
useful readiness check requires the binary to be in your healthchecker:
```yaml
healthcheck:
test: ["CMD", "/usr/local/bin/qu", "status"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
```
`qu status` exits 0 when the daemon socket is reachable and the
control RPC succeeds — it does **not** fail on quorum loss. That's
intentional: restarting a quorum-less node won't bring quorum back,
and a healthcheck that flaps a follower in and out of `unhealthy`
state every time the master is briefly unreachable is worse than no
check. If you want a stricter readiness signal, pipe `qu status`
through `grep -q 'quorum true'`.
+180
View File
@@ -0,0 +1,180 @@
# Deployment: public-internet exposure
If your nodes do not share a private network and you can't put an
overlay between them (see [tailscale.md](tailscale.md)), this is the
recipe for exposing TCP/9901 directly to the open internet without
losing sleep.
The short version: `qu` is designed for this — every inbound call is
mTLS-pinned at the application layer and gated by the cluster secret
— but defence in depth is cheap and you should take it.
## Threat model in one paragraph
Anyone on the internet can establish a TLS connection to `:9901`
because the daemon must accept handshakes from currently-untrusted
peers (otherwise no node could ever join). The RPC dispatcher then
rejects every method except `Join` for callers whose fingerprint
isn't in `trust.yaml`. `Join` itself is gated by the **cluster
secret**, compared in constant time. So the realistic attack surface
is:
1. The TLS 1.3 stack accepting handshakes from arbitrary peers.
2. The `Join` handler's secret check and downstream cert ingestion.
3. The blast radius of a leaked cluster secret (an attacker who has
it can enrol themselves as a peer and propose mutations, which is
game over).
What can't trivially happen:
- A random attacker observing or modifying cluster traffic — TLS 1.3
with fingerprint pinning sees to that.
- A random attacker calling any method other than `Join` — the RPC
dispatcher refuses.
What you should still do:
- Treat `node.yaml.cluster_secret` like an SSH host key. Out-of-band
distribution only. Never in git, never in CI logs, never in chat.
- Rate-limit and IP-allowlist where you can. The `Join` handler does
not currently rate-limit at the application layer, so a determined
attacker could try secrets at TLS-handshake rate.
- Run on a non-default port if your operations workflow allows it.
Doesn't add security, but reduces background internet noise in the
logs and makes IDS / WAF rules cleaner.
## Firewall
### nftables (recommended)
A drop-in `/etc/nftables.d/quptime.nft`:
```nft
table inet filter {
set quptime_peers {
type ipv4_addr
elements = { 198.51.100.10, 198.51.100.11, 198.51.100.12 }
}
chain quptime_input {
# Drop everything that didn't come from a known peer.
ip saddr @quptime_peers tcp dport 9901 accept
tcp dport 9901 log prefix "quptime-drop: " level info drop
}
chain input {
type filter hook input priority 0; policy drop;
ct state established,related accept
iif lo accept
jump quptime_input
# ... your other rules
}
}
```
The allowlist is the highest-ROI mitigation by far — if you maintain
fixed IPs for your monitor nodes, use this and move on.
### ufw
```sh
sudo ufw allow from 198.51.100.10 to any port 9901 proto tcp
sudo ufw allow from 198.51.100.11 to any port 9901 proto tcp
sudo ufw allow from 198.51.100.12 to any port 9901 proto tcp
```
### Dynamic peer IPs
If peer IPs aren't fixed (e.g., one node is on a home connection with
a rotating address), you have three options ranked by preference:
1. Use an overlay instead — see [tailscale.md](tailscale.md). This is
the right answer.
2. DNS-based allowlisting (`ipset`-from-DNS or a small reconciler that
re-resolves an allowlist hostname every minute). Beware: a
compromised DNS resolver becomes a compromise of the allowlist.
3. Drop the allowlist and rely solely on the cluster secret + mTLS.
This is what `qu` is designed to survive; just be sure the secret
actually has the entropy `qu init` generated for it (32 random
bytes, base64-encoded).
## Rate-limiting failed handshakes
`qu` does not currently rate-limit `Join` attempts at the application
layer. You can do it at the firewall, which catches both connect
floods and slow brute-force:
```nft
table inet filter {
chain quptime_input {
tcp dport 9901 ct state new \
meter quptime_ratemeter { ip saddr limit rate over 10/second } \
log prefix "quptime-rate: " drop
tcp dport 9901 accept
}
}
```
Or `fail2ban` with a tiny custom filter that watches `journalctl -u
quptime` for repeated `peer rejected join` lines:
```ini
# /etc/fail2ban/filter.d/quptime.conf
[Definition]
failregex = ^.*quptime:.*peer rejected join.*from <ADDR>.*$
```
```ini
# /etc/fail2ban/jail.d/quptime.local
[quptime]
enabled = true
filter = quptime
backend = systemd
journalmatch = _SYSTEMD_UNIT=quptime.service
maxretry = 3
findtime = 600
bantime = 86400
```
Note: the daemon doesn't currently log the *peer address* on rejected
joins. The log filter above is illustrative; check what your version
actually emits before relying on it.
## Secret hygiene
The single most important thing on a public-internet deployment:
- **Generate the secret on the first node.** `qu init` with no
`--secret` produces 32 random bytes from `crypto/rand`, base64-
encoded. Don't replace that with something memorable.
- **Transport out of band.** Paste it into your secret manager
immediately; share via 1Password / Vault / encrypted email.
- **Rotate if anyone with access has left.** Rotation isn't a CLI
command; do it the brute-force way: `qu init` a fresh cluster on
new ports, re-add every check via `cluster.yaml` export, swap DNS.
- **One secret per cluster.** Do not reuse the secret across staging
and prod, or across customers if you run several clusters.
## Non-default ports
```sh
# Each node, in node.yaml — or pass --port on init.
qu init --advertise alpha.example.com:51234 --port 51234
```
Open the corresponding firewall rule, restart the daemon. The
cluster doesn't require uniform ports across nodes; each peer's
`advertise` field tells everyone else what to dial.
## What you should monitor on a public deployment
- `term` from `qu status` — if it's ticking up frequently the master
is flapping, which probably means at least one peer's network is
unstable. Could be benign, could be a probe attempt.
- The firewall drop counter on the `quptime-drop` rule above.
- The number of TLS handshakes on `:9901`. A spike in handshakes that
don't progress to a successful RPC is the signature of a brute-force
on the cluster secret.
For the operational side — backups, upgrades, recovery — see
[operations.md](../operations.md).
+250
View File
@@ -0,0 +1,250 @@
# Deployment: systemd on bare metal / VM
The canonical way to run `qu` on a Linux host. Single static binary,
managed by systemd, with a hardened unit file. Most production users
should start here.
## Audience and assumptions
- You have root (or `sudo`) on the host.
- You have at least three hosts that can reach each other on TCP/9901.
(Three is the minimum for a useful quorum; fewer is fine for
development but a 2-node cluster offers no consensus protection.)
- The hosts have a way to authenticate each other — direct IP or a
resolvable hostname is fine. For overlay networks see
[tailscale.md](tailscale.md).
## Install the binary
See [installation.md](../installation.md). The official `install.sh`
script writes a *minimal* unit file that's fine for development. For
production replace it with the hardened version below.
## Create a dedicated user
Running as a dedicated unprivileged user is best practice, but ICMP
support adds a wrinkle — see the next section.
```sh
sudo useradd --system --no-create-home --shell /usr/sbin/nologin quptime
sudo install -d -o quptime -g quptime -m 0750 /etc/quptime
sudo install -d -o quptime -g quptime -m 0750 /var/run/quptime
```
## ICMP capabilities
ICMP probes have two implementations:
1. **Unprivileged UDP pings** — Linux's `dgram` ICMP socket. Works on
any modern kernel without elevated privileges, but only if
`net.ipv4.ping_group_range` includes the daemon's GID. This is the
default in `qu`.
2. **Raw ICMP** — requires `CAP_NET_RAW`, more accurate latency
numbers and works for IPv6 from arbitrary kernels.
The simplest path: stick with unprivileged pings and widen
`ping_group_range`. Sysctl, persistent across reboots:
```sh
# /etc/sysctl.d/10-quptime.conf
net.ipv4.ping_group_range = 0 2147483647
```
```sh
sudo sysctl --system
```
If you need raw ICMP instead, grant the capability on the binary:
```sh
sudo setcap cap_net_raw=+ep /usr/local/bin/qu
```
Note that `setcap` is overwritten by every `qu` upgrade — bake the
`setcap` call into your deploy script, or re-run it after each
package update.
## Hardened unit file
Drop this in `/etc/systemd/system/quptime.service`:
```ini
[Unit]
Description=QUptime distributed uptime monitor
Documentation=https://git.cer.sh/axodouble/quptime
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
ExecStart=/usr/local/bin/qu serve
Restart=always
RestartSec=5s
User=quptime
Group=quptime
# Where state lives. RuntimeDirectory creates /var/run/quptime/ each
# boot owned by User:Group with mode 0750.
Environment=QUPTIME_DIR=/etc/quptime
RuntimeDirectory=quptime
RuntimeDirectoryMode=0750
ReadWritePaths=/etc/quptime /var/run/quptime
# Hardening. Comment out individual directives if a probe needs
# something we've revoked.
NoNewPrivileges=true
ProtectSystem=strict
ProtectHome=true
PrivateTmp=true
PrivateDevices=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
ProtectClock=true
ProtectHostname=true
RestrictNamespaces=true
RestrictRealtime=true
RestrictSUIDSGID=true
LockPersonality=true
MemoryDenyWriteExecute=true
# Network access is required (we're a network monitor). Keep address
# families minimal — AF_NETLINK is needed for some libc lookups.
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK
# If you need raw ICMP, *also* uncomment:
# AmbientCapabilities=CAP_NET_RAW
# CapabilityBoundingSet=CAP_NET_RAW
# Otherwise drop all capabilities:
CapabilityBoundingSet=
[Install]
WantedBy=multi-user.target
```
Reload systemd and enable:
```sh
sudo systemctl daemon-reload
sudo systemctl enable quptime.service
```
## Initialise the node
**Don't start the service yet**`qu init` must run first, and it
must run as the `quptime` user so it creates files with the right
ownership.
On the **first** host (it will print a secret; copy it):
```sh
sudo -u quptime QUPTIME_DIR=/etc/quptime \
qu init --advertise alpha.example.com:9901
```
On every **other** host (paste the secret):
```sh
sudo -u quptime QUPTIME_DIR=/etc/quptime \
qu init --advertise bravo.example.com:9901 --secret '<paste>'
sudo -u quptime QUPTIME_DIR=/etc/quptime \
qu init --advertise charlie.example.com:9901 --secret '<paste>'
```
## Open the firewall
`qu` needs TCP/9901 reachable between cluster members. Adjust to your
firewall:
```sh
# ufw
sudo ufw allow from <peer-ip> to any port 9901 proto tcp
# firewalld
sudo firewall-cmd --permanent --zone=internal \
--add-rich-rule='rule family=ipv4 source address=<peer-ip> port port=9901 protocol=tcp accept'
sudo firewall-cmd --reload
# nftables (drop-in)
table inet filter {
chain input {
ip saddr { 10.0.0.10, 10.0.0.11, 10.0.0.12 } tcp dport 9901 accept
}
}
```
For exposing 9901 to the open internet see
[public-internet.md](public-internet.md).
## Start the daemon
```sh
sudo systemctl start quptime
sudo systemctl status quptime
journalctl -u quptime -f
```
## Invite peers
From one node (typically `alpha`):
```sh
sudo -u quptime qu node add bravo.example.com:9901
# Pause a few seconds so heartbeats reach the new peer before the next add —
# otherwise the "needs ≥2 live to mutate" check rejects the second invite.
sudo -u quptime qu node add charlie.example.com:9901
```
`qu node add` prints each remote's fingerprint and asks for SSH-style
confirmation. Verify it matches an out-of-band channel (the remote
operator can show their fingerprint with
`sudo -u quptime qu status` or by reading `trust.yaml`).
## Verify
```sh
sudo -u quptime qu status
```
Expect to see all three peers `live=true` and one of them as
`master`.
## Log scraping
`journalctl -u quptime` is the canonical log stream. Notable lines:
| Pattern | Meaning |
| ------------------------------------------------------------- | --------------------------------------------------------- |
| `listening on ... as node ...` | Daemon up. |
| `manual-edit: cluster.yaml changed externally — replicating…` | An operator edited `cluster.yaml` directly. |
| `manual-edit: parse cluster.yaml: ...` | Invalid YAML on disk; the operator must fix and re-save. |
| `report to master ...: <err>` | A follower couldn't ship a probe result to the master. |
| `replicate: pull from ...: <err>` | A follower couldn't pull a higher-version config snapshot. |
## Sample reload / restart drill
After editing the unit file:
```sh
sudo systemctl daemon-reload
sudo systemctl restart quptime
```
After editing `cluster.yaml` by hand:
```sh
sudoedit /etc/quptime/cluster.yaml
# No restart needed — the watcher picks it up within 2s and pushes to master.
```
After upgrading the binary:
```sh
sudo install -m 0755 qu-new /usr/local/bin/qu
sudo setcap cap_net_raw=+ep /usr/local/bin/qu # if you use raw ICMP
sudo systemctl restart quptime
```
Doing rolling upgrades? See [operations.md](../operations.md).
+188
View File
@@ -0,0 +1,188 @@
# Deployment: Tailscale / WireGuard overlay
When your nodes live in different networks — different VPS providers,
different physical sites, a mix of home and cloud — exposing TCP/9901
to the open internet is a poor idea. An overlay network gives every
node a stable private IP regardless of NAT, and `qu` only needs to
listen on that overlay address.
This page focuses on Tailscale because the repo ships an example
compose for it, but everything generalises to WireGuard, Nebula, or a
self-hosted Headscale.
## The big idea
```
+--- host A (VPS, no public ICMP) ----+
| tailscale ←→ overlay ip 100.64.1.1 |
| qu listening on 100.64.1.1:9901 |
+-------------------------------------+
│ mTLS over overlay
+--- host B (homelab behind NAT) -----+
| tailscale ←→ overlay ip 100.64.1.2 |
| qu listening on 100.64.1.2:9901 |
+-------------------------------------+
```
`bind_addr` is set to the tailscale IP, the host's public interface
has no port 9901 open, and the cluster secret + mTLS handshake gate
the link inside the tunnel.
## Compose recipe
The repo ships [`docker/docker-compose-tailscale.yml`](../../docker/docker-compose-tailscale.yml).
The relevant trick is `network_mode: "service:tailscale"` — the
`quptime` container shares the network namespace of the `tailscale`
sidecar so it sees the tailnet as its own interface.
```yaml
services:
tailscale:
image: tailscale/tailscale:latest
container_name: tailscale
cap_add: [NET_ADMIN]
environment:
- TS_AUTHKEY=${TAILSCALE_AUTHKEY} # provision via .env
- TS_HOSTNAME=quptime-${HOST} # name visible in admin
volumes:
- /dev/net/tun:/dev/net/tun
- tailscale:/var/lib/tailscale
restart: unless-stopped
quptime:
image: git.cer.sh/axodouble/quptime:latest
container_name: quptime
environment:
# host:port other QUptime nodes use to reach this one. Should be
# this node's tailnet IP / MagicDNS name. Auto-init reads this on
# first start.
- QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE}
# Shared cluster join secret. Omit on the very first node to have
# it generated and logged for you, then copy it into every
# follower's .env.
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-}
volumes:
- quptime:/etc/quptime
network_mode: "service:tailscale"
depends_on: [tailscale]
cap_add: [NET_RAW]
restart: unless-stopped
volumes:
tailscale:
quptime:
```
### One-time bootstrap
Each host runs the same compose file with a per-host `.env`:
```sh
# .env (alpha — the first node)
HOST=alpha
TAILSCALE_AUTHKEY=tskey-auth-xxxxxxxx
QUPTIME_ADVERTISE=100.64.1.1:9901 # this node's tailnet IP
# QUPTIME_CLUSTER_SECRET left unset — will be generated on first boot.
```
Start the stack on the first host. `qu serve` auto-initialises the
volume using the env vars above, so a single `docker compose up`
brings everything up:
```sh
docker compose up -d
docker compose logs quptime | grep -A1 'cluster secret'
# Pipe the secret through your password manager.
```
On every **other** host, write the same `.env` plus the captured
secret:
```sh
# .env (bravo, charlie, …)
HOST=bravo
TAILSCALE_AUTHKEY=tskey-auth-xxxxxxxx
QUPTIME_ADVERTISE=100.64.1.2:9901
QUPTIME_CLUSTER_SECRET=<paste from alpha>
```
Bring them up and invite them from the first node:
```sh
docker compose up -d
# From alpha
docker compose exec quptime qu node add 100.64.1.2:9901
sleep 3
docker compose exec quptime qu node add 100.64.1.3:9901
docker compose exec quptime qu status
```
## Tailscale ACLs
Belt and braces — even though mTLS pins identities, lock down the
tailnet itself so only the `qu` nodes can reach each other's :9901.
In the Tailscale admin console:
```jsonc
{
"tagOwners": { "tag:qu-node": ["group:ops"] },
"acls": [
{
"action": "accept",
"src": ["tag:qu-node"],
"dst": ["tag:qu-node:9901"]
}
// ...your other rules
]
}
```
Then tag every `qu` node in its auth key:
```yaml
environment:
- TS_AUTHKEY=${TAILSCALE_AUTHKEY}?ephemeral=false&tags=tag:qu-node
```
## WireGuard / Nebula / Headscale equivalents
The recipe generalises:
1. Provision the overlay interface on each host with a stable
private IP (the tunnel's own address).
2. `qu init --advertise <overlay-ip>:9901`.
3. Set `bind_addr: <overlay-ip>` in `node.yaml` so the daemon does
**not** also listen on the public interface.
4. Open `:9901` only on the overlay interface in your firewall — for
nftables that's something like `iifname "wg0" tcp dport 9901
accept`.
The cluster secret and mTLS fingerprints still apply; the overlay just
removes the open-internet attack surface.
## Why prefer overlay over public exposure
- Single failure domain at the network layer: an attacker who finds an
exploit in your overlay client (rare; Tailscale and WireGuard are
small surfaces) still hits the application-layer pinning before any
cluster-level operation.
- The cluster secret can be lower-entropy when it's already
unreachable from outside. (You should still treat it as a real
secret; "defence in depth" only works if every layer is real.)
- ICMP probes from a homelab to a target on the public internet are
trivial through NAT, but ICMP *into* a homelab usually isn't.
Running `qu` on a tailnet means peers can heartbeat each other
regardless of NAT direction.
## Trade-offs
- One more thing to monitor. If your tailnet is down, your monitor is
down. Counter-measure: run *another* tiny `qu` cluster (or a single
node) on the public internet that watches the overlay's coordinator
health.
- Probe latency includes the overlay's hop. Tailscale's wireguard is
fast (<1 ms LAN, single-digit ms WAN) so this rarely matters, but
if you're alerting on tight latency thresholds, account for it.
+136
View File
@@ -0,0 +1,136 @@
# Installation
`qu` ships as a single static Linux binary. Pick whichever method
matches how you manage software on the host.
> Choosing a deployment recipe instead? Jump to
> [systemd](deployment/systemd.md), [Docker](deployment/docker.md),
> [Tailscale](deployment/tailscale.md), or
> [public-internet](deployment/public-internet.md).
## Pre-built binary (recommended)
Every tag triggers identical builds on both sources, so either one
serves the same artefact set. Gitea is the canonical home; GitHub is a
push-mirror.
Primary — Gitea releases:
<https://git.cer.sh/axodouble/quptime/releases>
Fallback — GitHub releases (mirrored from the same tag):
<https://github.com/Axodouble/QUptime/releases>
Each release ships `qu-${TAG}-linux-amd64`, `qu-${TAG}-linux-arm64`,
and a `SHA256SUMS` file.
```sh
# Always pin to a tag — `latest` resolves on the server side.
TAG=v0.0.1
ARCH=amd64 # or arm64
# Primary: Gitea
curl -fSL -o qu \
"https://git.cer.sh/axodouble/quptime/releases/download/${TAG}/qu-${TAG}-linux-${ARCH}"
curl -fSL -o SHA256SUMS \
"https://git.cer.sh/axodouble/quptime/releases/download/${TAG}/SHA256SUMS"
# (or the GitHub mirror — substitute the host below if Gitea is unreachable)
# https://github.com/Axodouble/QUptime/releases/download/${TAG}/qu-${TAG}-linux-${ARCH}
# https://github.com/Axodouble/QUptime/releases/download/${TAG}/SHA256SUMS
# Verify before installing. Use the SHA256SUMS from the SAME source
# as the binary — never mix.
sha256sum --check --ignore-missing SHA256SUMS
install -m 0755 qu /usr/local/bin/qu
```
## One-line install script
The repo ships an `install.sh` that handles the download, checksum,
shell-completion installation, and a hardened systemd unit. Run it
under `sudo` so it can write to `/usr/local/bin` and
`/etc/systemd/system`.
```sh
curl -fsSL https://git.cer.sh/Axodouble/QUptime/raw/branch/master/install.sh | sudo bash
# or, via the GitHub mirror:
# curl -fsSL https://raw.githubusercontent.com/Axodouble/QUptime/master/install.sh | sudo bash
```
What it does:
1. Looks up the latest release via the Gitea API; falls back to the
GitHub API if Gitea is unreachable.
2. Downloads the per-arch binary and the matching `SHA256SUMS` from
the same source, then verifies the checksum. Refuses to install on
a mismatch.
3. Installs bash / zsh / fish completion if a target directory exists.
4. Creates a dedicated `quptime` system user and writes
`/etc/systemd/system/quptime.service` (hardened — matches the unit
in [systemd.md](deployment/systemd.md)). Enables but does not start
the service, so you can configure identity before first boot.
## Build from source
Requires Go 1.24.2 or newer.
```sh
# Either remote — Gitea is canonical, GitHub is a push-mirror.
git clone https://git.cer.sh/axodouble/quptime.git
# git clone https://github.com/Axodouble/QUptime.git
cd quptime
go build -ldflags "-X main.version=$(git describe --tags --always)" -o qu ./cmd/qu
./qu --version
```
Static binary, no cgo. `CGO_ENABLED=0` is the default on a clean Go
install; if you've enabled cgo globally, set it explicitly:
```sh
CGO_ENABLED=0 go build -trimpath -ldflags "-s -w" -o qu ./cmd/qu
```
## Docker image
The same multi-arch (`amd64` + `arm64`) image is published to two
registries on every tag. The Gitea registry is the canonical source
and also gets canary `:master` builds; GHCR is a tag-only mirror.
Primary — Gitea registry:
```
git.cer.sh/axodouble/quptime:master # tip of main (canary)
git.cer.sh/axodouble/quptime:latest # latest tagged release
git.cer.sh/axodouble/quptime:v0.0.1 # pinned release
```
Fallback — GitHub Container Registry:
```
ghcr.io/axodouble/quptime:latest # latest tagged release
ghcr.io/axodouble/quptime:v0.0.1 # pinned release
ghcr.io/axodouble/quptime:0.0 # latest 0.0.x
```
See the [Docker deployment guide](deployment/docker.md) for compose
files and volume layout.
## Verifying the install
```sh
qu --version
qu --help
```
If completions installed, `qu <tab>` will list subcommands. After
`qu init` you can run `qu status` to confirm the daemon is reachable
over its control socket.
## Next steps
- [Configure the node and the cluster](configuration.md).
- Pick a deployment recipe under [docs/deployment/](deployment/).
- Walk through the [architecture](architecture.md) so the operational
guarantees are clear before you commit to a topology.
+225
View File
@@ -0,0 +1,225 @@
# Operations
Day-2 tasks: keeping `qu` healthy, upgrading without dropping checks,
backing up state, recovering from failures. Pair this with
[troubleshooting.md](troubleshooting.md) for "the cluster is on fire,
what now" specifics.
## Upgrades
### Rolling upgrade (zero alert loss)
`qu` is built to tolerate one node being absent at a time as long as
quorum still holds. The simple recipe for a 3-node cluster:
```sh
# On each node in turn:
sudo systemctl stop quptime
sudo install -m 0755 qu-new /usr/local/bin/qu
sudo setcap cap_net_raw=+ep /usr/local/bin/qu # if you use raw ICMP
sudo systemctl start quptime
# Wait for the node to rejoin before moving on:
sudo -u quptime qu status # should show quorum true, all peers live
```
The first node you upgrade may briefly be a follower with a *higher*
binary version than the master. That's fine as long as no on-disk
format changes; the wire protocol and `cluster.yaml` schema are
stable within a minor version, so minor / patch upgrades freely
interleave.
For major-version upgrades that change the on-disk format, the release
notes will spell out the migration. As of v0 there have been none.
### Downgrades
A node that downgrades to an older binary will refuse to start if
`cluster.yaml` contains fields the older version doesn't know. To
roll back across a schema change, either:
- Take the cluster offline and downgrade all nodes simultaneously.
- Restore a `cluster.yaml` from before the schema change on every node
before starting the downgraded binary.
Within a single minor version, downgrade is symmetrical with upgrade.
### What can go wrong
- **Restarting two nodes at once in a 3-node cluster** loses quorum.
No mutations succeed, no alerts fire. Quorum returns the moment
the second node is back.
- **A node that has been offline for a long time** comes back with a
stale `cluster.yaml`. It will pull the master's higher version
within ~1 heartbeat. Don't pre-emptively delete its `cluster.yaml`
— let the catch-up path handle it.
## Backups
Three files matter, in descending order of "pain if lost":
| File | Why back it up |
| ---------------------- | -------------------------------------------------------------------- |
| `node.yaml` | Holds the cluster secret. Lose it and the node can't rejoin. |
| `keys/private.pem` | Lose it and you must `qu init` a fresh identity and re-trust. |
| `cluster.yaml` | Resyncs from any other live peer, so per-node backup is optional. |
### Per-host backup
```sh
# /etc/cron.daily/quptime-backup
#!/bin/sh
set -eu
dst=/var/backups/quptime/$(date +%Y%m%d)
mkdir -p "$dst"
cp -a /etc/quptime/node.yaml "$dst/"
cp -a /etc/quptime/keys "$dst/keys"
cp -a /etc/quptime/cluster.yaml "$dst/cluster.yaml"
chmod -R go-rwx "$dst"
```
### Cluster-wide backup
The cluster state (`peers`, `checks`, `alerts`) is identical across
every node. Back up one healthy node's `cluster.yaml` and you have
the canonical copy. To restore:
```sh
# Stop the daemon.
sudo systemctl stop quptime
# Drop in the backup. Reset the version to 0 so the running cluster's
# higher version supersedes whatever you're holding — otherwise this
# node will broadcast a stale snapshot and confuse everyone.
sudo cp backup-cluster.yaml /etc/quptime/cluster.yaml
sudo sed -i 's/^version:.*/version: 0/' /etc/quptime/cluster.yaml
sudo systemctl start quptime
# Within seconds the version-observer pulls the live version from a peer.
```
If you're restoring **the entire cluster** (every node lost), the
"reset version to 0" trick doesn't apply — there's no peer with a
higher version. Pick the highest-version backup, restore that file
across every node verbatim, and start the daemons. The cluster will
elect a master and continue.
## Replacing a dead node
A node has died permanently. You want to add a fresh box with the
same role.
1. On a surviving node, evict the dead one:
```sh
sudo -u quptime qu node remove <dead-node-id>
```
This drops it from `cluster.yaml` and removes its trust entry. The
live set's size shrinks by one — verify quorum still holds.
2. On the new host, install `qu` and `qu init` against the existing
cluster secret:
```sh
sudo -u quptime qu init \
--advertise delta.example.com:9901 \
--secret '<existing cluster secret>'
sudo systemctl start quptime
```
3. From a surviving node, invite the new one:
```sh
sudo -u quptime qu node add delta.example.com:9901
```
The dead node's checks and alerts are unaffected — they live in the
replicated `cluster.yaml`, not the dead node's identity.
## Recovering from lost quorum
You've lost more than half the cluster simultaneously. The remaining
nodes refuse to mutate (correct behaviour: they have no way to know
whether the missing nodes are dead or partitioned).
Options:
- **Bring the missing nodes back.** Always the right first move if it's
possible. The cluster recovers automatically once enough nodes are
live.
- **Shrink the cluster.** If you've genuinely lost the missing nodes
permanently and can't bring them back, you need to manually edit
`cluster.yaml` on every surviving node to remove the dead peers,
then restart. Be very deliberate:
```sh
# On each surviving node:
sudo systemctl stop quptime
sudoedit /etc/quptime/cluster.yaml # delete the dead peers[] entries
# bump version to something higher
sudo systemctl start quptime
```
Make sure every surviving node has identical `cluster.yaml` content
before restarting any of them. If they don't, you'll get conflicting
views of who's in the cluster and elections will flap.
- **Start over.** For small clusters this is often faster than the
manual surgery above: `rm -rf /etc/quptime` everywhere, then
bootstrap from scratch. You'll lose your checks and alerts unless
you saved a copy of `cluster.yaml` elsewhere.
## Monitoring `qu` itself
`qu` watches your services. Who watches `qu`?
### From within the cluster
`qu status` is the single source of truth. The fields to watch:
| Field | Healthy | Suspicious |
| -------------- | -------------- | --------------------------------------------------------- |
| `quorum` | `true` | `false` — no mutations, no alerts. |
| `master` | a NodeID | `(none — ...)` — quorum lost or election in flight. |
| `term` | slow growth | rapid growth → master flapping, network unstable. |
| `config ver` | identical across nodes | divergence → a node is stuck pulling. |
A simple cron sentinel on each node:
```sh
*/5 * * * * /usr/local/bin/qu status >/dev/null 2>&1 \
|| curl -fsSL -X POST -d "qu down on $(hostname)" https://alert.example.com/oncall
```
### From outside the cluster
`qu` does not currently expose a Prometheus / OpenMetrics endpoint.
The recommended pattern is to run a *separate* tiny monitoring path
that doesn't depend on `qu` — even a single `curl` health check on
each node's :9901 (which is TLS-only; you'll see a handshake succeed
even if the daemon's stuck) catches process death.
To produce structured metrics, write a sidecar that parses `qu status`
output and exports counters. The CLI emits stable, machine-grep-able
output specifically so this is straightforward.
## Operational checklist before you go to bed
After standing up a new cluster, work through:
- [ ] All nodes show `quorum true` in `qu status`.
- [ ] All nodes show identical `config ver`.
- [ ] All nodes show the same `master`.
- [ ] `journalctl -u quptime --since "10 min ago"` has no
`propose to master:` or `replicate: pull from:` errors.
- [ ] `qu alert test <name>` reaches your inbox / Discord channel for
every configured alert.
- [ ] At least one check has an intentional failure (a bogus target)
that you flip back and forth to verify the full state-transition
→ dispatch path end-to-end.
- [ ] Backups of `node.yaml` + `keys/` + `cluster.yaml` are landing in
your backup destination.
- [ ] Firewall allow-list (if any) lists every peer's IP.
- [ ] You've stored the cluster secret somewhere that survives the
first operator leaving.
+153
View File
@@ -0,0 +1,153 @@
# Security
The trust model in one page. Read this before deciding where to put
`qu` and who can talk to it.
## What `qu` is trying to defend against
- **Eavesdropping on cluster traffic.** Defended: TLS 1.3 only,
fingerprint-pinned per peer.
- **MITM on the cluster's inter-node link.** Defended: TLS 1.3 with
out-of-band fingerprint verification at `qu node add`.
- **A random internet host enrolling itself as a peer.** Defended:
pre-shared cluster secret on every `Join`.
- **A compromised peer issuing forged cluster-config mutations.** Not
defended. A peer trusted enough to be in `cluster.yaml.peers` can
propose mutations through the master. Treat membership as a
privilege.
- **A compromised peer becoming master.** Election is deterministic on
the smallest live `NodeID`, so a compromised peer can become master
if its `NodeID` sorts first. The master can rewrite `cluster.yaml`
arbitrarily. This is the worst-case blast radius from one compromised
node.
- **DoS by handshake flood.** Not directly defended at the application
layer. The TLS stack accepts anyone's handshake; rate-limiting belongs
at the firewall — see [public-internet.md](deployment/public-internet.md).
## The three secrets on disk
| Secret | What it is | Loss impact |
| -------------------------- | ----------------------------------------- | -------------------------------------------- |
| `keys/private.pem` | RSA private key, this node's identity. | Anyone with it can impersonate this node. |
| `node.yaml.cluster_secret` | Pre-shared base64 string. | Anyone with it can `Join` the cluster. |
| `trust.yaml.entries[].cert_pem` | Other peers' public certs (not secrets, but they enable mTLS). | Loss only forces re-trust. |
The first two are real secrets and live under `0600` permissions in
the data directory. Back them up; never commit them; never paste them
in chat.
## TLS handshake step by step
For every inter-node call:
1. Caller dials peer on its `advertise` address.
2. TLS 1.3 handshake. Both sides present their self-signed leaf cert.
3. The caller's `VerifyPeerCertificate` (set in
`internal/transport/tls.go`) computes the SPKI fingerprint of the
server's cert and compares it against `trust.yaml`. If the caller
knows which `NodeID` it expected, a strict verifier ensures the
fingerprint matches *that specific* entry — not just any trusted
peer.
4. The server's TLS layer accepts any client cert (`RequireAnyClientCert`,
`InsecureSkipVerify: true`) because trust is enforced one layer up.
5. The RPC dispatcher reads the client's cert, computes its
fingerprint, and looks it up in the server's `trust.yaml`. If no
entry exists, only the `Join` method is permitted.
6. `Join` performs a constant-time comparison of the inbound
`ClusterSecret` against `node.yaml.cluster_secret`. Mismatch →
refusal.
So:
- An adversary who gets your **public** cert can't impersonate you.
- An adversary who gets your **fingerprint** can't impersonate you.
- An adversary who gets your **private key** *can* impersonate you to
any peer that trusts your fingerprint.
## The TOFU step
`qu node add <host:port>` runs a one-shot insecure dial against the
target (the only place `InsecureBootstrapConfig` is used in the
codebase, see `internal/transport/tls.go:91`). It fetches the
remote's cert, prints the fingerprint, and asks for confirmation.
This is **identical** to SSH's first-connection prompt. The operator
must verify the fingerprint out of band — by running `qu status` on
the remote side, or by reading `keys/cert.pem` directly, or via a
known-good distribution channel.
If you skip verification, you trust the network at that moment. If
the network was MITM'd at exactly that moment, you trust the
attacker. After the prompt, the cert is pinned and the window closes.
## Cluster secret rotation
There is no built-in command to rotate the cluster secret. The hard
part isn't generating a new one — it's distributing it consistently
across every node. The pragmatic recipe:
1. Generate a new secret on one node and copy it to every other node.
2. Update `node.yaml.cluster_secret` on every node (manual edit).
3. Restart each daemon one at a time, verifying quorum returns
between restarts.
Rotation only protects future `Join` calls, not anything else. If you
suspect the old secret has been seen by an adversary, also assume any
peer that was added during the leaked window is compromised, and
re-init those peers from scratch.
## Identity rotation
To roll a node's RSA keypair (e.g., the private key was on a laptop
that got stolen):
```sh
# On the compromised node:
sudo systemctl stop quptime
sudo rm -rf /etc/quptime
sudo -u quptime qu init \
--advertise this-host.example.com:9901 \
--secret '<existing cluster secret>'
sudo systemctl start quptime
# On a surviving healthy node:
sudo -u quptime qu node remove <old-node-id> # evict the old identity
sudo -u quptime qu node add this-host.example.com:9901
```
The new `node_id` is a fresh UUID; the old one is gone for good. Any
historical references to it (e.g., the `updated_by` field on past
versions of `cluster.yaml`) are cosmetic.
## What the local control socket protects
`$XDG_RUNTIME_DIR/quptime/quptime.sock` (or `/var/run/quptime/...`) is
the channel the CLI uses to talk to the local daemon. It's `0600`
permissioned and authenticated solely by filesystem ACLs — no TLS, no
secrets in the protocol.
Anyone who can `read+write` the socket can:
- Propose cluster mutations (will be relayed to the master).
- Read full cluster state including `cluster.yaml`.
- Trigger test alerts.
So: don't put the daemon's user in a group that other unprivileged
users share. The default systemd setup with a dedicated `quptime`
user gets this right.
## Hardening checklist
- [ ] Dedicated `quptime` system user.
- [ ] Data directory owned by that user, mode 0750.
- [ ] `keys/private.pem` mode 0600.
- [ ] `node.yaml` mode 0600.
- [ ] systemd unit uses `ProtectSystem=strict`, `NoNewPrivileges=true`,
and the rest of the hardening directives in
[systemd.md](deployment/systemd.md).
- [ ] If `:9901` is internet-reachable, firewall allow-list to peer
IPs or use an overlay — see [public-internet.md](deployment/public-internet.md)
and [tailscale.md](deployment/tailscale.md).
- [ ] Cluster secret generated by `qu init` (not chosen by a human),
stored in your secret manager.
- [ ] Backups of `keys/` and `node.yaml` are encrypted at rest.
+214
View File
@@ -0,0 +1,214 @@
# Troubleshooting
The cluster is misbehaving. This page is organised by symptom. Each
entry pairs the user-visible signal with the log line(s) you'll see
in `journalctl -u quptime` and the fix.
## `qu status` shows `quorum false`
**What it means.** Fewer than ⌈N/2⌉+1 peers are live.
**Diagnose.** Look at the PEERS table. The `LIVE` column tells you
which peers this node has stopped hearing from.
- If only this node is "live" and everyone else is not → this node is
network-isolated. Test: `nc -zv <peer-advertise>`. Fix: network /
firewall.
- If multiple nodes show false → more than one peer is down. Look at
the other peers' status outputs to triangulate.
- If everyone is live but `quorum false` still → check
`cluster.yaml.peers` length vs. live count; you may have phantom
peer entries left over from a removed-but-not-evicted node. Fix:
`qu node remove <ghost-node-id>` from any live node.
## `qu status` shows `master (none — ...)`
**What it means.** Either no quorum (see above) or election is in
flight. The latter clears within ~1 heartbeat.
If `term` is incrementing rapidly (`watch qu status`), the master is
flapping. Causes:
- The currently-elected master is unreachable from some peers but
reachable from others, partial-partition style. Look for log lines
on the suspected master about peers it can't reach.
- Heartbeat timeouts (default 4s) are too tight for your inter-node
link. Rebuild with a higher `DefaultDeadAfter` if you need it.
## A check is stuck in `unknown`
**What it means.** The aggregator has no fresh reports for that check.
Possible causes:
- No node is actually running the probe yet. Probes start ~`interval/10`
after `qu serve` boots and reconcile every 5s. Wait 10s and
re-check.
- Nodes are submitting results but they're stale (older than 3×
interval). Probably means probes are timing out without reporting.
- This is a follower's view; the aggregator runs on the master only.
Check `qu status` on the master to see the canonical view.
## Alerts not firing
Walk this list in order; one of them will catch it:
1. **Is there quorum?** Aggregator runs on master only. No master →
no transitions → no alerts.
2. **Is the alert attached to the check?** `qu status` shows the
effective alert list per check. Empty → no alert. Confirm with
`qu alert list` that the alert exists and (if relying on default
attachment) has `default: true`.
3. **Is the alert suppressed on this check?** Check
`suppress_alert_ids` in `cluster.yaml`.
4. **Test the alert path directly:**
```sh
sudo -u quptime qu alert test <name>
```
This bypasses the aggregator and renders a synthetic transition.
If `alert test` doesn't deliver, the problem is the notifier
config or the template — see below. If `alert test` works but real
transitions don't, the aggregator isn't observing the transition.
5. **Has the check actually transitioned?** Aggregator commits a flip
only after **two consecutive** evaluations agree. A bouncing
target may never satisfy the hysteresis. Lower the check interval
or increase reliability of the target.
## Discord webhook returns 4xx
The dispatcher logs the HTTP body. Common causes:
- Webhook revoked / channel deleted → 404. Re-issue and update
`discord_webhook`.
- Body too large → 400. Long templates that pull `Snapshot.Detail`
with multi-line errors can blow past Discord's 2000-char limit.
Shorten the template or trim the variable.
- Rate-limited → 429. Reduce alert frequency or stop suppressing
hysteresis.
## SMTP refuses the message
Check the daemon log for `smtp:` lines. Most common:
- `530 5.7.0 Must issue a STARTTLS command first` → set
`smtp_starttls: true` on the alert.
- `535 Authentication failed` → wrong `smtp_user` / `smtp_password`.
- Connection refused / timeout → firewall between `qu` and the SMTP
relay. Verify with `openssl s_client -starttls smtp -connect host:587`.
## Manual edit to `cluster.yaml` was ignored
Symptoms: you edited the file, saved, nothing happened.
Look for one of these log lines:
- `manual-edit: parse cluster.yaml: <err> — ignoring` → YAML is
invalid. The daemon pins the bad hash and waits for the next valid
save. Run the file through `yq` or `python -c "import yaml,sys;
yaml.safe_load(open(sys.argv[1]))" cluster.yaml` to diagnose.
- `manual-edit: cluster.yaml changed externally — replicating via
master` followed by `manual-edit: forward to master: no quorum` →
cluster has no quorum, can't accept the edit. Restore quorum first.
- *No log line at all* → the on-disk content didn't change in a way
that matters. The watcher compares only `peers`, `checks`, and
`alerts`; whitespace and comment edits are accepted silently.
## Two nodes disagree on `config ver`
The follower with the lower version should pull within one heartbeat.
If after ~5 seconds the gap persists:
- The follower might not have an `advertise` address for the higher-
versioned peer. The version observer needs one to pull. Check
`cluster.yaml.peers` for both sides' `advertise` fields.
- The follower's TLS handshake against the higher-versioned peer is
failing — look for `replicate: pull from <id>: <err>` lines.
- The peer with the higher version is announcing it correctly but the
follower is rejecting the `ApplyClusterCfg` broadcasts because of
its own decode error — look for transport-layer errors instead.
## "needs ≥2 live to mutate" rejection during bootstrap
You ran two `qu node add` commands back-to-back and the second one
failed. The first add doesn't take effect until the new peer sends
its first heartbeat (≤ 1 second); during that window the cluster has
size 2 and quorum size 2, so a *second* peer add from a 1-live
cluster looks like "mutate without quorum."
Fix: pause ~3 seconds between adds. The README and the systemd guide
both call this out.
## Daemon refuses to start
```
load node.yaml: open ...: no such file or directory
```
`qu serve` normally auto-bootstraps a missing `node.yaml` using the
`QUPTIME_*` env vars (see
[configuration.md](configuration.md#auto-init-on-qu-serve)). If you
still see this error, the most likely causes are:
- The data directory is read-only or owned by a different user — the
bootstrap can't write `node.yaml`. Fix permissions on
`$QUPTIME_DIR`.
- Something else removed `node.yaml` mid-run (a config-management
tool, a misconfigured volume). Re-run `qu serve` and it will
rebuild from env, or run `qu init` manually with the flags you
want.
```
node.yaml has empty node_id — run `qu init` first
```
`node.yaml` exists but lacks a `node_id`. Either delete the file and
let auto-init regenerate it, or run `qu init` against a wiped data
dir.
```
listen tcp :9901: bind: address already in use
```
Another process owns the port. `ss -tlnp | grep :9901` to find it.
```
load private key: ...
```
Permissions on `keys/private.pem` are wrong — should be 0600 and owned
by the daemon user. Fix and restart.
## Probes look much slower than expected
ICMP first:
- Default ICMP is **unprivileged UDP-mode pings**, not raw ICMP. UDP
ping is a bit slower and may hit different kernel paths. For
reference latency, grant `CAP_NET_RAW`.
HTTP / TCP:
- `interval` and `timeout` are the only knobs in `cluster.yaml`. The
check is run synchronously per worker; if your target takes 9 s to
respond and your timeout is 10 s, the next probe doesn't start
until ~9 s elapsed. Increase concurrency by adding more
fast-interval checks against the same target, not by lowering
timeout (which will just produce false `down` results).
## I want to start over
```sh
sudo systemctl stop quptime
sudo rm -rf /etc/quptime
sudo -u quptime qu init --advertise <addr>
sudo systemctl start quptime
```
The data directory is the only state. Wipe it and you're back to a
fresh node.
Under Docker (or any env-driven deploy), the explicit `qu init` step
isn't needed — wiping the data volume and restarting the container is
enough; `qu serve` will re-bootstrap from the `QUPTIME_*` env vars.
+220 -31
View File
@@ -1,23 +1,42 @@
#!/bin/bash #!/bin/bash
# QUptime installer.
#
# Downloads the latest released `qu` binary, verifies it against the
# published SHA256SUMS, installs it to /usr/local/bin, and (on systemd
# hosts) drops in a hardened quptime.service that matches the unit
# documented in docs/deployment/systemd.md.
#
# Release sources, tried in order:
# 1. Gitea: git.cer.sh/axodouble/quptime/releases (primary — canonical home)
# 2. GitHub: github.com/Axodouble/QUptime/releases (push-mirror fallback)
#
# Idempotent — re-running upgrades the binary and refreshes the unit
# without touching the data directory.
set -euo pipefail set -euo pipefail
INSTALL_BIN="/usr/local/bin/qu" INSTALL_BIN="/usr/local/bin/qu"
SERVICE_FILE="/etc/systemd/system/qu-serve.service" SERVICE_FILE="/etc/systemd/system/quptime.service"
SERVICE_USER="${SUDO_USER:-$(whoami)}" SERVICE_NAME="$(basename "$SERVICE_FILE")"
SERVICE_GROUP="$(id -gn "$SERVICE_USER" 2>/dev/null || echo root)" SERVICE_USER="quptime"
SERVICE_GROUP="quptime"
DATA_DIR="/etc/quptime"
# Release sources, in preference order. Each row is:
# <name>|<latest-release API endpoint>|<release-asset base URL>
# The asset URL is concatenated with `/<tag>/<filename>`. Adjust here
# if the project moves hosts.
SOURCES=(
"gitea|https://git.cer.sh/api/v1/repos/axodouble/quptime/releases/latest|https://git.cer.sh/axodouble/quptime/releases/download"
"github|https://api.github.com/repos/Axodouble/QUptime/releases/latest|https://github.com/Axodouble/QUptime/releases/download"
)
fail() { fail() {
echo "Error: $*" >&2 echo "Error: $*" >&2
exit 1 exit 1
} }
echo_cmd() {
echo -e "\033[90m> $1\033[0m"
eval "$1"
}
require_command() { require_command() {
command -v "$1" > /dev/null 2>&1 || fail "$1 is not installed. Please install $1 and try again." command -v "$1" >/dev/null 2>&1 || fail "$1 is not installed. Please install $1 and try again."
} }
write_completion() { write_completion() {
@@ -31,52 +50,222 @@ write_completion() {
return 1 return 1
} }
require_command jq # fetch_from_source tries one release source end-to-end: pulls the
# latest tag from its API, downloads the per-arch binary and the
# accompanying SHA256SUMS, and verifies the checksum. Returns 0 on
# success (with RELEASE and BINARY_NAME set as globals) or 1 if any
# step fails — callers can then try the next source. Stderr is kept
# quiet so a failed primary doesn't spam the operator before the
# fallback is attempted.
fetch_from_source() {
local api_url=$1
local release_base=$2
local tmpdir=$3
local release
release=$(curl -fsSL --proto '=https' --tlsv1.2 "$api_url" 2>/dev/null | jq -r '.tag_name' 2>/dev/null) \
|| return 1
[ -n "$release" ] && [ "$release" != "null" ] || return 1
local binary_name="qu-${release}-linux-${ARCH}"
local binary_url="${release_base}/${release}/${binary_name}"
local sums_url="${release_base}/${release}/SHA256SUMS"
curl -fsSL --proto '=https' --tlsv1.2 -o "$tmpdir/$binary_name" "$binary_url" 2>/dev/null \
|| return 1
curl -fsSL --proto '=https' --tlsv1.2 -o "$tmpdir/SHA256SUMS" "$sums_url" 2>/dev/null \
|| return 1
# Verify against the SHA256SUMS that came from the same source as
# the binary. Never mix sources here — verifying a GitHub-hosted
# binary against a Gitea-hosted SHA256SUMS would defeat the
# tamper check.
(
cd "$tmpdir"
if ! grep -E "[[:space:]]\\*?${binary_name}\$" SHA256SUMS > expected.sum; then
exit 1
fi
if ! sha256sum -c expected.sum >/dev/null 2>&1; then
exit 1
fi
) || return 1
RELEASE="$release"
BINARY_NAME="$binary_name"
return 0
}
require_command curl require_command curl
require_command jq
require_command sha256sum
require_command install
require_command mktemp
# --- target architecture ------------------------------------------------
case "$(uname -m)" in
x86_64) ARCH=amd64 ;;
aarch64|arm64) ARCH=arm64 ;;
*) fail "unsupported architecture: $(uname -m). Pre-built binaries are published for amd64 and arm64 only — build from source for other platforms." ;;
esac
if [ ! -w "$(dirname "$INSTALL_BIN")" ]; then if [ ! -w "$(dirname "$INSTALL_BIN")" ]; then
fail "You are not allowed to write to $(dirname "$INSTALL_BIN"). Run this script with sudo or install qu manually." fail "Cannot write to $(dirname "$INSTALL_BIN"). Run this script with sudo, or set INSTALL_BIN to a writable location."
fi fi
RELEASE=$(curl -s https://git.cer.sh/api/v1/repos/axodouble/quptime/releases/latest | jq -r '.tag_name') # --- download + verify (with fallback) ----------------------------------
TMPDIR=$(mktemp -d)
trap 'rm -rf "$TMPDIR"' EXIT
echo_cmd "curl -L -o '$INSTALL_BIN' 'https://git.cer.sh/axodouble/quptime/releases/download/${RELEASE}/qu-${RELEASE}-linux-amd64'" # Globals filled in by fetch_from_source on success.
echo_cmd "chmod +x '$INSTALL_BIN'" RELEASE=""
echo "> qu has been installed to $INSTALL_BIN" BINARY_NAME=""
INSTALLED_FROM=""
INSTALLED_TMP=""
for source_spec in "${SOURCES[@]}"; do
IFS='|' read -r src_name src_api src_base <<<"$source_spec"
src_tmp="$TMPDIR/$src_name"
mkdir -p "$src_tmp"
echo "> trying release source: $src_name"
# `set -e` would abort the whole script the moment fetch_from_source
# returns nonzero; we want the loop to fall through to the next
# source instead. Wrap the call so a failure is just data.
if fetch_from_source "$src_api" "$src_base" "$src_tmp"; then
INSTALLED_FROM="$src_name"
INSTALLED_TMP="$src_tmp"
echo "> $src_name: ${RELEASE} ✓ checksum OK"
break
fi
echo "> $src_name: unavailable"
done
if [ -z "$INSTALLED_FROM" ]; then
fail "no release source reachable — tried: $(printf '%s ' "${SOURCES[@]%%|*}"). Check network access to git.cer.sh and github.com."
fi
install -m 0755 "$INSTALLED_TMP/$BINARY_NAME" "$INSTALL_BIN"
echo "> qu ${RELEASE} installed to $INSTALL_BIN (source: $INSTALLED_FROM)"
# --- shell completions --------------------------------------------------
if "$INSTALL_BIN" --help 2>/dev/null | grep -q "completion"; then if "$INSTALL_BIN" --help 2>/dev/null | grep -q "completion"; then
write_completion bash /usr/share/bash-completion/completions/qu \ write_completion bash /usr/share/bash-completion/completions/qu \
|| write_completion bash /etc/bash_completion.d/qu || true || write_completion bash /etc/bash_completion.d/qu \
|| true
write_completion zsh /usr/share/zsh/site-functions/_qu || true write_completion zsh /usr/share/zsh/site-functions/_qu || true
write_completion fish /usr/share/fish/vendor_completions.d/qu.fish || true write_completion fish /usr/share/fish/vendor_completions.d/qu.fish || true
else else
echo "> qu does not expose completion support; skipping shell completion installation." echo "> qu does not expose completion support; skipping shell completion installation."
fi fi
if ! command -v systemctl > /dev/null 2>&1; then # --- systemd unit -------------------------------------------------------
echo "> Warning: systemd is not available on this system. qu serve will not be automatically started on boot." if ! command -v systemctl >/dev/null 2>&1; then
echo "Installation complete, before starting qu serve, make sure to run qu init and read the documentation." echo
echo "> systemd is not available on this system. Installation stops here."
echo "> Run \`qu serve\` manually (or wire it into the supervisor of your choice)."
exit 0 exit 0
fi fi
echo "> Creating systemd service file for qu serve..." # Dedicated service user. Hardened unit drops all capabilities and
cat > "$SERVICE_FILE" <<EOL # locks the daemon down with ProtectSystem=strict, so it must run as
# its own unprivileged account rather than the invoking sudo user.
if ! id "$SERVICE_USER" >/dev/null 2>&1; then
echo "> creating system user $SERVICE_USER"
useradd --system --no-create-home --shell /usr/sbin/nologin "$SERVICE_USER"
fi
install -d -o "$SERVICE_USER" -g "$SERVICE_GROUP" -m 0750 "$DATA_DIR"
echo "> writing $SERVICE_FILE"
cat > "$SERVICE_FILE" <<'EOF'
[Unit] [Unit]
Description=QUptime Serve Description=QUptime distributed uptime monitor
After=network.target Documentation=https://git.cer.sh/axodouble/quptime
Wants=network-online.target
After=network-online.target
[Service] [Service]
ExecStart=$INSTALL_BIN serve Type=simple
ExecStart=/usr/local/bin/qu serve
Restart=always Restart=always
User=$SERVICE_USER RestartSec=5s
Group=$SERVICE_GROUP
User=quptime
Group=quptime
# Where state lives. RuntimeDirectory creates /var/run/quptime/ each
# boot owned by User:Group with mode 0750.
Environment=QUPTIME_DIR=/etc/quptime
RuntimeDirectory=quptime
RuntimeDirectoryMode=0750
ReadWritePaths=/etc/quptime /var/run/quptime
# Hardening. Comment out individual directives if a probe needs
# something we've revoked.
NoNewPrivileges=true
ProtectSystem=strict
ProtectHome=true
PrivateTmp=true
PrivateDevices=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
ProtectClock=true
ProtectHostname=true
RestrictNamespaces=true
RestrictRealtime=true
RestrictSUIDSGID=true
LockPersonality=true
MemoryDenyWriteExecute=true
# Network access is required (we're a network monitor). Keep address
# families minimal — AF_NETLINK is needed for some libc lookups.
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK
# If you need raw ICMP, *also* uncomment:
# AmbientCapabilities=CAP_NET_RAW
# CapabilityBoundingSet=CAP_NET_RAW
# Otherwise drop all capabilities:
CapabilityBoundingSet=
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target
EOL EOF
echo_cmd "systemctl daemon-reload" systemctl daemon-reload
echo_cmd "systemctl enable $(basename "$SERVICE_FILE")" systemctl enable "$SERVICE_NAME" >/dev/null
echo "> qu serve service has been created and enabled. You can start it with 'systemctl start $(basename "$SERVICE_FILE")'" echo "> ${SERVICE_NAME} installed and enabled (not yet started)"
echo "Installation complete, before starting qu serve, make sure to run qu init and read the documentation." cat <<EOF
Installation complete.
Next steps:
1. Initialise the node identity. Either:
a) Let \`qu serve\` auto-init from environment variables.
Drop a systemd override like:
sudo systemctl edit ${SERVICE_NAME}
[Service]
Environment=QUPTIME_ADVERTISE=<this-host>:9901
# On follower nodes, also set the shared join secret:
# Environment=QUPTIME_CLUSTER_SECRET=<paste from first node>
b) Or run \`qu init\` once explicitly:
sudo -u ${SERVICE_USER} QUPTIME_DIR=${DATA_DIR} \\
qu init --advertise <this-host>:9901
2. Start the service:
sudo systemctl start ${SERVICE_NAME}
sudo -u ${SERVICE_USER} qu status
3. For ICMP checks, the daemon defaults to unprivileged UDP-mode
pings — those need the ping_group_range sysctl widened to include
the ${SERVICE_USER} GID, or grant CAP_NET_RAW in the unit. See
docs/deployment/systemd.md for the recipes.
Full documentation: https://git.cer.sh/axodouble/quptime/src/branch/master/docs
EOF
+121 -60
View File
@@ -5,6 +5,7 @@ import (
"encoding/base64" "encoding/base64"
"errors" "errors"
"fmt" "fmt"
"io"
"os" "os"
"github.com/google/uuid" "github.com/google/uuid"
@@ -30,78 +31,50 @@ Pass --secret on every subsequent node so they share the same
cluster join secret. If --secret is omitted on the very first node, a cluster join secret. If --secret is omitted on the very first node, a
random secret is generated and printed for the operator to copy. random secret is generated and printed for the operator to copy.
Every flag may also be supplied via its QUPTIME_* environment variable
(see docs/configuration.md). Explicit flags win over env values, which
in turn win over the compiled defaults.
Idempotent in one direction only: existing key material is never Idempotent in one direction only: existing key material is never
overwritten. Re-run only after wiping the data directory.`, overwritten. Re-run only after wiping the data directory.`,
RunE: func(cmd *cobra.Command, args []string) error { RunE: func(cmd *cobra.Command, args []string) error {
if err := config.EnsureDataDir(); err != nil {
return err
}
if _, err := os.Stat(config.NodeFilePath()); err == nil { if _, err := os.Stat(config.NodeFilePath()); err == nil {
return errors.New("node.yaml already exists in data dir — refusing to overwrite") return errors.New("node.yaml already exists in data dir — refusing to overwrite")
} }
secret := clusterSecret // Only let env fill fields the operator did NOT pass on the
generated := false // command line; explicit flags must win over env.
if secret == "" { n := &config.NodeConfig{}
s, err := generateSecret() if cmd.Flags().Changed("bind") {
if err != nil { n.BindAddr = bindAddr
return fmt.Errorf("generate cluster secret: %w", err)
} }
secret = s if cmd.Flags().Changed("port") {
generated = true n.BindPort = bindPort
}
if cmd.Flags().Changed("advertise") {
n.Advertise = advertise
}
if cmd.Flags().Changed("secret") {
n.ClusterSecret = clusterSecret
}
if err := n.ApplyEnvOverrides(); err != nil {
return err
}
// Cobra defaults (bind=0.0.0.0, port=9901) are still
// available as fallbacks for fields neither flag nor env
// touched.
if n.BindAddr == "" {
n.BindAddr = bindAddr
}
if n.BindPort == 0 {
n.BindPort = bindPort
} }
nodeID := uuid.NewString() _, generated, err := bootstrapNode(n)
n := &config.NodeConfig{
NodeID: nodeID,
BindAddr: bindAddr,
BindPort: bindPort,
Advertise: advertise,
ClusterSecret: secret,
}
if err := n.Save(); err != nil {
return fmt.Errorf("save node.yaml: %w", err)
}
if _, err := crypto.GenerateKeyPair(nodeID); err != nil {
return fmt.Errorf("generate keys: %w", err)
}
// Seed cluster.yaml with this node as its own first peer.
// Without this the math in `quorum` would treat a one-node
// cluster as "0 peers, fallback quorum=1, master=self" —
// which works in isolation but breaks the moment another
// node joins, because the replicated peers list would lack
// the inviter, leading to split-brain elections.
certPEM, err := crypto.LoadCertPEM()
if err != nil { if err != nil {
return fmt.Errorf("load cert: %w", err) return err
}
fp, err := crypto.FingerprintFromCertPEM(certPEM)
if err != nil {
return fmt.Errorf("fingerprint own cert: %w", err)
}
cluster := &config.ClusterConfig{}
if err := cluster.Mutate(nodeID, func(c *config.ClusterConfig) error {
c.Peers = []config.PeerInfo{{
NodeID: nodeID,
Advertise: n.AdvertiseAddr(),
Fingerprint: fp,
CertPEM: string(certPEM),
}}
return nil
}); err != nil {
return fmt.Errorf("seed cluster.yaml: %w", err)
}
out := cmd.OutOrStdout()
fmt.Fprintf(out, "initialised node %s\n", nodeID)
fmt.Fprintf(out, "data dir: %s\n", config.DataDir())
fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr())
if generated {
fmt.Fprintln(out)
fmt.Fprintln(out, "cluster secret (copy to every other node via --secret):")
fmt.Fprintln(out, " "+secret)
} }
printBootstrapResult(cmd.OutOrStdout(), n, generated)
return nil return nil
}, },
} }
@@ -112,6 +85,94 @@ overwritten. Re-run only after wiping the data directory.`,
root.AddCommand(cmd) root.AddCommand(cmd)
} }
// bootstrapNode creates the data dir, writes node.yaml, generates the
// keypair, and seeds cluster.yaml with this node as its own first
// peer. cfg may arrive with any subset of fields populated; missing
// NodeID and ClusterSecret are auto-generated, missing BindAddr /
// BindPort get the compiled defaults.
//
// Returns the populated config (the same pointer that was passed in)
// and a flag indicating whether ClusterSecret was generated here. The
// flag exists so the caller can print the secret for the operator —
// it must be copied to every follower node out-of-band.
//
// Caller is responsible for checking that node.yaml does not yet
// exist; bootstrapNode itself will refuse to overwrite an existing
// keypair (crypto.GenerateKeyPair errors out) but does not guard
// against clobbering node.yaml.
func bootstrapNode(cfg *config.NodeConfig) (*config.NodeConfig, bool, error) {
if err := config.EnsureDataDir(); err != nil {
return nil, false, err
}
if cfg.NodeID == "" {
cfg.NodeID = uuid.NewString()
}
if cfg.BindAddr == "" {
cfg.BindAddr = "0.0.0.0"
}
if cfg.BindPort == 0 {
cfg.BindPort = 9901
}
generated := false
if cfg.ClusterSecret == "" {
s, err := generateSecret()
if err != nil {
return nil, false, fmt.Errorf("generate cluster secret: %w", err)
}
cfg.ClusterSecret = s
generated = true
}
if err := cfg.Save(); err != nil {
return nil, false, fmt.Errorf("save node.yaml: %w", err)
}
if _, err := crypto.GenerateKeyPair(cfg.NodeID); err != nil {
return nil, false, fmt.Errorf("generate keys: %w", err)
}
// Seed cluster.yaml with this node as its own first peer.
// Without this the math in `quorum` would treat a one-node
// cluster as "0 peers, fallback quorum=1, master=self" — which
// works in isolation but breaks the moment another node joins,
// because the replicated peers list would lack the inviter,
// leading to split-brain elections.
certPEM, err := crypto.LoadCertPEM()
if err != nil {
return nil, false, fmt.Errorf("load cert: %w", err)
}
fp, err := crypto.FingerprintFromCertPEM(certPEM)
if err != nil {
return nil, false, fmt.Errorf("fingerprint own cert: %w", err)
}
cluster := &config.ClusterConfig{}
if err := cluster.Mutate(cfg.NodeID, func(c *config.ClusterConfig) error {
c.Peers = []config.PeerInfo{{
NodeID: cfg.NodeID,
Advertise: cfg.AdvertiseAddr(),
Fingerprint: fp,
CertPEM: string(certPEM),
}}
return nil
}); err != nil {
return nil, false, fmt.Errorf("seed cluster.yaml: %w", err)
}
return cfg, generated, nil
}
// printBootstrapResult emits the human-readable summary both `qu init`
// and the serve auto-init path print after bootstrapping. Kept in one
// place so the secret-disclosure format stays identical across the two
// entry points.
func printBootstrapResult(out io.Writer, n *config.NodeConfig, secretGenerated bool) {
fmt.Fprintf(out, "initialised node %s\n", n.NodeID)
fmt.Fprintf(out, "data dir: %s\n", config.DataDir())
fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr())
if secretGenerated {
fmt.Fprintln(out)
fmt.Fprintln(out, "cluster secret (copy to every other node via --secret or QUPTIME_CLUSTER_SECRET):")
fmt.Fprintln(out, " "+n.ClusterSecret)
}
}
// generateSecret produces 32 bytes of crypto-random data and returns // generateSecret produces 32 bytes of crypto-random data and returns
// it base64-encoded. Long enough that brute force isn't a concern; // it base64-encoded. Long enough that brute force isn't a concern;
// short enough that operators can copy-paste it without pagination. // short enough that operators can copy-paste it without pagination.
+50 -1
View File
@@ -2,6 +2,9 @@ package cli
import ( import (
"context" "context"
"errors"
"fmt"
"io/fs"
"log" "log"
"os" "os"
"os/signal" "os/signal"
@@ -9,6 +12,7 @@ import (
"github.com/spf13/cobra" "github.com/spf13/cobra"
"git.cer.sh/axodouble/quptime/internal/config"
"git.cer.sh/axodouble/quptime/internal/daemon" "git.cer.sh/axodouble/quptime/internal/daemon"
) )
@@ -18,9 +22,18 @@ func addServeCmd(root *cobra.Command) {
Short: "Run the qu daemon in the foreground", Short: "Run the qu daemon in the foreground",
Long: `Run the qu daemon: starts the inter-node listener, the local Long: `Run the qu daemon: starts the inter-node listener, the local
control socket for the CLI, the heartbeat loop and the check control socket for the CLI, the heartbeat loop and the check
scheduler. Stops cleanly on SIGINT or SIGTERM.`, scheduler. Stops cleanly on SIGINT or SIGTERM.
If node.yaml does not exist yet, serve will bootstrap it using values
from the QUPTIME_* environment variables (see docs/configuration.md).
This makes a single ` + "`docker compose up`" + ` enough to launch a new node —
no separate ` + "`qu init`" + ` step is required when the data volume is
fresh.`,
RunE: func(cmd *cobra.Command, args []string) error { RunE: func(cmd *cobra.Command, args []string) error {
logger := log.New(os.Stderr, "quptime: ", log.LstdFlags|log.Lmsgprefix) logger := log.New(os.Stderr, "quptime: ", log.LstdFlags|log.Lmsgprefix)
if err := autoInitIfNeeded(cmd, logger); err != nil {
return err
}
d, err := daemon.New(logger) d, err := daemon.New(logger)
if err != nil { if err != nil {
return err return err
@@ -32,3 +45,39 @@ scheduler. Stops cleanly on SIGINT or SIGTERM.`,
} }
root.AddCommand(cmd) root.AddCommand(cmd)
} }
// autoInitIfNeeded bootstraps the node on first launch.
//
// Friction this removes for container deploys: before, the operator
// had to `docker compose run --rm quptime init …` once before the
// service could come up, which makes `restart: unless-stopped`
// awkward and forces an out-of-band step into every fresh volume.
// Now serve auto-runs the same bootstrap path using QUPTIME_* env
// vars when node.yaml is absent, so the compose file can come up on
// the first try.
//
// Pre-existing node.yaml is left untouched — we only bootstrap when
// the file is genuinely missing. Any other stat error (permission
// denied, broken symlink) is surfaced so the operator sees the real
// problem instead of a confused auto-init attempt clobbering state.
func autoInitIfNeeded(cmd *cobra.Command, logger *log.Logger) error {
_, err := os.Stat(config.NodeFilePath())
if err == nil {
return nil
}
if !errors.Is(err, fs.ErrNotExist) {
return fmt.Errorf("stat node.yaml: %w", err)
}
logger.Printf("node.yaml not found at %s — bootstrapping from environment", config.NodeFilePath())
n := &config.NodeConfig{}
if err := n.ApplyEnvOverrides(); err != nil {
return err
}
if _, generated, err := bootstrapNode(n); err != nil {
return fmt.Errorf("auto-init: %w", err)
} else {
printBootstrapResult(cmd.OutOrStderr(), n, generated)
}
return nil
}
+47
View File
@@ -3,10 +3,26 @@ package config
import ( import (
"fmt" "fmt"
"os" "os"
"strconv"
"gopkg.in/yaml.v3" "gopkg.in/yaml.v3"
) )
// Environment variable names that override fields on NodeConfig at
// load time. Intended to let `docker compose` setups drive a node's
// identity and listener configuration without having to bake a
// node.yaml into the image or run `qu init` manually first.
//
// Empty values are ignored — they do not clear a field. The override
// order is therefore: env (non-empty) > file > compiled default.
const (
EnvNodeID = "QUPTIME_NODE_ID"
EnvBindAddr = "QUPTIME_BIND_ADDR"
EnvBindPort = "QUPTIME_BIND_PORT"
EnvAdvertise = "QUPTIME_ADVERTISE"
EnvClusterSecret = "QUPTIME_CLUSTER_SECRET"
)
// NodeConfig is the per-node, never-replicated identity file. // NodeConfig is the per-node, never-replicated identity file.
type NodeConfig struct { type NodeConfig struct {
// NodeID is a stable UUID generated at `qu init`. Used by all peers // NodeID is a stable UUID generated at `qu init`. Used by all peers
@@ -45,6 +61,34 @@ func (n *NodeConfig) AdvertiseAddr() string {
return fmt.Sprintf("%s:%d", bind, n.BindPort) return fmt.Sprintf("%s:%d", bind, n.BindPort)
} }
// ApplyEnvOverrides folds QUPTIME_* environment variables onto n.
// Non-empty env values win over the existing field value. Called both
// by LoadNodeConfig and by the `qu init` / serve auto-init paths so
// the same precedence rules apply whether the daemon is reading a
// persisted node.yaml or constructing one from scratch.
func (n *NodeConfig) ApplyEnvOverrides() error {
if v := os.Getenv(EnvNodeID); v != "" {
n.NodeID = v
}
if v := os.Getenv(EnvBindAddr); v != "" {
n.BindAddr = v
}
if v := os.Getenv(EnvBindPort); v != "" {
p, err := strconv.Atoi(v)
if err != nil {
return fmt.Errorf("%s=%q: not an integer: %w", EnvBindPort, v, err)
}
n.BindPort = p
}
if v := os.Getenv(EnvAdvertise); v != "" {
n.Advertise = v
}
if v := os.Getenv(EnvClusterSecret); v != "" {
n.ClusterSecret = v
}
return nil
}
// LoadNodeConfig reads node.yaml from the data dir. // LoadNodeConfig reads node.yaml from the data dir.
func LoadNodeConfig() (*NodeConfig, error) { func LoadNodeConfig() (*NodeConfig, error) {
raw, err := os.ReadFile(NodeFilePath()) raw, err := os.ReadFile(NodeFilePath())
@@ -55,6 +99,9 @@ func LoadNodeConfig() (*NodeConfig, error) {
if err := yaml.Unmarshal(raw, cfg); err != nil { if err := yaml.Unmarshal(raw, cfg); err != nil {
return nil, fmt.Errorf("parse node.yaml: %w", err) return nil, fmt.Errorf("parse node.yaml: %w", err)
} }
if err := cfg.ApplyEnvOverrides(); err != nil {
return nil, err
}
if cfg.BindPort == 0 { if cfg.BindPort == 0 {
cfg.BindPort = 9901 cfg.BindPort = 9901
} }
+92
View File
@@ -56,3 +56,95 @@ func TestLoadNodeConfigAppliesDefaults(t *testing.T) {
t.Errorf("BindAddr=%q want 0.0.0.0", loaded.BindAddr) t.Errorf("BindAddr=%q want 0.0.0.0", loaded.BindAddr)
} }
} }
func TestApplyEnvOverrides(t *testing.T) {
t.Setenv(EnvNodeID, "node-from-env")
t.Setenv(EnvBindAddr, "1.2.3.4")
t.Setenv(EnvBindPort, "9999")
t.Setenv(EnvAdvertise, "public.example.com:9999")
t.Setenv(EnvClusterSecret, "shh-secret")
n := &NodeConfig{
NodeID: "original-id",
BindAddr: "0.0.0.0",
BindPort: 9901,
Advertise: "old.example.com:9901",
ClusterSecret: "old-secret",
}
if err := n.ApplyEnvOverrides(); err != nil {
t.Fatal(err)
}
want := NodeConfig{
NodeID: "node-from-env",
BindAddr: "1.2.3.4",
BindPort: 9999,
Advertise: "public.example.com:9999",
ClusterSecret: "shh-secret",
}
if *n != want {
t.Errorf("got %+v want %+v", *n, want)
}
}
func TestApplyEnvOverridesEmptyValuesIgnored(t *testing.T) {
// Explicitly empty env vars must NOT clobber existing fields —
// otherwise `docker run -e QUPTIME_ADVERTISE=` would silently
// erase a previously-persisted advertise address.
t.Setenv(EnvNodeID, "")
t.Setenv(EnvBindAddr, "")
t.Setenv(EnvBindPort, "")
t.Setenv(EnvAdvertise, "")
t.Setenv(EnvClusterSecret, "")
orig := NodeConfig{
NodeID: "keep-me",
BindAddr: "10.0.0.1",
BindPort: 9901,
Advertise: "keep.example.com:9901",
ClusterSecret: "keep-secret",
}
n := orig
if err := n.ApplyEnvOverrides(); err != nil {
t.Fatal(err)
}
if n != orig {
t.Errorf("empty env vars mutated config: got %+v want %+v", n, orig)
}
}
func TestApplyEnvOverridesBadPort(t *testing.T) {
t.Setenv(EnvBindPort, "not-an-int")
n := &NodeConfig{}
if err := n.ApplyEnvOverrides(); err == nil {
t.Fatal("expected error for non-integer port")
}
}
func TestLoadNodeConfigEnvOverridesFile(t *testing.T) {
t.Setenv("QUPTIME_DIR", t.TempDir())
// Persist a file with one bind addr; env should win on load.
n := &NodeConfig{NodeID: "abc", BindAddr: "127.0.0.1", BindPort: 9901, Advertise: "file.example.com:9901"}
if err := n.Save(); err != nil {
t.Fatal(err)
}
t.Setenv(EnvBindAddr, "0.0.0.0")
t.Setenv(EnvAdvertise, "env.example.com:9001")
t.Setenv(EnvBindPort, "9001")
loaded, err := LoadNodeConfig()
if err != nil {
t.Fatal(err)
}
if loaded.BindAddr != "0.0.0.0" {
t.Errorf("BindAddr=%q want 0.0.0.0 (env override)", loaded.BindAddr)
}
if loaded.BindPort != 9001 {
t.Errorf("BindPort=%d want 9001 (env override)", loaded.BindPort)
}
if loaded.Advertise != "env.example.com:9001" {
t.Errorf("Advertise=%q want env.example.com:9001 (env override)", loaded.Advertise)
}
if loaded.NodeID != "abc" {
t.Errorf("NodeID=%q want abc (unchanged)", loaded.NodeID)
}
}
+127 -29
View File
@@ -7,6 +7,7 @@ import (
"strings" "strings"
"time" "time"
"github.com/charmbracelet/bubbles/textarea"
"github.com/charmbracelet/bubbles/textinput" "github.com/charmbracelet/bubbles/textinput"
tea "github.com/charmbracelet/bubbletea" tea "github.com/charmbracelet/bubbletea"
"github.com/charmbracelet/lipgloss" "github.com/charmbracelet/lipgloss"
@@ -53,10 +54,45 @@ func modalDoneCmd(flash string, level flashLevel) tea.Cmd {
type formField struct { type formField struct {
label string label string
input textinput.Model input textinput.Model
textarea textarea.Model
multiline bool
required bool required bool
hint string hint string
} }
// value returns the field's current text regardless of whether it's
// backed by a single-line input or a multiline textarea.
func (fld *formField) value() string {
if fld.multiline {
return fld.textarea.Value()
}
return fld.input.Value()
}
func (fld *formField) focus() {
if fld.multiline {
fld.textarea.Focus()
return
}
fld.input.Focus()
}
func (fld *formField) blur() {
if fld.multiline {
fld.textarea.Blur()
return
}
fld.input.Blur()
}
func (fld *formField) setWidth(w int) {
if fld.multiline {
fld.textarea.SetWidth(w)
return
}
fld.input.Width = w
}
type form struct { type form struct {
title string title string
fields []formField fields []formField
@@ -86,12 +122,14 @@ func fieldWidthFor(termWidth int) int {
func newForm(title string, fields []formField, submit func([]string) tea.Cmd) *form { func newForm(title string, fields []formField, submit func([]string) tea.Cmd) *form {
for i := range fields { for i := range fields {
if !fields[i].multiline {
fields[i].input.Prompt = "" fields[i].input.Prompt = ""
fields[i].input.CharLimit = 256 fields[i].input.CharLimit = 256
}
if i == 0 { if i == 0 {
fields[i].input.Focus() fields[i].focus()
} else { } else {
fields[i].input.Blur() fields[i].blur()
} }
} }
return &form{title: title, fields: fields, submit: submit} return &form{title: title, fields: fields, submit: submit}
@@ -114,6 +152,31 @@ func textFieldWithValue(label, hint, value string, required bool) formField {
return formField{label: label, hint: hint, required: required, input: ti} return formField{label: label, hint: hint, required: required, input: ti}
} }
// textAreaField creates a multiline field. Enter inserts a newline;
// the form uses shift+enter / ctrl+s to submit when the cursor is on
// one of these. Useful for things like alert body templates where the
// rendered message naturally spans multiple lines.
func textAreaField(label, hint string, required bool) formField {
return textAreaFieldWithValue(label, hint, "", required)
}
func textAreaFieldWithValue(label, hint, value string, required bool) formField {
ta := textarea.New()
ta.Placeholder = hint
ta.ShowLineNumbers = false
ta.Prompt = " "
ta.SetHeight(5)
ta.SetWidth(defaultFieldWidth)
ta.CharLimit = 0
// Keep enter bound to "insert newline" (the textarea default) — the
// surrounding form intercepts enter on single-line fields and handles
// shift+enter/ctrl+s as the submit/advance trigger for multiline ones.
if value != "" {
ta.SetValue(value)
}
return formField{label: label, hint: hint, required: required, multiline: true, textarea: ta}
}
func passwordField(label, hint string) formField { func passwordField(label, hint string) formField {
return passwordFieldWithValue(label, hint, "") return passwordFieldWithValue(label, hint, "")
} }
@@ -146,7 +209,11 @@ func (f *form) View() string {
labelStyle = lipgloss.NewStyle().Foreground(colorAccent).Bold(true) labelStyle = lipgloss.NewStyle().Foreground(colorAccent).Bold(true)
} }
fmt.Fprintf(&b, "%s%s\n", marker, labelStyle.Render(fld.label)) fmt.Fprintf(&b, "%s%s\n", marker, labelStyle.Render(fld.label))
if fld.multiline {
fmt.Fprintf(&b, "%s\n", fld.textarea.View())
} else {
fmt.Fprintf(&b, " %s\n", fld.input.View()) fmt.Fprintf(&b, " %s\n", fld.input.View())
}
if i == f.cursor && fld.hint != "" { if i == f.cursor && fld.hint != "" {
fmt.Fprintf(&b, " %s\n", helpStyle.Render(fld.hint)) fmt.Fprintf(&b, " %s\n", helpStyle.Render(fld.hint))
} }
@@ -158,7 +225,11 @@ func (f *form) View() string {
if f.busy { if f.busy {
fmt.Fprintf(&b, "%s\n", flashWarnStyle.Render("working…")) fmt.Fprintf(&b, "%s\n", flashWarnStyle.Render("working…"))
} else { } else {
fmt.Fprintf(&b, "%s\n", helpStyle.Render("↑↓ field enter next/submit esc cancel")) help := "↑↓ field enter next/submit esc cancel"
if f.cursor < len(f.fields) && f.fields[f.cursor].multiline {
help = "tab field enter newline shift+enter/ctrl+s submit esc cancel"
}
fmt.Fprintf(&b, "%s\n", helpStyle.Render(help))
} }
return b.String() return b.String()
} }
@@ -169,7 +240,7 @@ func (f *form) Update(msg tea.Msg) (modal, tea.Cmd) {
f.width = msg.Width f.width = msg.Width
w := fieldWidthFor(msg.Width) w := fieldWidthFor(msg.Width)
for i := range f.fields { for i := range f.fields {
f.fields[i].input.Width = w f.fields[i].setWidth(w)
} }
return f, nil return f, nil
@@ -179,43 +250,74 @@ func (f *form) Update(msg tea.Msg) (modal, tea.Cmd) {
return f, nil return f, nil
case tea.KeyMsg: case tea.KeyMsg:
switch msg.String() { key := msg.String()
// up/down on a multiline field belong to in-text navigation;
// leave field-switching to tab/shift+tab there. Same for enter:
// the textarea owns it as "insert newline", so submission moves
// to shift+enter / ctrl+s.
multiline := f.cursor < len(f.fields) && f.fields[f.cursor].multiline
switch key {
case "esc": case "esc":
return f, modalDoneCmd("", flashInfo) return f, modalDoneCmd("", flashInfo)
case "tab", "down": case "tab":
f.advance(1) f.advance(1)
return f, nil return f, nil
case "shift+tab", "up": case "shift+tab":
f.advance(-1) f.advance(-1)
return f, nil return f, nil
case "enter": case "down":
if f.busy { if !multiline {
f.advance(1)
return f, nil return f, nil
} }
case "up":
if !multiline {
f.advance(-1)
return f, nil
}
case "enter":
if !multiline {
return f, f.submitOrAdvance()
}
case "shift+enter", "ctrl+s":
return f, f.submitOrAdvance()
}
}
var cmd tea.Cmd
if f.fields[f.cursor].multiline {
f.fields[f.cursor].textarea, cmd = f.fields[f.cursor].textarea.Update(msg)
} else {
f.fields[f.cursor].input, cmd = f.fields[f.cursor].input.Update(msg)
}
return f, cmd
}
// submitOrAdvance is the shared trigger for enter on single-line fields
// and shift+enter / ctrl+s on multiline fields: jump to the next field
// or, on the last one, validate and run submit.
func (f *form) submitOrAdvance() tea.Cmd {
if f.busy {
return nil
}
if f.cursor < len(f.fields)-1 { if f.cursor < len(f.fields)-1 {
f.advance(1) f.advance(1)
return f, nil return nil
} }
vals := make([]string, len(f.fields)) vals := make([]string, len(f.fields))
for i, fld := range f.fields { for i := range f.fields {
vals[i] = fld.input.Value() vals[i] = f.fields[i].value()
} }
for i, fld := range f.fields { for i, fld := range f.fields {
if fld.required && strings.TrimSpace(vals[i]) == "" { if fld.required && strings.TrimSpace(vals[i]) == "" {
f.err = fld.label + " is required" f.err = fld.label + " is required"
f.cursor = i f.cursor = i
f.focusOnly(i) f.focusOnly(i)
return f, nil return nil
} }
} }
f.busy = true f.busy = true
f.err = "" f.err = ""
return f, f.submit(vals) return f.submit(vals)
}
}
var cmd tea.Cmd
f.fields[f.cursor].input, cmd = f.fields[f.cursor].input.Update(msg)
return f, cmd
} }
func (f *form) advance(delta int) { func (f *form) advance(delta int) {
@@ -230,9 +332,9 @@ func (f *form) advance(delta int) {
func (f *form) focusOnly(i int) { func (f *form) focusOnly(i int) {
for j := range f.fields { for j := range f.fields {
if j == i { if j == i {
f.fields[j].input.Focus() f.fields[j].focus()
} else { } else {
f.fields[j].input.Blur() f.fields[j].blur()
} }
} }
} }
@@ -241,10 +343,6 @@ func (f *form) focusOnly(i int) {
// error inline without closing the form. // error inline without closing the form.
type formSubmitErr string type formSubmitErr string
func submitErr(err error) tea.Cmd {
return func() tea.Msg { return formSubmitErr(err.Error()) }
}
// ============================================================= // =============================================================
// Specific forms. // Specific forms.
// ============================================================= // =============================================================
@@ -298,7 +396,7 @@ func newAddDiscordForm() *form {
textField("Name", "human-friendly identifier", true), textField("Name", "human-friendly identifier", true),
textField("Webhook URL", "https://discord.com/api/webhooks/...", true), textField("Webhook URL", "https://discord.com/api/webhooks/...", true),
textField("Default", "yes/no — attach to every check automatically", false), textField("Default", "yes/no — attach to every check automatically", false),
textField("Body template", alerts.TemplateVarsHint(), false), textAreaField("Body template", alerts.TemplateVarsHint(), false),
} }
return newForm("Add Discord alert", fields, func(vals []string) tea.Cmd { return newForm("Add Discord alert", fields, func(vals []string) tea.Cmd {
return func() tea.Msg { return func() tea.Msg {
@@ -330,7 +428,7 @@ func newAddSMTPForm() *form {
textField("StartTLS", "yes/no — default yes", false), textField("StartTLS", "yes/no — default yes", false),
textField("Default", "yes/no — attach to every check", false), textField("Default", "yes/no — attach to every check", false),
textField("Subject template", alerts.TemplateVarsHint(), false), textField("Subject template", alerts.TemplateVarsHint(), false),
textField("Body template", alerts.TemplateVarsHint(), false), textAreaField("Body template", alerts.TemplateVarsHint(), false),
} }
return newForm("Add SMTP alert", fields, func(vals []string) tea.Cmd { return newForm("Add SMTP alert", fields, func(vals []string) tea.Cmd {
return func() tea.Msg { return func() tea.Msg {
@@ -471,7 +569,7 @@ func newEditDiscordForm(existing config.Alert) *form {
textFieldWithValue("Name", "human-friendly identifier", existing.Name, true), textFieldWithValue("Name", "human-friendly identifier", existing.Name, true),
textFieldWithValue("Webhook URL", "https://discord.com/api/webhooks/...", existing.DiscordWebhook, true), textFieldWithValue("Webhook URL", "https://discord.com/api/webhooks/...", existing.DiscordWebhook, true),
textFieldWithValue("Default", "yes/no — attach to every check automatically", boolStr(existing.Default), false), textFieldWithValue("Default", "yes/no — attach to every check automatically", boolStr(existing.Default), false),
textFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false), textAreaFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
} }
id := existing.ID id := existing.ID
subject := existing.SubjectTemplate subject := existing.SubjectTemplate
@@ -510,7 +608,7 @@ func newEditSMTPForm(existing config.Alert) *form {
textFieldWithValue("StartTLS", "yes/no — default yes", boolStr(existing.SMTPStartTLS), false), textFieldWithValue("StartTLS", "yes/no — default yes", boolStr(existing.SMTPStartTLS), false),
textFieldWithValue("Default", "yes/no — attach to every check", boolStr(existing.Default), false), textFieldWithValue("Default", "yes/no — attach to every check", boolStr(existing.Default), false),
textFieldWithValue("Subject template", alerts.TemplateVarsHint(), existing.SubjectTemplate, false), textFieldWithValue("Subject template", alerts.TemplateVarsHint(), existing.SubjectTemplate, false),
textFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false), textAreaFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
} }
id := existing.ID id := existing.ID
return newForm("Edit SMTP alert", fields, func(vals []string) tea.Cmd { return newForm("Edit SMTP alert", fields, func(vals []string) tea.Cmd {
-6
View File
@@ -73,9 +73,3 @@ func renderState(s string) string {
} }
} }
func renderLive(live bool) string {
if live {
return stateUpStyle.Render("● live")
}
return stateDownStyle.Render("● dead")
}