Compare commits
12 Commits
v0.0.1-rc4
...
v0.0.2
| Author | SHA1 | Date | |
|---|---|---|---|
| 005be12dd1 | |||
| e48da30240 | |||
| b46c258e4e | |||
| 7bc33b1837 | |||
| 7b6acb20eb | |||
| e11b3f4547 | |||
| 6953709574 | |||
| 364ba222e2 | |||
| b029c0a25d | |||
| 3453bf5ec7 | |||
| acd55d145c | |||
| ebbbd8c218 |
@@ -1,14 +1,18 @@
|
|||||||
name: Container image
|
name: Container image
|
||||||
|
|
||||||
# Builds the multi-arch container image. On tag push (v*) it logs in
|
# Three modes, all driven by the same job:
|
||||||
# to the Gitea registry on this host and publishes the image as
|
# - Tag push (v*) → full release: :v1.2.3, :1.2, :latest, :sha-<short>
|
||||||
# git.cer.sh/<owner>/<repo>:<version> plus :latest. On pull requests
|
# - Branch push → canary: :<branch>, :sha-<short>
|
||||||
# it builds without pushing — purely a smoke test that the Dockerfile
|
# - Pull request → smoke test: build only, nothing pushed
|
||||||
# still works.
|
#
|
||||||
|
# metadata-action emits the right subset of tags for each event based
|
||||||
|
# on the `tags:` rules below — no manual branching needed.
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
|
branches:
|
||||||
|
- "**"
|
||||||
tags:
|
tags:
|
||||||
- 'v*'
|
- "v*"
|
||||||
pull_request:
|
pull_request:
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
@@ -19,42 +23,74 @@ jobs:
|
|||||||
image:
|
image:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# The default `ubuntu-latest` label on aether-runner maps to
|
# The default `ubuntu-latest` label on aether-runner maps to
|
||||||
# `node:16-bullseye`, which has no docker CLI — so the docker/*
|
# `node:16-bullseye`, which has no docker CLI. Override to an
|
||||||
# actions fail. Override the job container to catthehacker's
|
# act-compatible image that ships docker + buildx. The runner
|
||||||
# act-compatible image (ships docker CLI + buildx) and mount the
|
# already bind-mounts /var/run/docker.sock into every job
|
||||||
# host's docker socket through. The runner already has the socket
|
# container, so we do NOT add a `volumes:` entry — doing so
|
||||||
# bind-mounted from the host (see docker.yml gitea-runner volume),
|
# produces a duplicate-mount error from the daemon.
|
||||||
# so this exposes that same daemon to the nested job container.
|
|
||||||
container:
|
container:
|
||||||
image: catthehacker/ubuntu:act-latest
|
image: catthehacker/ubuntu:act-latest
|
||||||
volumes:
|
# aether-runner defaults `run:` blocks to POSIX `sh`, which
|
||||||
- /var/run/docker.sock:/var/run/docker.sock
|
# chokes on bash-isms like ${var,,} (lowercase) and ${var:0:7}
|
||||||
|
# (substring). Pin bash for the whole job.
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: bash
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v3
|
uses: docker/setup-qemu-action@v3
|
||||||
|
with:
|
||||||
|
# Skip the GHA-cache lookup for the binfmt image. The Gitea
|
||||||
|
# runner has no GHA cache server, so the action would
|
||||||
|
# otherwise sit in a ~5-minute TCP timeout before falling
|
||||||
|
# back to a direct docker pull. Going straight to pull
|
||||||
|
# cuts QEMU setup from ~5 min to ~15 s.
|
||||||
|
cache-image: false
|
||||||
|
|
||||||
- name: Set up Buildx
|
- name: Set up Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
# github.repository is owner/name with the repo's original casing;
|
# Registries want lowercase namespaces, and Gitea's container
|
||||||
# registries require lowercase, so normalise once here and reuse
|
# registry is case-sensitive on the login username too. Lowercase
|
||||||
# the result in metadata-action below.
|
# both repo path and actor once here and reuse below.
|
||||||
- name: Resolve image name
|
- name: Resolve image name
|
||||||
id: img
|
id: img
|
||||||
run: |
|
run: |
|
||||||
repo='${{ github.repository }}'
|
repo='${{ github.repository }}'
|
||||||
|
actor='${{ github.actor }}'
|
||||||
echo "ref=git.cer.sh/${repo,,}" >> "$GITHUB_OUTPUT"
|
echo "ref=git.cer.sh/${repo,,}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "user=${actor,,}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
# Version stamp baked into the binary via -ldflags. Tag pushes
|
||||||
|
# use the tag name directly; everything else gets a short SHA
|
||||||
|
# suffix so `qu version` on a canary build is debuggable.
|
||||||
|
- name: Compute version
|
||||||
|
id: ver
|
||||||
|
run: |
|
||||||
|
if [[ "$GITHUB_REF" == refs/tags/* ]]; then
|
||||||
|
v="${GITHUB_REF_NAME}"
|
||||||
|
else
|
||||||
|
v="${GITHUB_REF_NAME}-${GITHUB_SHA:0:7}"
|
||||||
|
fi
|
||||||
|
echo "version=$v" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
# Prefers a user-provided PAT (repo secret REGISTRY_TOKEN with
|
||||||
|
# `write:package` scope) and falls back to the auto-injected
|
||||||
|
# runner token. The auto-token works on Gitea >= 1.21 when the
|
||||||
|
# workflow declares `packages: write` in permissions, but if
|
||||||
|
# the registry still rejects it (older instance, container
|
||||||
|
# registry gated by config, etc.), REGISTRY_TOKEN takes over
|
||||||
|
# without any workflow edits.
|
||||||
- name: Login to Gitea registry
|
- name: Login to Gitea registry
|
||||||
if: github.event_name == 'push'
|
if: github.event_name == 'push'
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
registry: git.cer.sh
|
registry: git.cer.sh
|
||||||
username: ${{ github.actor }}
|
username: ${{ steps.img.outputs.user }}
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
password: ${{ secrets.REGISTRY_TOKEN || secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Docker metadata
|
- name: Docker metadata
|
||||||
id: meta
|
id: meta
|
||||||
@@ -65,18 +101,20 @@ jobs:
|
|||||||
type=semver,pattern={{version}}
|
type=semver,pattern={{version}}
|
||||||
type=semver,pattern={{major}}.{{minor}}
|
type=semver,pattern={{major}}.{{minor}}
|
||||||
type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/v') }}
|
type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/v') }}
|
||||||
|
type=ref,event=branch
|
||||||
|
type=sha,prefix=sha-,format=short
|
||||||
|
|
||||||
- name: Build (and push on tag)
|
- name: Build (and push on push events)
|
||||||
uses: docker/build-push-action@v6
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
file: ./Dockerfile
|
file: ./docker/Dockerfile
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: linux/amd64,linux/arm64
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
build-args: |
|
build-args: |
|
||||||
VERSION=${{ github.ref_name }}
|
VERSION=${{ steps.ver.outputs.version }}
|
||||||
# Inline cache embeds layer metadata into the pushed image
|
# Inline cache embeds layer metadata into the pushed image
|
||||||
# itself — no external cache server needed, which keeps the
|
# itself — no external cache server needed, which keeps the
|
||||||
# workflow self-contained on the Gitea runner.
|
# workflow self-contained on the Gitea runner.
|
||||||
|
|||||||
@@ -0,0 +1,72 @@
|
|||||||
|
name: Container image
|
||||||
|
|
||||||
|
# Mirrors .gitea/workflows/container.yaml — publishes a multi-arch
|
||||||
|
# (amd64 + arm64) image to the GitHub Container Registry whenever the
|
||||||
|
# Gitea→GitHub mirror pushes a `v*` tag. Image lands at
|
||||||
|
# ghcr.io/axodouble/quptime with tags :vX.Y.Z, :X.Y, and :latest.
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
packages: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
image:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v3
|
||||||
|
|
||||||
|
- name: Set up Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
# GHCR namespaces must be lowercase. Lowercase the repository
|
||||||
|
# path once and reuse below so a mixed-case org/repo (e.g.
|
||||||
|
# Axodouble/QUptime) still resolves to a valid image reference.
|
||||||
|
- name: Resolve image name
|
||||||
|
id: img
|
||||||
|
run: |
|
||||||
|
repo='${{ github.repository }}'
|
||||||
|
echo "ref=ghcr.io/${repo,,}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Compute version
|
||||||
|
id: ver
|
||||||
|
run: |
|
||||||
|
echo "version=${GITHUB_REF_NAME}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Login to GHCR
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Docker metadata
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: ${{ steps.img.outputs.ref }}
|
||||||
|
tags: |
|
||||||
|
type=semver,pattern={{version}}
|
||||||
|
type=semver,pattern={{major}}.{{minor}}
|
||||||
|
type=raw,value=latest
|
||||||
|
|
||||||
|
- name: Build and push
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: ./docker/Dockerfile
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
push: true
|
||||||
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
build-args: |
|
||||||
|
VERSION=${{ steps.ver.outputs.version }}
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
name: Release
|
||||||
|
|
||||||
|
# Mirrors .gitea/workflows/release.yaml — fires when the Gitea→GitHub
|
||||||
|
# mirror pushes a `v*` tag, builds static Linux binaries for amd64 +
|
||||||
|
# arm64, and publishes them to GitHub Releases alongside the Gitea
|
||||||
|
# release the same tag produces upstream.
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
release:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Go
|
||||||
|
uses: actions/setup-go@v5
|
||||||
|
with:
|
||||||
|
go-version: '1.24'
|
||||||
|
check-latest: false
|
||||||
|
cache: false
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
run: go test -race ./...
|
||||||
|
|
||||||
|
- name: Build binaries
|
||||||
|
env:
|
||||||
|
CGO_ENABLED: '0'
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
VERSION="${GITHUB_REF_NAME}"
|
||||||
|
mkdir -p dist
|
||||||
|
for arch in amd64 arm64; do
|
||||||
|
out="dist/qu-${VERSION}-linux-${arch}"
|
||||||
|
echo "building ${out}"
|
||||||
|
GOOS=linux GOARCH="${arch}" \
|
||||||
|
go build \
|
||||||
|
-trimpath \
|
||||||
|
-ldflags "-s -w -X main.version=${VERSION}" \
|
||||||
|
-o "${out}" \
|
||||||
|
./cmd/qu
|
||||||
|
done
|
||||||
|
(cd dist && sha256sum qu-* > SHA256SUMS)
|
||||||
|
ls -lh dist
|
||||||
|
|
||||||
|
- name: Publish release
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
files: |
|
||||||
|
dist/qu-*
|
||||||
|
dist/SHA256SUMS
|
||||||
|
fail_on_unmatched_files: true
|
||||||
|
generate_release_notes: true
|
||||||
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
+34
@@ -0,0 +1,34 @@
|
|||||||
|
# Build artifacts
|
||||||
|
/qu
|
||||||
|
/qu-*
|
||||||
|
/dist/
|
||||||
|
*.exe
|
||||||
|
*.test
|
||||||
|
*.out
|
||||||
|
|
||||||
|
# Go workspace / module cache (only relevant if vendored)
|
||||||
|
/vendor/
|
||||||
|
|
||||||
|
# Local node state — never commit anything that looks like a data dir
|
||||||
|
/quptime/
|
||||||
|
/etc/quptime/
|
||||||
|
node.yaml
|
||||||
|
cluster.yaml
|
||||||
|
trust.yaml
|
||||||
|
keys/
|
||||||
|
|
||||||
|
# Compose / secrets
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
*.local.yml
|
||||||
|
*.local.yaml
|
||||||
|
|
||||||
|
# Editor / OS scratch
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Test / coverage
|
||||||
|
coverage.out
|
||||||
|
coverage.html
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
# Changelog
|
||||||
|
|
||||||
|
All notable changes to this project are documented here. The format
|
||||||
|
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and
|
||||||
|
this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [v0.0.1] — 2026-05-15
|
||||||
|
|
||||||
|
Initial public release.
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- **Quorum-based uptime monitoring.** Multiple cooperating nodes run
|
||||||
|
the same probes (HTTP, TCP, ICMP) and vote on the cluster-wide
|
||||||
|
truth. A check flips state only after two consecutive aggregate
|
||||||
|
evaluations agree (hysteresis), so single-node flake doesn't page
|
||||||
|
anyone.
|
||||||
|
- **Deterministic master election.** Among the live members of the
|
||||||
|
quorum the lexicographically smallest NodeID wins — no negotiation
|
||||||
|
step, no split-brain window.
|
||||||
|
- **mTLS inter-node transport** with TLS 1.3 minimum, SSH-style
|
||||||
|
fingerprint pinning, and a pre-shared `cluster_secret` gating the
|
||||||
|
Join RPC.
|
||||||
|
- **Replicated `cluster.yaml`** carrying peers, checks, and alerts.
|
||||||
|
Master is the only writer; followers receive monotonic-versioned
|
||||||
|
snapshots and converge on the latest. Hand-edits to the file on any
|
||||||
|
node are picked up by the manual-edit watcher and forwarded through
|
||||||
|
the master.
|
||||||
|
- **HTTP, TCP, and ICMP probes** with configurable interval,
|
||||||
|
timeout, expected status, and optional body-substring match. ICMP
|
||||||
|
defaults to unprivileged UDP-mode pings so the daemon can run as a
|
||||||
|
non-root user.
|
||||||
|
- **SMTP and Discord alerts** with optional Go `text/template`
|
||||||
|
subject/body overrides per alert, default-attach mode (`default:
|
||||||
|
true`), and per-check opt-outs via `suppress_alert_ids`.
|
||||||
|
- **Docker-friendly env-var configuration.** Every field in
|
||||||
|
`node.yaml` can also be supplied via a `QUPTIME_*` environment
|
||||||
|
variable; `qu serve` auto-initialises a fresh data volume from
|
||||||
|
these on first start, so `docker compose up` is enough to launch a
|
||||||
|
node.
|
||||||
|
- **Interactive TUI** (`qu tui`) for peers, checks, and alerts with
|
||||||
|
live refresh.
|
||||||
|
- **Hardened systemd unit** shipped via `install.sh`: dedicated
|
||||||
|
`quptime` user, `ProtectSystem=strict`, all capabilities dropped by
|
||||||
|
default.
|
||||||
|
- **Multi-arch Docker images** (`linux/amd64`, `linux/arm64`)
|
||||||
|
published to `git.cer.sh/axodouble/quptime` (primary) and
|
||||||
|
`ghcr.io/axodouble/quptime` (GitHub push-mirror) on every tag.
|
||||||
|
- **Static Linux binaries** (`amd64`, `arm64`) published per tag with
|
||||||
|
a `SHA256SUMS` file to both Gitea Releases (primary) and GitHub
|
||||||
|
Releases (mirror). The official installer prefers Gitea, falls back
|
||||||
|
to GitHub on failure, and verifies the checksum before placing the
|
||||||
|
binary on disk.
|
||||||
|
|
||||||
|
### Security
|
||||||
|
|
||||||
|
- Cluster secret is compared in constant time
|
||||||
|
(`crypto/subtle.ConstantTimeCompare`).
|
||||||
|
- Self-signed RSA certs minted at `qu init`; SPKI SHA-256
|
||||||
|
fingerprints are what's pinned, matching the canonical OpenSSL
|
||||||
|
representation.
|
||||||
|
- Private keys are written with mode `0600`; data and runtime
|
||||||
|
directories with `0700`/`0750`.
|
||||||
|
- All `cluster.yaml` writes go through an atomic `tmpfile + rename`.
|
||||||
|
- `install.sh` downloads the published `SHA256SUMS` and refuses to
|
||||||
|
install if the downloaded binary doesn't match.
|
||||||
|
|
||||||
|
### Known limitations
|
||||||
|
|
||||||
|
- **Cluster-wide secret distribution.** SMTP passwords and Discord
|
||||||
|
webhook URLs configured via `qu alert add …` are stored in
|
||||||
|
`cluster.yaml`, which is replicated to every node. Treat every node
|
||||||
|
as having read access to every alert credential. Restrict who can
|
||||||
|
reach the data directory accordingly. See
|
||||||
|
[docs/security.md](docs/security.md) for the threat model.
|
||||||
|
- **No automatic key rotation.** Rolling a node's identity means
|
||||||
|
wiping its data directory, running `qu init` again, and re-adding
|
||||||
|
it from another node.
|
||||||
|
- **No historical metrics.** Only the current aggregate state is kept
|
||||||
|
in memory. There is no built-in graph store, SLA calculator, or
|
||||||
|
audit log.
|
||||||
|
- **Master-flap state.** Aggregator hysteresis state lives in
|
||||||
|
memory on the current master. When leadership changes the new
|
||||||
|
master starts from `StateUnknown` and re-accumulates hysteresis —
|
||||||
|
expect a few seconds of delayed alerting after a master switch.
|
||||||
|
- **No release signing beyond SHA256SUMS** (no cosign / GPG).
|
||||||
|
Planned for a future release.
|
||||||
|
|
||||||
|
[v0.0.1]: https://git.cer.sh/axodouble/quptime/releases/tag/v0.0.1
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2026 Jasper V.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
@@ -14,12 +14,37 @@ trust — no central CA, no shared secret.
|
|||||||
|
|
||||||
### From pre-built binary
|
### From pre-built binary
|
||||||
|
|
||||||
This can be done in one step, either by downloading the latest release from
|
The canonical home is Gitea; the repo is push-mirrored to GitHub on
|
||||||
the [Gitea releases page](https://git.cer.sh/axodouble/quptime/releases) or by running the following script:
|
every tag. Releases and multi-arch container images are published to
|
||||||
|
both.
|
||||||
|
|
||||||
|
| Source | Releases | Container image |
|
||||||
|
| ---------------- | ------------------------------------------------------------ | -------------------------------- |
|
||||||
|
| Gitea (primary) | <https://git.cer.sh/axodouble/quptime/releases> | `git.cer.sh/axodouble/quptime` |
|
||||||
|
| GitHub (mirror) | <https://github.com/Axodouble/QUptime/releases> | `ghcr.io/axodouble/quptime` |
|
||||||
|
|
||||||
|
One-step install — tries Gitea first, falls back to GitHub automatically:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
curl -fsSL https://git.cer.sh/Axodouble/QUptime/raw/branch/master/install.sh | sudo bash
|
curl -fsSL https://git.cer.sh/Axodouble/QUptime/raw/branch/master/install.sh | sudo bash
|
||||||
|
# or, via the GitHub mirror:
|
||||||
|
# curl -fsSL https://raw.githubusercontent.com/Axodouble/QUptime/master/install.sh | sudo bash
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The script verifies the binary against the published `SHA256SUMS`
|
||||||
|
before installing and refuses to proceed on a mismatch.
|
||||||
|
|
||||||
|
### From Docker
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docker pull git.cer.sh/axodouble/quptime:latest
|
||||||
|
# or, via the GitHub mirror:
|
||||||
|
# docker pull ghcr.io/axodouble/quptime:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
See [docs/deployment/docker.md](docs/deployment/docker.md) for compose
|
||||||
|
recipes.
|
||||||
|
|
||||||
## Why
|
## Why
|
||||||
|
|
||||||
Most uptime monitors are either a SaaS or a single box that, by
|
Most uptime monitors are either a SaaS or a single box that, by
|
||||||
@@ -27,6 +52,23 @@ definition, can't tell you when it's the one that's down. `qu` solves
|
|||||||
both: run it on a few cheap hosts in different networks and they vote
|
both: run it on a few cheap hosts in different networks and they vote
|
||||||
on truth. If one of them loses its uplink, the rest keep alerting.
|
on truth. If one of them loses its uplink, the rest keep alerting.
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
This README is the quick-start. For production use, the longer guides
|
||||||
|
live under [`docs/`](docs/README.md):
|
||||||
|
|
||||||
|
| If you want to… | Read |
|
||||||
|
| ----------------------------------------------------- | ------------------------------------------------------------------ |
|
||||||
|
| understand the consensus / replication model | [docs/architecture.md](docs/architecture.md) |
|
||||||
|
| reference every field in `node.yaml` / `cluster.yaml` | [docs/configuration.md](docs/configuration.md) |
|
||||||
|
| deploy on Linux with systemd hardening | [docs/deployment/systemd.md](docs/deployment/systemd.md) |
|
||||||
|
| deploy with Docker / docker-compose | [docs/deployment/docker.md](docs/deployment/docker.md) |
|
||||||
|
| deploy over Tailscale or WireGuard | [docs/deployment/tailscale.md](docs/deployment/tailscale.md) |
|
||||||
|
| expose `qu` on the open internet safely | [docs/deployment/public-internet.md](docs/deployment/public-internet.md) |
|
||||||
|
| upgrade, back up, or recover from failures | [docs/operations.md](docs/operations.md) |
|
||||||
|
| understand the trust model and rotate identities | [docs/security.md](docs/security.md) |
|
||||||
|
| diagnose a misbehaving cluster | [docs/troubleshooting.md](docs/troubleshooting.md) |
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -71,7 +113,7 @@ go build -o qu ./cmd/qu
|
|||||||
To stamp the version into the binary:
|
To stamp the version into the binary:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
go build -ldflags "-X main.version=v0.1.0" -o qu ./cmd/qu
|
go build -ldflags "-X main.version=v0.0.1" -o qu ./cmd/qu
|
||||||
qu --version
|
qu --version
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -83,7 +125,7 @@ amd64 and arm64, and publishes them as a Gitea release with a
|
|||||||
`SHA256SUMS` file alongside.
|
`SHA256SUMS` file alongside.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
git tag v0.1.0
|
git tag v0.0.1
|
||||||
git push --tags
|
git push --tags
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -149,6 +191,15 @@ c0d4... charlie.example.com:9901 true 2026-05-12T15:01:32Z
|
|||||||
|
|
||||||
## Adding checks and alerts
|
## Adding checks and alerts
|
||||||
|
|
||||||
|
> ⚠️ **Alert credentials are replicated cluster-wide.** SMTP passwords
|
||||||
|
> and Discord webhook URLs live in `cluster.yaml`, which is mirrored to
|
||||||
|
> every node. Any node that can read its own data directory can read
|
||||||
|
> every alert secret. Treat compromising one node as compromising every
|
||||||
|
> alert credential, and restrict who can reach `$QUPTIME_DIR` on each
|
||||||
|
> host (the hardened systemd unit and the Docker image both default to
|
||||||
|
> `0700`/`0750`). See [docs/security.md](docs/security.md) for the full
|
||||||
|
> threat model.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# alerts first so checks can reference them
|
# alerts first so checks can reference them
|
||||||
qu alert add discord oncall --webhook https://discord.com/api/webhooks/...
|
qu alert add discord oncall --webhook https://discord.com/api/webhooks/...
|
||||||
|
|||||||
@@ -0,0 +1,54 @@
|
|||||||
|
# An example of a docker compose with Tailscale & QUptime.
|
||||||
|
# This setup is specifically intended for hosts that may not be able to
|
||||||
|
# reach each other directly or have a public IP address.
|
||||||
|
#
|
||||||
|
# Bring it up with `docker compose -f docker-compose-tailscale.yml up -d`.
|
||||||
|
# QUptime auto-initialises on first start using the QUPTIME_* env vars
|
||||||
|
# below — no separate `qu init` step is required.
|
||||||
|
#
|
||||||
|
# On the first node, omit QUPTIME_CLUSTER_SECRET to have one generated
|
||||||
|
# for you. Read it out of the logs (`docker logs quptime`) and copy it
|
||||||
|
# into the .env of every other node before bringing them up.
|
||||||
|
|
||||||
|
services:
|
||||||
|
tailscale:
|
||||||
|
image: tailscale/tailscale:latest
|
||||||
|
container_name: tailscale
|
||||||
|
cap_add:
|
||||||
|
- NET_ADMIN
|
||||||
|
environment:
|
||||||
|
- TS_AUTHKEY=${TAILSCALE_AUTHKEY} # Set this in your .env file with a Tailscale auth key
|
||||||
|
- TS_HOSTNAME=quptime-tailscale
|
||||||
|
volumes:
|
||||||
|
- /dev/net/tun:/dev/net/tun
|
||||||
|
- tailscale:/var/lib/tailscale
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
quptime:
|
||||||
|
image: git.cer.sh/axodouble/quptime:latest
|
||||||
|
container_name: quptime
|
||||||
|
environment:
|
||||||
|
# host:port other QUptime nodes use to reach this one. Use the
|
||||||
|
# Tailscale IP / MagicDNS name of this host. Required behind NAT.
|
||||||
|
- QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE}
|
||||||
|
# Shared cluster join secret. Set on every node. Leave unset on
|
||||||
|
# the very first node — one will be generated and logged for you
|
||||||
|
# to copy to the others. Followers MUST set this before starting.
|
||||||
|
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-}
|
||||||
|
# Optional: pin a port other than the default 9901.
|
||||||
|
# - QUPTIME_BIND_PORT=9901
|
||||||
|
volumes:
|
||||||
|
- quptime:/etc/quptime
|
||||||
|
ports:
|
||||||
|
- "9901:9901"
|
||||||
|
depends_on:
|
||||||
|
- tailscale
|
||||||
|
network_mode: "service:tailscale" # Use the Tailscale network stack
|
||||||
|
restart: unless-stopped
|
||||||
|
# After this node is up, add peers from the master with:
|
||||||
|
# docker compose -f docker-compose-tailscale.yml exec quptime \
|
||||||
|
# qu node add <OTHER_NODE_TAILSCALE_IP>:9901
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
tailscale:
|
||||||
|
quptime:
|
||||||
@@ -0,0 +1,53 @@
|
|||||||
|
# QUptime documentation
|
||||||
|
|
||||||
|
Production-oriented documentation for `qu`, a small distributed uptime
|
||||||
|
monitor that votes on the health of HTTP/TCP/ICMP targets across a
|
||||||
|
cluster of cooperating nodes.
|
||||||
|
|
||||||
|
The top-level `README.md` is the marketing pitch and quick-start. The
|
||||||
|
pages here go deeper and are organised by what you're trying to do.
|
||||||
|
|
||||||
|
## Getting set up
|
||||||
|
|
||||||
|
- [Installation](installation.md) — pre-built binaries, building from
|
||||||
|
source, verifying release artifacts, what the install script does.
|
||||||
|
- [Configuration](configuration.md) — `node.yaml`, `cluster.yaml`,
|
||||||
|
`trust.yaml`, environment variables, file layout, defaults.
|
||||||
|
|
||||||
|
## Running it
|
||||||
|
|
||||||
|
- [Architecture](architecture.md) — how nodes form quorum, how a master
|
||||||
|
is elected, how cluster state replicates, what happens during a
|
||||||
|
partition, and exactly which guarantees the design gives you.
|
||||||
|
- [Operations](operations.md) — day-2 tasks: upgrades, backups,
|
||||||
|
recovery from a lost node, recovery from a lost quorum, monitoring
|
||||||
|
`qu` itself.
|
||||||
|
- [Security](security.md) — the mTLS / TOFU trust model, what the
|
||||||
|
cluster secret protects, how to rotate keys, what to put on a public
|
||||||
|
network and what not to.
|
||||||
|
- [Troubleshooting](troubleshooting.md) — common failure modes with
|
||||||
|
the log lines you'll see and the fix.
|
||||||
|
|
||||||
|
## Deployment recipes
|
||||||
|
|
||||||
|
Pick the one that matches your environment. They share most of the
|
||||||
|
operational guidance — what differs is how `qu` is packaged and how
|
||||||
|
the inter-node link is secured at the network layer.
|
||||||
|
|
||||||
|
- [systemd on bare metal / VM](deployment/systemd.md) — single static
|
||||||
|
binary, hardened unit file, `CAP_NET_RAW` for ICMP.
|
||||||
|
- [Docker / docker-compose](deployment/docker.md) — official image,
|
||||||
|
single-node and multi-node compose files, persistent volumes.
|
||||||
|
- [Tailscale / WireGuard overlay](deployment/tailscale.md) — nodes in
|
||||||
|
separate networks with no public ingress; cluster traffic stays on
|
||||||
|
the tailnet.
|
||||||
|
- [Public-internet exposure](deployment/public-internet.md) — when
|
||||||
|
you have no overlay and `:9901` is reachable from the open
|
||||||
|
internet: firewalling, rate-limiting, secret hygiene.
|
||||||
|
|
||||||
|
## A note on stability
|
||||||
|
|
||||||
|
The wire protocol (`internal/transport`) and the on-disk format
|
||||||
|
(`cluster.yaml`, `node.yaml`, `trust.yaml`) are considered stable
|
||||||
|
within a minor version. Breaking changes will bump the major version
|
||||||
|
and ship with a migration note.
|
||||||
@@ -0,0 +1,196 @@
|
|||||||
|
# Architecture
|
||||||
|
|
||||||
|
This page is the long-form companion to the diagram in the top-level
|
||||||
|
README. Read it if you need to reason about partitions, recovery,
|
||||||
|
upgrade ordering, or the consistency guarantees of `qu`.
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
A running `qu serve` is one process containing five long-lived
|
||||||
|
goroutines plus the listeners:
|
||||||
|
|
||||||
|
| Component | Package | Role |
|
||||||
|
| --------------- | ------------------------ | ------------------------------------------------------------------------ |
|
||||||
|
| Transport | `internal/transport` | mTLS listener + dialer, length-prefixed JSON-RPC framing. |
|
||||||
|
| Quorum manager | `internal/quorum` | 1 Hz heartbeats, liveness tracking, deterministic master election. |
|
||||||
|
| Replicator | `internal/replicate` | Master-routed mutations, version-gated broadcast and pull. |
|
||||||
|
| Scheduler | `internal/checks` | One goroutine per check; runs HTTP/TCP/ICMP probes on each node. |
|
||||||
|
| Aggregator | `internal/checks` | Master-only. Folds per-node probe results into a cluster-wide verdict. |
|
||||||
|
| Alert dispatch | `internal/alerts` | Master-only. Renders templates and ships SMTP / Discord notifications. |
|
||||||
|
| Control socket | `internal/daemon` | Local-only unix socket; the CLI and TUI talk to the daemon through it. |
|
||||||
|
|
||||||
|
Every node runs every component. Whether the master-only ones actually
|
||||||
|
*do* anything depends on the result of master election.
|
||||||
|
|
||||||
|
## Trust and transport
|
||||||
|
|
||||||
|
Inter-node traffic is TLS 1.3 with mutual authentication. There is **no
|
||||||
|
central CA**. Each node generates a self-signed RSA cert at `qu init`
|
||||||
|
and the SPKI fingerprint of that cert is what other nodes pin against.
|
||||||
|
|
||||||
|
Two layers gate access:
|
||||||
|
|
||||||
|
1. **TLS layer** accepts any client cert. This avoids a chicken-and-egg
|
||||||
|
during bootstrap — a brand-new node has no entry in anyone's trust
|
||||||
|
store yet, so a strict TLS check would refuse the very first
|
||||||
|
handshake.
|
||||||
|
2. **RPC dispatcher** rejects every method except `Join` for callers
|
||||||
|
whose presented fingerprint is not in `trust.yaml`. So an untrusted
|
||||||
|
peer can knock on the door but cannot ask questions.
|
||||||
|
|
||||||
|
`Join` itself is gated by the **cluster secret** — a pre-shared base64
|
||||||
|
string generated at `qu init` on the first node. Without it, an
|
||||||
|
attacker who can reach `:9901` cannot enrol themselves into the
|
||||||
|
cluster.
|
||||||
|
|
||||||
|
The local CLI talks to the daemon over a unix socket with `0600`
|
||||||
|
permissions; filesystem ACLs are the only authentication and no TLS is
|
||||||
|
used on that channel.
|
||||||
|
|
||||||
|
## The replicated state machine
|
||||||
|
|
||||||
|
`cluster.yaml` is the single replicated source of truth. It holds three
|
||||||
|
editable lists — `peers`, `checks`, `alerts` — plus three
|
||||||
|
server-controlled fields:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
version: 7 # monotonically increasing
|
||||||
|
updated_at: 2026-05-15T...
|
||||||
|
updated_by: <node-id> # master that committed this version
|
||||||
|
peers: [...]
|
||||||
|
checks: [...]
|
||||||
|
alerts: [...]
|
||||||
|
```
|
||||||
|
|
||||||
|
### How mutations flow
|
||||||
|
|
||||||
|
1. The CLI (or the manual-edit watcher; see below) issues a mutation
|
||||||
|
on the local daemon's control socket.
|
||||||
|
2. The daemon's replicator looks at the current quorum view:
|
||||||
|
- If there is no quorum, the mutation fails loudly with
|
||||||
|
`no quorum: refusing mutation`.
|
||||||
|
- If this node is the master, apply locally and broadcast.
|
||||||
|
- Otherwise, ship the mutation to the master via the
|
||||||
|
`ProposeMutation` RPC and wait for the result.
|
||||||
|
3. The master holds the cluster lock, applies the mutation, bumps
|
||||||
|
`version`, writes `cluster.yaml` atomically, and broadcasts the new
|
||||||
|
snapshot to every peer via `ApplyClusterCfg`.
|
||||||
|
4. Each follower's `Replace` accepts the snapshot **only if**
|
||||||
|
`incoming.Version > local.Version`. Older or equal versions are
|
||||||
|
dropped silently.
|
||||||
|
|
||||||
|
The mutation kinds are enumerated in `internal/transport/messages.go`:
|
||||||
|
`add_check`, `remove_check`, `add_alert`, `remove_alert`, `add_peer`,
|
||||||
|
`remove_peer`, `replace_config`.
|
||||||
|
|
||||||
|
### Manual edits to `cluster.yaml`
|
||||||
|
|
||||||
|
Operators can `sudoedit /etc/quptime/cluster.yaml` on any node. Every
|
||||||
|
2 seconds the daemon hashes the file. When the on-disk hash diverges
|
||||||
|
from the last hash the daemon wrote, the new content is parsed and
|
||||||
|
forwarded to the master as a `replace_config` mutation. So a hand-edit
|
||||||
|
on a follower still ends up on the master, version-bumped, and
|
||||||
|
broadcast everywhere.
|
||||||
|
|
||||||
|
If the parse fails (invalid YAML), the daemon logs and pins the bad
|
||||||
|
hash so it doesn't loop. The operator's next valid save unblocks it.
|
||||||
|
|
||||||
|
## Quorum and master election
|
||||||
|
|
||||||
|
Every node sends a heartbeat to every peer once per second. A peer is
|
||||||
|
**live** if a heartbeat (sent or received) was observed within the
|
||||||
|
last 4 seconds — comfortably more than three missed beats so a one-tick
|
||||||
|
blip does not unseat the master.
|
||||||
|
|
||||||
|
**Quorum** is met when `len(live_peers) >= floor(N/2) + 1` where `N`
|
||||||
|
is the total peer count in `cluster.yaml`. Below quorum, the cluster
|
||||||
|
refuses every mutation; existing checks continue probing locally but no
|
||||||
|
state transitions are committed (the master is the only one who
|
||||||
|
aggregates, and there is no master).
|
||||||
|
|
||||||
|
**Master election** is deterministic with no negotiation step: among
|
||||||
|
the live members, the master is the one with the lexicographically
|
||||||
|
smallest `NodeID`. Every node that observes the same live set picks the
|
||||||
|
same master — so there is no split-brain window even during a partial
|
||||||
|
partition.
|
||||||
|
|
||||||
|
The `term` integer in `qu status` is bumped every time the elected
|
||||||
|
master changes (including transitions to and from "no master"). Use it
|
||||||
|
to spot flappy clusters.
|
||||||
|
|
||||||
|
## Catch-up when a node reconnects
|
||||||
|
|
||||||
|
This is the scenario most people ask about: node C is offline, the
|
||||||
|
master commits config version 7, node C comes back online. What
|
||||||
|
happens?
|
||||||
|
|
||||||
|
1. Node C's tick loop fires heartbeats every second regardless of its
|
||||||
|
previous state. There is no backoff, no give-up.
|
||||||
|
2. Each heartbeat carries the sender's `Version`. Each response carries
|
||||||
|
the responder's `Version`.
|
||||||
|
3. The first time C sees a peer reporting a higher version than its
|
||||||
|
own, the version-observer fires and calls
|
||||||
|
`replicator.PullFrom(peerID, addr)`.
|
||||||
|
4. `PullFrom` does a `GetClusterCfg` RPC against that peer and feeds
|
||||||
|
the snapshot through `Replace`, which writes `cluster.yaml`
|
||||||
|
atomically and refreshes the on-disk hash so the manual-edit
|
||||||
|
watcher doesn't re-fire.
|
||||||
|
5. Within ~1 heartbeat C is byte-for-byte identical to the master.
|
||||||
|
|
||||||
|
The same path catches a stale node up when the partition heals on the
|
||||||
|
minority side: the minority side cannot mutate, so when it rejoins it
|
||||||
|
strictly has the older version, and the pull fires.
|
||||||
|
|
||||||
|
There is one corner case worth knowing about: the pull only fires when
|
||||||
|
`peer_version > local_version`. Two nodes at the same version with
|
||||||
|
different content would silently diverge — but the design forbids
|
||||||
|
that (only the master mutates, and the master is the only one bumping
|
||||||
|
the version) unless somebody hand-edits `cluster.yaml` and also
|
||||||
|
manually sets `version:`. Don't do that.
|
||||||
|
|
||||||
|
## Why a check flips state
|
||||||
|
|
||||||
|
The aggregator runs on the master only. Followers' probe results are
|
||||||
|
shipped to the master via the `ReportResult` RPC; the master's own
|
||||||
|
probe results are submitted directly.
|
||||||
|
|
||||||
|
For each check, the aggregator keeps the latest result per node within
|
||||||
|
a freshness window (3× the check interval, minimum 30s). On each
|
||||||
|
incoming submission it counts OK vs not-OK across the fresh results:
|
||||||
|
|
||||||
|
- 0 fresh reports → `unknown`
|
||||||
|
- more OK than not-OK → `up`
|
||||||
|
- more not-OK than OK → `down`
|
||||||
|
- tie → `up` (a tie at one report means one node says yes and one says
|
||||||
|
no; biasing toward `up` avoids false alerts when nodes disagree
|
||||||
|
transiently).
|
||||||
|
|
||||||
|
A state flip is **not** committed immediately. Hysteresis requires the
|
||||||
|
candidate state to hold for **two consecutive aggregate evaluations**
|
||||||
|
before the state transition fires and the alert dispatcher is called.
|
||||||
|
Set in `internal/checks/aggregator.go` as the `HysteresisCount`
|
||||||
|
constant — change it there if you want a hair-trigger or a slower
|
||||||
|
alert.
|
||||||
|
|
||||||
|
If the master changes, the new master starts the per-check state from
|
||||||
|
`unknown` and rebuilds it as fresh results arrive. The first few
|
||||||
|
seconds after a re-election can therefore show `unknown` even for
|
||||||
|
checks that were `up` a moment ago.
|
||||||
|
|
||||||
|
## What `qu` does *not* do
|
||||||
|
|
||||||
|
These omissions are intentional in v1 and useful to know up front:
|
||||||
|
|
||||||
|
- **No persistent history.** Only the current aggregate state lives in
|
||||||
|
memory. There are no graphs, no SLA reports. Add a sidecar (Prometheus
|
||||||
|
exporter, SQLite logger) if you need them.
|
||||||
|
- **No automatic key rotation.** Re-init a node and re-trust if you
|
||||||
|
need to roll its identity. See [security.md](security.md).
|
||||||
|
- **No multi-tenant isolation.** One cluster = one set of checks =
|
||||||
|
one alert tree.
|
||||||
|
- **No web UI.** Operator surface is `qu` (CLI), `qu tui`, and direct
|
||||||
|
edits to `cluster.yaml`.
|
||||||
|
- **No automatic peer eviction on prolonged downtime.** A dead peer
|
||||||
|
stays in `cluster.yaml` until an operator runs `qu node remove`,
|
||||||
|
because that decision affects the quorum size and shouldn't happen
|
||||||
|
silently.
|
||||||
@@ -0,0 +1,318 @@
|
|||||||
|
# Configuration
|
||||||
|
|
||||||
|
This page is the canonical reference for the on-disk files, the
|
||||||
|
environment variables, and every field that `qu` reads. It's
|
||||||
|
deliberately tedious — when something doesn't behave the way you
|
||||||
|
expect, this is where the answer lives.
|
||||||
|
|
||||||
|
## File layout
|
||||||
|
|
||||||
|
When running as **root** (the typical case under systemd):
|
||||||
|
|
||||||
|
```
|
||||||
|
/etc/quptime/
|
||||||
|
├── node.yaml identity, never replicated
|
||||||
|
├── cluster.yaml replicated state
|
||||||
|
├── trust.yaml local fingerprint trust store
|
||||||
|
└── keys/
|
||||||
|
├── private.pem RSA private key (0600)
|
||||||
|
├── public.pem RSA public key
|
||||||
|
└── cert.pem self-signed X.509 cert
|
||||||
|
|
||||||
|
/var/run/quptime/quptime.sock control socket (0600)
|
||||||
|
```
|
||||||
|
|
||||||
|
When running as a **non-root** user (the typical case for `go run` or a
|
||||||
|
desktop test):
|
||||||
|
|
||||||
|
```
|
||||||
|
~/.config/quptime/... same shape as /etc/quptime
|
||||||
|
$XDG_RUNTIME_DIR/quptime/quptime.sock control socket
|
||||||
|
```
|
||||||
|
|
||||||
|
Override the data directory with `QUPTIME_DIR=/some/path qu serve`.
|
||||||
|
Override the socket path with `QUPTIME_SOCKET=/run/foo.sock`.
|
||||||
|
|
||||||
|
## Environment variables
|
||||||
|
|
||||||
|
### Paths
|
||||||
|
|
||||||
|
| Variable | Purpose |
|
||||||
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `QUPTIME_DIR` | Data directory. Defaults to `/etc/quptime` (root) or `$XDG_CONFIG_HOME/quptime`. |
|
||||||
|
| `QUPTIME_SOCKET` | Path to the CLI ↔ daemon unix socket. Defaults to `/var/run/quptime/quptime.sock` (root) or `$XDG_RUNTIME_DIR/quptime/…`. |
|
||||||
|
| `XDG_CONFIG_HOME` | Honored when running as non-root and `QUPTIME_DIR` is unset. |
|
||||||
|
| `XDG_RUNTIME_DIR` | Honored when running as non-root and `QUPTIME_SOCKET` is unset. |
|
||||||
|
|
||||||
|
### `node.yaml` field overrides
|
||||||
|
|
||||||
|
Every field in `node.yaml` can also be supplied via an environment
|
||||||
|
variable. This is the recommended way to drive Docker / Compose
|
||||||
|
deployments: drop the env vars into the compose file and the daemon
|
||||||
|
will bootstrap on first start without a separate `qu init` step.
|
||||||
|
|
||||||
|
| Variable | `node.yaml` field | Notes |
|
||||||
|
| ------------------------ | ----------------- | -------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `QUPTIME_NODE_ID` | `node_id` | Pin a specific UUID. Leave unset to let `qu init` / auto-init generate one. |
|
||||||
|
| `QUPTIME_BIND_ADDR` | `bind_addr` | Defaults to `0.0.0.0`. |
|
||||||
|
| `QUPTIME_BIND_PORT` | `bind_port` | Integer. Defaults to `9901`. |
|
||||||
|
| `QUPTIME_ADVERTISE` | `advertise` | `host:port` other peers use to reach this node. Required when bound to a wildcard or behind NAT. |
|
||||||
|
| `QUPTIME_CLUSTER_SECRET` | `cluster_secret` | Pre-shared join secret. Set the same value on every node. If unset on the very first node, one is generated. |
|
||||||
|
|
||||||
|
Precedence is **env > file > compiled default**. Non-empty env values
|
||||||
|
win over whatever is stored in `node.yaml` at load time, so changing a
|
||||||
|
variable in `docker-compose.yml` and restarting the container is
|
||||||
|
enough to roll out new bind/advertise values — no on-disk edit
|
||||||
|
required. Empty env values are ignored (they will not clear a
|
||||||
|
previously persisted field).
|
||||||
|
|
||||||
|
For `qu init` specifically, explicit command-line flags take
|
||||||
|
precedence over env values; env values fill in only the fields the
|
||||||
|
operator did not pass on the command line.
|
||||||
|
|
||||||
|
The daemon does not read any other environment variables. SMTP, Discord,
|
||||||
|
and HTTP probe targets are configured exclusively in `cluster.yaml`.
|
||||||
|
|
||||||
|
## Auto-init on `qu serve`
|
||||||
|
|
||||||
|
If `node.yaml` does not exist when `qu serve` starts, the daemon
|
||||||
|
bootstraps it in-place using the `QUPTIME_*` env vars above: a fresh
|
||||||
|
UUID is generated (or `QUPTIME_NODE_ID` is honored if set), an RSA
|
||||||
|
keypair and self-signed cert are written under `keys/`, and
|
||||||
|
`cluster.yaml` is seeded with this node as its sole peer. If no
|
||||||
|
`QUPTIME_CLUSTER_SECRET` was provided, a random one is generated and
|
||||||
|
printed to stderr — copy it to every follower node's
|
||||||
|
`QUPTIME_CLUSTER_SECRET` (or `--secret` flag) before they start.
|
||||||
|
|
||||||
|
This is what makes the docker-compose flow `docker compose up`-only
|
||||||
|
on a fresh volume. To opt out (e.g. so a misconfigured deployment
|
||||||
|
crashes loudly instead of silently generating a new identity), run
|
||||||
|
`qu init` against the volume yourself before letting `qu serve` ever
|
||||||
|
see it.
|
||||||
|
|
||||||
|
## `node.yaml` — local identity
|
||||||
|
|
||||||
|
Never replicated. One file per host. Generated by `qu init`.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
node_id: 7f3a5b9e-... # UUIDv4, immutable after init
|
||||||
|
bind_addr: 0.0.0.0 # listen address for :9901
|
||||||
|
bind_port: 9901 # listen port
|
||||||
|
advertise: alpha.example.com:9901 # how peers reach us; may differ from bind
|
||||||
|
cluster_secret: 4hZqK8vT9... # base64; required to Join, never replicated
|
||||||
|
```
|
||||||
|
|
||||||
|
### Field reference
|
||||||
|
|
||||||
|
- `node_id` — UUIDv4 generated at `qu init`. Used by every peer to
|
||||||
|
refer to this node across IP changes and restarts. Do not edit.
|
||||||
|
- `bind_addr` — Address the daemon listens on. `0.0.0.0` is the
|
||||||
|
default. Set to `127.0.0.1` if you only want to expose the daemon
|
||||||
|
through an overlay (Tailscale, WireGuard) — see
|
||||||
|
[deployment/tailscale.md](deployment/tailscale.md).
|
||||||
|
- `bind_port` — Defaults to `9901`. Change here if 9901 is taken; the
|
||||||
|
cluster does not require port-uniformity, peers just need to know
|
||||||
|
what to dial via the `advertise` field.
|
||||||
|
- `advertise` — Host:port other nodes use to reach this one. Must be
|
||||||
|
routable from every peer. Falls back to `bind_addr:bind_port` if
|
||||||
|
unset, which is rarely what you want behind NAT.
|
||||||
|
- `cluster_secret` — Pre-shared base64 string. Required on every
|
||||||
|
`Join` RPC; constant-time comparison on the receiver. Generate on
|
||||||
|
the first node, distribute out-of-band, keep out of version
|
||||||
|
control.
|
||||||
|
|
||||||
|
### How `qu init` populates this file
|
||||||
|
|
||||||
|
```sh
|
||||||
|
qu init \
|
||||||
|
--advertise alpha.example.com:9901 \
|
||||||
|
--bind 0.0.0.0 \
|
||||||
|
--port 9901 \
|
||||||
|
--secret '<paste from first node, or omit on the first node>'
|
||||||
|
```
|
||||||
|
|
||||||
|
Idempotent in one direction only: if `node.yaml` exists, `qu init`
|
||||||
|
refuses to overwrite. To re-init, delete the data directory entirely.
|
||||||
|
|
||||||
|
## `cluster.yaml` — replicated state
|
||||||
|
|
||||||
|
This is the file that every node converges on. The master is the only
|
||||||
|
one allowed to bump `version`; followers `Replace` it whole each time
|
||||||
|
they receive a higher-versioned snapshot.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
version: 12
|
||||||
|
updated_at: 2026-05-15T14:01:00Z
|
||||||
|
updated_by: 7f3a5b9e-...
|
||||||
|
peers:
|
||||||
|
- node_id: 7f3a5b9e-...
|
||||||
|
advertise: alpha.example.com:9901
|
||||||
|
fingerprint: SHA256:abcd...
|
||||||
|
cert_pem: |
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
...
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
checks:
|
||||||
|
- id: 0006a1...
|
||||||
|
name: homepage
|
||||||
|
type: http
|
||||||
|
target: https://example.com
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
expect_status: 200
|
||||||
|
alert_ids: [oncall]
|
||||||
|
suppress_alert_ids: []
|
||||||
|
alerts:
|
||||||
|
- id: f001ab...
|
||||||
|
name: oncall
|
||||||
|
type: discord
|
||||||
|
default: true
|
||||||
|
discord_webhook: https://discord.com/api/webhooks/...
|
||||||
|
body_template: |
|
||||||
|
:rotating_light: {{.Check.Name}} is {{.Verb}}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Top-level fields
|
||||||
|
|
||||||
|
| Field | Owner | Notes |
|
||||||
|
| ------------ | -------- | ---------------------------------------------------------------------------------- |
|
||||||
|
| `version` | master | Monotonic. Followers reject snapshots whose version is ≤ their local. |
|
||||||
|
| `updated_at` | master | UTC RFC3339. Cosmetic — humans use it, no logic depends on it. |
|
||||||
|
| `updated_by` | master | NodeID of the committing master. |
|
||||||
|
| `peers` | editable | Cluster members. Edits go through `add_peer` / `remove_peer` mutations. |
|
||||||
|
| `checks` | editable | Monitored targets. |
|
||||||
|
| `alerts` | editable | Notifier destinations. |
|
||||||
|
|
||||||
|
### `peers[]`
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- node_id: 7f3a5b9e-... # immutable, the peer's own UUID
|
||||||
|
advertise: host:port # how anyone dials this peer
|
||||||
|
fingerprint: SHA256:... # SPKI fingerprint of the peer's cert
|
||||||
|
cert_pem: | # full PEM so other peers can mTLS without a separate invite
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
The `cert_pem` field is what enables N-node clusters without N×(N-1)
|
||||||
|
manual invites: when peer X is added via the master, every other node
|
||||||
|
that receives the new `cluster.yaml` learns X's cert at the same time
|
||||||
|
and adds it to the local trust store. See
|
||||||
|
`internal/daemon/daemon.go:syncTrustFromCluster`.
|
||||||
|
|
||||||
|
### `checks[]`
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- id: 0006a1... # UUIDv4, generated when the check is created
|
||||||
|
name: homepage # human-friendly, must be unique within cluster
|
||||||
|
type: http # http | tcp | icmp
|
||||||
|
target: https://example.com
|
||||||
|
interval: 30s # Go duration syntax: 5s, 1m30s, 2h
|
||||||
|
timeout: 10s # default 10s
|
||||||
|
expect_status: 200 # http only; 0 = accept anything < 400
|
||||||
|
body_match: "OK" # http only; substring match on response body
|
||||||
|
alert_ids: [oncall] # alerts attached explicitly
|
||||||
|
suppress_alert_ids: [] # opt out of specific default alerts
|
||||||
|
```
|
||||||
|
|
||||||
|
Defaults:
|
||||||
|
|
||||||
|
- `interval`: 30s
|
||||||
|
- `timeout`: 10s
|
||||||
|
- `expect_status`: 0 → any 2xx is OK; otherwise the configured status
|
||||||
|
must match exactly.
|
||||||
|
|
||||||
|
ICMP checks default to **unprivileged UDP-mode pings** so the daemon
|
||||||
|
does not need root. For raw ICMP, grant the capability — see
|
||||||
|
[deployment/systemd.md](deployment/systemd.md).
|
||||||
|
|
||||||
|
### `alerts[]`
|
||||||
|
|
||||||
|
Two notifier kinds, distinguished by `type`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Discord
|
||||||
|
- id: f001ab...
|
||||||
|
name: oncall
|
||||||
|
type: discord
|
||||||
|
default: true # attach to every check automatically
|
||||||
|
discord_webhook: https://...
|
||||||
|
body_template: | # optional Go text/template override
|
||||||
|
{{.Check.Name}} is {{.Verb}}
|
||||||
|
|
||||||
|
# SMTP
|
||||||
|
- id: f002cd...
|
||||||
|
name: ops
|
||||||
|
type: smtp
|
||||||
|
smtp_host: smtp.example.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: mailbot
|
||||||
|
smtp_password: '...'
|
||||||
|
smtp_from: monitor@example.com
|
||||||
|
smtp_to: [ops@example.com]
|
||||||
|
smtp_starttls: true
|
||||||
|
subject_template: '[{{.Verb}}] {{.Check.Name}}'
|
||||||
|
body_template: |
|
||||||
|
Check {{.Check.Name}} ({{.Check.Target}}) is now {{.Verb}}.
|
||||||
|
```
|
||||||
|
|
||||||
|
If `default: true`, the alert fires for every check unless the check
|
||||||
|
lists the alert's ID or name in `suppress_alert_ids`. Otherwise the
|
||||||
|
alert only fires for checks that name it in `alert_ids`.
|
||||||
|
|
||||||
|
Templates are Go `text/template`. The full variable list is in the
|
||||||
|
top-level README under "Custom alert messages" — `qu alert add smtp
|
||||||
|
--help` and `qu alert add discord --help` print the same table.
|
||||||
|
|
||||||
|
### Suppression precedence
|
||||||
|
|
||||||
|
For each check, the dispatcher computes the effective alert list as:
|
||||||
|
|
||||||
|
```
|
||||||
|
( explicit alert_ids ∪ alerts with default=true ) \ suppress_alert_ids
|
||||||
|
```
|
||||||
|
|
||||||
|
de-duplicated by alert ID. So a check can both opt in to specific
|
||||||
|
alerts and opt out of specific defaults.
|
||||||
|
|
||||||
|
## `trust.yaml` — local trust store
|
||||||
|
|
||||||
|
A flat list of fingerprints this node accepts. One entry per peer,
|
||||||
|
populated by `qu node add` (or pulled in automatically when a peer's
|
||||||
|
cert arrives via the replicated `cluster.yaml`).
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
entries:
|
||||||
|
- node_id: 7f3a5b9e-...
|
||||||
|
address: alpha.example.com:9901
|
||||||
|
fingerprint: SHA256:...
|
||||||
|
cert_pem: |
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
Never edit this by hand. Use `qu trust list` and `qu trust remove`.
|
||||||
|
|
||||||
|
## Key material
|
||||||
|
|
||||||
|
`keys/private.pem` is the only secret on disk besides
|
||||||
|
`node.yaml.cluster_secret`. It's chmod 0600 by default; preserve that.
|
||||||
|
The public cert at `keys/cert.pem` is what gets fingerprinted and
|
||||||
|
shipped in `cluster.yaml.peers[].cert_pem`.
|
||||||
|
|
||||||
|
There is **no automatic key rotation**. Rolling a node's identity
|
||||||
|
means wiping its data directory, running `qu init` again, and
|
||||||
|
re-adding it from another node as a fresh peer.
|
||||||
|
|
||||||
|
## Tunables that don't live in YAML
|
||||||
|
|
||||||
|
A few values are compiled constants. Change them in source and rebuild
|
||||||
|
if you need different behaviour.
|
||||||
|
|
||||||
|
| Constant | Default | What it does |
|
||||||
|
| ----------------------------------------------------- | ------- | ------------------------------------------------------------- |
|
||||||
|
| `quorum.DefaultHeartbeatInterval` | `1s` | How often each node heartbeats every peer. |
|
||||||
|
| `quorum.DefaultDeadAfter` | `4s` | A peer is dead if no heartbeat is seen within this window. |
|
||||||
|
| `checks.HysteresisCount` | `2` | Consecutive aggregate evaluations needed before a state flip. |
|
||||||
|
| `checks.ReconcileInterval` | `5s` | How often the scheduler reconciles its workers vs `checks[]`. |
|
||||||
|
| `daemon.manualEditPollInterval` (`internal/daemon/watcher.go`) | `2s` | How often the daemon hashes `cluster.yaml` for hand edits. |
|
||||||
@@ -0,0 +1,243 @@
|
|||||||
|
# Deployment: Docker / docker-compose
|
||||||
|
|
||||||
|
The published image is a 14 MB distroless static container with the
|
||||||
|
`qu` binary as the entrypoint. It runs as root by default so the
|
||||||
|
daemon can bind privileged ports and open ICMP sockets; override with
|
||||||
|
`--user` if your host doesn't need that.
|
||||||
|
|
||||||
|
## Image references
|
||||||
|
|
||||||
|
The same multi-arch (amd64 + arm64) image is published to two
|
||||||
|
registries. **The Gitea registry is the canonical source** — it also
|
||||||
|
publishes canary `:master` builds on every branch push. GHCR is a
|
||||||
|
tag-only push-mirror for users who can't reach `git.cer.sh`.
|
||||||
|
|
||||||
|
Primary — Gitea registry:
|
||||||
|
|
||||||
|
```
|
||||||
|
git.cer.sh/axodouble/quptime:master # tip of main, multi-arch
|
||||||
|
git.cer.sh/axodouble/quptime:latest # latest tagged release
|
||||||
|
git.cer.sh/axodouble/quptime:v0.0.1 # specific tagged release
|
||||||
|
git.cer.sh/axodouble/quptime:latest-amd64 # single-arch (if you must pin)
|
||||||
|
```
|
||||||
|
|
||||||
|
Fallback — GitHub Container Registry:
|
||||||
|
|
||||||
|
```
|
||||||
|
ghcr.io/axodouble/quptime:latest # latest tagged release
|
||||||
|
ghcr.io/axodouble/quptime:v0.0.1 # specific tagged release
|
||||||
|
ghcr.io/axodouble/quptime:0.0 # latest patch in the 0.0 minor line
|
||||||
|
```
|
||||||
|
|
||||||
|
The image embeds `QUPTIME_DIR=/etc/quptime` and declares it a volume —
|
||||||
|
treat it as the only piece of state worth persisting.
|
||||||
|
|
||||||
|
## Single-node, single-container compose
|
||||||
|
|
||||||
|
For a development cluster or a single-node smoke test:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# compose.yaml
|
||||||
|
services:
|
||||||
|
quptime:
|
||||||
|
image: git.cer.sh/axodouble/quptime:latest
|
||||||
|
container_name: quptime
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
# host:port other nodes use to reach this one. Must be reachable
|
||||||
|
# from every peer — the loopback inside the container is useless.
|
||||||
|
- QUPTIME_ADVERTISE=<host-ip>:9901
|
||||||
|
# Pre-shared join secret. Omit on the very first node and read
|
||||||
|
# the generated value out of `docker logs quptime`, then set
|
||||||
|
# this env var on every follower before bringing them up.
|
||||||
|
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-}
|
||||||
|
ports:
|
||||||
|
- "9901:9901"
|
||||||
|
volumes:
|
||||||
|
- quptime-data:/etc/quptime
|
||||||
|
# ICMP UDP-mode pings need a permissive sysctl on the host:
|
||||||
|
# sysctl net.ipv4.ping_group_range="0 2147483647"
|
||||||
|
# Or grant CAP_NET_RAW (more accurate, raw ICMP).
|
||||||
|
cap_add:
|
||||||
|
- NET_RAW
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
quptime-data:
|
||||||
|
```
|
||||||
|
|
||||||
|
`qu serve` auto-initialises the data volume on first start using the
|
||||||
|
`QUPTIME_*` env vars (see [configuration.md](../configuration.md) for
|
||||||
|
the full list). One command brings everything up:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docker compose up -d
|
||||||
|
docker compose exec quptime qu status
|
||||||
|
```
|
||||||
|
|
||||||
|
On the very first node, capture the auto-generated cluster secret:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docker compose logs quptime | grep -A1 'cluster secret'
|
||||||
|
```
|
||||||
|
|
||||||
|
Copy that value into the `QUPTIME_CLUSTER_SECRET` env var of every
|
||||||
|
follower before starting them, otherwise their join RPCs will be
|
||||||
|
rejected. The full list of accepted env vars lives in
|
||||||
|
[configuration.md](../configuration.md#nodeyaml-field-overrides).
|
||||||
|
|
||||||
|
## Three-node compose on a single host
|
||||||
|
|
||||||
|
For local testing of the full quorum machinery without three machines:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# compose.yaml
|
||||||
|
x-quptime: &quptime
|
||||||
|
image: git.cer.sh/axodouble/quptime:latest
|
||||||
|
restart: unless-stopped
|
||||||
|
cap_add:
|
||||||
|
- NET_RAW
|
||||||
|
|
||||||
|
services:
|
||||||
|
alpha:
|
||||||
|
<<: *quptime
|
||||||
|
container_name: alpha
|
||||||
|
environment:
|
||||||
|
- QUPTIME_ADVERTISE=alpha:9901
|
||||||
|
# First node: leave secret unset and read it from `docker logs`.
|
||||||
|
ports: ["9901:9901"]
|
||||||
|
volumes: ["alpha-data:/etc/quptime"]
|
||||||
|
|
||||||
|
bravo:
|
||||||
|
<<: *quptime
|
||||||
|
container_name: bravo
|
||||||
|
environment:
|
||||||
|
- QUPTIME_ADVERTISE=bravo:9901
|
||||||
|
- QUPTIME_CLUSTER_SECRET=${SECRET}
|
||||||
|
ports: ["9902:9901"]
|
||||||
|
volumes: ["bravo-data:/etc/quptime"]
|
||||||
|
|
||||||
|
charlie:
|
||||||
|
<<: *quptime
|
||||||
|
container_name: charlie
|
||||||
|
environment:
|
||||||
|
- QUPTIME_ADVERTISE=charlie:9901
|
||||||
|
- QUPTIME_CLUSTER_SECRET=${SECRET}
|
||||||
|
ports: ["9903:9901"]
|
||||||
|
volumes: ["charlie-data:/etc/quptime"]
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
alpha-data:
|
||||||
|
bravo-data:
|
||||||
|
charlie-data:
|
||||||
|
```
|
||||||
|
|
||||||
|
Bootstrap:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# 1. Start alpha first to mint the cluster secret.
|
||||||
|
docker compose up -d alpha
|
||||||
|
# 2. Read the secret off alpha's stdout.
|
||||||
|
export SECRET=$(docker compose logs alpha | awk '/cluster secret/{getline; print $1}')
|
||||||
|
# 3. Bring up the followers — they pick up the secret from $SECRET.
|
||||||
|
docker compose up -d bravo charlie
|
||||||
|
|
||||||
|
# Invite from alpha. The hostnames resolve over the compose network.
|
||||||
|
docker compose exec alpha qu node add bravo:9901
|
||||||
|
sleep 3 # wait for heartbeats before the next add
|
||||||
|
docker compose exec alpha qu node add charlie:9901
|
||||||
|
|
||||||
|
docker compose exec alpha qu status
|
||||||
|
```
|
||||||
|
|
||||||
|
For a cluster on three separate hosts, replicate the compose file on
|
||||||
|
each box with different `advertise` addresses (the public hostname or
|
||||||
|
the overlay IP) and bootstrap the same way.
|
||||||
|
|
||||||
|
## Multi-host compose
|
||||||
|
|
||||||
|
The natural unit is one compose file per host, each running one
|
||||||
|
`qu` container. The minimum-viable file per host:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# /etc/qu-stack/compose.yaml
|
||||||
|
services:
|
||||||
|
quptime:
|
||||||
|
image: git.cer.sh/axodouble/quptime:latest
|
||||||
|
container_name: quptime
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE} # host:9901 reachable from peers
|
||||||
|
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET}
|
||||||
|
ports:
|
||||||
|
- "9901:9901"
|
||||||
|
volumes:
|
||||||
|
- /srv/quptime/data:/etc/quptime
|
||||||
|
cap_add:
|
||||||
|
- NET_RAW
|
||||||
|
```
|
||||||
|
|
||||||
|
Put the per-host values (`QUPTIME_ADVERTISE`, `QUPTIME_CLUSTER_SECRET`)
|
||||||
|
in a sibling `.env` file or a config-management secret so the compose
|
||||||
|
file itself is identical across hosts.
|
||||||
|
|
||||||
|
Persistence is a bind-mount under `/srv/quptime/data` so backups and
|
||||||
|
upgrades hit a known path. See [operations.md](../operations.md) for
|
||||||
|
the backup recipe.
|
||||||
|
|
||||||
|
Inter-host traffic on TCP/9901 must be reachable. If the boxes don't
|
||||||
|
share a private network, prefer the
|
||||||
|
[Tailscale recipe](tailscale.md) over exposing 9901 directly — see
|
||||||
|
[public-internet.md](public-internet.md) for the threat model if you
|
||||||
|
must expose it.
|
||||||
|
|
||||||
|
## Behind a reverse proxy
|
||||||
|
|
||||||
|
**Don't.** `qu` is mTLS-pinned at the application layer, so a TLS-
|
||||||
|
terminating proxy would force the daemon to trust whatever cert the
|
||||||
|
proxy presents — defeating fingerprint pinning. If you need a single
|
||||||
|
public address per node, use a Layer 4 TCP proxy (`nginx stream`,
|
||||||
|
HAProxy `mode tcp`, or a plain firewall NAT) that forwards bytes
|
||||||
|
without touching them.
|
||||||
|
|
||||||
|
## Image internals
|
||||||
|
|
||||||
|
Build locally if you want to inspect what you're running:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docker buildx build \
|
||||||
|
--build-arg VERSION=$(git describe --tags --always) \
|
||||||
|
--platform linux/amd64,linux/arm64 \
|
||||||
|
--file docker/Dockerfile \
|
||||||
|
--tag quptime:dev \
|
||||||
|
--load \
|
||||||
|
.
|
||||||
|
```
|
||||||
|
|
||||||
|
The Dockerfile (see `docker/Dockerfile`) is two stages: a `golang:1.24-alpine`
|
||||||
|
builder that cross-compiles with `-trimpath -ldflags "-s -w"`, and a
|
||||||
|
`gcr.io/distroless/static-debian12` runtime. No shell, no package
|
||||||
|
manager, no SSH; you cannot `docker exec -it sh` into it. Use
|
||||||
|
`docker exec quptime qu ...` for everything.
|
||||||
|
|
||||||
|
## Healthcheck
|
||||||
|
|
||||||
|
The container exits non-zero if the daemon crashes, so the default
|
||||||
|
`restart: unless-stopped` policy is enough for liveness. A more
|
||||||
|
useful readiness check requires the binary to be in your healthchecker:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "/usr/local/bin/qu", "status"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 10s
|
||||||
|
```
|
||||||
|
|
||||||
|
`qu status` exits 0 when the daemon socket is reachable and the
|
||||||
|
control RPC succeeds — it does **not** fail on quorum loss. That's
|
||||||
|
intentional: restarting a quorum-less node won't bring quorum back,
|
||||||
|
and a healthcheck that flaps a follower in and out of `unhealthy`
|
||||||
|
state every time the master is briefly unreachable is worse than no
|
||||||
|
check. If you want a stricter readiness signal, pipe `qu status`
|
||||||
|
through `grep -q 'quorum true'`.
|
||||||
@@ -0,0 +1,180 @@
|
|||||||
|
# Deployment: public-internet exposure
|
||||||
|
|
||||||
|
If your nodes do not share a private network and you can't put an
|
||||||
|
overlay between them (see [tailscale.md](tailscale.md)), this is the
|
||||||
|
recipe for exposing TCP/9901 directly to the open internet without
|
||||||
|
losing sleep.
|
||||||
|
|
||||||
|
The short version: `qu` is designed for this — every inbound call is
|
||||||
|
mTLS-pinned at the application layer and gated by the cluster secret
|
||||||
|
— but defence in depth is cheap and you should take it.
|
||||||
|
|
||||||
|
## Threat model in one paragraph
|
||||||
|
|
||||||
|
Anyone on the internet can establish a TLS connection to `:9901`
|
||||||
|
because the daemon must accept handshakes from currently-untrusted
|
||||||
|
peers (otherwise no node could ever join). The RPC dispatcher then
|
||||||
|
rejects every method except `Join` for callers whose fingerprint
|
||||||
|
isn't in `trust.yaml`. `Join` itself is gated by the **cluster
|
||||||
|
secret**, compared in constant time. So the realistic attack surface
|
||||||
|
is:
|
||||||
|
|
||||||
|
1. The TLS 1.3 stack accepting handshakes from arbitrary peers.
|
||||||
|
2. The `Join` handler's secret check and downstream cert ingestion.
|
||||||
|
3. The blast radius of a leaked cluster secret (an attacker who has
|
||||||
|
it can enrol themselves as a peer and propose mutations, which is
|
||||||
|
game over).
|
||||||
|
|
||||||
|
What can't trivially happen:
|
||||||
|
|
||||||
|
- A random attacker observing or modifying cluster traffic — TLS 1.3
|
||||||
|
with fingerprint pinning sees to that.
|
||||||
|
- A random attacker calling any method other than `Join` — the RPC
|
||||||
|
dispatcher refuses.
|
||||||
|
|
||||||
|
What you should still do:
|
||||||
|
|
||||||
|
- Treat `node.yaml.cluster_secret` like an SSH host key. Out-of-band
|
||||||
|
distribution only. Never in git, never in CI logs, never in chat.
|
||||||
|
- Rate-limit and IP-allowlist where you can. The `Join` handler does
|
||||||
|
not currently rate-limit at the application layer, so a determined
|
||||||
|
attacker could try secrets at TLS-handshake rate.
|
||||||
|
- Run on a non-default port if your operations workflow allows it.
|
||||||
|
Doesn't add security, but reduces background internet noise in the
|
||||||
|
logs and makes IDS / WAF rules cleaner.
|
||||||
|
|
||||||
|
## Firewall
|
||||||
|
|
||||||
|
### nftables (recommended)
|
||||||
|
|
||||||
|
A drop-in `/etc/nftables.d/quptime.nft`:
|
||||||
|
|
||||||
|
```nft
|
||||||
|
table inet filter {
|
||||||
|
set quptime_peers {
|
||||||
|
type ipv4_addr
|
||||||
|
elements = { 198.51.100.10, 198.51.100.11, 198.51.100.12 }
|
||||||
|
}
|
||||||
|
|
||||||
|
chain quptime_input {
|
||||||
|
# Drop everything that didn't come from a known peer.
|
||||||
|
ip saddr @quptime_peers tcp dport 9901 accept
|
||||||
|
tcp dport 9901 log prefix "quptime-drop: " level info drop
|
||||||
|
}
|
||||||
|
|
||||||
|
chain input {
|
||||||
|
type filter hook input priority 0; policy drop;
|
||||||
|
ct state established,related accept
|
||||||
|
iif lo accept
|
||||||
|
jump quptime_input
|
||||||
|
# ... your other rules
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The allowlist is the highest-ROI mitigation by far — if you maintain
|
||||||
|
fixed IPs for your monitor nodes, use this and move on.
|
||||||
|
|
||||||
|
### ufw
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo ufw allow from 198.51.100.10 to any port 9901 proto tcp
|
||||||
|
sudo ufw allow from 198.51.100.11 to any port 9901 proto tcp
|
||||||
|
sudo ufw allow from 198.51.100.12 to any port 9901 proto tcp
|
||||||
|
```
|
||||||
|
|
||||||
|
### Dynamic peer IPs
|
||||||
|
|
||||||
|
If peer IPs aren't fixed (e.g., one node is on a home connection with
|
||||||
|
a rotating address), you have three options ranked by preference:
|
||||||
|
|
||||||
|
1. Use an overlay instead — see [tailscale.md](tailscale.md). This is
|
||||||
|
the right answer.
|
||||||
|
2. DNS-based allowlisting (`ipset`-from-DNS or a small reconciler that
|
||||||
|
re-resolves an allowlist hostname every minute). Beware: a
|
||||||
|
compromised DNS resolver becomes a compromise of the allowlist.
|
||||||
|
3. Drop the allowlist and rely solely on the cluster secret + mTLS.
|
||||||
|
This is what `qu` is designed to survive; just be sure the secret
|
||||||
|
actually has the entropy `qu init` generated for it (32 random
|
||||||
|
bytes, base64-encoded).
|
||||||
|
|
||||||
|
## Rate-limiting failed handshakes
|
||||||
|
|
||||||
|
`qu` does not currently rate-limit `Join` attempts at the application
|
||||||
|
layer. You can do it at the firewall, which catches both connect
|
||||||
|
floods and slow brute-force:
|
||||||
|
|
||||||
|
```nft
|
||||||
|
table inet filter {
|
||||||
|
chain quptime_input {
|
||||||
|
tcp dport 9901 ct state new \
|
||||||
|
meter quptime_ratemeter { ip saddr limit rate over 10/second } \
|
||||||
|
log prefix "quptime-rate: " drop
|
||||||
|
tcp dport 9901 accept
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Or `fail2ban` with a tiny custom filter that watches `journalctl -u
|
||||||
|
quptime` for repeated `peer rejected join` lines:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
# /etc/fail2ban/filter.d/quptime.conf
|
||||||
|
[Definition]
|
||||||
|
failregex = ^.*quptime:.*peer rejected join.*from <ADDR>.*$
|
||||||
|
```
|
||||||
|
|
||||||
|
```ini
|
||||||
|
# /etc/fail2ban/jail.d/quptime.local
|
||||||
|
[quptime]
|
||||||
|
enabled = true
|
||||||
|
filter = quptime
|
||||||
|
backend = systemd
|
||||||
|
journalmatch = _SYSTEMD_UNIT=quptime.service
|
||||||
|
maxretry = 3
|
||||||
|
findtime = 600
|
||||||
|
bantime = 86400
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: the daemon doesn't currently log the *peer address* on rejected
|
||||||
|
joins. The log filter above is illustrative; check what your version
|
||||||
|
actually emits before relying on it.
|
||||||
|
|
||||||
|
## Secret hygiene
|
||||||
|
|
||||||
|
The single most important thing on a public-internet deployment:
|
||||||
|
|
||||||
|
- **Generate the secret on the first node.** `qu init` with no
|
||||||
|
`--secret` produces 32 random bytes from `crypto/rand`, base64-
|
||||||
|
encoded. Don't replace that with something memorable.
|
||||||
|
- **Transport out of band.** Paste it into your secret manager
|
||||||
|
immediately; share via 1Password / Vault / encrypted email.
|
||||||
|
- **Rotate if anyone with access has left.** Rotation isn't a CLI
|
||||||
|
command; do it the brute-force way: `qu init` a fresh cluster on
|
||||||
|
new ports, re-add every check via `cluster.yaml` export, swap DNS.
|
||||||
|
- **One secret per cluster.** Do not reuse the secret across staging
|
||||||
|
and prod, or across customers if you run several clusters.
|
||||||
|
|
||||||
|
## Non-default ports
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Each node, in node.yaml — or pass --port on init.
|
||||||
|
qu init --advertise alpha.example.com:51234 --port 51234
|
||||||
|
```
|
||||||
|
|
||||||
|
Open the corresponding firewall rule, restart the daemon. The
|
||||||
|
cluster doesn't require uniform ports across nodes; each peer's
|
||||||
|
`advertise` field tells everyone else what to dial.
|
||||||
|
|
||||||
|
## What you should monitor on a public deployment
|
||||||
|
|
||||||
|
- `term` from `qu status` — if it's ticking up frequently the master
|
||||||
|
is flapping, which probably means at least one peer's network is
|
||||||
|
unstable. Could be benign, could be a probe attempt.
|
||||||
|
- The firewall drop counter on the `quptime-drop` rule above.
|
||||||
|
- The number of TLS handshakes on `:9901`. A spike in handshakes that
|
||||||
|
don't progress to a successful RPC is the signature of a brute-force
|
||||||
|
on the cluster secret.
|
||||||
|
|
||||||
|
For the operational side — backups, upgrades, recovery — see
|
||||||
|
[operations.md](../operations.md).
|
||||||
@@ -0,0 +1,250 @@
|
|||||||
|
# Deployment: systemd on bare metal / VM
|
||||||
|
|
||||||
|
The canonical way to run `qu` on a Linux host. Single static binary,
|
||||||
|
managed by systemd, with a hardened unit file. Most production users
|
||||||
|
should start here.
|
||||||
|
|
||||||
|
## Audience and assumptions
|
||||||
|
|
||||||
|
- You have root (or `sudo`) on the host.
|
||||||
|
- You have at least three hosts that can reach each other on TCP/9901.
|
||||||
|
(Three is the minimum for a useful quorum; fewer is fine for
|
||||||
|
development but a 2-node cluster offers no consensus protection.)
|
||||||
|
- The hosts have a way to authenticate each other — direct IP or a
|
||||||
|
resolvable hostname is fine. For overlay networks see
|
||||||
|
[tailscale.md](tailscale.md).
|
||||||
|
|
||||||
|
## Install the binary
|
||||||
|
|
||||||
|
See [installation.md](../installation.md). The official `install.sh`
|
||||||
|
script writes a *minimal* unit file that's fine for development. For
|
||||||
|
production replace it with the hardened version below.
|
||||||
|
|
||||||
|
## Create a dedicated user
|
||||||
|
|
||||||
|
Running as a dedicated unprivileged user is best practice, but ICMP
|
||||||
|
support adds a wrinkle — see the next section.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo useradd --system --no-create-home --shell /usr/sbin/nologin quptime
|
||||||
|
sudo install -d -o quptime -g quptime -m 0750 /etc/quptime
|
||||||
|
sudo install -d -o quptime -g quptime -m 0750 /var/run/quptime
|
||||||
|
```
|
||||||
|
|
||||||
|
## ICMP capabilities
|
||||||
|
|
||||||
|
ICMP probes have two implementations:
|
||||||
|
|
||||||
|
1. **Unprivileged UDP pings** — Linux's `dgram` ICMP socket. Works on
|
||||||
|
any modern kernel without elevated privileges, but only if
|
||||||
|
`net.ipv4.ping_group_range` includes the daemon's GID. This is the
|
||||||
|
default in `qu`.
|
||||||
|
2. **Raw ICMP** — requires `CAP_NET_RAW`, more accurate latency
|
||||||
|
numbers and works for IPv6 from arbitrary kernels.
|
||||||
|
|
||||||
|
The simplest path: stick with unprivileged pings and widen
|
||||||
|
`ping_group_range`. Sysctl, persistent across reboots:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# /etc/sysctl.d/10-quptime.conf
|
||||||
|
net.ipv4.ping_group_range = 0 2147483647
|
||||||
|
```
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo sysctl --system
|
||||||
|
```
|
||||||
|
|
||||||
|
If you need raw ICMP instead, grant the capability on the binary:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo setcap cap_net_raw=+ep /usr/local/bin/qu
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that `setcap` is overwritten by every `qu` upgrade — bake the
|
||||||
|
`setcap` call into your deploy script, or re-run it after each
|
||||||
|
package update.
|
||||||
|
|
||||||
|
## Hardened unit file
|
||||||
|
|
||||||
|
Drop this in `/etc/systemd/system/quptime.service`:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=QUptime distributed uptime monitor
|
||||||
|
Documentation=https://git.cer.sh/axodouble/quptime
|
||||||
|
Wants=network-online.target
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
ExecStart=/usr/local/bin/qu serve
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5s
|
||||||
|
|
||||||
|
User=quptime
|
||||||
|
Group=quptime
|
||||||
|
|
||||||
|
# Where state lives. RuntimeDirectory creates /var/run/quptime/ each
|
||||||
|
# boot owned by User:Group with mode 0750.
|
||||||
|
Environment=QUPTIME_DIR=/etc/quptime
|
||||||
|
RuntimeDirectory=quptime
|
||||||
|
RuntimeDirectoryMode=0750
|
||||||
|
ReadWritePaths=/etc/quptime /var/run/quptime
|
||||||
|
|
||||||
|
# Hardening. Comment out individual directives if a probe needs
|
||||||
|
# something we've revoked.
|
||||||
|
NoNewPrivileges=true
|
||||||
|
ProtectSystem=strict
|
||||||
|
ProtectHome=true
|
||||||
|
PrivateTmp=true
|
||||||
|
PrivateDevices=true
|
||||||
|
ProtectKernelTunables=true
|
||||||
|
ProtectKernelModules=true
|
||||||
|
ProtectControlGroups=true
|
||||||
|
ProtectClock=true
|
||||||
|
ProtectHostname=true
|
||||||
|
RestrictNamespaces=true
|
||||||
|
RestrictRealtime=true
|
||||||
|
RestrictSUIDSGID=true
|
||||||
|
LockPersonality=true
|
||||||
|
MemoryDenyWriteExecute=true
|
||||||
|
|
||||||
|
# Network access is required (we're a network monitor). Keep address
|
||||||
|
# families minimal — AF_NETLINK is needed for some libc lookups.
|
||||||
|
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK
|
||||||
|
|
||||||
|
# If you need raw ICMP, *also* uncomment:
|
||||||
|
# AmbientCapabilities=CAP_NET_RAW
|
||||||
|
# CapabilityBoundingSet=CAP_NET_RAW
|
||||||
|
# Otherwise drop all capabilities:
|
||||||
|
CapabilityBoundingSet=
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
Reload systemd and enable:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable quptime.service
|
||||||
|
```
|
||||||
|
|
||||||
|
## Initialise the node
|
||||||
|
|
||||||
|
**Don't start the service yet** — `qu init` must run first, and it
|
||||||
|
must run as the `quptime` user so it creates files with the right
|
||||||
|
ownership.
|
||||||
|
|
||||||
|
On the **first** host (it will print a secret; copy it):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo -u quptime QUPTIME_DIR=/etc/quptime \
|
||||||
|
qu init --advertise alpha.example.com:9901
|
||||||
|
```
|
||||||
|
|
||||||
|
On every **other** host (paste the secret):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo -u quptime QUPTIME_DIR=/etc/quptime \
|
||||||
|
qu init --advertise bravo.example.com:9901 --secret '<paste>'
|
||||||
|
|
||||||
|
sudo -u quptime QUPTIME_DIR=/etc/quptime \
|
||||||
|
qu init --advertise charlie.example.com:9901 --secret '<paste>'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Open the firewall
|
||||||
|
|
||||||
|
`qu` needs TCP/9901 reachable between cluster members. Adjust to your
|
||||||
|
firewall:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# ufw
|
||||||
|
sudo ufw allow from <peer-ip> to any port 9901 proto tcp
|
||||||
|
|
||||||
|
# firewalld
|
||||||
|
sudo firewall-cmd --permanent --zone=internal \
|
||||||
|
--add-rich-rule='rule family=ipv4 source address=<peer-ip> port port=9901 protocol=tcp accept'
|
||||||
|
sudo firewall-cmd --reload
|
||||||
|
|
||||||
|
# nftables (drop-in)
|
||||||
|
table inet filter {
|
||||||
|
chain input {
|
||||||
|
ip saddr { 10.0.0.10, 10.0.0.11, 10.0.0.12 } tcp dport 9901 accept
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
For exposing 9901 to the open internet see
|
||||||
|
[public-internet.md](public-internet.md).
|
||||||
|
|
||||||
|
## Start the daemon
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo systemctl start quptime
|
||||||
|
sudo systemctl status quptime
|
||||||
|
journalctl -u quptime -f
|
||||||
|
```
|
||||||
|
|
||||||
|
## Invite peers
|
||||||
|
|
||||||
|
From one node (typically `alpha`):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo -u quptime qu node add bravo.example.com:9901
|
||||||
|
# Pause a few seconds so heartbeats reach the new peer before the next add —
|
||||||
|
# otherwise the "needs ≥2 live to mutate" check rejects the second invite.
|
||||||
|
sudo -u quptime qu node add charlie.example.com:9901
|
||||||
|
```
|
||||||
|
|
||||||
|
`qu node add` prints each remote's fingerprint and asks for SSH-style
|
||||||
|
confirmation. Verify it matches an out-of-band channel (the remote
|
||||||
|
operator can show their fingerprint with
|
||||||
|
`sudo -u quptime qu status` or by reading `trust.yaml`).
|
||||||
|
|
||||||
|
## Verify
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo -u quptime qu status
|
||||||
|
```
|
||||||
|
|
||||||
|
Expect to see all three peers `live=true` and one of them as
|
||||||
|
`master`.
|
||||||
|
|
||||||
|
## Log scraping
|
||||||
|
|
||||||
|
`journalctl -u quptime` is the canonical log stream. Notable lines:
|
||||||
|
|
||||||
|
| Pattern | Meaning |
|
||||||
|
| ------------------------------------------------------------- | --------------------------------------------------------- |
|
||||||
|
| `listening on ... as node ...` | Daemon up. |
|
||||||
|
| `manual-edit: cluster.yaml changed externally — replicating…` | An operator edited `cluster.yaml` directly. |
|
||||||
|
| `manual-edit: parse cluster.yaml: ...` | Invalid YAML on disk; the operator must fix and re-save. |
|
||||||
|
| `report to master ...: <err>` | A follower couldn't ship a probe result to the master. |
|
||||||
|
| `replicate: pull from ...: <err>` | A follower couldn't pull a higher-version config snapshot. |
|
||||||
|
|
||||||
|
## Sample reload / restart drill
|
||||||
|
|
||||||
|
After editing the unit file:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl restart quptime
|
||||||
|
```
|
||||||
|
|
||||||
|
After editing `cluster.yaml` by hand:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudoedit /etc/quptime/cluster.yaml
|
||||||
|
# No restart needed — the watcher picks it up within 2s and pushes to master.
|
||||||
|
```
|
||||||
|
|
||||||
|
After upgrading the binary:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo install -m 0755 qu-new /usr/local/bin/qu
|
||||||
|
sudo setcap cap_net_raw=+ep /usr/local/bin/qu # if you use raw ICMP
|
||||||
|
sudo systemctl restart quptime
|
||||||
|
```
|
||||||
|
|
||||||
|
Doing rolling upgrades? See [operations.md](../operations.md).
|
||||||
@@ -0,0 +1,188 @@
|
|||||||
|
# Deployment: Tailscale / WireGuard overlay
|
||||||
|
|
||||||
|
When your nodes live in different networks — different VPS providers,
|
||||||
|
different physical sites, a mix of home and cloud — exposing TCP/9901
|
||||||
|
to the open internet is a poor idea. An overlay network gives every
|
||||||
|
node a stable private IP regardless of NAT, and `qu` only needs to
|
||||||
|
listen on that overlay address.
|
||||||
|
|
||||||
|
This page focuses on Tailscale because the repo ships an example
|
||||||
|
compose for it, but everything generalises to WireGuard, Nebula, or a
|
||||||
|
self-hosted Headscale.
|
||||||
|
|
||||||
|
## The big idea
|
||||||
|
|
||||||
|
```
|
||||||
|
+--- host A (VPS, no public ICMP) ----+
|
||||||
|
| tailscale ←→ overlay ip 100.64.1.1 |
|
||||||
|
| qu listening on 100.64.1.1:9901 |
|
||||||
|
+-------------------------------------+
|
||||||
|
│ mTLS over overlay
|
||||||
|
▼
|
||||||
|
+--- host B (homelab behind NAT) -----+
|
||||||
|
| tailscale ←→ overlay ip 100.64.1.2 |
|
||||||
|
| qu listening on 100.64.1.2:9901 |
|
||||||
|
+-------------------------------------+
|
||||||
|
```
|
||||||
|
|
||||||
|
`bind_addr` is set to the tailscale IP, the host's public interface
|
||||||
|
has no port 9901 open, and the cluster secret + mTLS handshake gate
|
||||||
|
the link inside the tunnel.
|
||||||
|
|
||||||
|
## Compose recipe
|
||||||
|
|
||||||
|
The repo ships [`docker/docker-compose-tailscale.yml`](../../docker/docker-compose-tailscale.yml).
|
||||||
|
The relevant trick is `network_mode: "service:tailscale"` — the
|
||||||
|
`quptime` container shares the network namespace of the `tailscale`
|
||||||
|
sidecar so it sees the tailnet as its own interface.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
tailscale:
|
||||||
|
image: tailscale/tailscale:latest
|
||||||
|
container_name: tailscale
|
||||||
|
cap_add: [NET_ADMIN]
|
||||||
|
environment:
|
||||||
|
- TS_AUTHKEY=${TAILSCALE_AUTHKEY} # provision via .env
|
||||||
|
- TS_HOSTNAME=quptime-${HOST} # name visible in admin
|
||||||
|
volumes:
|
||||||
|
- /dev/net/tun:/dev/net/tun
|
||||||
|
- tailscale:/var/lib/tailscale
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
quptime:
|
||||||
|
image: git.cer.sh/axodouble/quptime:latest
|
||||||
|
container_name: quptime
|
||||||
|
environment:
|
||||||
|
# host:port other QUptime nodes use to reach this one. Should be
|
||||||
|
# this node's tailnet IP / MagicDNS name. Auto-init reads this on
|
||||||
|
# first start.
|
||||||
|
- QUPTIME_ADVERTISE=${QUPTIME_ADVERTISE}
|
||||||
|
# Shared cluster join secret. Omit on the very first node to have
|
||||||
|
# it generated and logged for you, then copy it into every
|
||||||
|
# follower's .env.
|
||||||
|
- QUPTIME_CLUSTER_SECRET=${QUPTIME_CLUSTER_SECRET:-}
|
||||||
|
volumes:
|
||||||
|
- quptime:/etc/quptime
|
||||||
|
network_mode: "service:tailscale"
|
||||||
|
depends_on: [tailscale]
|
||||||
|
cap_add: [NET_RAW]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
tailscale:
|
||||||
|
quptime:
|
||||||
|
```
|
||||||
|
|
||||||
|
### One-time bootstrap
|
||||||
|
|
||||||
|
Each host runs the same compose file with a per-host `.env`:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# .env (alpha — the first node)
|
||||||
|
HOST=alpha
|
||||||
|
TAILSCALE_AUTHKEY=tskey-auth-xxxxxxxx
|
||||||
|
QUPTIME_ADVERTISE=100.64.1.1:9901 # this node's tailnet IP
|
||||||
|
# QUPTIME_CLUSTER_SECRET left unset — will be generated on first boot.
|
||||||
|
```
|
||||||
|
|
||||||
|
Start the stack on the first host. `qu serve` auto-initialises the
|
||||||
|
volume using the env vars above, so a single `docker compose up`
|
||||||
|
brings everything up:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docker compose up -d
|
||||||
|
docker compose logs quptime | grep -A1 'cluster secret'
|
||||||
|
# Pipe the secret through your password manager.
|
||||||
|
```
|
||||||
|
|
||||||
|
On every **other** host, write the same `.env` plus the captured
|
||||||
|
secret:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# .env (bravo, charlie, …)
|
||||||
|
HOST=bravo
|
||||||
|
TAILSCALE_AUTHKEY=tskey-auth-xxxxxxxx
|
||||||
|
QUPTIME_ADVERTISE=100.64.1.2:9901
|
||||||
|
QUPTIME_CLUSTER_SECRET=<paste from alpha>
|
||||||
|
```
|
||||||
|
|
||||||
|
Bring them up and invite them from the first node:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
# From alpha
|
||||||
|
docker compose exec quptime qu node add 100.64.1.2:9901
|
||||||
|
sleep 3
|
||||||
|
docker compose exec quptime qu node add 100.64.1.3:9901
|
||||||
|
|
||||||
|
docker compose exec quptime qu status
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tailscale ACLs
|
||||||
|
|
||||||
|
Belt and braces — even though mTLS pins identities, lock down the
|
||||||
|
tailnet itself so only the `qu` nodes can reach each other's :9901.
|
||||||
|
In the Tailscale admin console:
|
||||||
|
|
||||||
|
```jsonc
|
||||||
|
{
|
||||||
|
"tagOwners": { "tag:qu-node": ["group:ops"] },
|
||||||
|
"acls": [
|
||||||
|
{
|
||||||
|
"action": "accept",
|
||||||
|
"src": ["tag:qu-node"],
|
||||||
|
"dst": ["tag:qu-node:9901"]
|
||||||
|
}
|
||||||
|
// ...your other rules
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then tag every `qu` node in its auth key:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
environment:
|
||||||
|
- TS_AUTHKEY=${TAILSCALE_AUTHKEY}?ephemeral=false&tags=tag:qu-node
|
||||||
|
```
|
||||||
|
|
||||||
|
## WireGuard / Nebula / Headscale equivalents
|
||||||
|
|
||||||
|
The recipe generalises:
|
||||||
|
|
||||||
|
1. Provision the overlay interface on each host with a stable
|
||||||
|
private IP (the tunnel's own address).
|
||||||
|
2. `qu init --advertise <overlay-ip>:9901`.
|
||||||
|
3. Set `bind_addr: <overlay-ip>` in `node.yaml` so the daemon does
|
||||||
|
**not** also listen on the public interface.
|
||||||
|
4. Open `:9901` only on the overlay interface in your firewall — for
|
||||||
|
nftables that's something like `iifname "wg0" tcp dport 9901
|
||||||
|
accept`.
|
||||||
|
|
||||||
|
The cluster secret and mTLS fingerprints still apply; the overlay just
|
||||||
|
removes the open-internet attack surface.
|
||||||
|
|
||||||
|
## Why prefer overlay over public exposure
|
||||||
|
|
||||||
|
- Single failure domain at the network layer: an attacker who finds an
|
||||||
|
exploit in your overlay client (rare; Tailscale and WireGuard are
|
||||||
|
small surfaces) still hits the application-layer pinning before any
|
||||||
|
cluster-level operation.
|
||||||
|
- The cluster secret can be lower-entropy when it's already
|
||||||
|
unreachable from outside. (You should still treat it as a real
|
||||||
|
secret; "defence in depth" only works if every layer is real.)
|
||||||
|
- ICMP probes from a homelab to a target on the public internet are
|
||||||
|
trivial through NAT, but ICMP *into* a homelab usually isn't.
|
||||||
|
Running `qu` on a tailnet means peers can heartbeat each other
|
||||||
|
regardless of NAT direction.
|
||||||
|
|
||||||
|
## Trade-offs
|
||||||
|
|
||||||
|
- One more thing to monitor. If your tailnet is down, your monitor is
|
||||||
|
down. Counter-measure: run *another* tiny `qu` cluster (or a single
|
||||||
|
node) on the public internet that watches the overlay's coordinator
|
||||||
|
health.
|
||||||
|
- Probe latency includes the overlay's hop. Tailscale's wireguard is
|
||||||
|
fast (<1 ms LAN, single-digit ms WAN) so this rarely matters, but
|
||||||
|
if you're alerting on tight latency thresholds, account for it.
|
||||||
@@ -0,0 +1,136 @@
|
|||||||
|
# Installation
|
||||||
|
|
||||||
|
`qu` ships as a single static Linux binary. Pick whichever method
|
||||||
|
matches how you manage software on the host.
|
||||||
|
|
||||||
|
> Choosing a deployment recipe instead? Jump to
|
||||||
|
> [systemd](deployment/systemd.md), [Docker](deployment/docker.md),
|
||||||
|
> [Tailscale](deployment/tailscale.md), or
|
||||||
|
> [public-internet](deployment/public-internet.md).
|
||||||
|
|
||||||
|
## Pre-built binary (recommended)
|
||||||
|
|
||||||
|
Every tag triggers identical builds on both sources, so either one
|
||||||
|
serves the same artefact set. Gitea is the canonical home; GitHub is a
|
||||||
|
push-mirror.
|
||||||
|
|
||||||
|
Primary — Gitea releases:
|
||||||
|
<https://git.cer.sh/axodouble/quptime/releases>
|
||||||
|
|
||||||
|
Fallback — GitHub releases (mirrored from the same tag):
|
||||||
|
<https://github.com/Axodouble/QUptime/releases>
|
||||||
|
|
||||||
|
Each release ships `qu-${TAG}-linux-amd64`, `qu-${TAG}-linux-arm64`,
|
||||||
|
and a `SHA256SUMS` file.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Always pin to a tag — `latest` resolves on the server side.
|
||||||
|
TAG=v0.0.1
|
||||||
|
ARCH=amd64 # or arm64
|
||||||
|
|
||||||
|
# Primary: Gitea
|
||||||
|
curl -fSL -o qu \
|
||||||
|
"https://git.cer.sh/axodouble/quptime/releases/download/${TAG}/qu-${TAG}-linux-${ARCH}"
|
||||||
|
curl -fSL -o SHA256SUMS \
|
||||||
|
"https://git.cer.sh/axodouble/quptime/releases/download/${TAG}/SHA256SUMS"
|
||||||
|
|
||||||
|
# (or the GitHub mirror — substitute the host below if Gitea is unreachable)
|
||||||
|
# https://github.com/Axodouble/QUptime/releases/download/${TAG}/qu-${TAG}-linux-${ARCH}
|
||||||
|
# https://github.com/Axodouble/QUptime/releases/download/${TAG}/SHA256SUMS
|
||||||
|
|
||||||
|
# Verify before installing. Use the SHA256SUMS from the SAME source
|
||||||
|
# as the binary — never mix.
|
||||||
|
sha256sum --check --ignore-missing SHA256SUMS
|
||||||
|
|
||||||
|
install -m 0755 qu /usr/local/bin/qu
|
||||||
|
```
|
||||||
|
|
||||||
|
## One-line install script
|
||||||
|
|
||||||
|
The repo ships an `install.sh` that handles the download, checksum,
|
||||||
|
shell-completion installation, and a hardened systemd unit. Run it
|
||||||
|
under `sudo` so it can write to `/usr/local/bin` and
|
||||||
|
`/etc/systemd/system`.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
curl -fsSL https://git.cer.sh/Axodouble/QUptime/raw/branch/master/install.sh | sudo bash
|
||||||
|
# or, via the GitHub mirror:
|
||||||
|
# curl -fsSL https://raw.githubusercontent.com/Axodouble/QUptime/master/install.sh | sudo bash
|
||||||
|
```
|
||||||
|
|
||||||
|
What it does:
|
||||||
|
|
||||||
|
1. Looks up the latest release via the Gitea API; falls back to the
|
||||||
|
GitHub API if Gitea is unreachable.
|
||||||
|
2. Downloads the per-arch binary and the matching `SHA256SUMS` from
|
||||||
|
the same source, then verifies the checksum. Refuses to install on
|
||||||
|
a mismatch.
|
||||||
|
3. Installs bash / zsh / fish completion if a target directory exists.
|
||||||
|
4. Creates a dedicated `quptime` system user and writes
|
||||||
|
`/etc/systemd/system/quptime.service` (hardened — matches the unit
|
||||||
|
in [systemd.md](deployment/systemd.md)). Enables but does not start
|
||||||
|
the service, so you can configure identity before first boot.
|
||||||
|
|
||||||
|
## Build from source
|
||||||
|
|
||||||
|
Requires Go 1.24.2 or newer.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Either remote — Gitea is canonical, GitHub is a push-mirror.
|
||||||
|
git clone https://git.cer.sh/axodouble/quptime.git
|
||||||
|
# git clone https://github.com/Axodouble/QUptime.git
|
||||||
|
cd quptime
|
||||||
|
go build -ldflags "-X main.version=$(git describe --tags --always)" -o qu ./cmd/qu
|
||||||
|
|
||||||
|
./qu --version
|
||||||
|
```
|
||||||
|
|
||||||
|
Static binary, no cgo. `CGO_ENABLED=0` is the default on a clean Go
|
||||||
|
install; if you've enabled cgo globally, set it explicitly:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
CGO_ENABLED=0 go build -trimpath -ldflags "-s -w" -o qu ./cmd/qu
|
||||||
|
```
|
||||||
|
|
||||||
|
## Docker image
|
||||||
|
|
||||||
|
The same multi-arch (`amd64` + `arm64`) image is published to two
|
||||||
|
registries on every tag. The Gitea registry is the canonical source
|
||||||
|
and also gets canary `:master` builds; GHCR is a tag-only mirror.
|
||||||
|
|
||||||
|
Primary — Gitea registry:
|
||||||
|
|
||||||
|
```
|
||||||
|
git.cer.sh/axodouble/quptime:master # tip of main (canary)
|
||||||
|
git.cer.sh/axodouble/quptime:latest # latest tagged release
|
||||||
|
git.cer.sh/axodouble/quptime:v0.0.1 # pinned release
|
||||||
|
```
|
||||||
|
|
||||||
|
Fallback — GitHub Container Registry:
|
||||||
|
|
||||||
|
```
|
||||||
|
ghcr.io/axodouble/quptime:latest # latest tagged release
|
||||||
|
ghcr.io/axodouble/quptime:v0.0.1 # pinned release
|
||||||
|
ghcr.io/axodouble/quptime:0.0 # latest 0.0.x
|
||||||
|
```
|
||||||
|
|
||||||
|
See the [Docker deployment guide](deployment/docker.md) for compose
|
||||||
|
files and volume layout.
|
||||||
|
|
||||||
|
## Verifying the install
|
||||||
|
|
||||||
|
```sh
|
||||||
|
qu --version
|
||||||
|
qu --help
|
||||||
|
```
|
||||||
|
|
||||||
|
If completions installed, `qu <tab>` will list subcommands. After
|
||||||
|
`qu init` you can run `qu status` to confirm the daemon is reachable
|
||||||
|
over its control socket.
|
||||||
|
|
||||||
|
## Next steps
|
||||||
|
|
||||||
|
- [Configure the node and the cluster](configuration.md).
|
||||||
|
- Pick a deployment recipe under [docs/deployment/](deployment/).
|
||||||
|
- Walk through the [architecture](architecture.md) so the operational
|
||||||
|
guarantees are clear before you commit to a topology.
|
||||||
@@ -0,0 +1,225 @@
|
|||||||
|
# Operations
|
||||||
|
|
||||||
|
Day-2 tasks: keeping `qu` healthy, upgrading without dropping checks,
|
||||||
|
backing up state, recovering from failures. Pair this with
|
||||||
|
[troubleshooting.md](troubleshooting.md) for "the cluster is on fire,
|
||||||
|
what now" specifics.
|
||||||
|
|
||||||
|
## Upgrades
|
||||||
|
|
||||||
|
### Rolling upgrade (zero alert loss)
|
||||||
|
|
||||||
|
`qu` is built to tolerate one node being absent at a time as long as
|
||||||
|
quorum still holds. The simple recipe for a 3-node cluster:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# On each node in turn:
|
||||||
|
sudo systemctl stop quptime
|
||||||
|
sudo install -m 0755 qu-new /usr/local/bin/qu
|
||||||
|
sudo setcap cap_net_raw=+ep /usr/local/bin/qu # if you use raw ICMP
|
||||||
|
sudo systemctl start quptime
|
||||||
|
|
||||||
|
# Wait for the node to rejoin before moving on:
|
||||||
|
sudo -u quptime qu status # should show quorum true, all peers live
|
||||||
|
```
|
||||||
|
|
||||||
|
The first node you upgrade may briefly be a follower with a *higher*
|
||||||
|
binary version than the master. That's fine as long as no on-disk
|
||||||
|
format changes; the wire protocol and `cluster.yaml` schema are
|
||||||
|
stable within a minor version, so minor / patch upgrades freely
|
||||||
|
interleave.
|
||||||
|
|
||||||
|
For major-version upgrades that change the on-disk format, the release
|
||||||
|
notes will spell out the migration. As of v0 there have been none.
|
||||||
|
|
||||||
|
### Downgrades
|
||||||
|
|
||||||
|
A node that downgrades to an older binary will refuse to start if
|
||||||
|
`cluster.yaml` contains fields the older version doesn't know. To
|
||||||
|
roll back across a schema change, either:
|
||||||
|
|
||||||
|
- Take the cluster offline and downgrade all nodes simultaneously.
|
||||||
|
- Restore a `cluster.yaml` from before the schema change on every node
|
||||||
|
before starting the downgraded binary.
|
||||||
|
|
||||||
|
Within a single minor version, downgrade is symmetrical with upgrade.
|
||||||
|
|
||||||
|
### What can go wrong
|
||||||
|
|
||||||
|
- **Restarting two nodes at once in a 3-node cluster** loses quorum.
|
||||||
|
No mutations succeed, no alerts fire. Quorum returns the moment
|
||||||
|
the second node is back.
|
||||||
|
- **A node that has been offline for a long time** comes back with a
|
||||||
|
stale `cluster.yaml`. It will pull the master's higher version
|
||||||
|
within ~1 heartbeat. Don't pre-emptively delete its `cluster.yaml`
|
||||||
|
— let the catch-up path handle it.
|
||||||
|
|
||||||
|
## Backups
|
||||||
|
|
||||||
|
Three files matter, in descending order of "pain if lost":
|
||||||
|
|
||||||
|
| File | Why back it up |
|
||||||
|
| ---------------------- | -------------------------------------------------------------------- |
|
||||||
|
| `node.yaml` | Holds the cluster secret. Lose it and the node can't rejoin. |
|
||||||
|
| `keys/private.pem` | Lose it and you must `qu init` a fresh identity and re-trust. |
|
||||||
|
| `cluster.yaml` | Resyncs from any other live peer, so per-node backup is optional. |
|
||||||
|
|
||||||
|
### Per-host backup
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# /etc/cron.daily/quptime-backup
|
||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
dst=/var/backups/quptime/$(date +%Y%m%d)
|
||||||
|
mkdir -p "$dst"
|
||||||
|
cp -a /etc/quptime/node.yaml "$dst/"
|
||||||
|
cp -a /etc/quptime/keys "$dst/keys"
|
||||||
|
cp -a /etc/quptime/cluster.yaml "$dst/cluster.yaml"
|
||||||
|
chmod -R go-rwx "$dst"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cluster-wide backup
|
||||||
|
|
||||||
|
The cluster state (`peers`, `checks`, `alerts`) is identical across
|
||||||
|
every node. Back up one healthy node's `cluster.yaml` and you have
|
||||||
|
the canonical copy. To restore:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Stop the daemon.
|
||||||
|
sudo systemctl stop quptime
|
||||||
|
|
||||||
|
# Drop in the backup. Reset the version to 0 so the running cluster's
|
||||||
|
# higher version supersedes whatever you're holding — otherwise this
|
||||||
|
# node will broadcast a stale snapshot and confuse everyone.
|
||||||
|
sudo cp backup-cluster.yaml /etc/quptime/cluster.yaml
|
||||||
|
sudo sed -i 's/^version:.*/version: 0/' /etc/quptime/cluster.yaml
|
||||||
|
|
||||||
|
sudo systemctl start quptime
|
||||||
|
# Within seconds the version-observer pulls the live version from a peer.
|
||||||
|
```
|
||||||
|
|
||||||
|
If you're restoring **the entire cluster** (every node lost), the
|
||||||
|
"reset version to 0" trick doesn't apply — there's no peer with a
|
||||||
|
higher version. Pick the highest-version backup, restore that file
|
||||||
|
across every node verbatim, and start the daemons. The cluster will
|
||||||
|
elect a master and continue.
|
||||||
|
|
||||||
|
## Replacing a dead node
|
||||||
|
|
||||||
|
A node has died permanently. You want to add a fresh box with the
|
||||||
|
same role.
|
||||||
|
|
||||||
|
1. On a surviving node, evict the dead one:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo -u quptime qu node remove <dead-node-id>
|
||||||
|
```
|
||||||
|
|
||||||
|
This drops it from `cluster.yaml` and removes its trust entry. The
|
||||||
|
live set's size shrinks by one — verify quorum still holds.
|
||||||
|
|
||||||
|
2. On the new host, install `qu` and `qu init` against the existing
|
||||||
|
cluster secret:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo -u quptime qu init \
|
||||||
|
--advertise delta.example.com:9901 \
|
||||||
|
--secret '<existing cluster secret>'
|
||||||
|
sudo systemctl start quptime
|
||||||
|
```
|
||||||
|
|
||||||
|
3. From a surviving node, invite the new one:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo -u quptime qu node add delta.example.com:9901
|
||||||
|
```
|
||||||
|
|
||||||
|
The dead node's checks and alerts are unaffected — they live in the
|
||||||
|
replicated `cluster.yaml`, not the dead node's identity.
|
||||||
|
|
||||||
|
## Recovering from lost quorum
|
||||||
|
|
||||||
|
You've lost more than half the cluster simultaneously. The remaining
|
||||||
|
nodes refuse to mutate (correct behaviour: they have no way to know
|
||||||
|
whether the missing nodes are dead or partitioned).
|
||||||
|
|
||||||
|
Options:
|
||||||
|
|
||||||
|
- **Bring the missing nodes back.** Always the right first move if it's
|
||||||
|
possible. The cluster recovers automatically once enough nodes are
|
||||||
|
live.
|
||||||
|
- **Shrink the cluster.** If you've genuinely lost the missing nodes
|
||||||
|
permanently and can't bring them back, you need to manually edit
|
||||||
|
`cluster.yaml` on every surviving node to remove the dead peers,
|
||||||
|
then restart. Be very deliberate:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# On each surviving node:
|
||||||
|
sudo systemctl stop quptime
|
||||||
|
sudoedit /etc/quptime/cluster.yaml # delete the dead peers[] entries
|
||||||
|
# bump version to something higher
|
||||||
|
sudo systemctl start quptime
|
||||||
|
```
|
||||||
|
|
||||||
|
Make sure every surviving node has identical `cluster.yaml` content
|
||||||
|
before restarting any of them. If they don't, you'll get conflicting
|
||||||
|
views of who's in the cluster and elections will flap.
|
||||||
|
|
||||||
|
- **Start over.** For small clusters this is often faster than the
|
||||||
|
manual surgery above: `rm -rf /etc/quptime` everywhere, then
|
||||||
|
bootstrap from scratch. You'll lose your checks and alerts unless
|
||||||
|
you saved a copy of `cluster.yaml` elsewhere.
|
||||||
|
|
||||||
|
## Monitoring `qu` itself
|
||||||
|
|
||||||
|
`qu` watches your services. Who watches `qu`?
|
||||||
|
|
||||||
|
### From within the cluster
|
||||||
|
|
||||||
|
`qu status` is the single source of truth. The fields to watch:
|
||||||
|
|
||||||
|
| Field | Healthy | Suspicious |
|
||||||
|
| -------------- | -------------- | --------------------------------------------------------- |
|
||||||
|
| `quorum` | `true` | `false` — no mutations, no alerts. |
|
||||||
|
| `master` | a NodeID | `(none — ...)` — quorum lost or election in flight. |
|
||||||
|
| `term` | slow growth | rapid growth → master flapping, network unstable. |
|
||||||
|
| `config ver` | identical across nodes | divergence → a node is stuck pulling. |
|
||||||
|
|
||||||
|
A simple cron sentinel on each node:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
*/5 * * * * /usr/local/bin/qu status >/dev/null 2>&1 \
|
||||||
|
|| curl -fsSL -X POST -d "qu down on $(hostname)" https://alert.example.com/oncall
|
||||||
|
```
|
||||||
|
|
||||||
|
### From outside the cluster
|
||||||
|
|
||||||
|
`qu` does not currently expose a Prometheus / OpenMetrics endpoint.
|
||||||
|
The recommended pattern is to run a *separate* tiny monitoring path
|
||||||
|
that doesn't depend on `qu` — even a single `curl` health check on
|
||||||
|
each node's :9901 (which is TLS-only; you'll see a handshake succeed
|
||||||
|
even if the daemon's stuck) catches process death.
|
||||||
|
|
||||||
|
To produce structured metrics, write a sidecar that parses `qu status`
|
||||||
|
output and exports counters. The CLI emits stable, machine-grep-able
|
||||||
|
output specifically so this is straightforward.
|
||||||
|
|
||||||
|
## Operational checklist before you go to bed
|
||||||
|
|
||||||
|
After standing up a new cluster, work through:
|
||||||
|
|
||||||
|
- [ ] All nodes show `quorum true` in `qu status`.
|
||||||
|
- [ ] All nodes show identical `config ver`.
|
||||||
|
- [ ] All nodes show the same `master`.
|
||||||
|
- [ ] `journalctl -u quptime --since "10 min ago"` has no
|
||||||
|
`propose to master:` or `replicate: pull from:` errors.
|
||||||
|
- [ ] `qu alert test <name>` reaches your inbox / Discord channel for
|
||||||
|
every configured alert.
|
||||||
|
- [ ] At least one check has an intentional failure (a bogus target)
|
||||||
|
that you flip back and forth to verify the full state-transition
|
||||||
|
→ dispatch path end-to-end.
|
||||||
|
- [ ] Backups of `node.yaml` + `keys/` + `cluster.yaml` are landing in
|
||||||
|
your backup destination.
|
||||||
|
- [ ] Firewall allow-list (if any) lists every peer's IP.
|
||||||
|
- [ ] You've stored the cluster secret somewhere that survives the
|
||||||
|
first operator leaving.
|
||||||
@@ -0,0 +1,153 @@
|
|||||||
|
# Security
|
||||||
|
|
||||||
|
The trust model in one page. Read this before deciding where to put
|
||||||
|
`qu` and who can talk to it.
|
||||||
|
|
||||||
|
## What `qu` is trying to defend against
|
||||||
|
|
||||||
|
- **Eavesdropping on cluster traffic.** Defended: TLS 1.3 only,
|
||||||
|
fingerprint-pinned per peer.
|
||||||
|
- **MITM on the cluster's inter-node link.** Defended: TLS 1.3 with
|
||||||
|
out-of-band fingerprint verification at `qu node add`.
|
||||||
|
- **A random internet host enrolling itself as a peer.** Defended:
|
||||||
|
pre-shared cluster secret on every `Join`.
|
||||||
|
- **A compromised peer issuing forged cluster-config mutations.** Not
|
||||||
|
defended. A peer trusted enough to be in `cluster.yaml.peers` can
|
||||||
|
propose mutations through the master. Treat membership as a
|
||||||
|
privilege.
|
||||||
|
- **A compromised peer becoming master.** Election is deterministic on
|
||||||
|
the smallest live `NodeID`, so a compromised peer can become master
|
||||||
|
if its `NodeID` sorts first. The master can rewrite `cluster.yaml`
|
||||||
|
arbitrarily. This is the worst-case blast radius from one compromised
|
||||||
|
node.
|
||||||
|
- **DoS by handshake flood.** Not directly defended at the application
|
||||||
|
layer. The TLS stack accepts anyone's handshake; rate-limiting belongs
|
||||||
|
at the firewall — see [public-internet.md](deployment/public-internet.md).
|
||||||
|
|
||||||
|
## The three secrets on disk
|
||||||
|
|
||||||
|
| Secret | What it is | Loss impact |
|
||||||
|
| -------------------------- | ----------------------------------------- | -------------------------------------------- |
|
||||||
|
| `keys/private.pem` | RSA private key, this node's identity. | Anyone with it can impersonate this node. |
|
||||||
|
| `node.yaml.cluster_secret` | Pre-shared base64 string. | Anyone with it can `Join` the cluster. |
|
||||||
|
| `trust.yaml.entries[].cert_pem` | Other peers' public certs (not secrets, but they enable mTLS). | Loss only forces re-trust. |
|
||||||
|
|
||||||
|
The first two are real secrets and live under `0600` permissions in
|
||||||
|
the data directory. Back them up; never commit them; never paste them
|
||||||
|
in chat.
|
||||||
|
|
||||||
|
## TLS handshake step by step
|
||||||
|
|
||||||
|
For every inter-node call:
|
||||||
|
|
||||||
|
1. Caller dials peer on its `advertise` address.
|
||||||
|
2. TLS 1.3 handshake. Both sides present their self-signed leaf cert.
|
||||||
|
3. The caller's `VerifyPeerCertificate` (set in
|
||||||
|
`internal/transport/tls.go`) computes the SPKI fingerprint of the
|
||||||
|
server's cert and compares it against `trust.yaml`. If the caller
|
||||||
|
knows which `NodeID` it expected, a strict verifier ensures the
|
||||||
|
fingerprint matches *that specific* entry — not just any trusted
|
||||||
|
peer.
|
||||||
|
4. The server's TLS layer accepts any client cert (`RequireAnyClientCert`,
|
||||||
|
`InsecureSkipVerify: true`) because trust is enforced one layer up.
|
||||||
|
5. The RPC dispatcher reads the client's cert, computes its
|
||||||
|
fingerprint, and looks it up in the server's `trust.yaml`. If no
|
||||||
|
entry exists, only the `Join` method is permitted.
|
||||||
|
6. `Join` performs a constant-time comparison of the inbound
|
||||||
|
`ClusterSecret` against `node.yaml.cluster_secret`. Mismatch →
|
||||||
|
refusal.
|
||||||
|
|
||||||
|
So:
|
||||||
|
|
||||||
|
- An adversary who gets your **public** cert can't impersonate you.
|
||||||
|
- An adversary who gets your **fingerprint** can't impersonate you.
|
||||||
|
- An adversary who gets your **private key** *can* impersonate you to
|
||||||
|
any peer that trusts your fingerprint.
|
||||||
|
|
||||||
|
## The TOFU step
|
||||||
|
|
||||||
|
`qu node add <host:port>` runs a one-shot insecure dial against the
|
||||||
|
target (the only place `InsecureBootstrapConfig` is used in the
|
||||||
|
codebase, see `internal/transport/tls.go:91`). It fetches the
|
||||||
|
remote's cert, prints the fingerprint, and asks for confirmation.
|
||||||
|
|
||||||
|
This is **identical** to SSH's first-connection prompt. The operator
|
||||||
|
must verify the fingerprint out of band — by running `qu status` on
|
||||||
|
the remote side, or by reading `keys/cert.pem` directly, or via a
|
||||||
|
known-good distribution channel.
|
||||||
|
|
||||||
|
If you skip verification, you trust the network at that moment. If
|
||||||
|
the network was MITM'd at exactly that moment, you trust the
|
||||||
|
attacker. After the prompt, the cert is pinned and the window closes.
|
||||||
|
|
||||||
|
## Cluster secret rotation
|
||||||
|
|
||||||
|
There is no built-in command to rotate the cluster secret. The hard
|
||||||
|
part isn't generating a new one — it's distributing it consistently
|
||||||
|
across every node. The pragmatic recipe:
|
||||||
|
|
||||||
|
1. Generate a new secret on one node and copy it to every other node.
|
||||||
|
2. Update `node.yaml.cluster_secret` on every node (manual edit).
|
||||||
|
3. Restart each daemon one at a time, verifying quorum returns
|
||||||
|
between restarts.
|
||||||
|
|
||||||
|
Rotation only protects future `Join` calls, not anything else. If you
|
||||||
|
suspect the old secret has been seen by an adversary, also assume any
|
||||||
|
peer that was added during the leaked window is compromised, and
|
||||||
|
re-init those peers from scratch.
|
||||||
|
|
||||||
|
## Identity rotation
|
||||||
|
|
||||||
|
To roll a node's RSA keypair (e.g., the private key was on a laptop
|
||||||
|
that got stolen):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# On the compromised node:
|
||||||
|
sudo systemctl stop quptime
|
||||||
|
sudo rm -rf /etc/quptime
|
||||||
|
sudo -u quptime qu init \
|
||||||
|
--advertise this-host.example.com:9901 \
|
||||||
|
--secret '<existing cluster secret>'
|
||||||
|
sudo systemctl start quptime
|
||||||
|
|
||||||
|
# On a surviving healthy node:
|
||||||
|
sudo -u quptime qu node remove <old-node-id> # evict the old identity
|
||||||
|
sudo -u quptime qu node add this-host.example.com:9901
|
||||||
|
```
|
||||||
|
|
||||||
|
The new `node_id` is a fresh UUID; the old one is gone for good. Any
|
||||||
|
historical references to it (e.g., the `updated_by` field on past
|
||||||
|
versions of `cluster.yaml`) are cosmetic.
|
||||||
|
|
||||||
|
## What the local control socket protects
|
||||||
|
|
||||||
|
`$XDG_RUNTIME_DIR/quptime/quptime.sock` (or `/var/run/quptime/...`) is
|
||||||
|
the channel the CLI uses to talk to the local daemon. It's `0600`
|
||||||
|
permissioned and authenticated solely by filesystem ACLs — no TLS, no
|
||||||
|
secrets in the protocol.
|
||||||
|
|
||||||
|
Anyone who can `read+write` the socket can:
|
||||||
|
|
||||||
|
- Propose cluster mutations (will be relayed to the master).
|
||||||
|
- Read full cluster state including `cluster.yaml`.
|
||||||
|
- Trigger test alerts.
|
||||||
|
|
||||||
|
So: don't put the daemon's user in a group that other unprivileged
|
||||||
|
users share. The default systemd setup with a dedicated `quptime`
|
||||||
|
user gets this right.
|
||||||
|
|
||||||
|
## Hardening checklist
|
||||||
|
|
||||||
|
- [ ] Dedicated `quptime` system user.
|
||||||
|
- [ ] Data directory owned by that user, mode 0750.
|
||||||
|
- [ ] `keys/private.pem` mode 0600.
|
||||||
|
- [ ] `node.yaml` mode 0600.
|
||||||
|
- [ ] systemd unit uses `ProtectSystem=strict`, `NoNewPrivileges=true`,
|
||||||
|
and the rest of the hardening directives in
|
||||||
|
[systemd.md](deployment/systemd.md).
|
||||||
|
- [ ] If `:9901` is internet-reachable, firewall allow-list to peer
|
||||||
|
IPs or use an overlay — see [public-internet.md](deployment/public-internet.md)
|
||||||
|
and [tailscale.md](deployment/tailscale.md).
|
||||||
|
- [ ] Cluster secret generated by `qu init` (not chosen by a human),
|
||||||
|
stored in your secret manager.
|
||||||
|
- [ ] Backups of `keys/` and `node.yaml` are encrypted at rest.
|
||||||
@@ -0,0 +1,214 @@
|
|||||||
|
# Troubleshooting
|
||||||
|
|
||||||
|
The cluster is misbehaving. This page is organised by symptom. Each
|
||||||
|
entry pairs the user-visible signal with the log line(s) you'll see
|
||||||
|
in `journalctl -u quptime` and the fix.
|
||||||
|
|
||||||
|
## `qu status` shows `quorum false`
|
||||||
|
|
||||||
|
**What it means.** Fewer than ⌈N/2⌉+1 peers are live.
|
||||||
|
|
||||||
|
**Diagnose.** Look at the PEERS table. The `LIVE` column tells you
|
||||||
|
which peers this node has stopped hearing from.
|
||||||
|
|
||||||
|
- If only this node is "live" and everyone else is not → this node is
|
||||||
|
network-isolated. Test: `nc -zv <peer-advertise>`. Fix: network /
|
||||||
|
firewall.
|
||||||
|
- If multiple nodes show false → more than one peer is down. Look at
|
||||||
|
the other peers' status outputs to triangulate.
|
||||||
|
- If everyone is live but `quorum false` still → check
|
||||||
|
`cluster.yaml.peers` length vs. live count; you may have phantom
|
||||||
|
peer entries left over from a removed-but-not-evicted node. Fix:
|
||||||
|
`qu node remove <ghost-node-id>` from any live node.
|
||||||
|
|
||||||
|
## `qu status` shows `master (none — ...)`
|
||||||
|
|
||||||
|
**What it means.** Either no quorum (see above) or election is in
|
||||||
|
flight. The latter clears within ~1 heartbeat.
|
||||||
|
|
||||||
|
If `term` is incrementing rapidly (`watch qu status`), the master is
|
||||||
|
flapping. Causes:
|
||||||
|
|
||||||
|
- The currently-elected master is unreachable from some peers but
|
||||||
|
reachable from others, partial-partition style. Look for log lines
|
||||||
|
on the suspected master about peers it can't reach.
|
||||||
|
- Heartbeat timeouts (default 4s) are too tight for your inter-node
|
||||||
|
link. Rebuild with a higher `DefaultDeadAfter` if you need it.
|
||||||
|
|
||||||
|
## A check is stuck in `unknown`
|
||||||
|
|
||||||
|
**What it means.** The aggregator has no fresh reports for that check.
|
||||||
|
|
||||||
|
Possible causes:
|
||||||
|
|
||||||
|
- No node is actually running the probe yet. Probes start ~`interval/10`
|
||||||
|
after `qu serve` boots and reconcile every 5s. Wait 10s and
|
||||||
|
re-check.
|
||||||
|
- Nodes are submitting results but they're stale (older than 3×
|
||||||
|
interval). Probably means probes are timing out without reporting.
|
||||||
|
- This is a follower's view; the aggregator runs on the master only.
|
||||||
|
Check `qu status` on the master to see the canonical view.
|
||||||
|
|
||||||
|
## Alerts not firing
|
||||||
|
|
||||||
|
Walk this list in order; one of them will catch it:
|
||||||
|
|
||||||
|
1. **Is there quorum?** Aggregator runs on master only. No master →
|
||||||
|
no transitions → no alerts.
|
||||||
|
2. **Is the alert attached to the check?** `qu status` shows the
|
||||||
|
effective alert list per check. Empty → no alert. Confirm with
|
||||||
|
`qu alert list` that the alert exists and (if relying on default
|
||||||
|
attachment) has `default: true`.
|
||||||
|
3. **Is the alert suppressed on this check?** Check
|
||||||
|
`suppress_alert_ids` in `cluster.yaml`.
|
||||||
|
4. **Test the alert path directly:**
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo -u quptime qu alert test <name>
|
||||||
|
```
|
||||||
|
|
||||||
|
This bypasses the aggregator and renders a synthetic transition.
|
||||||
|
If `alert test` doesn't deliver, the problem is the notifier
|
||||||
|
config or the template — see below. If `alert test` works but real
|
||||||
|
transitions don't, the aggregator isn't observing the transition.
|
||||||
|
5. **Has the check actually transitioned?** Aggregator commits a flip
|
||||||
|
only after **two consecutive** evaluations agree. A bouncing
|
||||||
|
target may never satisfy the hysteresis. Lower the check interval
|
||||||
|
or increase reliability of the target.
|
||||||
|
|
||||||
|
## Discord webhook returns 4xx
|
||||||
|
|
||||||
|
The dispatcher logs the HTTP body. Common causes:
|
||||||
|
|
||||||
|
- Webhook revoked / channel deleted → 404. Re-issue and update
|
||||||
|
`discord_webhook`.
|
||||||
|
- Body too large → 400. Long templates that pull `Snapshot.Detail`
|
||||||
|
with multi-line errors can blow past Discord's 2000-char limit.
|
||||||
|
Shorten the template or trim the variable.
|
||||||
|
- Rate-limited → 429. Reduce alert frequency or stop suppressing
|
||||||
|
hysteresis.
|
||||||
|
|
||||||
|
## SMTP refuses the message
|
||||||
|
|
||||||
|
Check the daemon log for `smtp:` lines. Most common:
|
||||||
|
|
||||||
|
- `530 5.7.0 Must issue a STARTTLS command first` → set
|
||||||
|
`smtp_starttls: true` on the alert.
|
||||||
|
- `535 Authentication failed` → wrong `smtp_user` / `smtp_password`.
|
||||||
|
- Connection refused / timeout → firewall between `qu` and the SMTP
|
||||||
|
relay. Verify with `openssl s_client -starttls smtp -connect host:587`.
|
||||||
|
|
||||||
|
## Manual edit to `cluster.yaml` was ignored
|
||||||
|
|
||||||
|
Symptoms: you edited the file, saved, nothing happened.
|
||||||
|
|
||||||
|
Look for one of these log lines:
|
||||||
|
|
||||||
|
- `manual-edit: parse cluster.yaml: <err> — ignoring` → YAML is
|
||||||
|
invalid. The daemon pins the bad hash and waits for the next valid
|
||||||
|
save. Run the file through `yq` or `python -c "import yaml,sys;
|
||||||
|
yaml.safe_load(open(sys.argv[1]))" cluster.yaml` to diagnose.
|
||||||
|
- `manual-edit: cluster.yaml changed externally — replicating via
|
||||||
|
master` followed by `manual-edit: forward to master: no quorum` →
|
||||||
|
cluster has no quorum, can't accept the edit. Restore quorum first.
|
||||||
|
- *No log line at all* → the on-disk content didn't change in a way
|
||||||
|
that matters. The watcher compares only `peers`, `checks`, and
|
||||||
|
`alerts`; whitespace and comment edits are accepted silently.
|
||||||
|
|
||||||
|
## Two nodes disagree on `config ver`
|
||||||
|
|
||||||
|
The follower with the lower version should pull within one heartbeat.
|
||||||
|
If after ~5 seconds the gap persists:
|
||||||
|
|
||||||
|
- The follower might not have an `advertise` address for the higher-
|
||||||
|
versioned peer. The version observer needs one to pull. Check
|
||||||
|
`cluster.yaml.peers` for both sides' `advertise` fields.
|
||||||
|
- The follower's TLS handshake against the higher-versioned peer is
|
||||||
|
failing — look for `replicate: pull from <id>: <err>` lines.
|
||||||
|
- The peer with the higher version is announcing it correctly but the
|
||||||
|
follower is rejecting the `ApplyClusterCfg` broadcasts because of
|
||||||
|
its own decode error — look for transport-layer errors instead.
|
||||||
|
|
||||||
|
## "needs ≥2 live to mutate" rejection during bootstrap
|
||||||
|
|
||||||
|
You ran two `qu node add` commands back-to-back and the second one
|
||||||
|
failed. The first add doesn't take effect until the new peer sends
|
||||||
|
its first heartbeat (≤ 1 second); during that window the cluster has
|
||||||
|
size 2 and quorum size 2, so a *second* peer add from a 1-live
|
||||||
|
cluster looks like "mutate without quorum."
|
||||||
|
|
||||||
|
Fix: pause ~3 seconds between adds. The README and the systemd guide
|
||||||
|
both call this out.
|
||||||
|
|
||||||
|
## Daemon refuses to start
|
||||||
|
|
||||||
|
```
|
||||||
|
load node.yaml: open ...: no such file or directory
|
||||||
|
```
|
||||||
|
|
||||||
|
`qu serve` normally auto-bootstraps a missing `node.yaml` using the
|
||||||
|
`QUPTIME_*` env vars (see
|
||||||
|
[configuration.md](configuration.md#auto-init-on-qu-serve)). If you
|
||||||
|
still see this error, the most likely causes are:
|
||||||
|
|
||||||
|
- The data directory is read-only or owned by a different user — the
|
||||||
|
bootstrap can't write `node.yaml`. Fix permissions on
|
||||||
|
`$QUPTIME_DIR`.
|
||||||
|
- Something else removed `node.yaml` mid-run (a config-management
|
||||||
|
tool, a misconfigured volume). Re-run `qu serve` and it will
|
||||||
|
rebuild from env, or run `qu init` manually with the flags you
|
||||||
|
want.
|
||||||
|
|
||||||
|
```
|
||||||
|
node.yaml has empty node_id — run `qu init` first
|
||||||
|
```
|
||||||
|
|
||||||
|
`node.yaml` exists but lacks a `node_id`. Either delete the file and
|
||||||
|
let auto-init regenerate it, or run `qu init` against a wiped data
|
||||||
|
dir.
|
||||||
|
|
||||||
|
```
|
||||||
|
listen tcp :9901: bind: address already in use
|
||||||
|
```
|
||||||
|
|
||||||
|
Another process owns the port. `ss -tlnp | grep :9901` to find it.
|
||||||
|
|
||||||
|
```
|
||||||
|
load private key: ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Permissions on `keys/private.pem` are wrong — should be 0600 and owned
|
||||||
|
by the daemon user. Fix and restart.
|
||||||
|
|
||||||
|
## Probes look much slower than expected
|
||||||
|
|
||||||
|
ICMP first:
|
||||||
|
|
||||||
|
- Default ICMP is **unprivileged UDP-mode pings**, not raw ICMP. UDP
|
||||||
|
ping is a bit slower and may hit different kernel paths. For
|
||||||
|
reference latency, grant `CAP_NET_RAW`.
|
||||||
|
|
||||||
|
HTTP / TCP:
|
||||||
|
|
||||||
|
- `interval` and `timeout` are the only knobs in `cluster.yaml`. The
|
||||||
|
check is run synchronously per worker; if your target takes 9 s to
|
||||||
|
respond and your timeout is 10 s, the next probe doesn't start
|
||||||
|
until ~9 s elapsed. Increase concurrency by adding more
|
||||||
|
fast-interval checks against the same target, not by lowering
|
||||||
|
timeout (which will just produce false `down` results).
|
||||||
|
|
||||||
|
## I want to start over
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sudo systemctl stop quptime
|
||||||
|
sudo rm -rf /etc/quptime
|
||||||
|
sudo -u quptime qu init --advertise <addr>
|
||||||
|
sudo systemctl start quptime
|
||||||
|
```
|
||||||
|
|
||||||
|
The data directory is the only state. Wipe it and you're back to a
|
||||||
|
fresh node.
|
||||||
|
|
||||||
|
Under Docker (or any env-driven deploy), the explicit `qu init` step
|
||||||
|
isn't needed — wiping the data volume and restarting the container is
|
||||||
|
enough; `qu serve` will re-bootstrap from the `QUPTIME_*` env vars.
|
||||||
+220
-31
@@ -1,23 +1,42 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
# QUptime installer.
|
||||||
|
#
|
||||||
|
# Downloads the latest released `qu` binary, verifies it against the
|
||||||
|
# published SHA256SUMS, installs it to /usr/local/bin, and (on systemd
|
||||||
|
# hosts) drops in a hardened quptime.service that matches the unit
|
||||||
|
# documented in docs/deployment/systemd.md.
|
||||||
|
#
|
||||||
|
# Release sources, tried in order:
|
||||||
|
# 1. Gitea: git.cer.sh/axodouble/quptime/releases (primary — canonical home)
|
||||||
|
# 2. GitHub: github.com/Axodouble/QUptime/releases (push-mirror fallback)
|
||||||
|
#
|
||||||
|
# Idempotent — re-running upgrades the binary and refreshes the unit
|
||||||
|
# without touching the data directory.
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
INSTALL_BIN="/usr/local/bin/qu"
|
INSTALL_BIN="/usr/local/bin/qu"
|
||||||
SERVICE_FILE="/etc/systemd/system/qu-serve.service"
|
SERVICE_FILE="/etc/systemd/system/quptime.service"
|
||||||
SERVICE_USER="${SUDO_USER:-$(whoami)}"
|
SERVICE_NAME="$(basename "$SERVICE_FILE")"
|
||||||
SERVICE_GROUP="$(id -gn "$SERVICE_USER" 2>/dev/null || echo root)"
|
SERVICE_USER="quptime"
|
||||||
|
SERVICE_GROUP="quptime"
|
||||||
|
DATA_DIR="/etc/quptime"
|
||||||
|
|
||||||
|
# Release sources, in preference order. Each row is:
|
||||||
|
# <name>|<latest-release API endpoint>|<release-asset base URL>
|
||||||
|
# The asset URL is concatenated with `/<tag>/<filename>`. Adjust here
|
||||||
|
# if the project moves hosts.
|
||||||
|
SOURCES=(
|
||||||
|
"gitea|https://git.cer.sh/api/v1/repos/axodouble/quptime/releases/latest|https://git.cer.sh/axodouble/quptime/releases/download"
|
||||||
|
"github|https://api.github.com/repos/Axodouble/QUptime/releases/latest|https://github.com/Axodouble/QUptime/releases/download"
|
||||||
|
)
|
||||||
|
|
||||||
fail() {
|
fail() {
|
||||||
echo "Error: $*" >&2
|
echo "Error: $*" >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
echo_cmd() {
|
|
||||||
echo -e "\033[90m> $1\033[0m"
|
|
||||||
eval "$1"
|
|
||||||
}
|
|
||||||
|
|
||||||
require_command() {
|
require_command() {
|
||||||
command -v "$1" > /dev/null 2>&1 || fail "$1 is not installed. Please install $1 and try again."
|
command -v "$1" >/dev/null 2>&1 || fail "$1 is not installed. Please install $1 and try again."
|
||||||
}
|
}
|
||||||
|
|
||||||
write_completion() {
|
write_completion() {
|
||||||
@@ -31,52 +50,222 @@ write_completion() {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
require_command jq
|
# fetch_from_source tries one release source end-to-end: pulls the
|
||||||
|
# latest tag from its API, downloads the per-arch binary and the
|
||||||
|
# accompanying SHA256SUMS, and verifies the checksum. Returns 0 on
|
||||||
|
# success (with RELEASE and BINARY_NAME set as globals) or 1 if any
|
||||||
|
# step fails — callers can then try the next source. Stderr is kept
|
||||||
|
# quiet so a failed primary doesn't spam the operator before the
|
||||||
|
# fallback is attempted.
|
||||||
|
fetch_from_source() {
|
||||||
|
local api_url=$1
|
||||||
|
local release_base=$2
|
||||||
|
local tmpdir=$3
|
||||||
|
|
||||||
|
local release
|
||||||
|
release=$(curl -fsSL --proto '=https' --tlsv1.2 "$api_url" 2>/dev/null | jq -r '.tag_name' 2>/dev/null) \
|
||||||
|
|| return 1
|
||||||
|
[ -n "$release" ] && [ "$release" != "null" ] || return 1
|
||||||
|
|
||||||
|
local binary_name="qu-${release}-linux-${ARCH}"
|
||||||
|
local binary_url="${release_base}/${release}/${binary_name}"
|
||||||
|
local sums_url="${release_base}/${release}/SHA256SUMS"
|
||||||
|
|
||||||
|
curl -fsSL --proto '=https' --tlsv1.2 -o "$tmpdir/$binary_name" "$binary_url" 2>/dev/null \
|
||||||
|
|| return 1
|
||||||
|
curl -fsSL --proto '=https' --tlsv1.2 -o "$tmpdir/SHA256SUMS" "$sums_url" 2>/dev/null \
|
||||||
|
|| return 1
|
||||||
|
|
||||||
|
# Verify against the SHA256SUMS that came from the same source as
|
||||||
|
# the binary. Never mix sources here — verifying a GitHub-hosted
|
||||||
|
# binary against a Gitea-hosted SHA256SUMS would defeat the
|
||||||
|
# tamper check.
|
||||||
|
(
|
||||||
|
cd "$tmpdir"
|
||||||
|
if ! grep -E "[[:space:]]\\*?${binary_name}\$" SHA256SUMS > expected.sum; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! sha256sum -c expected.sum >/dev/null 2>&1; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
) || return 1
|
||||||
|
|
||||||
|
RELEASE="$release"
|
||||||
|
BINARY_NAME="$binary_name"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
require_command curl
|
require_command curl
|
||||||
|
require_command jq
|
||||||
|
require_command sha256sum
|
||||||
|
require_command install
|
||||||
|
require_command mktemp
|
||||||
|
|
||||||
|
# --- target architecture ------------------------------------------------
|
||||||
|
case "$(uname -m)" in
|
||||||
|
x86_64) ARCH=amd64 ;;
|
||||||
|
aarch64|arm64) ARCH=arm64 ;;
|
||||||
|
*) fail "unsupported architecture: $(uname -m). Pre-built binaries are published for amd64 and arm64 only — build from source for other platforms." ;;
|
||||||
|
esac
|
||||||
|
|
||||||
if [ ! -w "$(dirname "$INSTALL_BIN")" ]; then
|
if [ ! -w "$(dirname "$INSTALL_BIN")" ]; then
|
||||||
fail "You are not allowed to write to $(dirname "$INSTALL_BIN"). Run this script with sudo or install qu manually."
|
fail "Cannot write to $(dirname "$INSTALL_BIN"). Run this script with sudo, or set INSTALL_BIN to a writable location."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RELEASE=$(curl -s https://git.cer.sh/api/v1/repos/axodouble/quptime/releases/latest | jq -r '.tag_name')
|
# --- download + verify (with fallback) ----------------------------------
|
||||||
|
TMPDIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$TMPDIR"' EXIT
|
||||||
|
|
||||||
echo_cmd "curl -L -o '$INSTALL_BIN' 'https://git.cer.sh/axodouble/quptime/releases/download/${RELEASE}/qu-${RELEASE}-linux-amd64'"
|
# Globals filled in by fetch_from_source on success.
|
||||||
echo_cmd "chmod +x '$INSTALL_BIN'"
|
RELEASE=""
|
||||||
echo "> qu has been installed to $INSTALL_BIN"
|
BINARY_NAME=""
|
||||||
|
INSTALLED_FROM=""
|
||||||
|
INSTALLED_TMP=""
|
||||||
|
|
||||||
|
for source_spec in "${SOURCES[@]}"; do
|
||||||
|
IFS='|' read -r src_name src_api src_base <<<"$source_spec"
|
||||||
|
src_tmp="$TMPDIR/$src_name"
|
||||||
|
mkdir -p "$src_tmp"
|
||||||
|
echo "> trying release source: $src_name"
|
||||||
|
# `set -e` would abort the whole script the moment fetch_from_source
|
||||||
|
# returns nonzero; we want the loop to fall through to the next
|
||||||
|
# source instead. Wrap the call so a failure is just data.
|
||||||
|
if fetch_from_source "$src_api" "$src_base" "$src_tmp"; then
|
||||||
|
INSTALLED_FROM="$src_name"
|
||||||
|
INSTALLED_TMP="$src_tmp"
|
||||||
|
echo "> $src_name: ${RELEASE} ✓ checksum OK"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
echo "> $src_name: unavailable"
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -z "$INSTALLED_FROM" ]; then
|
||||||
|
fail "no release source reachable — tried: $(printf '%s ' "${SOURCES[@]%%|*}"). Check network access to git.cer.sh and github.com."
|
||||||
|
fi
|
||||||
|
|
||||||
|
install -m 0755 "$INSTALLED_TMP/$BINARY_NAME" "$INSTALL_BIN"
|
||||||
|
echo "> qu ${RELEASE} installed to $INSTALL_BIN (source: $INSTALLED_FROM)"
|
||||||
|
|
||||||
|
# --- shell completions --------------------------------------------------
|
||||||
if "$INSTALL_BIN" --help 2>/dev/null | grep -q "completion"; then
|
if "$INSTALL_BIN" --help 2>/dev/null | grep -q "completion"; then
|
||||||
write_completion bash /usr/share/bash-completion/completions/qu \
|
write_completion bash /usr/share/bash-completion/completions/qu \
|
||||||
|| write_completion bash /etc/bash_completion.d/qu || true
|
|| write_completion bash /etc/bash_completion.d/qu \
|
||||||
|
|| true
|
||||||
write_completion zsh /usr/share/zsh/site-functions/_qu || true
|
write_completion zsh /usr/share/zsh/site-functions/_qu || true
|
||||||
write_completion fish /usr/share/fish/vendor_completions.d/qu.fish || true
|
write_completion fish /usr/share/fish/vendor_completions.d/qu.fish || true
|
||||||
else
|
else
|
||||||
echo "> qu does not expose completion support; skipping shell completion installation."
|
echo "> qu does not expose completion support; skipping shell completion installation."
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! command -v systemctl > /dev/null 2>&1; then
|
# --- systemd unit -------------------------------------------------------
|
||||||
echo "> Warning: systemd is not available on this system. qu serve will not be automatically started on boot."
|
if ! command -v systemctl >/dev/null 2>&1; then
|
||||||
echo "Installation complete, before starting qu serve, make sure to run qu init and read the documentation."
|
echo
|
||||||
|
echo "> systemd is not available on this system. Installation stops here."
|
||||||
|
echo "> Run \`qu serve\` manually (or wire it into the supervisor of your choice)."
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "> Creating systemd service file for qu serve..."
|
# Dedicated service user. Hardened unit drops all capabilities and
|
||||||
cat > "$SERVICE_FILE" <<EOL
|
# locks the daemon down with ProtectSystem=strict, so it must run as
|
||||||
|
# its own unprivileged account rather than the invoking sudo user.
|
||||||
|
if ! id "$SERVICE_USER" >/dev/null 2>&1; then
|
||||||
|
echo "> creating system user $SERVICE_USER"
|
||||||
|
useradd --system --no-create-home --shell /usr/sbin/nologin "$SERVICE_USER"
|
||||||
|
fi
|
||||||
|
|
||||||
|
install -d -o "$SERVICE_USER" -g "$SERVICE_GROUP" -m 0750 "$DATA_DIR"
|
||||||
|
|
||||||
|
echo "> writing $SERVICE_FILE"
|
||||||
|
cat > "$SERVICE_FILE" <<'EOF'
|
||||||
[Unit]
|
[Unit]
|
||||||
Description=QUptime Serve
|
Description=QUptime distributed uptime monitor
|
||||||
After=network.target
|
Documentation=https://git.cer.sh/axodouble/quptime
|
||||||
|
Wants=network-online.target
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
ExecStart=$INSTALL_BIN serve
|
Type=simple
|
||||||
|
ExecStart=/usr/local/bin/qu serve
|
||||||
Restart=always
|
Restart=always
|
||||||
User=$SERVICE_USER
|
RestartSec=5s
|
||||||
Group=$SERVICE_GROUP
|
|
||||||
|
User=quptime
|
||||||
|
Group=quptime
|
||||||
|
|
||||||
|
# Where state lives. RuntimeDirectory creates /var/run/quptime/ each
|
||||||
|
# boot owned by User:Group with mode 0750.
|
||||||
|
Environment=QUPTIME_DIR=/etc/quptime
|
||||||
|
RuntimeDirectory=quptime
|
||||||
|
RuntimeDirectoryMode=0750
|
||||||
|
ReadWritePaths=/etc/quptime /var/run/quptime
|
||||||
|
|
||||||
|
# Hardening. Comment out individual directives if a probe needs
|
||||||
|
# something we've revoked.
|
||||||
|
NoNewPrivileges=true
|
||||||
|
ProtectSystem=strict
|
||||||
|
ProtectHome=true
|
||||||
|
PrivateTmp=true
|
||||||
|
PrivateDevices=true
|
||||||
|
ProtectKernelTunables=true
|
||||||
|
ProtectKernelModules=true
|
||||||
|
ProtectControlGroups=true
|
||||||
|
ProtectClock=true
|
||||||
|
ProtectHostname=true
|
||||||
|
RestrictNamespaces=true
|
||||||
|
RestrictRealtime=true
|
||||||
|
RestrictSUIDSGID=true
|
||||||
|
LockPersonality=true
|
||||||
|
MemoryDenyWriteExecute=true
|
||||||
|
|
||||||
|
# Network access is required (we're a network monitor). Keep address
|
||||||
|
# families minimal — AF_NETLINK is needed for some libc lookups.
|
||||||
|
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK
|
||||||
|
|
||||||
|
# If you need raw ICMP, *also* uncomment:
|
||||||
|
# AmbientCapabilities=CAP_NET_RAW
|
||||||
|
# CapabilityBoundingSet=CAP_NET_RAW
|
||||||
|
# Otherwise drop all capabilities:
|
||||||
|
CapabilityBoundingSet=
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
EOL
|
EOF
|
||||||
|
|
||||||
echo_cmd "systemctl daemon-reload"
|
systemctl daemon-reload
|
||||||
echo_cmd "systemctl enable $(basename "$SERVICE_FILE")"
|
systemctl enable "$SERVICE_NAME" >/dev/null
|
||||||
echo "> qu serve service has been created and enabled. You can start it with 'systemctl start $(basename "$SERVICE_FILE")'"
|
echo "> ${SERVICE_NAME} installed and enabled (not yet started)"
|
||||||
|
|
||||||
echo "Installation complete, before starting qu serve, make sure to run qu init and read the documentation."
|
cat <<EOF
|
||||||
|
|
||||||
|
Installation complete.
|
||||||
|
|
||||||
|
Next steps:
|
||||||
|
|
||||||
|
1. Initialise the node identity. Either:
|
||||||
|
|
||||||
|
a) Let \`qu serve\` auto-init from environment variables.
|
||||||
|
Drop a systemd override like:
|
||||||
|
|
||||||
|
sudo systemctl edit ${SERVICE_NAME}
|
||||||
|
[Service]
|
||||||
|
Environment=QUPTIME_ADVERTISE=<this-host>:9901
|
||||||
|
# On follower nodes, also set the shared join secret:
|
||||||
|
# Environment=QUPTIME_CLUSTER_SECRET=<paste from first node>
|
||||||
|
|
||||||
|
b) Or run \`qu init\` once explicitly:
|
||||||
|
|
||||||
|
sudo -u ${SERVICE_USER} QUPTIME_DIR=${DATA_DIR} \\
|
||||||
|
qu init --advertise <this-host>:9901
|
||||||
|
|
||||||
|
2. Start the service:
|
||||||
|
|
||||||
|
sudo systemctl start ${SERVICE_NAME}
|
||||||
|
sudo -u ${SERVICE_USER} qu status
|
||||||
|
|
||||||
|
3. For ICMP checks, the daemon defaults to unprivileged UDP-mode
|
||||||
|
pings — those need the ping_group_range sysctl widened to include
|
||||||
|
the ${SERVICE_USER} GID, or grant CAP_NET_RAW in the unit. See
|
||||||
|
docs/deployment/systemd.md for the recipes.
|
||||||
|
|
||||||
|
Full documentation: https://git.cer.sh/axodouble/quptime/src/branch/master/docs
|
||||||
|
EOF
|
||||||
|
|||||||
+121
-60
@@ -5,6 +5,7 @@ import (
|
|||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
@@ -30,78 +31,50 @@ Pass --secret on every subsequent node so they share the same
|
|||||||
cluster join secret. If --secret is omitted on the very first node, a
|
cluster join secret. If --secret is omitted on the very first node, a
|
||||||
random secret is generated and printed for the operator to copy.
|
random secret is generated and printed for the operator to copy.
|
||||||
|
|
||||||
|
Every flag may also be supplied via its QUPTIME_* environment variable
|
||||||
|
(see docs/configuration.md). Explicit flags win over env values, which
|
||||||
|
in turn win over the compiled defaults.
|
||||||
|
|
||||||
Idempotent in one direction only: existing key material is never
|
Idempotent in one direction only: existing key material is never
|
||||||
overwritten. Re-run only after wiping the data directory.`,
|
overwritten. Re-run only after wiping the data directory.`,
|
||||||
RunE: func(cmd *cobra.Command, args []string) error {
|
RunE: func(cmd *cobra.Command, args []string) error {
|
||||||
if err := config.EnsureDataDir(); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if _, err := os.Stat(config.NodeFilePath()); err == nil {
|
if _, err := os.Stat(config.NodeFilePath()); err == nil {
|
||||||
return errors.New("node.yaml already exists in data dir — refusing to overwrite")
|
return errors.New("node.yaml already exists in data dir — refusing to overwrite")
|
||||||
}
|
}
|
||||||
|
|
||||||
secret := clusterSecret
|
// Only let env fill fields the operator did NOT pass on the
|
||||||
generated := false
|
// command line; explicit flags must win over env.
|
||||||
if secret == "" {
|
n := &config.NodeConfig{}
|
||||||
s, err := generateSecret()
|
if cmd.Flags().Changed("bind") {
|
||||||
if err != nil {
|
n.BindAddr = bindAddr
|
||||||
return fmt.Errorf("generate cluster secret: %w", err)
|
|
||||||
}
|
}
|
||||||
secret = s
|
if cmd.Flags().Changed("port") {
|
||||||
generated = true
|
n.BindPort = bindPort
|
||||||
|
}
|
||||||
|
if cmd.Flags().Changed("advertise") {
|
||||||
|
n.Advertise = advertise
|
||||||
|
}
|
||||||
|
if cmd.Flags().Changed("secret") {
|
||||||
|
n.ClusterSecret = clusterSecret
|
||||||
|
}
|
||||||
|
if err := n.ApplyEnvOverrides(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Cobra defaults (bind=0.0.0.0, port=9901) are still
|
||||||
|
// available as fallbacks for fields neither flag nor env
|
||||||
|
// touched.
|
||||||
|
if n.BindAddr == "" {
|
||||||
|
n.BindAddr = bindAddr
|
||||||
|
}
|
||||||
|
if n.BindPort == 0 {
|
||||||
|
n.BindPort = bindPort
|
||||||
}
|
}
|
||||||
|
|
||||||
nodeID := uuid.NewString()
|
_, generated, err := bootstrapNode(n)
|
||||||
n := &config.NodeConfig{
|
|
||||||
NodeID: nodeID,
|
|
||||||
BindAddr: bindAddr,
|
|
||||||
BindPort: bindPort,
|
|
||||||
Advertise: advertise,
|
|
||||||
ClusterSecret: secret,
|
|
||||||
}
|
|
||||||
if err := n.Save(); err != nil {
|
|
||||||
return fmt.Errorf("save node.yaml: %w", err)
|
|
||||||
}
|
|
||||||
if _, err := crypto.GenerateKeyPair(nodeID); err != nil {
|
|
||||||
return fmt.Errorf("generate keys: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Seed cluster.yaml with this node as its own first peer.
|
|
||||||
// Without this the math in `quorum` would treat a one-node
|
|
||||||
// cluster as "0 peers, fallback quorum=1, master=self" —
|
|
||||||
// which works in isolation but breaks the moment another
|
|
||||||
// node joins, because the replicated peers list would lack
|
|
||||||
// the inviter, leading to split-brain elections.
|
|
||||||
certPEM, err := crypto.LoadCertPEM()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("load cert: %w", err)
|
return err
|
||||||
}
|
|
||||||
fp, err := crypto.FingerprintFromCertPEM(certPEM)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("fingerprint own cert: %w", err)
|
|
||||||
}
|
|
||||||
cluster := &config.ClusterConfig{}
|
|
||||||
if err := cluster.Mutate(nodeID, func(c *config.ClusterConfig) error {
|
|
||||||
c.Peers = []config.PeerInfo{{
|
|
||||||
NodeID: nodeID,
|
|
||||||
Advertise: n.AdvertiseAddr(),
|
|
||||||
Fingerprint: fp,
|
|
||||||
CertPEM: string(certPEM),
|
|
||||||
}}
|
|
||||||
return nil
|
|
||||||
}); err != nil {
|
|
||||||
return fmt.Errorf("seed cluster.yaml: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
out := cmd.OutOrStdout()
|
|
||||||
fmt.Fprintf(out, "initialised node %s\n", nodeID)
|
|
||||||
fmt.Fprintf(out, "data dir: %s\n", config.DataDir())
|
|
||||||
fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr())
|
|
||||||
if generated {
|
|
||||||
fmt.Fprintln(out)
|
|
||||||
fmt.Fprintln(out, "cluster secret (copy to every other node via --secret):")
|
|
||||||
fmt.Fprintln(out, " "+secret)
|
|
||||||
}
|
}
|
||||||
|
printBootstrapResult(cmd.OutOrStdout(), n, generated)
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -112,6 +85,94 @@ overwritten. Re-run only after wiping the data directory.`,
|
|||||||
root.AddCommand(cmd)
|
root.AddCommand(cmd)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bootstrapNode creates the data dir, writes node.yaml, generates the
|
||||||
|
// keypair, and seeds cluster.yaml with this node as its own first
|
||||||
|
// peer. cfg may arrive with any subset of fields populated; missing
|
||||||
|
// NodeID and ClusterSecret are auto-generated, missing BindAddr /
|
||||||
|
// BindPort get the compiled defaults.
|
||||||
|
//
|
||||||
|
// Returns the populated config (the same pointer that was passed in)
|
||||||
|
// and a flag indicating whether ClusterSecret was generated here. The
|
||||||
|
// flag exists so the caller can print the secret for the operator —
|
||||||
|
// it must be copied to every follower node out-of-band.
|
||||||
|
//
|
||||||
|
// Caller is responsible for checking that node.yaml does not yet
|
||||||
|
// exist; bootstrapNode itself will refuse to overwrite an existing
|
||||||
|
// keypair (crypto.GenerateKeyPair errors out) but does not guard
|
||||||
|
// against clobbering node.yaml.
|
||||||
|
func bootstrapNode(cfg *config.NodeConfig) (*config.NodeConfig, bool, error) {
|
||||||
|
if err := config.EnsureDataDir(); err != nil {
|
||||||
|
return nil, false, err
|
||||||
|
}
|
||||||
|
if cfg.NodeID == "" {
|
||||||
|
cfg.NodeID = uuid.NewString()
|
||||||
|
}
|
||||||
|
if cfg.BindAddr == "" {
|
||||||
|
cfg.BindAddr = "0.0.0.0"
|
||||||
|
}
|
||||||
|
if cfg.BindPort == 0 {
|
||||||
|
cfg.BindPort = 9901
|
||||||
|
}
|
||||||
|
generated := false
|
||||||
|
if cfg.ClusterSecret == "" {
|
||||||
|
s, err := generateSecret()
|
||||||
|
if err != nil {
|
||||||
|
return nil, false, fmt.Errorf("generate cluster secret: %w", err)
|
||||||
|
}
|
||||||
|
cfg.ClusterSecret = s
|
||||||
|
generated = true
|
||||||
|
}
|
||||||
|
if err := cfg.Save(); err != nil {
|
||||||
|
return nil, false, fmt.Errorf("save node.yaml: %w", err)
|
||||||
|
}
|
||||||
|
if _, err := crypto.GenerateKeyPair(cfg.NodeID); err != nil {
|
||||||
|
return nil, false, fmt.Errorf("generate keys: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seed cluster.yaml with this node as its own first peer.
|
||||||
|
// Without this the math in `quorum` would treat a one-node
|
||||||
|
// cluster as "0 peers, fallback quorum=1, master=self" — which
|
||||||
|
// works in isolation but breaks the moment another node joins,
|
||||||
|
// because the replicated peers list would lack the inviter,
|
||||||
|
// leading to split-brain elections.
|
||||||
|
certPEM, err := crypto.LoadCertPEM()
|
||||||
|
if err != nil {
|
||||||
|
return nil, false, fmt.Errorf("load cert: %w", err)
|
||||||
|
}
|
||||||
|
fp, err := crypto.FingerprintFromCertPEM(certPEM)
|
||||||
|
if err != nil {
|
||||||
|
return nil, false, fmt.Errorf("fingerprint own cert: %w", err)
|
||||||
|
}
|
||||||
|
cluster := &config.ClusterConfig{}
|
||||||
|
if err := cluster.Mutate(cfg.NodeID, func(c *config.ClusterConfig) error {
|
||||||
|
c.Peers = []config.PeerInfo{{
|
||||||
|
NodeID: cfg.NodeID,
|
||||||
|
Advertise: cfg.AdvertiseAddr(),
|
||||||
|
Fingerprint: fp,
|
||||||
|
CertPEM: string(certPEM),
|
||||||
|
}}
|
||||||
|
return nil
|
||||||
|
}); err != nil {
|
||||||
|
return nil, false, fmt.Errorf("seed cluster.yaml: %w", err)
|
||||||
|
}
|
||||||
|
return cfg, generated, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// printBootstrapResult emits the human-readable summary both `qu init`
|
||||||
|
// and the serve auto-init path print after bootstrapping. Kept in one
|
||||||
|
// place so the secret-disclosure format stays identical across the two
|
||||||
|
// entry points.
|
||||||
|
func printBootstrapResult(out io.Writer, n *config.NodeConfig, secretGenerated bool) {
|
||||||
|
fmt.Fprintf(out, "initialised node %s\n", n.NodeID)
|
||||||
|
fmt.Fprintf(out, "data dir: %s\n", config.DataDir())
|
||||||
|
fmt.Fprintf(out, "advertise: %s\n", n.AdvertiseAddr())
|
||||||
|
if secretGenerated {
|
||||||
|
fmt.Fprintln(out)
|
||||||
|
fmt.Fprintln(out, "cluster secret (copy to every other node via --secret or QUPTIME_CLUSTER_SECRET):")
|
||||||
|
fmt.Fprintln(out, " "+n.ClusterSecret)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// generateSecret produces 32 bytes of crypto-random data and returns
|
// generateSecret produces 32 bytes of crypto-random data and returns
|
||||||
// it base64-encoded. Long enough that brute force isn't a concern;
|
// it base64-encoded. Long enough that brute force isn't a concern;
|
||||||
// short enough that operators can copy-paste it without pagination.
|
// short enough that operators can copy-paste it without pagination.
|
||||||
|
|||||||
+50
-1
@@ -2,6 +2,9 @@ package cli
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io/fs"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
@@ -9,6 +12,7 @@ import (
|
|||||||
|
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
|
|
||||||
|
"git.cer.sh/axodouble/quptime/internal/config"
|
||||||
"git.cer.sh/axodouble/quptime/internal/daemon"
|
"git.cer.sh/axodouble/quptime/internal/daemon"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -18,9 +22,18 @@ func addServeCmd(root *cobra.Command) {
|
|||||||
Short: "Run the qu daemon in the foreground",
|
Short: "Run the qu daemon in the foreground",
|
||||||
Long: `Run the qu daemon: starts the inter-node listener, the local
|
Long: `Run the qu daemon: starts the inter-node listener, the local
|
||||||
control socket for the CLI, the heartbeat loop and the check
|
control socket for the CLI, the heartbeat loop and the check
|
||||||
scheduler. Stops cleanly on SIGINT or SIGTERM.`,
|
scheduler. Stops cleanly on SIGINT or SIGTERM.
|
||||||
|
|
||||||
|
If node.yaml does not exist yet, serve will bootstrap it using values
|
||||||
|
from the QUPTIME_* environment variables (see docs/configuration.md).
|
||||||
|
This makes a single ` + "`docker compose up`" + ` enough to launch a new node —
|
||||||
|
no separate ` + "`qu init`" + ` step is required when the data volume is
|
||||||
|
fresh.`,
|
||||||
RunE: func(cmd *cobra.Command, args []string) error {
|
RunE: func(cmd *cobra.Command, args []string) error {
|
||||||
logger := log.New(os.Stderr, "quptime: ", log.LstdFlags|log.Lmsgprefix)
|
logger := log.New(os.Stderr, "quptime: ", log.LstdFlags|log.Lmsgprefix)
|
||||||
|
if err := autoInitIfNeeded(cmd, logger); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
d, err := daemon.New(logger)
|
d, err := daemon.New(logger)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -32,3 +45,39 @@ scheduler. Stops cleanly on SIGINT or SIGTERM.`,
|
|||||||
}
|
}
|
||||||
root.AddCommand(cmd)
|
root.AddCommand(cmd)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// autoInitIfNeeded bootstraps the node on first launch.
|
||||||
|
//
|
||||||
|
// Friction this removes for container deploys: before, the operator
|
||||||
|
// had to `docker compose run --rm quptime init …` once before the
|
||||||
|
// service could come up, which makes `restart: unless-stopped`
|
||||||
|
// awkward and forces an out-of-band step into every fresh volume.
|
||||||
|
// Now serve auto-runs the same bootstrap path using QUPTIME_* env
|
||||||
|
// vars when node.yaml is absent, so the compose file can come up on
|
||||||
|
// the first try.
|
||||||
|
//
|
||||||
|
// Pre-existing node.yaml is left untouched — we only bootstrap when
|
||||||
|
// the file is genuinely missing. Any other stat error (permission
|
||||||
|
// denied, broken symlink) is surfaced so the operator sees the real
|
||||||
|
// problem instead of a confused auto-init attempt clobbering state.
|
||||||
|
func autoInitIfNeeded(cmd *cobra.Command, logger *log.Logger) error {
|
||||||
|
_, err := os.Stat(config.NodeFilePath())
|
||||||
|
if err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if !errors.Is(err, fs.ErrNotExist) {
|
||||||
|
return fmt.Errorf("stat node.yaml: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.Printf("node.yaml not found at %s — bootstrapping from environment", config.NodeFilePath())
|
||||||
|
n := &config.NodeConfig{}
|
||||||
|
if err := n.ApplyEnvOverrides(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if _, generated, err := bootstrapNode(n); err != nil {
|
||||||
|
return fmt.Errorf("auto-init: %w", err)
|
||||||
|
} else {
|
||||||
|
printBootstrapResult(cmd.OutOrStderr(), n, generated)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,10 +3,26 @@ package config
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
"gopkg.in/yaml.v3"
|
"gopkg.in/yaml.v3"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Environment variable names that override fields on NodeConfig at
|
||||||
|
// load time. Intended to let `docker compose` setups drive a node's
|
||||||
|
// identity and listener configuration without having to bake a
|
||||||
|
// node.yaml into the image or run `qu init` manually first.
|
||||||
|
//
|
||||||
|
// Empty values are ignored — they do not clear a field. The override
|
||||||
|
// order is therefore: env (non-empty) > file > compiled default.
|
||||||
|
const (
|
||||||
|
EnvNodeID = "QUPTIME_NODE_ID"
|
||||||
|
EnvBindAddr = "QUPTIME_BIND_ADDR"
|
||||||
|
EnvBindPort = "QUPTIME_BIND_PORT"
|
||||||
|
EnvAdvertise = "QUPTIME_ADVERTISE"
|
||||||
|
EnvClusterSecret = "QUPTIME_CLUSTER_SECRET"
|
||||||
|
)
|
||||||
|
|
||||||
// NodeConfig is the per-node, never-replicated identity file.
|
// NodeConfig is the per-node, never-replicated identity file.
|
||||||
type NodeConfig struct {
|
type NodeConfig struct {
|
||||||
// NodeID is a stable UUID generated at `qu init`. Used by all peers
|
// NodeID is a stable UUID generated at `qu init`. Used by all peers
|
||||||
@@ -45,6 +61,34 @@ func (n *NodeConfig) AdvertiseAddr() string {
|
|||||||
return fmt.Sprintf("%s:%d", bind, n.BindPort)
|
return fmt.Sprintf("%s:%d", bind, n.BindPort)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ApplyEnvOverrides folds QUPTIME_* environment variables onto n.
|
||||||
|
// Non-empty env values win over the existing field value. Called both
|
||||||
|
// by LoadNodeConfig and by the `qu init` / serve auto-init paths so
|
||||||
|
// the same precedence rules apply whether the daemon is reading a
|
||||||
|
// persisted node.yaml or constructing one from scratch.
|
||||||
|
func (n *NodeConfig) ApplyEnvOverrides() error {
|
||||||
|
if v := os.Getenv(EnvNodeID); v != "" {
|
||||||
|
n.NodeID = v
|
||||||
|
}
|
||||||
|
if v := os.Getenv(EnvBindAddr); v != "" {
|
||||||
|
n.BindAddr = v
|
||||||
|
}
|
||||||
|
if v := os.Getenv(EnvBindPort); v != "" {
|
||||||
|
p, err := strconv.Atoi(v)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("%s=%q: not an integer: %w", EnvBindPort, v, err)
|
||||||
|
}
|
||||||
|
n.BindPort = p
|
||||||
|
}
|
||||||
|
if v := os.Getenv(EnvAdvertise); v != "" {
|
||||||
|
n.Advertise = v
|
||||||
|
}
|
||||||
|
if v := os.Getenv(EnvClusterSecret); v != "" {
|
||||||
|
n.ClusterSecret = v
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// LoadNodeConfig reads node.yaml from the data dir.
|
// LoadNodeConfig reads node.yaml from the data dir.
|
||||||
func LoadNodeConfig() (*NodeConfig, error) {
|
func LoadNodeConfig() (*NodeConfig, error) {
|
||||||
raw, err := os.ReadFile(NodeFilePath())
|
raw, err := os.ReadFile(NodeFilePath())
|
||||||
@@ -55,6 +99,9 @@ func LoadNodeConfig() (*NodeConfig, error) {
|
|||||||
if err := yaml.Unmarshal(raw, cfg); err != nil {
|
if err := yaml.Unmarshal(raw, cfg); err != nil {
|
||||||
return nil, fmt.Errorf("parse node.yaml: %w", err)
|
return nil, fmt.Errorf("parse node.yaml: %w", err)
|
||||||
}
|
}
|
||||||
|
if err := cfg.ApplyEnvOverrides(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
if cfg.BindPort == 0 {
|
if cfg.BindPort == 0 {
|
||||||
cfg.BindPort = 9901
|
cfg.BindPort = 9901
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -56,3 +56,95 @@ func TestLoadNodeConfigAppliesDefaults(t *testing.T) {
|
|||||||
t.Errorf("BindAddr=%q want 0.0.0.0", loaded.BindAddr)
|
t.Errorf("BindAddr=%q want 0.0.0.0", loaded.BindAddr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestApplyEnvOverrides(t *testing.T) {
|
||||||
|
t.Setenv(EnvNodeID, "node-from-env")
|
||||||
|
t.Setenv(EnvBindAddr, "1.2.3.4")
|
||||||
|
t.Setenv(EnvBindPort, "9999")
|
||||||
|
t.Setenv(EnvAdvertise, "public.example.com:9999")
|
||||||
|
t.Setenv(EnvClusterSecret, "shh-secret")
|
||||||
|
|
||||||
|
n := &NodeConfig{
|
||||||
|
NodeID: "original-id",
|
||||||
|
BindAddr: "0.0.0.0",
|
||||||
|
BindPort: 9901,
|
||||||
|
Advertise: "old.example.com:9901",
|
||||||
|
ClusterSecret: "old-secret",
|
||||||
|
}
|
||||||
|
if err := n.ApplyEnvOverrides(); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
want := NodeConfig{
|
||||||
|
NodeID: "node-from-env",
|
||||||
|
BindAddr: "1.2.3.4",
|
||||||
|
BindPort: 9999,
|
||||||
|
Advertise: "public.example.com:9999",
|
||||||
|
ClusterSecret: "shh-secret",
|
||||||
|
}
|
||||||
|
if *n != want {
|
||||||
|
t.Errorf("got %+v want %+v", *n, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyEnvOverridesEmptyValuesIgnored(t *testing.T) {
|
||||||
|
// Explicitly empty env vars must NOT clobber existing fields —
|
||||||
|
// otherwise `docker run -e QUPTIME_ADVERTISE=` would silently
|
||||||
|
// erase a previously-persisted advertise address.
|
||||||
|
t.Setenv(EnvNodeID, "")
|
||||||
|
t.Setenv(EnvBindAddr, "")
|
||||||
|
t.Setenv(EnvBindPort, "")
|
||||||
|
t.Setenv(EnvAdvertise, "")
|
||||||
|
t.Setenv(EnvClusterSecret, "")
|
||||||
|
|
||||||
|
orig := NodeConfig{
|
||||||
|
NodeID: "keep-me",
|
||||||
|
BindAddr: "10.0.0.1",
|
||||||
|
BindPort: 9901,
|
||||||
|
Advertise: "keep.example.com:9901",
|
||||||
|
ClusterSecret: "keep-secret",
|
||||||
|
}
|
||||||
|
n := orig
|
||||||
|
if err := n.ApplyEnvOverrides(); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if n != orig {
|
||||||
|
t.Errorf("empty env vars mutated config: got %+v want %+v", n, orig)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyEnvOverridesBadPort(t *testing.T) {
|
||||||
|
t.Setenv(EnvBindPort, "not-an-int")
|
||||||
|
n := &NodeConfig{}
|
||||||
|
if err := n.ApplyEnvOverrides(); err == nil {
|
||||||
|
t.Fatal("expected error for non-integer port")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadNodeConfigEnvOverridesFile(t *testing.T) {
|
||||||
|
t.Setenv("QUPTIME_DIR", t.TempDir())
|
||||||
|
// Persist a file with one bind addr; env should win on load.
|
||||||
|
n := &NodeConfig{NodeID: "abc", BindAddr: "127.0.0.1", BindPort: 9901, Advertise: "file.example.com:9901"}
|
||||||
|
if err := n.Save(); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
t.Setenv(EnvBindAddr, "0.0.0.0")
|
||||||
|
t.Setenv(EnvAdvertise, "env.example.com:9001")
|
||||||
|
t.Setenv(EnvBindPort, "9001")
|
||||||
|
|
||||||
|
loaded, err := LoadNodeConfig()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if loaded.BindAddr != "0.0.0.0" {
|
||||||
|
t.Errorf("BindAddr=%q want 0.0.0.0 (env override)", loaded.BindAddr)
|
||||||
|
}
|
||||||
|
if loaded.BindPort != 9001 {
|
||||||
|
t.Errorf("BindPort=%d want 9001 (env override)", loaded.BindPort)
|
||||||
|
}
|
||||||
|
if loaded.Advertise != "env.example.com:9001" {
|
||||||
|
t.Errorf("Advertise=%q want env.example.com:9001 (env override)", loaded.Advertise)
|
||||||
|
}
|
||||||
|
if loaded.NodeID != "abc" {
|
||||||
|
t.Errorf("NodeID=%q want abc (unchanged)", loaded.NodeID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
+127
-29
@@ -7,6 +7,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/charmbracelet/bubbles/textarea"
|
||||||
"github.com/charmbracelet/bubbles/textinput"
|
"github.com/charmbracelet/bubbles/textinput"
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
tea "github.com/charmbracelet/bubbletea"
|
||||||
"github.com/charmbracelet/lipgloss"
|
"github.com/charmbracelet/lipgloss"
|
||||||
@@ -53,10 +54,45 @@ func modalDoneCmd(flash string, level flashLevel) tea.Cmd {
|
|||||||
type formField struct {
|
type formField struct {
|
||||||
label string
|
label string
|
||||||
input textinput.Model
|
input textinput.Model
|
||||||
|
textarea textarea.Model
|
||||||
|
multiline bool
|
||||||
required bool
|
required bool
|
||||||
hint string
|
hint string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// value returns the field's current text regardless of whether it's
|
||||||
|
// backed by a single-line input or a multiline textarea.
|
||||||
|
func (fld *formField) value() string {
|
||||||
|
if fld.multiline {
|
||||||
|
return fld.textarea.Value()
|
||||||
|
}
|
||||||
|
return fld.input.Value()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (fld *formField) focus() {
|
||||||
|
if fld.multiline {
|
||||||
|
fld.textarea.Focus()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fld.input.Focus()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (fld *formField) blur() {
|
||||||
|
if fld.multiline {
|
||||||
|
fld.textarea.Blur()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fld.input.Blur()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (fld *formField) setWidth(w int) {
|
||||||
|
if fld.multiline {
|
||||||
|
fld.textarea.SetWidth(w)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fld.input.Width = w
|
||||||
|
}
|
||||||
|
|
||||||
type form struct {
|
type form struct {
|
||||||
title string
|
title string
|
||||||
fields []formField
|
fields []formField
|
||||||
@@ -86,12 +122,14 @@ func fieldWidthFor(termWidth int) int {
|
|||||||
|
|
||||||
func newForm(title string, fields []formField, submit func([]string) tea.Cmd) *form {
|
func newForm(title string, fields []formField, submit func([]string) tea.Cmd) *form {
|
||||||
for i := range fields {
|
for i := range fields {
|
||||||
|
if !fields[i].multiline {
|
||||||
fields[i].input.Prompt = ""
|
fields[i].input.Prompt = ""
|
||||||
fields[i].input.CharLimit = 256
|
fields[i].input.CharLimit = 256
|
||||||
|
}
|
||||||
if i == 0 {
|
if i == 0 {
|
||||||
fields[i].input.Focus()
|
fields[i].focus()
|
||||||
} else {
|
} else {
|
||||||
fields[i].input.Blur()
|
fields[i].blur()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return &form{title: title, fields: fields, submit: submit}
|
return &form{title: title, fields: fields, submit: submit}
|
||||||
@@ -114,6 +152,31 @@ func textFieldWithValue(label, hint, value string, required bool) formField {
|
|||||||
return formField{label: label, hint: hint, required: required, input: ti}
|
return formField{label: label, hint: hint, required: required, input: ti}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// textAreaField creates a multiline field. Enter inserts a newline;
|
||||||
|
// the form uses shift+enter / ctrl+s to submit when the cursor is on
|
||||||
|
// one of these. Useful for things like alert body templates where the
|
||||||
|
// rendered message naturally spans multiple lines.
|
||||||
|
func textAreaField(label, hint string, required bool) formField {
|
||||||
|
return textAreaFieldWithValue(label, hint, "", required)
|
||||||
|
}
|
||||||
|
|
||||||
|
func textAreaFieldWithValue(label, hint, value string, required bool) formField {
|
||||||
|
ta := textarea.New()
|
||||||
|
ta.Placeholder = hint
|
||||||
|
ta.ShowLineNumbers = false
|
||||||
|
ta.Prompt = " "
|
||||||
|
ta.SetHeight(5)
|
||||||
|
ta.SetWidth(defaultFieldWidth)
|
||||||
|
ta.CharLimit = 0
|
||||||
|
// Keep enter bound to "insert newline" (the textarea default) — the
|
||||||
|
// surrounding form intercepts enter on single-line fields and handles
|
||||||
|
// shift+enter/ctrl+s as the submit/advance trigger for multiline ones.
|
||||||
|
if value != "" {
|
||||||
|
ta.SetValue(value)
|
||||||
|
}
|
||||||
|
return formField{label: label, hint: hint, required: required, multiline: true, textarea: ta}
|
||||||
|
}
|
||||||
|
|
||||||
func passwordField(label, hint string) formField {
|
func passwordField(label, hint string) formField {
|
||||||
return passwordFieldWithValue(label, hint, "")
|
return passwordFieldWithValue(label, hint, "")
|
||||||
}
|
}
|
||||||
@@ -146,7 +209,11 @@ func (f *form) View() string {
|
|||||||
labelStyle = lipgloss.NewStyle().Foreground(colorAccent).Bold(true)
|
labelStyle = lipgloss.NewStyle().Foreground(colorAccent).Bold(true)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "%s%s\n", marker, labelStyle.Render(fld.label))
|
fmt.Fprintf(&b, "%s%s\n", marker, labelStyle.Render(fld.label))
|
||||||
|
if fld.multiline {
|
||||||
|
fmt.Fprintf(&b, "%s\n", fld.textarea.View())
|
||||||
|
} else {
|
||||||
fmt.Fprintf(&b, " %s\n", fld.input.View())
|
fmt.Fprintf(&b, " %s\n", fld.input.View())
|
||||||
|
}
|
||||||
if i == f.cursor && fld.hint != "" {
|
if i == f.cursor && fld.hint != "" {
|
||||||
fmt.Fprintf(&b, " %s\n", helpStyle.Render(fld.hint))
|
fmt.Fprintf(&b, " %s\n", helpStyle.Render(fld.hint))
|
||||||
}
|
}
|
||||||
@@ -158,7 +225,11 @@ func (f *form) View() string {
|
|||||||
if f.busy {
|
if f.busy {
|
||||||
fmt.Fprintf(&b, "%s\n", flashWarnStyle.Render("working…"))
|
fmt.Fprintf(&b, "%s\n", flashWarnStyle.Render("working…"))
|
||||||
} else {
|
} else {
|
||||||
fmt.Fprintf(&b, "%s\n", helpStyle.Render("↑↓ field enter next/submit esc cancel"))
|
help := "↑↓ field enter next/submit esc cancel"
|
||||||
|
if f.cursor < len(f.fields) && f.fields[f.cursor].multiline {
|
||||||
|
help = "tab field enter newline shift+enter/ctrl+s submit esc cancel"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "%s\n", helpStyle.Render(help))
|
||||||
}
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
@@ -169,7 +240,7 @@ func (f *form) Update(msg tea.Msg) (modal, tea.Cmd) {
|
|||||||
f.width = msg.Width
|
f.width = msg.Width
|
||||||
w := fieldWidthFor(msg.Width)
|
w := fieldWidthFor(msg.Width)
|
||||||
for i := range f.fields {
|
for i := range f.fields {
|
||||||
f.fields[i].input.Width = w
|
f.fields[i].setWidth(w)
|
||||||
}
|
}
|
||||||
return f, nil
|
return f, nil
|
||||||
|
|
||||||
@@ -179,43 +250,74 @@ func (f *form) Update(msg tea.Msg) (modal, tea.Cmd) {
|
|||||||
return f, nil
|
return f, nil
|
||||||
|
|
||||||
case tea.KeyMsg:
|
case tea.KeyMsg:
|
||||||
switch msg.String() {
|
key := msg.String()
|
||||||
|
// up/down on a multiline field belong to in-text navigation;
|
||||||
|
// leave field-switching to tab/shift+tab there. Same for enter:
|
||||||
|
// the textarea owns it as "insert newline", so submission moves
|
||||||
|
// to shift+enter / ctrl+s.
|
||||||
|
multiline := f.cursor < len(f.fields) && f.fields[f.cursor].multiline
|
||||||
|
switch key {
|
||||||
case "esc":
|
case "esc":
|
||||||
return f, modalDoneCmd("", flashInfo)
|
return f, modalDoneCmd("", flashInfo)
|
||||||
case "tab", "down":
|
case "tab":
|
||||||
f.advance(1)
|
f.advance(1)
|
||||||
return f, nil
|
return f, nil
|
||||||
case "shift+tab", "up":
|
case "shift+tab":
|
||||||
f.advance(-1)
|
f.advance(-1)
|
||||||
return f, nil
|
return f, nil
|
||||||
case "enter":
|
case "down":
|
||||||
if f.busy {
|
if !multiline {
|
||||||
|
f.advance(1)
|
||||||
return f, nil
|
return f, nil
|
||||||
}
|
}
|
||||||
|
case "up":
|
||||||
|
if !multiline {
|
||||||
|
f.advance(-1)
|
||||||
|
return f, nil
|
||||||
|
}
|
||||||
|
case "enter":
|
||||||
|
if !multiline {
|
||||||
|
return f, f.submitOrAdvance()
|
||||||
|
}
|
||||||
|
case "shift+enter", "ctrl+s":
|
||||||
|
return f, f.submitOrAdvance()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var cmd tea.Cmd
|
||||||
|
if f.fields[f.cursor].multiline {
|
||||||
|
f.fields[f.cursor].textarea, cmd = f.fields[f.cursor].textarea.Update(msg)
|
||||||
|
} else {
|
||||||
|
f.fields[f.cursor].input, cmd = f.fields[f.cursor].input.Update(msg)
|
||||||
|
}
|
||||||
|
return f, cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
// submitOrAdvance is the shared trigger for enter on single-line fields
|
||||||
|
// and shift+enter / ctrl+s on multiline fields: jump to the next field
|
||||||
|
// or, on the last one, validate and run submit.
|
||||||
|
func (f *form) submitOrAdvance() tea.Cmd {
|
||||||
|
if f.busy {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
if f.cursor < len(f.fields)-1 {
|
if f.cursor < len(f.fields)-1 {
|
||||||
f.advance(1)
|
f.advance(1)
|
||||||
return f, nil
|
return nil
|
||||||
}
|
}
|
||||||
vals := make([]string, len(f.fields))
|
vals := make([]string, len(f.fields))
|
||||||
for i, fld := range f.fields {
|
for i := range f.fields {
|
||||||
vals[i] = fld.input.Value()
|
vals[i] = f.fields[i].value()
|
||||||
}
|
}
|
||||||
for i, fld := range f.fields {
|
for i, fld := range f.fields {
|
||||||
if fld.required && strings.TrimSpace(vals[i]) == "" {
|
if fld.required && strings.TrimSpace(vals[i]) == "" {
|
||||||
f.err = fld.label + " is required"
|
f.err = fld.label + " is required"
|
||||||
f.cursor = i
|
f.cursor = i
|
||||||
f.focusOnly(i)
|
f.focusOnly(i)
|
||||||
return f, nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
f.busy = true
|
f.busy = true
|
||||||
f.err = ""
|
f.err = ""
|
||||||
return f, f.submit(vals)
|
return f.submit(vals)
|
||||||
}
|
|
||||||
}
|
|
||||||
var cmd tea.Cmd
|
|
||||||
f.fields[f.cursor].input, cmd = f.fields[f.cursor].input.Update(msg)
|
|
||||||
return f, cmd
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f *form) advance(delta int) {
|
func (f *form) advance(delta int) {
|
||||||
@@ -230,9 +332,9 @@ func (f *form) advance(delta int) {
|
|||||||
func (f *form) focusOnly(i int) {
|
func (f *form) focusOnly(i int) {
|
||||||
for j := range f.fields {
|
for j := range f.fields {
|
||||||
if j == i {
|
if j == i {
|
||||||
f.fields[j].input.Focus()
|
f.fields[j].focus()
|
||||||
} else {
|
} else {
|
||||||
f.fields[j].input.Blur()
|
f.fields[j].blur()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -241,10 +343,6 @@ func (f *form) focusOnly(i int) {
|
|||||||
// error inline without closing the form.
|
// error inline without closing the form.
|
||||||
type formSubmitErr string
|
type formSubmitErr string
|
||||||
|
|
||||||
func submitErr(err error) tea.Cmd {
|
|
||||||
return func() tea.Msg { return formSubmitErr(err.Error()) }
|
|
||||||
}
|
|
||||||
|
|
||||||
// =============================================================
|
// =============================================================
|
||||||
// Specific forms.
|
// Specific forms.
|
||||||
// =============================================================
|
// =============================================================
|
||||||
@@ -298,7 +396,7 @@ func newAddDiscordForm() *form {
|
|||||||
textField("Name", "human-friendly identifier", true),
|
textField("Name", "human-friendly identifier", true),
|
||||||
textField("Webhook URL", "https://discord.com/api/webhooks/...", true),
|
textField("Webhook URL", "https://discord.com/api/webhooks/...", true),
|
||||||
textField("Default", "yes/no — attach to every check automatically", false),
|
textField("Default", "yes/no — attach to every check automatically", false),
|
||||||
textField("Body template", alerts.TemplateVarsHint(), false),
|
textAreaField("Body template", alerts.TemplateVarsHint(), false),
|
||||||
}
|
}
|
||||||
return newForm("Add Discord alert", fields, func(vals []string) tea.Cmd {
|
return newForm("Add Discord alert", fields, func(vals []string) tea.Cmd {
|
||||||
return func() tea.Msg {
|
return func() tea.Msg {
|
||||||
@@ -330,7 +428,7 @@ func newAddSMTPForm() *form {
|
|||||||
textField("StartTLS", "yes/no — default yes", false),
|
textField("StartTLS", "yes/no — default yes", false),
|
||||||
textField("Default", "yes/no — attach to every check", false),
|
textField("Default", "yes/no — attach to every check", false),
|
||||||
textField("Subject template", alerts.TemplateVarsHint(), false),
|
textField("Subject template", alerts.TemplateVarsHint(), false),
|
||||||
textField("Body template", alerts.TemplateVarsHint(), false),
|
textAreaField("Body template", alerts.TemplateVarsHint(), false),
|
||||||
}
|
}
|
||||||
return newForm("Add SMTP alert", fields, func(vals []string) tea.Cmd {
|
return newForm("Add SMTP alert", fields, func(vals []string) tea.Cmd {
|
||||||
return func() tea.Msg {
|
return func() tea.Msg {
|
||||||
@@ -471,7 +569,7 @@ func newEditDiscordForm(existing config.Alert) *form {
|
|||||||
textFieldWithValue("Name", "human-friendly identifier", existing.Name, true),
|
textFieldWithValue("Name", "human-friendly identifier", existing.Name, true),
|
||||||
textFieldWithValue("Webhook URL", "https://discord.com/api/webhooks/...", existing.DiscordWebhook, true),
|
textFieldWithValue("Webhook URL", "https://discord.com/api/webhooks/...", existing.DiscordWebhook, true),
|
||||||
textFieldWithValue("Default", "yes/no — attach to every check automatically", boolStr(existing.Default), false),
|
textFieldWithValue("Default", "yes/no — attach to every check automatically", boolStr(existing.Default), false),
|
||||||
textFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
|
textAreaFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
|
||||||
}
|
}
|
||||||
id := existing.ID
|
id := existing.ID
|
||||||
subject := existing.SubjectTemplate
|
subject := existing.SubjectTemplate
|
||||||
@@ -510,7 +608,7 @@ func newEditSMTPForm(existing config.Alert) *form {
|
|||||||
textFieldWithValue("StartTLS", "yes/no — default yes", boolStr(existing.SMTPStartTLS), false),
|
textFieldWithValue("StartTLS", "yes/no — default yes", boolStr(existing.SMTPStartTLS), false),
|
||||||
textFieldWithValue("Default", "yes/no — attach to every check", boolStr(existing.Default), false),
|
textFieldWithValue("Default", "yes/no — attach to every check", boolStr(existing.Default), false),
|
||||||
textFieldWithValue("Subject template", alerts.TemplateVarsHint(), existing.SubjectTemplate, false),
|
textFieldWithValue("Subject template", alerts.TemplateVarsHint(), existing.SubjectTemplate, false),
|
||||||
textFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
|
textAreaFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
|
||||||
}
|
}
|
||||||
id := existing.ID
|
id := existing.ID
|
||||||
return newForm("Edit SMTP alert", fields, func(vals []string) tea.Cmd {
|
return newForm("Edit SMTP alert", fields, func(vals []string) tea.Cmd {
|
||||||
|
|||||||
@@ -73,9 +73,3 @@ func renderState(s string) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderLive(live bool) string {
|
|
||||||
if live {
|
|
||||||
return stateUpStyle.Render("● live")
|
|
||||||
}
|
|
||||||
return stateDownStyle.Render("● dead")
|
|
||||||
}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user