Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a1d74cf36d | |||
| f60b0a0609 | |||
| ea30dbb895 | |||
| 1e2e382867 | |||
| ed25e9ed68 | |||
| c55482664c | |||
| 3c85caabcf | |||
| 8638ab5432 | |||
| a11b31f160 | |||
| 005be12dd1 | |||
| e48da30240 | |||
| b46c258e4e |
@@ -0,0 +1,72 @@
|
||||
name: Container image
|
||||
|
||||
# Mirrors .gitea/workflows/container.yaml — publishes a multi-arch
|
||||
# (amd64 + arm64) image to the GitHub Container Registry whenever the
|
||||
# Gitea→GitHub mirror pushes a `v*` tag. Image lands at
|
||||
# ghcr.io/axodouble/quptime with tags :vX.Y.Z, :X.Y, and :latest.
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
jobs:
|
||||
image:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set up Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
# GHCR namespaces must be lowercase. Lowercase the repository
|
||||
# path once and reuse below so a mixed-case org/repo (e.g.
|
||||
# Axodouble/QUptime) still resolves to a valid image reference.
|
||||
- name: Resolve image name
|
||||
id: img
|
||||
run: |
|
||||
repo='${{ github.repository }}'
|
||||
echo "ref=ghcr.io/${repo,,}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Compute version
|
||||
id: ver
|
||||
run: |
|
||||
echo "version=${GITHUB_REF_NAME}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Login to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Docker metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ steps.img.outputs.ref }}
|
||||
tags: |
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=raw,value=latest
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
file: ./docker/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-args: |
|
||||
VERSION=${{ steps.ver.outputs.version }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
@@ -0,0 +1,60 @@
|
||||
name: Release
|
||||
|
||||
# Mirrors .gitea/workflows/release.yaml — fires when the Gitea→GitHub
|
||||
# mirror pushes a `v*` tag, builds static Linux binaries for amd64 +
|
||||
# arm64, and publishes them to GitHub Releases alongside the Gitea
|
||||
# release the same tag produces upstream.
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
jobs:
|
||||
release:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.24'
|
||||
check-latest: false
|
||||
cache: false
|
||||
|
||||
- name: Test
|
||||
run: go test -race ./...
|
||||
|
||||
- name: Build binaries
|
||||
env:
|
||||
CGO_ENABLED: '0'
|
||||
run: |
|
||||
set -euo pipefail
|
||||
VERSION="${GITHUB_REF_NAME}"
|
||||
mkdir -p dist
|
||||
for arch in amd64 arm64; do
|
||||
out="dist/qu-${VERSION}-linux-${arch}"
|
||||
echo "building ${out}"
|
||||
GOOS=linux GOARCH="${arch}" \
|
||||
go build \
|
||||
-trimpath \
|
||||
-ldflags "-s -w -X main.version=${VERSION}" \
|
||||
-o "${out}" \
|
||||
./cmd/qu
|
||||
done
|
||||
(cd dist && sha256sum qu-* > SHA256SUMS)
|
||||
ls -lh dist
|
||||
|
||||
- name: Publish release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
files: |
|
||||
dist/qu-*
|
||||
dist/SHA256SUMS
|
||||
fail_on_unmatched_files: true
|
||||
generate_release_notes: true
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
+59
-3
@@ -4,6 +4,57 @@ All notable changes to this project are documented here. The format
|
||||
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and
|
||||
this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [v0.1.1] — 2026-05-15
|
||||
|
||||
### Changed
|
||||
|
||||
- **`install.sh` now repairs data-dir permissions on every run.**
|
||||
Re-running the installer reasserts the canonical ownership
|
||||
(`quptime:quptime`) and modes across `/etc/quptime/` — `0750` on
|
||||
the dir, `0700` on `keys/`, `0600` on `node.yaml`, `cluster.yaml`,
|
||||
`trust.yaml`, and `keys/private.pem`, `0644` on `keys/public.pem`
|
||||
and `keys/cert.pem`. Makes the installer the one-step recovery
|
||||
path when something has tampered with modes (e.g. a stray
|
||||
`chmod -R`, a backup restore, or an accidental `sudo qu init`
|
||||
that left files owned by root). Unknown files in the dir are left
|
||||
alone.
|
||||
|
||||
### Fixed
|
||||
|
||||
- **CLI socket lookup as the daemon user.** `sudo -u quptime qu …`
|
||||
no longer fails with `dial daemon socket /tmp/quptime-quptime/…:
|
||||
no such file or directory` while the system daemon is running.
|
||||
`config.SocketPath()` now probes the canonical systemd location
|
||||
(`/run/quptime/quptime.sock`, then `/var/run/quptime/quptime.sock`)
|
||||
regardless of euid before falling back to per-user paths, so the
|
||||
CLI reaches the daemon's socket even when `sudo` has stripped
|
||||
`RUNTIME_DIRECTORY` and `XDG_RUNTIME_DIR` from the environment.
|
||||
|
||||
## [v0.1.0] — 2026-05-15
|
||||
|
||||
### Changed
|
||||
|
||||
- **Master election cooldown (2 min).** A returning peer with a
|
||||
lower NodeID no longer reclaims master the instant it reappears.
|
||||
It must stay continuously live for `DefaultMasterCooldown`
|
||||
(2 minutes) before displacing the incumbent. Bootstrap and
|
||||
quorum-regained-from-empty still elect immediately; the cooldown
|
||||
only protects an active incumbent. Fixes #3: a self-monitoring
|
||||
master (TCP check on its own `:9901`) would otherwise flap the
|
||||
role in lock-step with its own restart.
|
||||
|
||||
### Fixed
|
||||
|
||||
- #1 Previously up services are alerted as going back up if the master goes down.
|
||||
Ignore `unknown` -> `up` transitions during master election; still
|
||||
alert on `unknown` -> `down` by design.
|
||||
|
||||
## [v0.0.2] — 2026-05-15
|
||||
|
||||
### Fixed
|
||||
|
||||
- Text template field in the TUI did not support newlines, causing multi-line templates to render as a single line and losing formatting. This has been fixed by changing the field into a textarea and escaping the `enter` key to insert newlines.
|
||||
|
||||
## [v0.0.1] — 2026-05-15
|
||||
|
||||
Initial public release.
|
||||
@@ -44,10 +95,13 @@ Initial public release.
|
||||
`quptime` user, `ProtectSystem=strict`, all capabilities dropped by
|
||||
default.
|
||||
- **Multi-arch Docker images** (`linux/amd64`, `linux/arm64`)
|
||||
published to `git.cer.sh/axodouble/quptime`.
|
||||
published to `git.cer.sh/axodouble/quptime` (primary) and
|
||||
`ghcr.io/axodouble/quptime` (GitHub push-mirror) on every tag.
|
||||
- **Static Linux binaries** (`amd64`, `arm64`) published per tag with
|
||||
a `SHA256SUMS` file; the official installer verifies the checksum
|
||||
before placing the binary on disk.
|
||||
a `SHA256SUMS` file to both Gitea Releases (primary) and GitHub
|
||||
Releases (mirror). The official installer prefers Gitea, falls back
|
||||
to GitHub on failure, and verifies the checksum before placing the
|
||||
binary on disk.
|
||||
|
||||
### Security
|
||||
|
||||
@@ -84,3 +138,5 @@ Initial public release.
|
||||
Planned for a future release.
|
||||
|
||||
[v0.0.1]: https://git.cer.sh/axodouble/quptime/releases/tag/v0.0.1
|
||||
[v0.1.0]: https://git.cer.sh/axodouble/quptime/releases/tag/v0.1.0
|
||||
[v0.1.1]: https://git.cer.sh/axodouble/quptime/releases/tag/v0.1.1
|
||||
@@ -14,12 +14,37 @@ trust — no central CA, no shared secret.
|
||||
|
||||
### From pre-built binary
|
||||
|
||||
This can be done in one step, either by downloading the latest release from
|
||||
the [Gitea releases page](https://git.cer.sh/axodouble/quptime/releases) or by running the following script:
|
||||
The canonical home is Gitea; the repo is push-mirrored to GitHub on
|
||||
every tag. Releases and multi-arch container images are published to
|
||||
both.
|
||||
|
||||
| Source | Releases | Container image |
|
||||
| ---------------- | ------------------------------------------------------------ | -------------------------------- |
|
||||
| Gitea (primary) | <https://git.cer.sh/axodouble/quptime/releases> | `git.cer.sh/axodouble/quptime` |
|
||||
| GitHub (mirror) | <https://github.com/Axodouble/QUptime/releases> | `ghcr.io/axodouble/quptime` |
|
||||
|
||||
One-step install — tries Gitea first, falls back to GitHub automatically:
|
||||
|
||||
```sh
|
||||
curl -fsSL https://git.cer.sh/Axodouble/QUptime/raw/branch/master/install.sh | sudo bash
|
||||
# or, via the GitHub mirror:
|
||||
# curl -fsSL https://raw.githubusercontent.com/Axodouble/QUptime/master/install.sh | sudo bash
|
||||
```
|
||||
|
||||
The script verifies the binary against the published `SHA256SUMS`
|
||||
before installing and refuses to proceed on a mismatch.
|
||||
|
||||
### From Docker
|
||||
|
||||
```sh
|
||||
docker pull git.cer.sh/axodouble/quptime:latest
|
||||
# or, via the GitHub mirror:
|
||||
# docker pull ghcr.io/axodouble/quptime:latest
|
||||
```
|
||||
|
||||
See [docs/deployment/docker.md](docs/deployment/docker.md) for compose
|
||||
recipes.
|
||||
|
||||
## Why
|
||||
|
||||
Most uptime monitors are either a SaaS or a single box that, by
|
||||
@@ -69,7 +94,11 @@ the hysteresis that absorbs network blips.
|
||||
|
||||
Master election is deterministic: among the live members of the quorum,
|
||||
the node with the lexicographically smallest NodeID wins. No
|
||||
negotiation, no split-brain window.
|
||||
negotiation, no split-brain window. A 2-minute **master cooldown**
|
||||
keeps the current master in place until a returning lower-NodeID peer
|
||||
has been continuously live for the full window, so a self-monitoring
|
||||
master that briefly drops doesn't flap the role back the instant it
|
||||
reappears.
|
||||
|
||||
`cluster.yaml` is the single replicated source of truth (peers, checks,
|
||||
alerts). Mutations from the CLI route through the master, which bumps a
|
||||
|
||||
@@ -25,7 +25,7 @@ services:
|
||||
restart: unless-stopped
|
||||
|
||||
quptime:
|
||||
image: git.cer.sh/axodouble/quptime:master
|
||||
image: git.cer.sh/axodouble/quptime:latest
|
||||
container_name: quptime
|
||||
environment:
|
||||
# host:port other QUptime nodes use to reach this one. Use the
|
||||
|
||||
@@ -118,6 +118,35 @@ The `term` integer in `qu status` is bumped every time the elected
|
||||
master changes (including transitions to and from "no master"). Use it
|
||||
to spot flappy clusters.
|
||||
|
||||
### Master cooldown
|
||||
|
||||
The bare "lowest-live-NodeID wins" rule has one unpleasant edge: if the
|
||||
primary master is also being monitored by `qu` itself (a TCP check on
|
||||
its own `:9901`, say), a brief restart causes a master flap *and* a
|
||||
state flap in lock-step. The new master sees the old master come back
|
||||
on the next tick and immediately hands the role back, taking the
|
||||
just-recovering node from `unknown` to `up` with no quiet period.
|
||||
|
||||
To absorb that, the quorum manager applies a **master cooldown**
|
||||
(`DefaultMasterCooldown`, 2 minutes) before a peer with a lower NodeID
|
||||
may displace the incumbent. The rules:
|
||||
|
||||
- The cooldown timer starts on the **first heartbeat after a
|
||||
dead-after gap** — i.e. when a peer re-enters the live set after
|
||||
having aged out. Continuous heartbeats never restart it.
|
||||
- A flap during the cooldown resets the timer; the returning peer
|
||||
must clear a full fresh window before taking over.
|
||||
- The cooldown applies **only when an incumbent master exists**.
|
||||
Bootstrap and quorum-regained-from-empty elect the lowest-NodeID
|
||||
live peer immediately, because there is no role to protect.
|
||||
- If the incumbent drops out of the live set, the cooldown is
|
||||
irrelevant — any live peer may take over without waiting.
|
||||
|
||||
The constant lives in `internal/quorum/manager.go`. Lower it for
|
||||
faster fail-back at the cost of monitoring-self flap risk; raise it
|
||||
to give a recovering master longer to settle before reclaiming the
|
||||
role.
|
||||
|
||||
## Catch-up when a node reconnects
|
||||
|
||||
This is the scenario most people ask about: node C is offline, the
|
||||
|
||||
@@ -7,6 +7,13 @@ daemon can bind privileged ports and open ICMP sockets; override with
|
||||
|
||||
## Image references
|
||||
|
||||
The same multi-arch (amd64 + arm64) image is published to two
|
||||
registries. **The Gitea registry is the canonical source** — it also
|
||||
publishes canary `:master` builds on every branch push. GHCR is a
|
||||
tag-only push-mirror for users who can't reach `git.cer.sh`.
|
||||
|
||||
Primary — Gitea registry:
|
||||
|
||||
```
|
||||
git.cer.sh/axodouble/quptime:master # tip of main, multi-arch
|
||||
git.cer.sh/axodouble/quptime:latest # latest tagged release
|
||||
@@ -14,6 +21,14 @@ git.cer.sh/axodouble/quptime:v0.0.1 # specific tagged release
|
||||
git.cer.sh/axodouble/quptime:latest-amd64 # single-arch (if you must pin)
|
||||
```
|
||||
|
||||
Fallback — GitHub Container Registry:
|
||||
|
||||
```
|
||||
ghcr.io/axodouble/quptime:latest # latest tagged release
|
||||
ghcr.io/axodouble/quptime:v0.0.1 # specific tagged release
|
||||
ghcr.io/axodouble/quptime:0.0 # latest patch in the 0.0 minor line
|
||||
```
|
||||
|
||||
The image embeds `QUPTIME_DIR=/etc/quptime` and declares it a volume —
|
||||
treat it as the only piece of state worth persisting.
|
||||
|
||||
|
||||
+57
-17
@@ -10,22 +10,36 @@ matches how you manage software on the host.
|
||||
|
||||
## Pre-built binary (recommended)
|
||||
|
||||
Releases are published to the [Gitea releases
|
||||
page](https://git.cer.sh/axodouble/quptime/releases) with a
|
||||
`SHA256SUMS` file. Two architectures are built: `linux-amd64` and
|
||||
`linux-arm64`.
|
||||
Every tag triggers identical builds on both sources, so either one
|
||||
serves the same artefact set. Gitea is the canonical home; GitHub is a
|
||||
push-mirror.
|
||||
|
||||
Primary — Gitea releases:
|
||||
<https://git.cer.sh/axodouble/quptime/releases>
|
||||
|
||||
Fallback — GitHub releases (mirrored from the same tag):
|
||||
<https://github.com/Axodouble/QUptime/releases>
|
||||
|
||||
Each release ships `qu-${TAG}-linux-amd64`, `qu-${TAG}-linux-arm64`,
|
||||
and a `SHA256SUMS` file.
|
||||
|
||||
```sh
|
||||
# Always pin to a tag — `latest` resolves on the server side.
|
||||
TAG=v0.1.0
|
||||
TAG=v0.0.1
|
||||
ARCH=amd64 # or arm64
|
||||
|
||||
# Primary: Gitea
|
||||
curl -fSL -o qu \
|
||||
"https://git.cer.sh/axodouble/quptime/releases/download/${TAG}/qu-${TAG}-linux-${ARCH}"
|
||||
curl -fSL -o SHA256SUMS \
|
||||
"https://git.cer.sh/axodouble/quptime/releases/download/${TAG}/SHA256SUMS"
|
||||
|
||||
# Verify before installing.
|
||||
# (or the GitHub mirror — substitute the host below if Gitea is unreachable)
|
||||
# https://github.com/Axodouble/QUptime/releases/download/${TAG}/qu-${TAG}-linux-${ARCH}
|
||||
# https://github.com/Axodouble/QUptime/releases/download/${TAG}/SHA256SUMS
|
||||
|
||||
# Verify before installing. Use the SHA256SUMS from the SAME source
|
||||
# as the binary — never mix.
|
||||
sha256sum --check --ignore-missing SHA256SUMS
|
||||
|
||||
install -m 0755 qu /usr/local/bin/qu
|
||||
@@ -34,31 +48,46 @@ install -m 0755 qu /usr/local/bin/qu
|
||||
## One-line install script
|
||||
|
||||
The repo ships an `install.sh` that handles the download, checksum,
|
||||
shell-completion installation, and a default systemd unit file. Run it
|
||||
shell-completion installation, and a hardened systemd unit. Run it
|
||||
under `sudo` so it can write to `/usr/local/bin` and
|
||||
`/etc/systemd/system`.
|
||||
|
||||
```sh
|
||||
curl -fsSL https://git.cer.sh/Axodouble/QUptime/raw/branch/master/install.sh | sudo bash
|
||||
# or, via the GitHub mirror:
|
||||
# curl -fsSL https://raw.githubusercontent.com/Axodouble/QUptime/master/install.sh | sudo bash
|
||||
```
|
||||
|
||||
What it does:
|
||||
|
||||
1. Looks up the latest release via the Gitea API.
|
||||
2. Downloads the binary to `/usr/local/bin/qu`.
|
||||
1. Looks up the latest release via the Gitea API; falls back to the
|
||||
GitHub API if Gitea is unreachable.
|
||||
2. Downloads the per-arch binary and the matching `SHA256SUMS` from
|
||||
the same source, then verifies the checksum. Refuses to install on
|
||||
a mismatch.
|
||||
3. Installs bash / zsh / fish completion if a target directory exists.
|
||||
4. Writes `/etc/systemd/system/qu-serve.service` and enables it (but
|
||||
does **not** start it — you need to run `qu init` first).
|
||||
|
||||
The unit it writes is minimal. For a production unit with hardening,
|
||||
see the [systemd deployment guide](deployment/systemd.md).
|
||||
4. Creates a dedicated `quptime` system user and writes
|
||||
`/etc/systemd/system/quptime.service` (hardened — matches the unit
|
||||
in [systemd.md](deployment/systemd.md)). Enables but does not start
|
||||
the service, so you can configure identity before first boot.
|
||||
5. Repairs ownership and modes under `/etc/quptime/` to the canonical
|
||||
layout (`0750` on the dir, `0700` on `keys/`, `0600` on
|
||||
`node.yaml` / `cluster.yaml` / `trust.yaml` / `keys/private.pem`,
|
||||
`0644` on `keys/public.pem` / `keys/cert.pem`). This makes the
|
||||
installer idempotent for permission damage — if something
|
||||
tightened or loosened modes (a stray `chmod -R`, a misguided
|
||||
backup restore, an accidental `sudo qu init`), re-running
|
||||
`install.sh` puts everything back without touching the contents
|
||||
of those files.
|
||||
|
||||
## Build from source
|
||||
|
||||
Requires Go 1.24.2 or newer.
|
||||
|
||||
```sh
|
||||
# Either remote — Gitea is canonical, GitHub is a push-mirror.
|
||||
git clone https://git.cer.sh/axodouble/quptime.git
|
||||
# git clone https://github.com/Axodouble/QUptime.git
|
||||
cd quptime
|
||||
go build -ldflags "-X main.version=$(git describe --tags --always)" -o qu ./cmd/qu
|
||||
|
||||
@@ -74,15 +103,26 @@ CGO_ENABLED=0 go build -trimpath -ldflags "-s -w" -o qu ./cmd/qu
|
||||
|
||||
## Docker image
|
||||
|
||||
A multi-arch (`amd64` + `arm64`) image is published to the Gitea
|
||||
registry on every tag and every push to `master`:
|
||||
The same multi-arch (`amd64` + `arm64`) image is published to two
|
||||
registries on every tag. The Gitea registry is the canonical source
|
||||
and also gets canary `:master` builds; GHCR is a tag-only mirror.
|
||||
|
||||
Primary — Gitea registry:
|
||||
|
||||
```
|
||||
git.cer.sh/axodouble/quptime:master # tip of main
|
||||
git.cer.sh/axodouble/quptime:master # tip of main (canary)
|
||||
git.cer.sh/axodouble/quptime:latest # latest tagged release
|
||||
git.cer.sh/axodouble/quptime:v0.0.1 # pinned release
|
||||
```
|
||||
|
||||
Fallback — GitHub Container Registry:
|
||||
|
||||
```
|
||||
ghcr.io/axodouble/quptime:latest # latest tagged release
|
||||
ghcr.io/axodouble/quptime:v0.0.1 # pinned release
|
||||
ghcr.io/axodouble/quptime:0.0 # latest 0.0.x
|
||||
```
|
||||
|
||||
See the [Docker deployment guide](deployment/docker.md) for compose
|
||||
files and volume layout.
|
||||
|
||||
|
||||
@@ -183,6 +183,7 @@ Options:
|
||||
| `quorum` | `true` | `false` — no mutations, no alerts. |
|
||||
| `master` | a NodeID | `(none — ...)` — quorum lost or election in flight. |
|
||||
| `term` | slow growth | rapid growth → master flapping, network unstable. |
|
||||
| `master` after a restart of the primary | unchanged for ~2 min, then bumps back | bumps back immediately → cooldown disabled or misconfigured. |
|
||||
| `config ver` | identical across nodes | divergence → a node is stuck pulling. |
|
||||
|
||||
A simple cron sentinel on each node:
|
||||
|
||||
+25
-2
@@ -35,6 +35,25 @@ flapping. Causes:
|
||||
- Heartbeat timeouts (default 4s) are too tight for your inter-node
|
||||
link. Rebuild with a higher `DefaultDeadAfter` if you need it.
|
||||
|
||||
## Primary master came back but the cluster hasn't switched to it
|
||||
|
||||
**What it means.** Working as designed. After a returning peer with a
|
||||
lower NodeID rejoins, the quorum manager waits
|
||||
`DefaultMasterCooldown` (2 minutes) before letting it displace the
|
||||
incumbent. The window prevents a self-monitoring master from flapping
|
||||
the role in lock-step with its own restart.
|
||||
|
||||
How to confirm:
|
||||
|
||||
- `qu status` on every node shows the same (current) master and a
|
||||
steady `term` — not flapping. The lower-NodeID peer is in the live
|
||||
set but not yet master.
|
||||
- After ~2 minutes of continuous liveness, `term` bumps once and the
|
||||
master switches to the lower-NodeID peer.
|
||||
|
||||
If you need a different window, change `DefaultMasterCooldown` in
|
||||
`internal/quorum/manager.go` and rebuild.
|
||||
|
||||
## A check is stuck in `unknown`
|
||||
|
||||
**What it means.** The aggregator has no fresh reports for that check.
|
||||
@@ -153,7 +172,9 @@ still see this error, the most likely causes are:
|
||||
|
||||
- The data directory is read-only or owned by a different user — the
|
||||
bootstrap can't write `node.yaml`. Fix permissions on
|
||||
`$QUPTIME_DIR`.
|
||||
`$QUPTIME_DIR`. The fastest fix on a standard install is just to
|
||||
re-run `install.sh` — it reasserts the canonical ownership and
|
||||
modes on the whole tree without touching your config.
|
||||
- Something else removed `node.yaml` mid-run (a config-management
|
||||
tool, a misconfigured volume). Re-run `qu serve` and it will
|
||||
rebuild from env, or run `qu init` manually with the flags you
|
||||
@@ -178,7 +199,9 @@ load private key: ...
|
||||
```
|
||||
|
||||
Permissions on `keys/private.pem` are wrong — should be 0600 and owned
|
||||
by the daemon user. Fix and restart.
|
||||
by the daemon user. Fix and restart. Re-running `install.sh` on a
|
||||
standard install is the easiest path: it repairs ownership and modes
|
||||
on the entire data dir.
|
||||
|
||||
## Probes look much slower than expected
|
||||
|
||||
|
||||
+157
-41
@@ -1,12 +1,17 @@
|
||||
#!/bin/bash
|
||||
# QUptime installer.
|
||||
#
|
||||
# Downloads the latest released `qu` binary from the Gitea release
|
||||
# page, verifies it against the published SHA256SUMS, installs it to
|
||||
# /usr/local/bin, and (on systemd hosts) drops in a hardened
|
||||
# quptime.service that matches the unit documented in
|
||||
# docs/deployment/systemd.md. Idempotent — re-running upgrades the
|
||||
# binary and refreshes the unit without touching the data directory.
|
||||
# Downloads the latest released `qu` binary, verifies it against the
|
||||
# published SHA256SUMS, installs it to /usr/local/bin, and (on systemd
|
||||
# hosts) drops in a hardened quptime.service that matches the unit
|
||||
# documented in docs/deployment/systemd.md.
|
||||
#
|
||||
# Release sources, tried in order:
|
||||
# 1. Gitea: git.cer.sh/axodouble/quptime/releases (primary — canonical home)
|
||||
# 2. GitHub: github.com/Axodouble/QUptime/releases (push-mirror fallback)
|
||||
#
|
||||
# Idempotent — re-running upgrades the binary and refreshes the unit
|
||||
# without touching the data directory.
|
||||
set -euo pipefail
|
||||
|
||||
INSTALL_BIN="/usr/local/bin/qu"
|
||||
@@ -15,8 +20,15 @@ SERVICE_NAME="$(basename "$SERVICE_FILE")"
|
||||
SERVICE_USER="quptime"
|
||||
SERVICE_GROUP="quptime"
|
||||
DATA_DIR="/etc/quptime"
|
||||
REPO_API="https://git.cer.sh/api/v1/repos/axodouble/quptime/releases/latest"
|
||||
RELEASE_BASE="https://git.cer.sh/axodouble/quptime/releases/download"
|
||||
|
||||
# Release sources, in preference order. Each row is:
|
||||
# <name>|<latest-release API endpoint>|<release-asset base URL>
|
||||
# The asset URL is concatenated with `/<tag>/<filename>`. Adjust here
|
||||
# if the project moves hosts.
|
||||
SOURCES=(
|
||||
"gitea|https://git.cer.sh/api/v1/repos/axodouble/quptime/releases/latest|https://git.cer.sh/axodouble/quptime/releases/download"
|
||||
"github|https://api.github.com/repos/Axodouble/QUptime/releases/latest|https://github.com/Axodouble/QUptime/releases/download"
|
||||
)
|
||||
|
||||
fail() {
|
||||
echo "Error: $*" >&2
|
||||
@@ -38,6 +50,51 @@ write_completion() {
|
||||
return 1
|
||||
}
|
||||
|
||||
# fetch_from_source tries one release source end-to-end: pulls the
|
||||
# latest tag from its API, downloads the per-arch binary and the
|
||||
# accompanying SHA256SUMS, and verifies the checksum. Returns 0 on
|
||||
# success (with RELEASE and BINARY_NAME set as globals) or 1 if any
|
||||
# step fails — callers can then try the next source. Stderr is kept
|
||||
# quiet so a failed primary doesn't spam the operator before the
|
||||
# fallback is attempted.
|
||||
fetch_from_source() {
|
||||
local api_url=$1
|
||||
local release_base=$2
|
||||
local tmpdir=$3
|
||||
|
||||
local release
|
||||
release=$(curl -fsSL --proto '=https' --tlsv1.2 "$api_url" 2>/dev/null | jq -r '.tag_name' 2>/dev/null) \
|
||||
|| return 1
|
||||
[ -n "$release" ] && [ "$release" != "null" ] || return 1
|
||||
|
||||
local binary_name="qu-${release}-linux-${ARCH}"
|
||||
local binary_url="${release_base}/${release}/${binary_name}"
|
||||
local sums_url="${release_base}/${release}/SHA256SUMS"
|
||||
|
||||
curl -fsSL --proto '=https' --tlsv1.2 -o "$tmpdir/$binary_name" "$binary_url" 2>/dev/null \
|
||||
|| return 1
|
||||
curl -fsSL --proto '=https' --tlsv1.2 -o "$tmpdir/SHA256SUMS" "$sums_url" 2>/dev/null \
|
||||
|| return 1
|
||||
|
||||
# Verify against the SHA256SUMS that came from the same source as
|
||||
# the binary. Never mix sources here — verifying a GitHub-hosted
|
||||
# binary against a Gitea-hosted SHA256SUMS would defeat the
|
||||
# tamper check.
|
||||
(
|
||||
cd "$tmpdir"
|
||||
if ! grep -E "[[:space:]]\\*?${binary_name}\$" SHA256SUMS > expected.sum; then
|
||||
exit 1
|
||||
fi
|
||||
if ! sha256sum -c expected.sum >/dev/null 2>&1; then
|
||||
exit 1
|
||||
fi
|
||||
) || return 1
|
||||
|
||||
RELEASE="$release"
|
||||
BINARY_NAME="$binary_name"
|
||||
return 0
|
||||
}
|
||||
|
||||
require_command curl
|
||||
require_command jq
|
||||
require_command sha256sum
|
||||
@@ -55,44 +112,39 @@ if [ ! -w "$(dirname "$INSTALL_BIN")" ]; then
|
||||
fail "Cannot write to $(dirname "$INSTALL_BIN"). Run this script with sudo, or set INSTALL_BIN to a writable location."
|
||||
fi
|
||||
|
||||
# --- latest release tag -------------------------------------------------
|
||||
RELEASE=$(curl -fsSL "$REPO_API" | jq -r '.tag_name')
|
||||
[ -n "$RELEASE" ] && [ "$RELEASE" != "null" ] \
|
||||
|| fail "could not determine the latest release tag from $REPO_API"
|
||||
|
||||
BINARY_NAME="qu-${RELEASE}-linux-${ARCH}"
|
||||
BINARY_URL="${RELEASE_BASE}/${RELEASE}/${BINARY_NAME}"
|
||||
SUMS_URL="${RELEASE_BASE}/${RELEASE}/SHA256SUMS"
|
||||
|
||||
# --- download + verify --------------------------------------------------
|
||||
# Stage in a temp dir so a failed verification never leaves a partial
|
||||
# or unverified binary on disk.
|
||||
# --- download + verify (with fallback) ----------------------------------
|
||||
TMPDIR=$(mktemp -d)
|
||||
trap 'rm -rf "$TMPDIR"' EXIT
|
||||
|
||||
echo "> downloading $BINARY_NAME"
|
||||
curl -fsSL --proto '=https' --tlsv1.2 -o "$TMPDIR/$BINARY_NAME" "$BINARY_URL"
|
||||
echo "> downloading SHA256SUMS"
|
||||
curl -fsSL --proto '=https' --tlsv1.2 -o "$TMPDIR/SHA256SUMS" "$SUMS_URL"
|
||||
# Globals filled in by fetch_from_source on success.
|
||||
RELEASE=""
|
||||
BINARY_NAME=""
|
||||
INSTALLED_FROM=""
|
||||
INSTALLED_TMP=""
|
||||
|
||||
echo "> verifying checksum"
|
||||
# Pull just our binary's entry so sha256sum -c doesn't fail on the
|
||||
# arches we didn't download.
|
||||
(
|
||||
cd "$TMPDIR"
|
||||
if ! grep -E "[[:space:]]\\*?${BINARY_NAME}\$" SHA256SUMS > expected.sum; then
|
||||
fail "no entry for $BINARY_NAME in published SHA256SUMS — refusing to install"
|
||||
for source_spec in "${SOURCES[@]}"; do
|
||||
IFS='|' read -r src_name src_api src_base <<<"$source_spec"
|
||||
src_tmp="$TMPDIR/$src_name"
|
||||
mkdir -p "$src_tmp"
|
||||
echo "> trying release source: $src_name"
|
||||
# `set -e` would abort the whole script the moment fetch_from_source
|
||||
# returns nonzero; we want the loop to fall through to the next
|
||||
# source instead. Wrap the call so a failure is just data.
|
||||
if fetch_from_source "$src_api" "$src_base" "$src_tmp"; then
|
||||
INSTALLED_FROM="$src_name"
|
||||
INSTALLED_TMP="$src_tmp"
|
||||
echo "> $src_name: ${RELEASE} ✓ checksum OK"
|
||||
break
|
||||
fi
|
||||
if ! sha256sum -c expected.sum >/dev/null 2>&1; then
|
||||
echo "expected: $(awk '{print $1}' expected.sum)"
|
||||
echo "actual: $(sha256sum "$BINARY_NAME" | awk '{print $1}')"
|
||||
fail "checksum mismatch for $BINARY_NAME — refusing to install"
|
||||
fi
|
||||
)
|
||||
echo "> checksum OK"
|
||||
echo "> $src_name: unavailable"
|
||||
done
|
||||
|
||||
install -m 0755 "$TMPDIR/$BINARY_NAME" "$INSTALL_BIN"
|
||||
echo "> qu ${RELEASE} installed to $INSTALL_BIN"
|
||||
if [ -z "$INSTALLED_FROM" ]; then
|
||||
fail "no release source reachable — tried: $(printf '%s ' "${SOURCES[@]%%|*}"). Check network access to git.cer.sh and github.com."
|
||||
fi
|
||||
|
||||
install -m 0755 "$INSTALLED_TMP/$BINARY_NAME" "$INSTALL_BIN"
|
||||
echo "> qu ${RELEASE} installed to $INSTALL_BIN (source: $INSTALLED_FROM)"
|
||||
|
||||
# --- shell completions --------------------------------------------------
|
||||
if "$INSTALL_BIN" --help 2>/dev/null | grep -q "completion"; then
|
||||
@@ -123,6 +175,63 @@ fi
|
||||
|
||||
install -d -o "$SERVICE_USER" -g "$SERVICE_GROUP" -m 0750 "$DATA_DIR"
|
||||
|
||||
# Repair ownership and permissions on the data dir's contents. Catches:
|
||||
# - re-running the installer over a previous install where the
|
||||
# service user/group changed.
|
||||
# - the operator ran `qu init` or `qu serve` as root once (easy
|
||||
# mistake: `sudo qu init` is shorter than the documented
|
||||
# `sudo -u quptime qu init`). When the daemon runs as root its
|
||||
# DataDir() resolves to /etc/quptime, so any files it writes land
|
||||
# owned by root:root — the systemd service then fails with
|
||||
# `open node.yaml: permission denied`.
|
||||
# - someone or something (a stray `chmod -R`, a misguided backup
|
||||
# restore) tightened or loosened modes. Re-running the installer
|
||||
# should be enough to get back to a working baseline.
|
||||
# The canonical layout (mirrors the modes the daemon writes itself
|
||||
# in internal/config and internal/crypto):
|
||||
# /etc/quptime/ quptime:quptime 0750
|
||||
# /etc/quptime/keys/ quptime:quptime 0700
|
||||
# /etc/quptime/node.yaml quptime:quptime 0600
|
||||
# /etc/quptime/cluster.yaml quptime:quptime 0600
|
||||
# /etc/quptime/trust.yaml quptime:quptime 0600
|
||||
# /etc/quptime/keys/private.pem quptime:quptime 0600
|
||||
# /etc/quptime/keys/public.pem quptime:quptime 0644
|
||||
# /etc/quptime/keys/cert.pem quptime:quptime 0644
|
||||
# The runtime dir /var/run/quptime is owned by systemd via
|
||||
# RuntimeDirectory= and rebuilt at each service start, so we leave it
|
||||
# alone.
|
||||
repair_perms() {
|
||||
# Always reset the top-level dir mode — `install -d` only sets it
|
||||
# on creation, not on re-run.
|
||||
chown "$SERVICE_USER:$SERVICE_GROUP" "$DATA_DIR"
|
||||
chmod 0750 "$DATA_DIR"
|
||||
|
||||
# Reassert ownership across the whole tree in one pass.
|
||||
if [ -n "$(ls -A "$DATA_DIR" 2>/dev/null)" ]; then
|
||||
chown -R "$SERVICE_USER:$SERVICE_GROUP" "$DATA_DIR"
|
||||
fi
|
||||
|
||||
# keys/ is a directory with its own tighter mode.
|
||||
if [ -d "$DATA_DIR/keys" ]; then
|
||||
chmod 0700 "$DATA_DIR/keys"
|
||||
fi
|
||||
|
||||
# Each known file gets its canonical mode if it exists. We don't
|
||||
# create anything that isn't already there — that's `qu init`'s
|
||||
# job — and we don't touch unknown files an operator may have
|
||||
# parked in the dir.
|
||||
local f
|
||||
for f in node.yaml cluster.yaml trust.yaml keys/private.pem; do
|
||||
[ -f "$DATA_DIR/$f" ] && chmod 0600 "$DATA_DIR/$f"
|
||||
done
|
||||
for f in keys/public.pem keys/cert.pem; do
|
||||
[ -f "$DATA_DIR/$f" ] && chmod 0644 "$DATA_DIR/$f"
|
||||
done
|
||||
}
|
||||
|
||||
repair_perms
|
||||
echo "> reasserted ownership ($SERVICE_USER:$SERVICE_GROUP) and modes under $DATA_DIR"
|
||||
|
||||
echo "> writing $SERVICE_FILE"
|
||||
cat > "$SERVICE_FILE" <<'EOF'
|
||||
[Unit]
|
||||
@@ -200,11 +309,18 @@ Next steps:
|
||||
# On follower nodes, also set the shared join secret:
|
||||
# Environment=QUPTIME_CLUSTER_SECRET=<paste from first node>
|
||||
|
||||
b) Or run \`qu init\` once explicitly:
|
||||
b) Or run \`qu init\` once explicitly. IMPORTANT: run as the
|
||||
${SERVICE_USER} user, not root — otherwise node.yaml lands
|
||||
owned by root and the service can't read it on start.
|
||||
|
||||
sudo -u ${SERVICE_USER} QUPTIME_DIR=${DATA_DIR} \\
|
||||
qu init --advertise <this-host>:9901
|
||||
|
||||
If you already ran it as root and the service is failing
|
||||
with "permission denied" on node.yaml, repair with:
|
||||
|
||||
sudo chown -R ${SERVICE_USER}:${SERVICE_GROUP} ${DATA_DIR}
|
||||
|
||||
2. Start the service:
|
||||
|
||||
sudo systemctl start ${SERVICE_NAME}
|
||||
|
||||
@@ -25,12 +25,20 @@ type discordPayload struct {
|
||||
}
|
||||
|
||||
// sendDiscord posts msg.Subject + body to the configured webhook URL.
|
||||
// When the alert has a custom BodyTemplate, the rendered body is shipped
|
||||
// verbatim — the operator has opted out of the default subject header
|
||||
// and code-block wrapping in favour of their own formatting.
|
||||
func sendDiscord(a *config.Alert, msg Message) error {
|
||||
if a.DiscordWebhook == "" {
|
||||
return errors.New("discord webhook url not set")
|
||||
}
|
||||
|
||||
content := msg.Subject + "\n```\n" + msg.Body + "\n```"
|
||||
var content string
|
||||
if a.BodyTemplate != "" {
|
||||
content = msg.Body
|
||||
} else {
|
||||
content = msg.Subject + "\n```\n" + msg.Body + "\n```"
|
||||
}
|
||||
raw, err := json.Marshal(discordPayload{Content: content})
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -27,7 +27,7 @@ func New(cluster *config.ClusterConfig, selfID string, logger *log.Logger) *Disp
|
||||
|
||||
// OnTransition is wired as checks.TransitionFn.
|
||||
func (d *Dispatcher) OnTransition(check *config.Check, from, to checks.State, snap checks.Snapshot) {
|
||||
if to == checks.StateUnknown {
|
||||
if !shouldAlert(from, to) {
|
||||
return
|
||||
}
|
||||
alerts := d.cluster.EffectiveAlertsFor(check)
|
||||
@@ -77,6 +77,25 @@ func (d *Dispatcher) Test(alertID string) error {
|
||||
return d.dispatchOne(alert, msg)
|
||||
}
|
||||
|
||||
// shouldAlert decides whether a committed state transition warrants
|
||||
// firing the configured alert channels.
|
||||
//
|
||||
// A fresh master's aggregator starts every check at StateUnknown, so
|
||||
// the first successful evaluation always commits Unknown→Up. Without
|
||||
// filtering, every master failover (or daemon restart) would spam an
|
||||
// "is now UP" alert for every healthy check. We treat Unknown→Up as a
|
||||
// silent cold start; real recoveries (Down→Up) and any transition to
|
||||
// Down still alert.
|
||||
func shouldAlert(from, to checks.State) bool {
|
||||
if to == checks.StateUnknown {
|
||||
return false
|
||||
}
|
||||
if from == checks.StateUnknown && to == checks.StateUp {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (d *Dispatcher) dispatchOne(a *config.Alert, msg Message) error {
|
||||
switch a.Type {
|
||||
case config.AlertSMTP:
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.cer.sh/axodouble/quptime/internal/checks"
|
||||
)
|
||||
|
||||
func TestShouldAlertFiltersColdStartUp(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
from checks.State
|
||||
to checks.State
|
||||
want bool
|
||||
}{
|
||||
{"cold start to up (master failover / daemon restart)", checks.StateUnknown, checks.StateUp, false},
|
||||
{"cold start to down still alerts", checks.StateUnknown, checks.StateDown, true},
|
||||
{"real recovery alerts", checks.StateDown, checks.StateUp, true},
|
||||
{"regression alerts", checks.StateUp, checks.StateDown, true},
|
||||
{"stale (up to unknown) suppressed", checks.StateUp, checks.StateUnknown, false},
|
||||
{"stale (down to unknown) suppressed", checks.StateDown, checks.StateUnknown, false},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
if got := shouldAlert(c.from, c.to); got != c.want {
|
||||
t.Errorf("shouldAlert(%s→%s) = %v, want %v", c.from, c.to, got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Default file names. Callers should always go through DataDir() so an
|
||||
@@ -55,10 +56,47 @@ func DataDir() string {
|
||||
}
|
||||
|
||||
// SocketPath returns the unix socket used for local CLI ↔ daemon control.
|
||||
//
|
||||
// Resolution order:
|
||||
// 1. $QUPTIME_SOCKET — explicit operator override.
|
||||
// 2. $RUNTIME_DIRECTORY — set by systemd when the unit declares
|
||||
// RuntimeDirectory=quptime. This is the path the daemon uses
|
||||
// when run under the packaged unit: /run/quptime/quptime.sock.
|
||||
// 3. The canonical system socket path — /run/quptime/quptime.sock —
|
||||
// if it exists. This catches the CLI side regardless of who is
|
||||
// invoking it: `sudo -u quptime qu status` strips RUNTIME_DIRECTORY
|
||||
// and XDG_RUNTIME_DIR, so without this probe the CLI falls all
|
||||
// the way through to /tmp/quptime-<user>/… and reports "no such
|
||||
// file" even while the daemon is happily listening.
|
||||
// 4. /var/run/quptime/… when euid is 0 (CLI side, packaged installs
|
||||
// on systems where /var/run isn't a symlink to /run).
|
||||
// 5. $XDG_RUNTIME_DIR/quptime/… for user-mode installs.
|
||||
// 6. /tmp/quptime-<user>/… as a last resort.
|
||||
func SocketPath() string {
|
||||
if v := os.Getenv("QUPTIME_SOCKET"); v != "" {
|
||||
return v
|
||||
}
|
||||
if v := os.Getenv("RUNTIME_DIRECTORY"); v != "" {
|
||||
// systemd may pass multiple colon-separated entries when more
|
||||
// than one RuntimeDirectory= is declared. Ours is single, but
|
||||
// be defensive in case a future unit adds more.
|
||||
if i := strings.IndexByte(v, ':'); i >= 0 {
|
||||
v = v[:i]
|
||||
}
|
||||
return filepath.Join(v, SocketName)
|
||||
}
|
||||
// If a system-managed daemon is already listening, route there
|
||||
// regardless of euid. Without this, `sudo -u quptime qu …` can't
|
||||
// find the socket the daemon (also running as quptime) created
|
||||
// via RuntimeDirectory=.
|
||||
for _, p := range []string{
|
||||
"/run/quptime/" + SocketName,
|
||||
"/var/run/quptime/" + SocketName,
|
||||
} {
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
return p
|
||||
}
|
||||
}
|
||||
if os.Geteuid() == 0 {
|
||||
return "/var/run/quptime/" + SocketName
|
||||
}
|
||||
|
||||
@@ -34,6 +34,12 @@ import (
|
||||
const (
|
||||
DefaultHeartbeatInterval = 1 * time.Second
|
||||
DefaultDeadAfter = 4 * time.Second
|
||||
// DefaultMasterCooldown is the grace period a returning peer must
|
||||
// stay continuously live before it's allowed to displace the
|
||||
// currently-elected master. Without it, a self-monitoring master
|
||||
// that briefly drops would reclaim the role immediately on return
|
||||
// and disrupt anything watching its TCP port.
|
||||
DefaultMasterCooldown = 2 * time.Minute
|
||||
)
|
||||
|
||||
// VersionObserver is invoked whenever a heartbeat exchange reveals
|
||||
@@ -50,11 +56,13 @@ type Manager struct {
|
||||
|
||||
heartbeatInterval time.Duration
|
||||
deadAfter time.Duration
|
||||
masterCooldown time.Duration
|
||||
|
||||
mu sync.RWMutex
|
||||
term uint64
|
||||
masterID string
|
||||
lastSeen map[string]time.Time // peerID -> last contact (sent or recv)
|
||||
liveSince map[string]time.Time // peerID -> start of current liveness streak
|
||||
addrOf map[string]string // peerID -> advertise addr (last known)
|
||||
|
||||
observer VersionObserver
|
||||
@@ -70,7 +78,9 @@ func New(selfID string, cluster *config.ClusterConfig, client *transport.Client)
|
||||
client: client,
|
||||
heartbeatInterval: DefaultHeartbeatInterval,
|
||||
deadAfter: DefaultDeadAfter,
|
||||
masterCooldown: DefaultMasterCooldown,
|
||||
lastSeen: map[string]time.Time{},
|
||||
liveSince: map[string]time.Time{},
|
||||
addrOf: map[string]string{},
|
||||
}
|
||||
}
|
||||
@@ -242,7 +252,15 @@ func (m *Manager) tick(ctx context.Context) {
|
||||
|
||||
func (m *Manager) markLive(id string) {
|
||||
m.mu.Lock()
|
||||
m.lastSeen[id] = time.Now()
|
||||
now := time.Now()
|
||||
prev, ok := m.lastSeen[id]
|
||||
// A peer entering its first liveness streak — or returning after
|
||||
// the dead-after window expired — resets liveSince. Subsequent
|
||||
// heartbeats within the streak leave it untouched.
|
||||
if !ok || now.Sub(prev) > m.deadAfter {
|
||||
m.liveSince[id] = now
|
||||
}
|
||||
m.lastSeen[id] = now
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
@@ -276,7 +294,41 @@ func (m *Manager) recomputeMaster() {
|
||||
|
||||
var newMaster string
|
||||
if len(live) >= quorum && len(live) > 0 {
|
||||
newMaster = live[0] // lowest NodeID wins
|
||||
// Without an incumbent the cluster is bootstrapping or
|
||||
// has just regained quorum, so elect immediately — there's
|
||||
// nothing to protect from a handoff.
|
||||
if m.masterID == "" {
|
||||
newMaster = live[0]
|
||||
} else {
|
||||
newMaster = m.masterID
|
||||
now := time.Now()
|
||||
incumbentLive := false
|
||||
for _, id := range live {
|
||||
if id == m.masterID {
|
||||
incumbentLive = true
|
||||
break
|
||||
}
|
||||
}
|
||||
// If the incumbent is no longer live, any live peer
|
||||
// may take over without waiting.
|
||||
if !incumbentLive {
|
||||
newMaster = live[0]
|
||||
} else {
|
||||
// Incumbent is live. A peer with a lower NodeID
|
||||
// may only displace it after it has stayed
|
||||
// continuously live for masterCooldown.
|
||||
for _, id := range live {
|
||||
if id >= m.masterID {
|
||||
break // sorted ascending — nobody lower left
|
||||
}
|
||||
since, ok := m.liveSince[id]
|
||||
if ok && now.Sub(since) >= m.masterCooldown {
|
||||
newMaster = id
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if newMaster != m.masterID {
|
||||
m.term++
|
||||
|
||||
@@ -119,6 +119,127 @@ func TestDeadAfterEvictsStaleLiveness(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// heartbeatLoop simulates the production heartbeat cadence — calling
|
||||
// markLive for the given peers more frequently than deadAfter, so a
|
||||
// peer that's "live throughout" never has its liveSince reset by the
|
||||
// dead-after gap heuristic. It returns when the context's deadline
|
||||
// hits.
|
||||
func heartbeatLoop(t *testing.T, m *Manager, dur time.Duration, peers ...string) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(dur)
|
||||
interval := m.deadAfter / 4
|
||||
if interval < time.Millisecond {
|
||||
interval = time.Millisecond
|
||||
}
|
||||
for time.Now().Before(deadline) {
|
||||
for _, p := range peers {
|
||||
m.markLive(p)
|
||||
}
|
||||
m.recomputeMaster()
|
||||
time.Sleep(interval)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReturningLowerIDWaitsForCooldown(t *testing.T) {
|
||||
_, m := threeNode("b")
|
||||
m.deadAfter = 80 * time.Millisecond
|
||||
m.masterCooldown = 200 * time.Millisecond
|
||||
|
||||
// Bootstrap: all three live, "a" elected.
|
||||
m.markLive("a")
|
||||
m.markLive("b")
|
||||
m.markLive("c")
|
||||
m.recomputeMaster()
|
||||
if m.Master() != "a" {
|
||||
t.Fatalf("initial master=%q want a", m.Master())
|
||||
}
|
||||
|
||||
// "a" drops — only b/c heartbeat. Long enough to age a out and let
|
||||
// b take over.
|
||||
heartbeatLoop(t, m, 120*time.Millisecond, "b", "c")
|
||||
if m.Master() != "b" {
|
||||
t.Fatalf("after a-drop master=%q want b", m.Master())
|
||||
}
|
||||
|
||||
// "a" returns. Verify b stays master for less than the cooldown.
|
||||
heartbeatLoop(t, m, 120*time.Millisecond, "a", "b", "c")
|
||||
if m.Master() != "b" {
|
||||
t.Errorf("mid-cooldown master=%q want b", m.Master())
|
||||
}
|
||||
|
||||
// Past the cooldown, a reclaims master.
|
||||
heartbeatLoop(t, m, 120*time.Millisecond, "a", "b", "c")
|
||||
if m.Master() != "a" {
|
||||
t.Errorf("after cooldown master=%q want a", m.Master())
|
||||
}
|
||||
}
|
||||
|
||||
func TestCooldownResetsOnFlap(t *testing.T) {
|
||||
_, m := threeNode("b")
|
||||
m.deadAfter = 80 * time.Millisecond
|
||||
m.masterCooldown = 200 * time.Millisecond
|
||||
|
||||
m.markLive("a")
|
||||
m.markLive("b")
|
||||
m.markLive("c")
|
||||
m.recomputeMaster()
|
||||
|
||||
// a drops, b becomes master.
|
||||
heartbeatLoop(t, m, 120*time.Millisecond, "b", "c")
|
||||
if m.Master() != "b" {
|
||||
t.Fatalf("master=%q want b", m.Master())
|
||||
}
|
||||
|
||||
// a returns briefly, then drops again before cooldown elapses.
|
||||
heartbeatLoop(t, m, 100*time.Millisecond, "a", "b", "c")
|
||||
if m.Master() != "b" {
|
||||
t.Fatalf("during first cooldown master=%q want b", m.Master())
|
||||
}
|
||||
heartbeatLoop(t, m, 120*time.Millisecond, "b", "c") // a ages out again
|
||||
if m.Master() != "b" {
|
||||
t.Fatalf("after a-reflap master=%q want b", m.Master())
|
||||
}
|
||||
|
||||
// a returns for the second time — cooldown restarts here.
|
||||
// Wait less than a full cooldown — b should still be master.
|
||||
heartbeatLoop(t, m, 100*time.Millisecond, "a", "b", "c")
|
||||
if m.Master() != "b" {
|
||||
t.Errorf("partway through fresh cooldown master=%q want b", m.Master())
|
||||
}
|
||||
|
||||
// Past the full fresh cooldown, a takes over.
|
||||
heartbeatLoop(t, m, 150*time.Millisecond, "a", "b", "c")
|
||||
if m.Master() != "a" {
|
||||
t.Errorf("after fresh cooldown master=%q want a", m.Master())
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewMasterAfterQuorumLossIgnoresCooldown(t *testing.T) {
|
||||
_, m := threeNode("b")
|
||||
m.deadAfter = 50 * time.Millisecond
|
||||
m.masterCooldown = 1 * time.Hour // would block election if applied
|
||||
|
||||
// Bootstrap into no-master state by letting all peers age out.
|
||||
m.markLive("a")
|
||||
m.markLive("b")
|
||||
m.markLive("c")
|
||||
m.recomputeMaster()
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
m.markLive("b")
|
||||
m.recomputeMaster()
|
||||
if m.Master() != "" {
|
||||
t.Fatalf("master=%q want empty (quorum lost)", m.Master())
|
||||
}
|
||||
|
||||
// Quorum regained — incumbent is empty, election must be immediate.
|
||||
m.markLive("a")
|
||||
m.markLive("b")
|
||||
m.recomputeMaster()
|
||||
if m.Master() != "a" {
|
||||
t.Errorf("post-recovery master=%q want a (no cooldown when empty)", m.Master())
|
||||
}
|
||||
}
|
||||
|
||||
func TestVersionObserverFiresOnHigherVersion(t *testing.T) {
|
||||
cluster := &config.ClusterConfig{Version: 2}
|
||||
m := New("a", cluster, nil)
|
||||
|
||||
+127
-25
@@ -7,6 +7,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/charmbracelet/bubbles/textarea"
|
||||
"github.com/charmbracelet/bubbles/textinput"
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
"github.com/charmbracelet/lipgloss"
|
||||
@@ -53,10 +54,45 @@ func modalDoneCmd(flash string, level flashLevel) tea.Cmd {
|
||||
type formField struct {
|
||||
label string
|
||||
input textinput.Model
|
||||
textarea textarea.Model
|
||||
multiline bool
|
||||
required bool
|
||||
hint string
|
||||
}
|
||||
|
||||
// value returns the field's current text regardless of whether it's
|
||||
// backed by a single-line input or a multiline textarea.
|
||||
func (fld *formField) value() string {
|
||||
if fld.multiline {
|
||||
return fld.textarea.Value()
|
||||
}
|
||||
return fld.input.Value()
|
||||
}
|
||||
|
||||
func (fld *formField) focus() {
|
||||
if fld.multiline {
|
||||
fld.textarea.Focus()
|
||||
return
|
||||
}
|
||||
fld.input.Focus()
|
||||
}
|
||||
|
||||
func (fld *formField) blur() {
|
||||
if fld.multiline {
|
||||
fld.textarea.Blur()
|
||||
return
|
||||
}
|
||||
fld.input.Blur()
|
||||
}
|
||||
|
||||
func (fld *formField) setWidth(w int) {
|
||||
if fld.multiline {
|
||||
fld.textarea.SetWidth(w)
|
||||
return
|
||||
}
|
||||
fld.input.Width = w
|
||||
}
|
||||
|
||||
type form struct {
|
||||
title string
|
||||
fields []formField
|
||||
@@ -86,12 +122,14 @@ func fieldWidthFor(termWidth int) int {
|
||||
|
||||
func newForm(title string, fields []formField, submit func([]string) tea.Cmd) *form {
|
||||
for i := range fields {
|
||||
if !fields[i].multiline {
|
||||
fields[i].input.Prompt = ""
|
||||
fields[i].input.CharLimit = 256
|
||||
}
|
||||
if i == 0 {
|
||||
fields[i].input.Focus()
|
||||
fields[i].focus()
|
||||
} else {
|
||||
fields[i].input.Blur()
|
||||
fields[i].blur()
|
||||
}
|
||||
}
|
||||
return &form{title: title, fields: fields, submit: submit}
|
||||
@@ -114,6 +152,31 @@ func textFieldWithValue(label, hint, value string, required bool) formField {
|
||||
return formField{label: label, hint: hint, required: required, input: ti}
|
||||
}
|
||||
|
||||
// textAreaField creates a multiline field. Enter inserts a newline;
|
||||
// the form uses shift+enter / ctrl+s to submit when the cursor is on
|
||||
// one of these. Useful for things like alert body templates where the
|
||||
// rendered message naturally spans multiple lines.
|
||||
func textAreaField(label, hint string, required bool) formField {
|
||||
return textAreaFieldWithValue(label, hint, "", required)
|
||||
}
|
||||
|
||||
func textAreaFieldWithValue(label, hint, value string, required bool) formField {
|
||||
ta := textarea.New()
|
||||
ta.Placeholder = hint
|
||||
ta.ShowLineNumbers = false
|
||||
ta.Prompt = " "
|
||||
ta.SetHeight(5)
|
||||
ta.SetWidth(defaultFieldWidth)
|
||||
ta.CharLimit = 0
|
||||
// Keep enter bound to "insert newline" (the textarea default) — the
|
||||
// surrounding form intercepts enter on single-line fields and handles
|
||||
// shift+enter/ctrl+s as the submit/advance trigger for multiline ones.
|
||||
if value != "" {
|
||||
ta.SetValue(value)
|
||||
}
|
||||
return formField{label: label, hint: hint, required: required, multiline: true, textarea: ta}
|
||||
}
|
||||
|
||||
func passwordField(label, hint string) formField {
|
||||
return passwordFieldWithValue(label, hint, "")
|
||||
}
|
||||
@@ -146,7 +209,11 @@ func (f *form) View() string {
|
||||
labelStyle = lipgloss.NewStyle().Foreground(colorAccent).Bold(true)
|
||||
}
|
||||
fmt.Fprintf(&b, "%s%s\n", marker, labelStyle.Render(fld.label))
|
||||
if fld.multiline {
|
||||
fmt.Fprintf(&b, "%s\n", fld.textarea.View())
|
||||
} else {
|
||||
fmt.Fprintf(&b, " %s\n", fld.input.View())
|
||||
}
|
||||
if i == f.cursor && fld.hint != "" {
|
||||
fmt.Fprintf(&b, " %s\n", helpStyle.Render(fld.hint))
|
||||
}
|
||||
@@ -158,7 +225,11 @@ func (f *form) View() string {
|
||||
if f.busy {
|
||||
fmt.Fprintf(&b, "%s\n", flashWarnStyle.Render("working…"))
|
||||
} else {
|
||||
fmt.Fprintf(&b, "%s\n", helpStyle.Render("↑↓ field enter next/submit esc cancel"))
|
||||
help := "↑↓ field enter next/submit esc cancel"
|
||||
if f.cursor < len(f.fields) && f.fields[f.cursor].multiline {
|
||||
help = "tab field enter newline shift+enter/ctrl+s submit esc cancel"
|
||||
}
|
||||
fmt.Fprintf(&b, "%s\n", helpStyle.Render(help))
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
@@ -169,7 +240,7 @@ func (f *form) Update(msg tea.Msg) (modal, tea.Cmd) {
|
||||
f.width = msg.Width
|
||||
w := fieldWidthFor(msg.Width)
|
||||
for i := range f.fields {
|
||||
f.fields[i].input.Width = w
|
||||
f.fields[i].setWidth(w)
|
||||
}
|
||||
return f, nil
|
||||
|
||||
@@ -179,43 +250,74 @@ func (f *form) Update(msg tea.Msg) (modal, tea.Cmd) {
|
||||
return f, nil
|
||||
|
||||
case tea.KeyMsg:
|
||||
switch msg.String() {
|
||||
key := msg.String()
|
||||
// up/down on a multiline field belong to in-text navigation;
|
||||
// leave field-switching to tab/shift+tab there. Same for enter:
|
||||
// the textarea owns it as "insert newline", so submission moves
|
||||
// to shift+enter / ctrl+s.
|
||||
multiline := f.cursor < len(f.fields) && f.fields[f.cursor].multiline
|
||||
switch key {
|
||||
case "esc":
|
||||
return f, modalDoneCmd("", flashInfo)
|
||||
case "tab", "down":
|
||||
case "tab":
|
||||
f.advance(1)
|
||||
return f, nil
|
||||
case "shift+tab", "up":
|
||||
case "shift+tab":
|
||||
f.advance(-1)
|
||||
return f, nil
|
||||
case "enter":
|
||||
if f.busy {
|
||||
case "down":
|
||||
if !multiline {
|
||||
f.advance(1)
|
||||
return f, nil
|
||||
}
|
||||
case "up":
|
||||
if !multiline {
|
||||
f.advance(-1)
|
||||
return f, nil
|
||||
}
|
||||
case "enter":
|
||||
if !multiline {
|
||||
return f, f.submitOrAdvance()
|
||||
}
|
||||
case "shift+enter", "ctrl+s":
|
||||
return f, f.submitOrAdvance()
|
||||
}
|
||||
}
|
||||
var cmd tea.Cmd
|
||||
if f.fields[f.cursor].multiline {
|
||||
f.fields[f.cursor].textarea, cmd = f.fields[f.cursor].textarea.Update(msg)
|
||||
} else {
|
||||
f.fields[f.cursor].input, cmd = f.fields[f.cursor].input.Update(msg)
|
||||
}
|
||||
return f, cmd
|
||||
}
|
||||
|
||||
// submitOrAdvance is the shared trigger for enter on single-line fields
|
||||
// and shift+enter / ctrl+s on multiline fields: jump to the next field
|
||||
// or, on the last one, validate and run submit.
|
||||
func (f *form) submitOrAdvance() tea.Cmd {
|
||||
if f.busy {
|
||||
return nil
|
||||
}
|
||||
if f.cursor < len(f.fields)-1 {
|
||||
f.advance(1)
|
||||
return f, nil
|
||||
return nil
|
||||
}
|
||||
vals := make([]string, len(f.fields))
|
||||
for i, fld := range f.fields {
|
||||
vals[i] = fld.input.Value()
|
||||
for i := range f.fields {
|
||||
vals[i] = f.fields[i].value()
|
||||
}
|
||||
for i, fld := range f.fields {
|
||||
if fld.required && strings.TrimSpace(vals[i]) == "" {
|
||||
f.err = fld.label + " is required"
|
||||
f.cursor = i
|
||||
f.focusOnly(i)
|
||||
return f, nil
|
||||
return nil
|
||||
}
|
||||
}
|
||||
f.busy = true
|
||||
f.err = ""
|
||||
return f, f.submit(vals)
|
||||
}
|
||||
}
|
||||
var cmd tea.Cmd
|
||||
f.fields[f.cursor].input, cmd = f.fields[f.cursor].input.Update(msg)
|
||||
return f, cmd
|
||||
return f.submit(vals)
|
||||
}
|
||||
|
||||
func (f *form) advance(delta int) {
|
||||
@@ -230,9 +332,9 @@ func (f *form) advance(delta int) {
|
||||
func (f *form) focusOnly(i int) {
|
||||
for j := range f.fields {
|
||||
if j == i {
|
||||
f.fields[j].input.Focus()
|
||||
f.fields[j].focus()
|
||||
} else {
|
||||
f.fields[j].input.Blur()
|
||||
f.fields[j].blur()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -294,7 +396,7 @@ func newAddDiscordForm() *form {
|
||||
textField("Name", "human-friendly identifier", true),
|
||||
textField("Webhook URL", "https://discord.com/api/webhooks/...", true),
|
||||
textField("Default", "yes/no — attach to every check automatically", false),
|
||||
textField("Body template", alerts.TemplateVarsHint(), false),
|
||||
textAreaField("Body template", alerts.TemplateVarsHint(), false),
|
||||
}
|
||||
return newForm("Add Discord alert", fields, func(vals []string) tea.Cmd {
|
||||
return func() tea.Msg {
|
||||
@@ -326,7 +428,7 @@ func newAddSMTPForm() *form {
|
||||
textField("StartTLS", "yes/no — default yes", false),
|
||||
textField("Default", "yes/no — attach to every check", false),
|
||||
textField("Subject template", alerts.TemplateVarsHint(), false),
|
||||
textField("Body template", alerts.TemplateVarsHint(), false),
|
||||
textAreaField("Body template", alerts.TemplateVarsHint(), false),
|
||||
}
|
||||
return newForm("Add SMTP alert", fields, func(vals []string) tea.Cmd {
|
||||
return func() tea.Msg {
|
||||
@@ -467,7 +569,7 @@ func newEditDiscordForm(existing config.Alert) *form {
|
||||
textFieldWithValue("Name", "human-friendly identifier", existing.Name, true),
|
||||
textFieldWithValue("Webhook URL", "https://discord.com/api/webhooks/...", existing.DiscordWebhook, true),
|
||||
textFieldWithValue("Default", "yes/no — attach to every check automatically", boolStr(existing.Default), false),
|
||||
textFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
|
||||
textAreaFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
|
||||
}
|
||||
id := existing.ID
|
||||
subject := existing.SubjectTemplate
|
||||
@@ -506,7 +608,7 @@ func newEditSMTPForm(existing config.Alert) *form {
|
||||
textFieldWithValue("StartTLS", "yes/no — default yes", boolStr(existing.SMTPStartTLS), false),
|
||||
textFieldWithValue("Default", "yes/no — attach to every check", boolStr(existing.Default), false),
|
||||
textFieldWithValue("Subject template", alerts.TemplateVarsHint(), existing.SubjectTemplate, false),
|
||||
textFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
|
||||
textAreaFieldWithValue("Body template", alerts.TemplateVarsHint(), existing.BodyTemplate, false),
|
||||
}
|
||||
id := existing.ID
|
||||
return newForm("Edit SMTP alert", fields, func(vals []string) tea.Cmd {
|
||||
|
||||
Reference in New Issue
Block a user