ci: replace staging workflows with LXC-local testing

Remove the staging VPS-based test-and-deploy workflows and
replace them with LXC-local CI. Refactor image caching
to use per-container aliases instead of a single relay image.
Ensure postfix/dovecot/opendkim restarts after unbound restarts
This commit is contained in:
holger krekel
2026-03-10 17:50:33 +01:00
parent c3fd52e5dd
commit 8a1c1d9288
8 changed files with 189 additions and 345 deletions

View File

@@ -260,10 +260,10 @@ def test_cmd(args, out):
pytest_args = [
pytest_path,
"cmdeploy/src/",
"-n4",
"-rs",
"-x",
"-v",
"-vv" if args.verbose > 1 else "-v",
"-s",
"--durations=5",
]
if args.slow:

View File

@@ -4,7 +4,7 @@ import os
import time
from ..util import get_git_hash, get_version_string, shell
from .incus import RELAY_IMAGE_ALIAS, Incus, RelayContainer
from .incus import Incus, RelayContainer
RELAY_NAMES = ("test0", "test1")
@@ -47,6 +47,7 @@ def _lxc_start_cmd(args, out):
out.green("Ensuring DNS container (ns-localchat) ...")
dns_ct = ix.get_dns_container()
dns_ct.ensure()
dns_ct.ensure_cached_as_image()
sub.print(f"DNS container IP: {dns_ct.ipv4}")
names = args.names if args.names else RELAY_NAMES
@@ -116,17 +117,39 @@ def _lxc_start_cmd(args, out):
# Optionally run cmdeploy run + dns on each relay
if args.run:
local_hash = get_git_hash()
for ct in relays:
status = _deploy_status(ct, local_hash, ix)
with out.section(f"cmdeploy run: {ct.sname} ({ct.domain})"):
ret = _run_cmdeploy("run", ct, ix, out, extra=["--skip-dns-check"])
if ret:
out.red(f"Deploy to {ct.sname} failed (exit {ret})")
return ret
if "IN-SYNC" in status:
out.print(f"{ct.sname} is {status}, skipping")
else:
ret = _run_cmdeploy("run", ct, ix, out, extra=["--skip-dns-check"])
if ret:
out.red(f"Deploy to {ct.sname} failed (exit {ret})")
return ret
# Cache a per-relay image after each successful deploy
# so the next run can launch directly from the deployed state.
with out.section(f"lxc-test: caching {ct.sname} image"):
ct.ensure_cached_as_image()
with out.section("loading DNS zones"):
for ct in relays:
# Restart mail services to flush stale DNS state.
# Cached container images boot with a resolv.conf
# pointing to the previous run's DNS IP;
# configure_dns() already restarted unbound,
# but postfix/dovecot may hold stale results
# from the window between boot and DNS fix.
for ct in relays:
out.print(f"Restarting mail services on {ct.name} ...")
ct.bash("systemctl restart postfix dovecot opendkim")
for ct in relays:
with out.section(f"cmdeploy dns: {ct.sname} ({ct.domain})"):
ret = _run_cmdeploy(
"dns", ct, ix, out,
"dns",
ct,
ix,
out,
extra=["--zonefile", str(ct.zone)],
)
if ret:
@@ -134,7 +157,10 @@ def _lxc_start_cmd(args, out):
return ret
if ct.zone.exists():
dns_ct.set_dns_records(ct.zone.read_text())
out.print(f"Restarting filtermail-incoming on {ct.name}")
# Restart filtermail so its in-process DNS cache
# does not hold stale negative DKIM responses
# from before the zones were loaded.
out.print(f"Restarting filtermail-incoming on {ct.name} ...")
ct.bash("systemctl restart filtermail-incoming")
@@ -209,71 +235,26 @@ def lxc_test_cmd(args, out):
"""
ix = Incus(out)
t_total = time.time()
relay_names = list(RELAY_NAMES)
if args.one:
relay_names = relay_names[:1]
v_flag = " -" + "v" * out.verbosity if out.verbosity > 0 else ""
local_hash = get_git_hash()
ret = out.shell(f"cmdeploy lxc-start{v_flag} --run test0", cwd=str(ix.project_root))
if ret:
return ret
# Per-relay: start, deploy, then snapshot the first relay as a
# reusable image so the second relay launches pre-deployed.
ipv4_only_flags = {RELAY_NAMES[0]: False, RELAY_NAMES[1]: True}
for ct in map(ix.get_container, relay_names):
name = ct.sname
ipv4_only = ipv4_only_flags.get(name, False)
v_flag = " -" + "v" * out.verbosity if out.verbosity > 0 else ""
start_cmd = f"cmdeploy lxc-start{v_flag} {name}"
if ipv4_only:
start_cmd += " --ipv4-only"
with out.section(f"cmdeploy lxc-start: {name}"):
ret = out.shell(start_cmd, cwd=str(ix.project_root))
if ret:
return ret
status = _deploy_status(ct, local_hash, ix)
with out.section(f"cmdeploy run: {name}"):
if "IN-SYNC" in status:
out.print(f"{name} is {status}, skipping")
else:
ret = _run_cmdeploy("run", ct, ix, out, extra=["--skip-dns-check"])
if ret:
out.red(f"Deploy to {name} failed (exit {ret})")
return ret
# Snapshot the first relay so subsequent ones launch pre-deployed
if not ix.find_image([RELAY_IMAGE_ALIAS]):
with out.section("lxc-test: caching relay image"):
ct.publish_as_relay_image()
for ct in map(ix.get_container, relay_names):
with out.section(f"cmdeploy dns: {ct.sname} ({ct.domain})"):
ret = _run_cmdeploy("dns", ct, ix, out, extra=["--zonefile", str(ct.zone)])
if ret:
out.red(f"DNS for {ct.sname} failed (exit {ret})")
return ret
with out.section(f"lxc-test: loading DNS zones {' & '.join(relay_names)}"):
dns_ct = ix.get_dns_container()
for ct in map(ix.get_container, relay_names):
if ct.zone.exists():
zone_data = ct.zone.read_text()
out.print(f"Loading {ct.zone} into PowerDNS ...")
dns_ct.set_dns_records(zone_data)
# Restart filtermail so its in-process DNS cache
# does not hold stale negative DKIM responses
# from before the zones were loaded.
for ct in map(ix.get_container, relay_names):
out.print(f"Restarting filtermail-incoming on {ct.name} ...")
ct.bash("systemctl restart filtermail-incoming")
if not args.one:
ret = out.shell(
f"cmdeploy lxc-start{v_flag} --run test1 --ipv4-only",
cwd=str(ix.project_root),
)
if ret:
return ret
with out.section("cmdeploy test"):
first = ix.get_container(relay_names[0])
first = ix.get_container("test0")
env = None
if len(relay_names) > 1:
if not args.one:
env = os.environ.copy()
env["CHATMAIL_DOMAIN2"] = ix.get_container(relay_names[1]).domain
env["CHATMAIL_DOMAIN2"] = ix.get_container("test1").domain
ret = _run_cmdeploy("test", first, ix, out, **({"env": env} if env else {}))
if ret:
out.red(f"Tests failed (exit {ret})")

View File

@@ -14,9 +14,9 @@ DOMAIN_SUFFIX = ".localchat"
UPSTREAM_IMAGE = "images:debian/12"
BASE_IMAGE_ALIAS = "localchat-base"
BASE_SETUP_NAME = "localchat-base-setup"
RELAY_IMAGE_ALIAS = "localchat-relay"
DNS_CONTAINER_NAME = "ns-localchat"
DNS_IMAGE_ALIAS = "localchat-ns"
DNS_DOMAIN = "ns.localchat"
@@ -184,7 +184,16 @@ class Incus:
)
if result.returncode != 0:
return None
return json.loads(result.stdout)
try:
return json.loads(result.stdout)
except json.JSONDecodeError as e:
msg = f"Incus JSON processing failed for {args!r}: {e!s}"
self.out.red(msg)
self.out.red(f"Captured stdout: {result.stdout!r}")
self.out.red(f"Captured stderr: {result.stderr!r}")
if check:
raise
return None
def run_output(self, args, check=True):
"""Run an incus command and return its stripped stdout.
@@ -207,8 +216,13 @@ class Incus:
return None
def delete_images(self):
"""Delete the cached base and relay images."""
for alias in (RELAY_IMAGE_ALIAS, BASE_IMAGE_ALIAS):
"""Delete localchat-base and per-container images."""
for alias in [
BASE_IMAGE_ALIAS,
DNS_IMAGE_ALIAS,
"localchat-test0",
"localchat-test1",
]:
self.run(["image", "delete", alias], check=False) # ok if absent
def list_managed(self):
@@ -238,7 +252,7 @@ class Incus:
def ensure_base_image(self):
"""Build and cache a base image with openssh and the SSH key.
The image is published as a local incus image with alias
The image is cached as a local incus image with alias
'localchat-base'. Subsequent container launches use this
image instead of the upstream Debian 12, skipping the
slow apt-get install step.
@@ -338,7 +352,7 @@ class Container:
def launch(self):
"""Launch from the best available image, return the alias used."""
image = self.incus.find_image([RELAY_IMAGE_ALIAS, BASE_IMAGE_ALIAS])
image = self.incus.find_image([BASE_IMAGE_ALIAS])
if not image:
raise RuntimeError(
f"No base image '{BASE_IMAGE_ALIAS}' found. "
@@ -419,6 +433,18 @@ class Container:
parts = line.split()
return int(parts[2]), int(parts[1])
def ensure_cached_as_image(self):
"""Cache this container as a respective image."""
alias = self.image_alias
if self.incus.find_image([alias]):
return
self.out.print(" Cleaning apt cache before caching image ...")
self.bash("apt-get clean")
self.out.print(f" Caching {self.name!r} as '{alias}' ...")
self.incus.run(["publish", self.name, f"--alias={alias}", "--force"])
self.out.print(f" Image '{alias}' cached.")
self.wait_ready()
class RelayContainer(Container):
"""Container handle for a chatmail relay.
@@ -434,12 +460,22 @@ class RelayContainer(Container):
domain=f"_{name}{DOMAIN_SUFFIX}",
)
self.sname = name
self.image_alias = f"localchat-{name}"
self.ini = incus.lxconfigs_dir / f"chatmail-{name}.ini"
self.zone = incus.lxconfigs_dir / f"{name}.zone"
def launch(self):
"""Launch (from a potentially cached image) and clear inherited chatmail-version."""
image = super().launch()
"""Launch from localchat-{sname} if cached, else localchat-base."""
candidates = [self.image_alias]
candidates.append(BASE_IMAGE_ALIAS)
image = self.incus.find_image(candidates)
assert image, f"No deployment base, candidates: {','.join(candidates)}"
self.out.print(f" Launching from '{image}' image ...")
cfg = []
cfg += ("-c", f"{LABEL_KEY}=true")
cfg += ("-c", f"user.localchat-domain={self.domain}")
self.incus.run(["launch", image, self.name, *cfg])
self.bash("rm -f /etc/chatmail-version")
return image
@@ -474,22 +510,6 @@ class RelayContainer(Container):
echo '{ip} {self.name} {self.domain}' >> /etc/hosts
""")
def publish_as_relay_image(self):
"""Publish this container as a reusable relay image.
Stops the container, 'publishes' it as 'localchat-relay', then restarts it.
"""
if self.incus.find_image([RELAY_IMAGE_ALIAS]):
return
self.out.print(
f" Locally caching {self.name!r} as '{RELAY_IMAGE_ALIAS}' image ..."
)
self.incus.run(
["publish", self.name, f"--alias={RELAY_IMAGE_ALIAS}", "--force"]
)
self.wait_ready()
self.out.print(f" Relay image '{RELAY_IMAGE_ALIAS}' ready.")
def deployed_version(self):
"""Read /etc/chatmail-version, or None if absent."""
return self.bash("cat /etc/chatmail-version", check=False)
@@ -572,6 +592,21 @@ class DNSContainer(Container):
def __init__(self, incus):
super().__init__(incus, DNS_CONTAINER_NAME, domain=DNS_DOMAIN)
self.image_alias = DNS_IMAGE_ALIAS
def launch(self):
"""Launch from localchat-ns if cached, else localchat-base."""
image = self.incus.find_image([DNS_IMAGE_ALIAS, BASE_IMAGE_ALIAS])
if not image:
raise RuntimeError(
f"No base image '{BASE_IMAGE_ALIAS}' found. "
"Call ensure_base_image() before launching containers."
)
self.out.print(f" Launching from '{image}' image ...")
cfg = []
cfg += ("-c", f"{LABEL_KEY}=true")
cfg += ("-c", f"user.localchat-domain={self.domain}")
self.incus.run(["launch", image, self.name, *cfg])
def pdnsutil(self, *args, check=True):
"""Run ``pdnsutil <args>`` inside the DNS container."""