From 010e9de08eb6b1aa7f877f79ece31e79c5bad774 Mon Sep 17 00:00:00 2001 From: holger krekel Date: Sun, 8 Mar 2026 18:07:36 +0100 Subject: [PATCH] lxc: poll until DNS answers before continuing; reset bridge on --destroy-all After restarting pdns/pdns-recursor, wait up to 10 s for the recursor to actually answer a query before proceeding. Likewise, after configure_dns(), poll from inside the relay container until the configured DNS IP responds. On --destroy-all, unset the incusbr0 dns.mode and raw.dnsmasq network options so the next lxc-start starts from a clean bridge state. DNSConfigurationError (caught in main()) is raised on timeout so the CLI prints a clean error instead of failing later with a cryptic message. --- cmdeploy/src/cmdeploy/cmdeploy.py | 4 +++ cmdeploy/src/cmdeploy/lxc/incus.py | 56 +++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/cmdeploy/src/cmdeploy/cmdeploy.py b/cmdeploy/src/cmdeploy/cmdeploy.py index d1b90c01..d9b432cf 100644 --- a/cmdeploy/src/cmdeploy/cmdeploy.py +++ b/cmdeploy/src/cmdeploy/cmdeploy.py @@ -28,6 +28,7 @@ from .lxc.cli import ( # noqa: F401 lxc_test_cmd, lxc_test_cmd_options, ) +from .lxc.incus import DNSConfigurationError from .sshexec import ( LocalExec, SSHExec, @@ -464,6 +465,9 @@ def main(args=None): if res is None: res = 0 return res + except DNSConfigurationError as exc: + out.red(str(exc)) + return 1 except KeyboardInterrupt: out.red("KeyboardInterrupt") sys.exit(130) diff --git a/cmdeploy/src/cmdeploy/lxc/incus.py b/cmdeploy/src/cmdeploy/lxc/incus.py index e21d4447..391c8b51 100644 --- a/cmdeploy/src/cmdeploy/lxc/incus.py +++ b/cmdeploy/src/cmdeploy/lxc/incus.py @@ -20,6 +20,10 @@ DNS_CONTAINER_NAME = "ns-localchat" DNS_DOMAIN = "ns.localchat" +class DNSConfigurationError(Exception): + """Raised when the DNS container is not reachable or not answering.""" + + def _extract_ip(net_data, family="inet"): """Extract the first global-scope IP of *family* from network state data. @@ -196,6 +200,7 @@ class Incus: key_path = self.ssh_key_path pub_key = key_path.with_suffix(".pub").read_text().strip() ct.bash(f"""\ + echo 'nameserver 9.9.9.9' > /etc/resolv.conf apt-get -o DPkg::Lock::Timeout=60 update DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-server python3 systemctl enable ssh @@ -431,11 +436,10 @@ class RelayContainer(Container): return shell(cmd, timeout=15).returncode == 0 def configure_dns(self, dns_ip): - """Point this container's resolver at *dns_ip*. + """Point this container's resolver at *dns_ip* and verify it works. - Disables systemd-resolved to free port 53 and writes - a static /etc/resolv.conf. Also configures unbound - (if present) to forward .localchat queries. + Disables systemd-resolved, writes a static /etc/resolv.conf, + and configures unbound to forward .localchat queries to *dns_ip*. """ self.bash(f"""\ systemctl disable --now systemd-resolved 2>/dev/null || true @@ -448,6 +452,27 @@ class RelayContainer(Container): > /etc/unbound/unbound.conf.d/localchat-forward.conf systemctl restart unbound 2>/dev/null || true """) + self._wait_dns_reachable(dns_ip) + + def _wait_dns_reachable(self, dns_ip, timeout=10): + """Poll until *dns_ip* answers a DNS query from this container.""" + if self.bash("which dig", check=False) is None: + self.bash( + "DEBIAN_FRONTEND=noninteractive " + "apt-get install -y dnsutils 2>/dev/null || true" + ) + deadline = time.time() + timeout + while time.time() < deadline: + result = self.bash( + f"dig @{dns_ip} . SOA +short +time=1 +tries=1", + check=False, + ) + if result and result.strip(): + return + time.sleep(0.5) + raise DNSConfigurationError( + f"DNS at {dns_ip} not reachable from {self.name} after {timeout}s" + ) def write_ini(self, disable_ipv6=False): """Generate a chatmail.ini config file in lxconfigs/.""" @@ -479,11 +504,25 @@ class DNSContainer(Container): self.pdnsutil("replace-rrset", zone, name, rtype, ttl, rdata) def restart_services(self): - """Restart pdns and pdns-recursor.""" + """Restart pdns and pdns-recursor, then wait until DNS is answering.""" self.bash("""\ systemctl restart pdns systemctl restart pdns-recursor || true """) + self._wait_dns_ready() + + def _wait_dns_ready(self, timeout=10): + """Poll until the recursor answers a query on port 53.""" + deadline = time.time() + timeout + while time.time() < deadline: + result = self.bash( + "dig @127.0.0.1 . SOA +short +time=1 +tries=1", + check=False, + ) + if result and result.strip(): + return + time.sleep(0.5) + raise DNSConfigurationError(f"DNS recursor not answering after {timeout}s") def ensure(self): """Create the DNS container with PowerDNS if needed. @@ -503,6 +542,12 @@ class DNSContainer(Container): check=False, ) + def destroy(self): + """Stop, delete, and reset bridge DNS config.""" + super().destroy() + self.incus.run(["network", "unset", "incusbr0", "dns.mode"], check=False) + self.incus.run(["network", "unset", "incusbr0", "raw.dnsmasq"], check=False) + def _install_powerdns(self): """Install and configure PowerDNS if not already present.""" if self.run_cmd("which", "pdns_server", check=False) is not None: @@ -550,6 +595,7 @@ class DNSContainer(Container): systemctl start pdns-recursor echo 'nameserver 127.0.0.1' > /etc/resolv.conf """) + self._wait_dns_ready() def reset_dns_records(self, dns_ip, domains): """Create DNS zones with initial A records via pdnsutil.