Coverage for app/backend/src/couchers/supervisor.py: 96%

1"""

2Parent-process supervision for the forked child processes (API workers, background workers, scheduler).

4We don't respawn individual children: if any child exits on its own the parent tears the rest down and

5returns the dead child, so the caller can exit non-zero and let the container be restarted from scratch.

6On SIGTERM/SIGINT the parent instead shuts down gracefully, forwarding SIGTERM so children drain.

8All teardown happens within a single GRACEFUL_SHUTDOWN_TIMEOUT window: the children and any parent-local

9gRPC servers (e.g. the media server) are all told to stop first, then waited on concurrently, so their

10drain budgets overlap instead of stacking — total shutdown stays under the container's stop_grace_period.

11"""

13import logging

14import signal

15import threading

16import time

17from collections.abc import Sequence

18from multiprocessing import Process

20import grpc

22from couchers.constants import GRACEFUL_SHUTDOWN_TIMEOUT

23from couchers.metrics import supervised_children_alive_gauge

25logger = logging.getLogger(__name__)

27_POLL_INTERVAL = 1.0

30def supervise(children: list[Process], parent_servers: Sequence[grpc.Server] = ()) -> Process | None:

31 """Block until a shutdown signal or until a child dies, then drain everything. Returns the dead child."""

32 shutting_down = threading.Event()

34 def handle_signal(signum: int, frame: object) -> None:

35 logger.info(f"Received signal {signum}, shutting down")

36 shutting_down.set()

38 signal.signal(signal.SIGTERM, handle_signal)

39 signal.signal(signal.SIGINT, handle_signal)

41 crashed: Process | None = None

42 while not shutting_down.is_set():

43 supervised_children_alive_gauge.set(sum(child.is_alive() for child in children))

44 crashed = next((child for child in children if not child.is_alive()), None)

45 if crashed is not None: 45 ↛ 48line 45 didn't jump to line 48 because the condition on line 45 was always true

46 logger.critical(f"Child {crashed.name} (pid {crashed.pid}) exited with code {crashed.exitcode}")

47 break

48 shutting_down.wait(_POLL_INTERVAL)

50 # initiate every drain (all non-blocking) before waiting on any, so child and parent-server draining

51 # overlap under one shared deadline rather than running back-to-back

52 for child in children:

53 if child.is_alive():

54 child.terminate()

55 server_stopped = [server.stop(GRACEFUL_SHUTDOWN_TIMEOUT) for server in parent_servers]

56 deadline = time.monotonic() + GRACEFUL_SHUTDOWN_TIMEOUT

57 for child in children:

58 child.join(timeout=max(0.0, deadline - time.monotonic()))

59 for stopped in server_stopped:

60 stopped.wait(timeout=max(0.0, deadline - time.monotonic()))

62 return crashed