Coverage for app/backend/src/couchers/profiling.py: 76%
39 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-05-31 14:08 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-05-31 14:08 +0000
1"""
2Continuous profiling via Grafana Pyroscope.
4Each API and background-worker process runs an in-process pyroscope-io agent (py-spy under the
5hood). Profiling is controlled at runtime by GrowthBook flags, so it can be turned on, off, and
6retuned in production without a redeploy:
8 profiling_enabled whether to profile at all
9 profiling_sample_rate samples per second
10 profiling_mode "wall" or "cpu"
12A per-process reconcile thread polls these and (re)starts or stops the agent to match. The agent is
13per-process (it can't survive the forkserver boundary), needs CAP_SYS_PTRACE in the container, and
14its logging may only be initialised once per process - a second attempt aborts the process, so we
15never enable it.
16"""
18import logging
19import threading
21import pyroscope
23from couchers import experimentation
24from couchers.config import config
26logger = logging.getLogger(__name__)
28_RECONCILE_INTERVAL_SECONDS = 30
30_initialized = False
31_stop = threading.Event()
33# Owned exclusively by the single reconcile thread, so no locking.
34_tags: dict[str, str] = {}
35_running = False
36_sample_rate: int | None = None
37_oncpu: bool | None = None
40def _reconcile() -> None:
41 global _running, _sample_rate, _oncpu
43 if not experimentation.get_global_boolean_value("profiling_enabled", False):
44 if _running:
45 logger.info("Stopping profiler")
46 pyroscope.shutdown()
47 _running, _sample_rate, _oncpu = False, None, None
48 return
50 # clamp to 1..250: 0 divides by zero in the agent, and a runaway flag value shouldn't pin CPU
51 sample_rate = max(1, min(250, experimentation.get_global_integer_value("profiling_sample_rate", 20)))
52 oncpu = experimentation.get_global_string_value("profiling_mode", "wall") == "cpu"
54 if _running and (sample_rate, oncpu) == (_sample_rate, _oncpu):
55 return
57 # The sample rate and mode are fixed when the agent starts, so a change means a full restart.
58 if _running:
59 pyroscope.shutdown()
61 logger.info("Starting profiler at %d Hz (%s)", sample_rate, "cpu" if oncpu else "wall")
62 pyroscope.configure(
63 application_name="couchers-backend",
64 server_address=config["PYROSCOPE_SERVER"],
65 sample_rate=sample_rate,
66 oncpu=oncpu,
67 tags=_tags,
68 # The agent has no bearer-token option; this is what our nginx ingest gate authenticates against.
69 http_headers={"Authorization": f"Bearer {config['PYROSCOPE_AUTH_TOKEN']}"},
70 enable_logging=False,
71 )
72 _running, _sample_rate, _oncpu = True, sample_rate, oncpu
75def _reconcile_loop() -> None:
76 _reconcile()
77 while not _stop.wait(_RECONCILE_INTERVAL_SECONDS):
78 _reconcile()
81def setup_profiling(role: str, instance: str) -> None:
82 """Start the per-process profiling reconcile thread. Call once per process, after
83 setup_experimentation(). No-op unless profiling is enabled for this deployment."""
84 global _initialized, _tags
86 if not config["PYROSCOPE_ENABLED"] or _initialized:
87 return
88 _initialized = True
90 _tags = {
91 "role": role,
92 "instance": instance,
93 "environment": config["COOKIE_DOMAIN"],
94 "version": config["VERSION"],
95 }
96 threading.Thread(target=_reconcile_loop, name="profiling-reconcile", daemon=True).start()