Coverage for app/backend/src/couchers/profiling.py: 76%

1"""

2Continuous profiling via Grafana Pyroscope.

4Each API and background-worker process runs an in-process pyroscope-io agent (py-spy under the

5hood). Profiling is controlled at runtime by GrowthBook flags, so it can be turned on, off, and

6retuned in production without a redeploy:

8 profiling_enabled whether to profile at all

9 profiling_sample_rate samples per second

10 profiling_mode "wall" or "cpu"

12A per-process reconcile thread polls these and (re)starts or stops the agent to match. The agent is

13per-process (it can't survive the forkserver boundary), needs CAP_SYS_PTRACE in the container, and

14its logging may only be initialised once per process - a second attempt aborts the process, so we

15never enable it.

16"""

18import logging

19import threading

21import pyroscope

23from couchers import experimentation

24from couchers.config import config

26logger = logging.getLogger(__name__)

28_RECONCILE_INTERVAL_SECONDS = 30

30_initialized = False

31_stop = threading.Event()

33# Owned exclusively by the single reconcile thread, so no locking.

34_tags: dict[str, str] = {}

35_running = False

36_sample_rate: int | None = None

37_oncpu: bool | None = None

40def _reconcile() -> None:

41 global _running, _sample_rate, _oncpu

43 if not experimentation.get_global_boolean_value("profiling_enabled", False):

44 if _running:

45 logger.info("Stopping profiler")

46 pyroscope.shutdown()

47 _running, _sample_rate, _oncpu = False, None, None

48 return

50 # clamp to 1..250: 0 divides by zero in the agent, and a runaway flag value shouldn't pin CPU

51 sample_rate = max(1, min(250, experimentation.get_global_integer_value("profiling_sample_rate", 20)))

52 oncpu = experimentation.get_global_string_value("profiling_mode", "wall") == "cpu"

54 if _running and (sample_rate, oncpu) == (_sample_rate, _oncpu):

55 return

57 # The sample rate and mode are fixed when the agent starts, so a change means a full restart.

58 if _running:

59 pyroscope.shutdown()

61 logger.info("Starting profiler at %d Hz (%s)", sample_rate, "cpu" if oncpu else "wall")

62 pyroscope.configure(

63 application_name="couchers-backend",

64 server_address=config.PYROSCOPE_SERVER,

65 sample_rate=sample_rate,

66 oncpu=oncpu,

67 tags=_tags,

68 # The agent has no bearer-token option; this is what our nginx ingest gate authenticates against.

69 http_headers={"Authorization": f"Bearer {config.PYROSCOPE_AUTH_TOKEN}"},

70 enable_logging=False,

71 )

72 _running, _sample_rate, _oncpu = True, sample_rate, oncpu

75def _reconcile_loop() -> None:

76 _reconcile()

77 while not _stop.wait(_RECONCILE_INTERVAL_SECONDS):

78 _reconcile()

81def setup_profiling(role: str, instance: str) -> None:

82 """Start the per-process profiling reconcile thread. Call once per process, after

83 setup_experimentation(). No-op unless profiling is enabled for this deployment."""

84 global _initialized, _tags

86 if not config.PYROSCOPE_ENABLED or _initialized:

87 return

88 _initialized = True

90 _tags = {

91 "role": role,

92 "instance": instance,

93 "environment": config.COOKIE_DOMAIN,

94 "version": config.VERSION,

95 }

96 threading.Thread(target=_reconcile_loop, name="profiling-reconcile", daemon=True).start()