Coverage for app/backend/src/couchers/profiling.py: 76%

39 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-05-31 14:08 +0000

1""" 

2Continuous profiling via Grafana Pyroscope. 

3 

4Each API and background-worker process runs an in-process pyroscope-io agent (py-spy under the 

5hood). Profiling is controlled at runtime by GrowthBook flags, so it can be turned on, off, and 

6retuned in production without a redeploy: 

7 

8 profiling_enabled whether to profile at all 

9 profiling_sample_rate samples per second 

10 profiling_mode "wall" or "cpu" 

11 

12A per-process reconcile thread polls these and (re)starts or stops the agent to match. The agent is 

13per-process (it can't survive the forkserver boundary), needs CAP_SYS_PTRACE in the container, and 

14its logging may only be initialised once per process - a second attempt aborts the process, so we 

15never enable it. 

16""" 

17 

18import logging 

19import threading 

20 

21import pyroscope 

22 

23from couchers import experimentation 

24from couchers.config import config 

25 

26logger = logging.getLogger(__name__) 

27 

28_RECONCILE_INTERVAL_SECONDS = 30 

29 

30_initialized = False 

31_stop = threading.Event() 

32 

33# Owned exclusively by the single reconcile thread, so no locking. 

34_tags: dict[str, str] = {} 

35_running = False 

36_sample_rate: int | None = None 

37_oncpu: bool | None = None 

38 

39 

40def _reconcile() -> None: 

41 global _running, _sample_rate, _oncpu 

42 

43 if not experimentation.get_global_boolean_value("profiling_enabled", False): 

44 if _running: 

45 logger.info("Stopping profiler") 

46 pyroscope.shutdown() 

47 _running, _sample_rate, _oncpu = False, None, None 

48 return 

49 

50 # clamp to 1..250: 0 divides by zero in the agent, and a runaway flag value shouldn't pin CPU 

51 sample_rate = max(1, min(250, experimentation.get_global_integer_value("profiling_sample_rate", 20))) 

52 oncpu = experimentation.get_global_string_value("profiling_mode", "wall") == "cpu" 

53 

54 if _running and (sample_rate, oncpu) == (_sample_rate, _oncpu): 

55 return 

56 

57 # The sample rate and mode are fixed when the agent starts, so a change means a full restart. 

58 if _running: 

59 pyroscope.shutdown() 

60 

61 logger.info("Starting profiler at %d Hz (%s)", sample_rate, "cpu" if oncpu else "wall") 

62 pyroscope.configure( 

63 application_name="couchers-backend", 

64 server_address=config["PYROSCOPE_SERVER"], 

65 sample_rate=sample_rate, 

66 oncpu=oncpu, 

67 tags=_tags, 

68 # The agent has no bearer-token option; this is what our nginx ingest gate authenticates against. 

69 http_headers={"Authorization": f"Bearer {config['PYROSCOPE_AUTH_TOKEN']}"}, 

70 enable_logging=False, 

71 ) 

72 _running, _sample_rate, _oncpu = True, sample_rate, oncpu 

73 

74 

75def _reconcile_loop() -> None: 

76 _reconcile() 

77 while not _stop.wait(_RECONCILE_INTERVAL_SECONDS): 

78 _reconcile() 

79 

80 

81def setup_profiling(role: str, instance: str) -> None: 

82 """Start the per-process profiling reconcile thread. Call once per process, after 

83 setup_experimentation(). No-op unless profiling is enabled for this deployment.""" 

84 global _initialized, _tags 

85 

86 if not config["PYROSCOPE_ENABLED"] or _initialized: 

87 return 

88 _initialized = True 

89 

90 _tags = { 

91 "role": role, 

92 "instance": instance, 

93 "environment": config["COOKIE_DOMAIN"], 

94 "version": config["VERSION"], 

95 } 

96 threading.Thread(target=_reconcile_loop, name="profiling-reconcile", daemon=True).start()