Coverage for src/couchers/metrics.py: 100%
104 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-12-14 00:52 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-12-14 00:52 +0000
1import threading
2from collections.abc import Callable
3from datetime import timedelta
4from typing import Any
6from opentelemetry import trace
7from prometheus_client import (
8 CONTENT_TYPE_LATEST,
9 CollectorRegistry,
10 Counter,
11 Gauge,
12 Histogram,
13 exposition,
14 generate_latest,
15 multiprocess,
16)
17from prometheus_client.registry import CollectorRegistry
18from sqlalchemy.sql import distinct, func
19from sqlalchemy.sql.selectable import Select
21from couchers.db import session_scope
22from couchers.models import BackgroundJob, EventOccurrenceAttendee, HostingStatus, HostRequest, Message, Reference, User
23from couchers.models.moderation import (
24 ModerationAction,
25 ModerationObjectType,
26 ModerationQueueItem,
27 ModerationState,
28 ModerationTrigger,
29 ModerationVisibility,
30)
31from couchers.sql import couchers_select as select
33tracer = trace.get_tracer(__name__)
35registry: CollectorRegistry = CollectorRegistry()
36multiprocess.MultiProcessCollector(registry) # type: ignore[no-untyped-call]
38_INF: float = float("inf")
40jobs_duration_histogram: Histogram = Histogram(
41 "couchers_background_jobs_seconds",
42 "Durations of background jobs",
43 labelnames=["job", "status", "attempt", "exception"],
44)
47def observe_in_jobs_duration_histogram(
48 job_type: str, job_state: str, try_count: int, exception_name: str, duration_s: float
49) -> None:
50 jobs_duration_histogram.labels(job_type, job_state, str(try_count), exception_name).observe(duration_s)
53jobs_queued_histogram: Histogram = Histogram(
54 "couchers_background_jobs_queued_seconds",
55 "Time background job spent queued before being picked up",
56 buckets=(0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 10, 20, 30, 40, 50, 60, 90, 120, 300, 600, 1800, 3600, _INF),
57)
60servicer_duration_histogram: Histogram = Histogram(
61 "couchers_servicer_duration_seconds",
62 "Durations of processing gRPC calls",
63 labelnames=["method", "logged_in", "code", "exception"],
64)
67def observe_in_servicer_duration_histogram(
68 method: str, user_id: Any, status_code: str, exception_type: str, duration_s: float
69) -> None:
70 servicer_duration_histogram.labels(method, user_id is not None, status_code, exception_type).observe(duration_s)
73# list of gauge names and function to execute to set value to
74# the python prometheus client does not support Gauge.set_function, so instead we hack around it and set each gauge just
75# before collection with this
76_set_hacky_gauges_funcs: list[tuple[Gauge, Callable[[], Any]]] = []
79def _make_gauge_from_query(name: str, description: str, statement: Select[Any]) -> Gauge:
80 """
81 Given a name, description and statement that is a sqlalchemy statement, creates a gauge from it
83 statement should be a sqlalchemy SELECT statement that returns a single number
84 """
86 def f() -> Any:
87 with tracer.start_as_current_span(f"metric.{name}"):
88 with session_scope() as session:
89 return session.execute(statement).scalar_one()
91 gauge = Gauge(name, description, multiprocess_mode="mostrecent")
92 _set_hacky_gauges_funcs.append((gauge, f))
93 return gauge
96active_users_gauges: list[Gauge] = [
97 _make_gauge_from_query(
98 f"couchers_active_users_{name}",
99 f"Number of active users in the last {description}",
100 (select(func.count()).select_from(User).where(User.is_visible).where(User.last_active > func.now() - interval)),
101 )
102 for name, description, interval in [
103 ("5m", "5 min", timedelta(minutes=5)),
104 ("24h", "24 hours", timedelta(hours=24)),
105 ("1month", "1 month", timedelta(days=31)),
106 ("3month", "3 months", timedelta(days=92)),
107 ("6month", "6 months", timedelta(days=183)),
108 ("12month", "12 months", timedelta(days=365)),
109 ]
110]
112users_gauge: Gauge = _make_gauge_from_query(
113 "couchers_users", "Total number of users", select(func.count()).select_from(User).where(User.is_visible)
114)
116man_gauge: Gauge = _make_gauge_from_query(
117 "couchers_users_man",
118 "Total number of users with gender 'Man'",
119 select(func.count()).select_from(User).where(User.is_visible).where(User.gender == "Man"),
120)
122woman_gauge: Gauge = _make_gauge_from_query(
123 "couchers_users_woman",
124 "Total number of users with gender 'Woman'",
125 select(func.count()).select_from(User).where(User.is_visible).where(User.gender == "Woman"),
126)
128nonbinary_gauge: Gauge = _make_gauge_from_query(
129 "couchers_users_nonbinary",
130 "Total number of users with gender 'Non-binary'",
131 select(func.count()).select_from(User).where(User.is_visible).where(User.gender == "Non-binary"),
132)
134can_host_gauge: Gauge = _make_gauge_from_query(
135 "couchers_users_can_host",
136 "Total number of users with hosting status 'can_host'",
137 select(func.count()).select_from(User).where(User.is_visible).where(User.hosting_status == HostingStatus.can_host),
138)
140cant_host_gauge: Gauge = _make_gauge_from_query(
141 "couchers_users_cant_host",
142 "Total number of users with hosting status 'cant_host'",
143 select(func.count()).select_from(User).where(User.is_visible).where(User.hosting_status == HostingStatus.cant_host),
144)
146maybe_gauge: Gauge = _make_gauge_from_query(
147 "couchers_users_maybe",
148 "Total number of users with hosting status 'maybe'",
149 select(func.count()).select_from(User).where(User.is_visible).where(User.hosting_status == HostingStatus.maybe),
150)
152completed_profile_gauge: Gauge = _make_gauge_from_query(
153 "couchers_users_completed_profile",
154 "Total number of users with a completed profile",
155 select(func.count()).select_from(User).where(User.is_visible).where(User.has_completed_profile),
156)
158completed_my_home_gauge: Gauge = _make_gauge_from_query(
159 "couchers_users_completed_my_home",
160 "Total number of users with a completed my home section",
161 select(func.count()).select_from(User).where(User.is_visible).where(User.has_completed_my_home),
162)
164sent_message_gauge: Gauge = _make_gauge_from_query(
165 "couchers_users_sent_message",
166 "Total number of users who have sent a message",
167 (select(func.count(distinct(Message.author_id))).join(User, User.id == Message.author_id).where(User.is_visible)),
168)
170sent_request_gauge: Gauge = _make_gauge_from_query(
171 "couchers_users_sent_request",
172 "Total number of users who have sent a host request",
173 (
174 select(func.count(distinct(HostRequest.surfer_user_id)))
175 .join(User, User.id == HostRequest.surfer_user_id)
176 .where(User.is_visible)
177 ),
178)
180has_reference_gauge: Gauge = _make_gauge_from_query(
181 "couchers_users_has_reference",
182 "Total number of users who have a reference",
183 (
184 select(func.count(distinct(Reference.to_user_id)))
185 .join(User, User.id == Reference.to_user_id)
186 .where(User.is_visible)
187 ),
188)
190rsvpd_to_event_gauge: Gauge = _make_gauge_from_query(
191 "couchers_users_rsvpd_to_event",
192 "Total number of users who have RSVPd to an event",
193 (
194 select(func.count(distinct(EventOccurrenceAttendee.user_id)))
195 .join(User, User.id == EventOccurrenceAttendee.user_id)
196 .where(User.is_visible)
197 ),
198)
200background_jobs_ready_to_execute_gauge: Gauge = _make_gauge_from_query(
201 "couchers_background_jobs_ready_to_execute",
202 "Total number of background jobs ready to execute",
203 select(func.count()).select_from(BackgroundJob).where(BackgroundJob.ready_for_retry),
204)
206background_jobs_serialization_errors_counter: Counter = Counter(
207 "couchers_background_jobs_serialization_errors_total",
208 "Number of times a bg worker has a serialization error",
209)
211background_jobs_no_jobs_counter: Counter = Counter(
212 "couchers_background_jobs_no_jobs_total",
213 "Number of times a bg worker tries to grab a job but there is none",
214)
216background_jobs_got_job_counter: Counter = Counter(
217 "couchers_background_jobs_got_job_total",
218 "Number of times a bg worker grabbed a job",
219)
222signup_initiations_counter: Counter = Counter(
223 "couchers_signup_initiations_total",
224 "Number of initiated signups",
225)
226signup_completions_counter: Counter = Counter(
227 "couchers_signup_completions_total",
228 "Number of completed signups",
229 labelnames=["gender"],
230)
231signup_time_histogram: Histogram = Histogram(
232 "couchers_signup_time_seconds",
233 "Time taken for a user to sign up",
234 labelnames=["gender"],
235 buckets=(30, 60, 90, 120, 180, 240, 300, 360, 420, 480, 540, 600, 900, 1200, 1800, 3600, 7200, _INF),
236)
238logins_counter: Counter = Counter(
239 "couchers_logins_total",
240 "Number of logins",
241 labelnames=["gender"],
242)
244password_reset_initiations_counter: Counter = Counter(
245 "couchers_password_reset_initiations_total",
246 "Number of password reset initiations",
247)
248password_reset_completions_counter: Counter = Counter(
249 "couchers_password_reset_completions_total",
250 "Number of password reset completions",
251)
253account_deletion_initiations_counter: Counter = Counter(
254 "couchers_account_deletion_initiations_total",
255 "Number of account deletion initiations",
256 labelnames=["gender"],
257)
258account_deletion_completions_counter: Counter = Counter(
259 "couchers_account_deletion_completions_total",
260 "Number of account deletion completions",
261 labelnames=["gender"],
262)
263account_recoveries_counter: Counter = Counter(
264 "couchers_account_recoveries_total",
265 "Number of account recoveries",
266 labelnames=["gender"],
267)
269strong_verification_initiations_counter: Counter = Counter(
270 "couchers_strong_verification_initiations_total",
271 "Number of strong verification initiations",
272 labelnames=["gender"],
273)
274strong_verification_completions_counter: Counter = Counter(
275 "couchers_strong_verification_completions_total",
276 "Number of strong verification completions",
277)
278strong_verification_data_deletions_counter: Counter = Counter(
279 "couchers_strong_verification_data_deletions_total",
280 "Number of strong verification data deletions",
281 labelnames=["gender"],
282)
284host_requests_sent_counter: Counter = Counter(
285 "couchers_host_requests_total",
286 "Number of host requests sent",
287 labelnames=["from_gender", "to_gender"],
288)
289host_request_responses_counter: Counter = Counter(
290 "couchers_host_requests_responses_total",
291 "Number of responses to host requests",
292 labelnames=["responder_gender", "other_gender", "response_type"],
293)
295sent_messages_counter: Counter = Counter(
296 "couchers_sent_messages_total",
297 "Number of messages sent",
298 labelnames=["gender", "message_type"],
299)
302push_notification_counter: Counter = Counter(
303 "couchers_push_notification_total",
304 "Number of push notification delivery attempts",
305 labelnames=["platform", "outcome"],
306)
307emails_counter: Counter = Counter(
308 "couchers_emails_total",
309 "Number of emails sent",
310)
313recaptchas_assessed_counter: Counter = Counter(
314 "couchers_recaptchas_assessed_total",
315 "Number of times a recaptcha assessment is created",
316 labelnames=["action"],
317)
319recaptcha_score_histogram: Histogram = Histogram(
320 "couchers_recaptcha_score",
321 "Score of recaptcha assessments",
322 labelnames=["action"],
323 buckets=tuple(x / 20 for x in range(0, 21)),
324)
326host_request_first_response_histogram: Histogram = Histogram(
327 "couchers_host_request_first_response_seconds",
328 "Response time to host requests",
329 labelnames=["host_gender", "surfer_gender", "response_type"],
330 buckets=(
331 1 * 60, # 1m
332 2 * 60, # 2m
333 5 * 60, # 5m
334 10 * 60, # 10m
335 15 * 60, # 15m
336 30 * 60, # 30m
337 45 * 60, # 45m
338 3_600, # 1h
339 2 * 3_600, # 2h
340 3 * 3_600, # 3h
341 6 * 3_600, # 6h
342 12 * 3_600, # 12h
343 86_400, # 24h
344 2 * 86_400, # 2d
345 5 * 86_400, # 4d
346 602_000, # 1w
347 2 * 602_000, # 2w
348 3 * 602_000, # 3w
349 4 * 602_000, # 4w
350 _INF,
351 ),
352)
353account_age_on_host_request_create_histogram: Histogram = Histogram(
354 "couchers_account_age_on_host_request_create_histogram_seconds",
355 "Age of account sending a host request",
356 labelnames=["surfer_gender", "host_gender"],
357 buckets=(
358 5 * 60, # 5m
359 10 * 60, # 10m
360 15 * 60, # 15m
361 30 * 60, # 30m
362 45 * 60, # 45m
363 3_600, # 1h
364 2 * 3_600, # 2h
365 3 * 3_600, # 3h
366 6 * 3_600, # 6h
367 12 * 3_600, # 12h
368 86_400, # 24h
369 2 * 86_400, # 2d
370 3 * 86_400, # 3d
371 4 * 86_400, # 4d
372 5 * 86_400, # 5d
373 6 * 86_400, # 6d
374 602_000, # 1w
375 2 * 602_000, # 2w
376 3 * 602_000, # 3w
377 4 * 602_000, # 4w
378 5 * 602_000, # 5w
379 10 * 602_000, # 10w
380 25 * 602_000, # 25w
381 52 * 602_000, # 52w
382 104 * 602_000, # 104w
383 _INF,
384 ),
385)
388# =============================================================================
389# Moderation metrics
390# =============================================================================
392# Gauges: Queue lengths
393moderation_queue_length_gauge: Gauge = _make_gauge_from_query(
394 "couchers_moderation_queue_length",
395 "Total number of unresolved items in the moderation queue",
396 select(func.count()).select_from(ModerationQueueItem).where(ModerationQueueItem.resolved_by_log_id.is_(None)),
397)
399moderation_queue_length_by_trigger_gauges: list[Gauge] = [
400 _make_gauge_from_query(
401 f"couchers_moderation_queue_length_{trigger.name.lower()}",
402 f"Number of unresolved items in the moderation queue with trigger {trigger.name}",
403 select(func.count())
404 .select_from(ModerationQueueItem)
405 .where(ModerationQueueItem.resolved_by_log_id.is_(None))
406 .where(ModerationQueueItem.trigger == trigger),
407 )
408 for trigger in ModerationTrigger
409]
411moderation_queue_length_by_object_type_gauges: list[Gauge] = [
412 _make_gauge_from_query(
413 f"couchers_moderation_queue_length_{object_type.name.lower()}",
414 f"Number of unresolved items in the moderation queue for {object_type.name}",
415 select(func.count())
416 .select_from(ModerationQueueItem)
417 .join(ModerationState, ModerationQueueItem.moderation_state_id == ModerationState.id)
418 .where(ModerationQueueItem.resolved_by_log_id.is_(None))
419 .where(ModerationState.object_type == object_type),
420 )
421 for object_type in ModerationObjectType
422]
424# Gauges: Items in each visibility state by object type
425moderation_visibility_gauges: list[Gauge] = [
426 _make_gauge_from_query(
427 f"couchers_moderation_items_{object_type.name.lower()}_{visibility.name.lower()}",
428 f"Number of {object_type.name} items with visibility {visibility.name}",
429 select(func.count())
430 .select_from(ModerationState)
431 .where(ModerationState.object_type == object_type)
432 .where(ModerationState.visibility == visibility),
433 )
434 for object_type in ModerationObjectType
435 for visibility in ModerationVisibility
436]
438# Counters: Moderation actions taken
439moderation_actions_counter: Counter = Counter(
440 "couchers_moderation_actions_total",
441 "Number of moderation actions taken",
442 labelnames=["action", "object_type"],
443)
446def observe_moderation_action(action: ModerationAction, object_type: ModerationObjectType) -> None:
447 moderation_actions_counter.labels(action.name, object_type.name).inc()
450# Counters: Visibility state transitions
451moderation_visibility_transitions_counter: Counter = Counter(
452 "couchers_moderation_visibility_transitions_total",
453 "Number of visibility state transitions",
454 labelnames=["from_visibility", "to_visibility", "object_type"],
455)
458def observe_moderation_visibility_transition(
459 from_visibility: ModerationVisibility, to_visibility: ModerationVisibility, object_type: ModerationObjectType
460) -> None:
461 moderation_visibility_transitions_counter.labels(from_visibility.name, to_visibility.name, object_type.name).inc()
464# Counters: Auto-approved items
465moderation_auto_approved_counter: Counter = Counter(
466 "couchers_moderation_auto_approved_total",
467 "Number of items that were auto-approved",
468)
471# Counters: Queue items created
472moderation_queue_items_created_counter: Counter = Counter(
473 "couchers_moderation_queue_items_created_total",
474 "Number of moderation queue items created",
475 labelnames=["trigger", "object_type"],
476)
479def observe_moderation_queue_item_created(trigger: ModerationTrigger, object_type: ModerationObjectType) -> None:
480 moderation_queue_items_created_counter.labels(trigger.name, object_type.name).inc()
483# Counters: Queue items resolved
484moderation_queue_items_resolved_counter: Counter = Counter(
485 "couchers_moderation_queue_items_resolved_total",
486 "Number of moderation queue items resolved",
487 labelnames=["trigger", "action", "object_type"],
488)
491def observe_moderation_queue_item_resolved(
492 trigger: ModerationTrigger, action: ModerationAction, object_type: ModerationObjectType
493) -> None:
494 moderation_queue_items_resolved_counter.labels(trigger.name, action.name, object_type.name).inc()
497# Histogram: Time to resolve queue items
498moderation_queue_resolution_time_histogram: Histogram = Histogram(
499 "couchers_moderation_queue_resolution_seconds",
500 "Time taken to resolve moderation queue items",
501 labelnames=["trigger", "action", "object_type"],
502 buckets=(
503 0.1,
504 0.25,
505 0.5,
506 1,
507 2.5,
508 5,
509 10,
510 30,
511 60,
512 5 * 60,
513 15 * 60,
514 30 * 60,
515 3_600,
516 2 * 3_600,
517 6 * 3_600,
518 12 * 3_600,
519 86_400,
520 2 * 86_400,
521 3 * 86_400,
522 7 * 86_400,
523 14 * 86_400,
524 30 * 86_400,
525 _INF,
526 ),
527)
530def observe_moderation_queue_resolution_time(
531 trigger: ModerationTrigger, action: ModerationAction, object_type: ModerationObjectType, duration_s: float
532) -> None:
533 moderation_queue_resolution_time_histogram.labels(trigger.name, action.name, object_type.name).observe(duration_s)
536def create_prometheus_server(port: int) -> Any:
537 """custom start method to fix problem descrbied in https://github.com/prometheus/client_python/issues/155"""
539 def app(environ: Any, start_response: Any) -> Any:
540 # set hacky gauges
541 for gauge, f in _set_hacky_gauges_funcs:
542 gauge.set(f())
544 data = generate_latest(registry)
545 start_response("200 OK", [("Content-type", CONTENT_TYPE_LATEST), ("Content-Length", str(len(data)))])
546 return [data]
548 httpd = exposition.make_server( # type: ignore[attr-defined]
549 "", port, app, exposition.ThreadingWSGIServer, handler_class=exposition._SilentHandler
550 )
551 t = threading.Thread(target=httpd.serve_forever)
552 t.daemon = True
553 t.start()
554 return httpd