Coverage for app / backend / src / couchers / metrics.py: 100%

105 statements  

« prev     ^ index     » next       coverage.py v7.13.2, created at 2026-02-03 06:18 +0000

1import threading 

2from collections.abc import Callable 

3from datetime import timedelta 

4from typing import Any 

5 

6from opentelemetry import trace 

7from prometheus_client import ( 

8 CONTENT_TYPE_LATEST, 

9 CollectorRegistry, 

10 Counter, 

11 Gauge, 

12 Histogram, 

13 exposition, 

14 generate_latest, 

15 multiprocess, 

16) 

17from prometheus_client.registry import CollectorRegistry 

18from sqlalchemy import select 

19from sqlalchemy.sql import distinct, func 

20from sqlalchemy.sql.selectable import Select 

21 

22from couchers.db import session_scope 

23from couchers.helpers.completed_profile import has_completed_profile_expression 

24from couchers.models import BackgroundJob, EventOccurrenceAttendee, HostingStatus, HostRequest, Message, Reference, User 

25from couchers.models.moderation import ( 

26 ModerationAction, 

27 ModerationObjectType, 

28 ModerationQueueItem, 

29 ModerationState, 

30 ModerationTrigger, 

31 ModerationVisibility, 

32) 

33 

34tracer = trace.get_tracer(__name__) 

35 

36registry: CollectorRegistry = CollectorRegistry() 

37multiprocess.MultiProcessCollector(registry) # type: ignore[no-untyped-call] 

38 

39_INF: float = float("inf") 

40 

41jobs_duration_histogram: Histogram = Histogram( 

42 "couchers_background_jobs_seconds", 

43 "Durations of background jobs", 

44 labelnames=["job", "status", "attempt", "exception"], 

45) 

46 

47 

48def observe_in_jobs_duration_histogram( 

49 job_type: str, job_state: str, try_count: int, exception_name: str, duration_s: float 

50) -> None: 

51 jobs_duration_histogram.labels(job_type, job_state, str(try_count), exception_name).observe(duration_s) 

52 

53 

54jobs_queued_histogram: Histogram = Histogram( 

55 "couchers_background_jobs_queued_seconds", 

56 "Time background job spent queued before being picked up", 

57 buckets=(0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 10, 20, 30, 40, 50, 60, 90, 120, 300, 600, 1800, 3600, _INF), 

58) 

59 

60 

61servicer_duration_histogram: Histogram = Histogram( 

62 "couchers_servicer_duration_seconds", 

63 "Durations of processing gRPC calls", 

64 labelnames=["method", "logged_in", "code", "exception"], 

65) 

66 

67 

68def observe_in_servicer_duration_histogram( 

69 method: str, user_id: Any, status_code: str, exception_type: str, duration_s: float 

70) -> None: 

71 servicer_duration_histogram.labels(method, user_id is not None, status_code, exception_type).observe(duration_s) 

72 

73 

74# list of gauge names and function to execute to set value to 

75# the python prometheus client does not support Gauge.set_function, so instead we hack around it and set each gauge just 

76# before collection with this 

77_set_hacky_gauges_funcs: list[tuple[Gauge, Callable[[], Any]]] = [] 

78 

79 

80def _make_gauge_from_query(name: str, description: str, statement: Select[Any]) -> Gauge: 

81 """ 

82 Given a name, description and statement that is a sqlalchemy statement, creates a gauge from it 

83 

84 statement should be a sqlalchemy SELECT statement that returns a single number 

85 """ 

86 

87 def f() -> Any: 

88 with tracer.start_as_current_span(f"metric.{name}"): 

89 with session_scope() as session: 

90 return session.execute(statement).scalar_one() 

91 

92 gauge = Gauge(name, description, multiprocess_mode="mostrecent") 

93 _set_hacky_gauges_funcs.append((gauge, f)) 

94 return gauge 

95 

96 

97active_users_gauges: list[Gauge] = [ 

98 _make_gauge_from_query( 

99 f"couchers_active_users_{name}", 

100 f"Number of active users in the last {description}", 

101 (select(func.count()).select_from(User).where(User.is_visible).where(User.last_active > func.now() - interval)), 

102 ) 

103 for name, description, interval in [ 

104 ("5m", "5 min", timedelta(minutes=5)), 

105 ("24h", "24 hours", timedelta(hours=24)), 

106 ("1month", "1 month", timedelta(days=31)), 

107 ("3month", "3 months", timedelta(days=92)), 

108 ("6month", "6 months", timedelta(days=183)), 

109 ("12month", "12 months", timedelta(days=365)), 

110 ] 

111] 

112 

113users_gauge: Gauge = _make_gauge_from_query( 

114 "couchers_users", "Total number of users", select(func.count()).select_from(User).where(User.is_visible) 

115) 

116 

117man_gauge: Gauge = _make_gauge_from_query( 

118 "couchers_users_man", 

119 "Total number of users with gender 'Man'", 

120 select(func.count()).select_from(User).where(User.is_visible).where(User.gender == "Man"), 

121) 

122 

123woman_gauge: Gauge = _make_gauge_from_query( 

124 "couchers_users_woman", 

125 "Total number of users with gender 'Woman'", 

126 select(func.count()).select_from(User).where(User.is_visible).where(User.gender == "Woman"), 

127) 

128 

129nonbinary_gauge: Gauge = _make_gauge_from_query( 

130 "couchers_users_nonbinary", 

131 "Total number of users with gender 'Non-binary'", 

132 select(func.count()).select_from(User).where(User.is_visible).where(User.gender == "Non-binary"), 

133) 

134 

135can_host_gauge: Gauge = _make_gauge_from_query( 

136 "couchers_users_can_host", 

137 "Total number of users with hosting status 'can_host'", 

138 select(func.count()).select_from(User).where(User.is_visible).where(User.hosting_status == HostingStatus.can_host), 

139) 

140 

141cant_host_gauge: Gauge = _make_gauge_from_query( 

142 "couchers_users_cant_host", 

143 "Total number of users with hosting status 'cant_host'", 

144 select(func.count()).select_from(User).where(User.is_visible).where(User.hosting_status == HostingStatus.cant_host), 

145) 

146 

147maybe_gauge: Gauge = _make_gauge_from_query( 

148 "couchers_users_maybe", 

149 "Total number of users with hosting status 'maybe'", 

150 select(func.count()).select_from(User).where(User.is_visible).where(User.hosting_status == HostingStatus.maybe), 

151) 

152 

153completed_profile_gauge: Gauge = _make_gauge_from_query( 

154 "couchers_users_completed_profile", 

155 "Total number of users with a completed profile", 

156 select(func.count()).select_from(User).where(User.is_visible).where(has_completed_profile_expression()), 

157) 

158 

159completed_my_home_gauge: Gauge = _make_gauge_from_query( 

160 "couchers_users_completed_my_home", 

161 "Total number of users with a completed my home section", 

162 select(func.count()).select_from(User).where(User.is_visible).where(User.has_completed_my_home), 

163) 

164 

165sent_message_gauge: Gauge = _make_gauge_from_query( 

166 "couchers_users_sent_message", 

167 "Total number of users who have sent a message", 

168 (select(func.count(distinct(Message.author_id))).join(User, User.id == Message.author_id).where(User.is_visible)), 

169) 

170 

171sent_request_gauge: Gauge = _make_gauge_from_query( 

172 "couchers_users_sent_request", 

173 "Total number of users who have sent a host request", 

174 ( 

175 select(func.count(distinct(HostRequest.surfer_user_id))) 

176 .join(User, User.id == HostRequest.surfer_user_id) 

177 .where(User.is_visible) 

178 ), 

179) 

180 

181has_reference_gauge: Gauge = _make_gauge_from_query( 

182 "couchers_users_has_reference", 

183 "Total number of users who have a reference", 

184 ( 

185 select(func.count(distinct(Reference.to_user_id))) 

186 .join(User, User.id == Reference.to_user_id) 

187 .where(User.is_visible) 

188 ), 

189) 

190 

191rsvpd_to_event_gauge: Gauge = _make_gauge_from_query( 

192 "couchers_users_rsvpd_to_event", 

193 "Total number of users who have RSVPd to an event", 

194 ( 

195 select(func.count(distinct(EventOccurrenceAttendee.user_id))) 

196 .join(User, User.id == EventOccurrenceAttendee.user_id) 

197 .where(User.is_visible) 

198 ), 

199) 

200 

201background_jobs_ready_to_execute_gauge: Gauge = _make_gauge_from_query( 

202 "couchers_background_jobs_ready_to_execute", 

203 "Total number of background jobs ready to execute", 

204 select(func.count()).select_from(BackgroundJob).where(BackgroundJob.ready_for_retry), 

205) 

206 

207background_jobs_serialization_errors_counter: Counter = Counter( 

208 "couchers_background_jobs_serialization_errors_total", 

209 "Number of times a bg worker has a serialization error", 

210) 

211 

212background_jobs_no_jobs_counter: Counter = Counter( 

213 "couchers_background_jobs_no_jobs_total", 

214 "Number of times a bg worker tries to grab a job but there is none", 

215) 

216 

217background_jobs_got_job_counter: Counter = Counter( 

218 "couchers_background_jobs_got_job_total", 

219 "Number of times a bg worker grabbed a job", 

220) 

221 

222 

223signup_initiations_counter: Counter = Counter( 

224 "couchers_signup_initiations_total", 

225 "Number of initiated signups", 

226) 

227signup_completions_counter: Counter = Counter( 

228 "couchers_signup_completions_total", 

229 "Number of completed signups", 

230 labelnames=["gender"], 

231) 

232signup_time_histogram: Histogram = Histogram( 

233 "couchers_signup_time_seconds", 

234 "Time taken for a user to sign up", 

235 labelnames=["gender"], 

236 buckets=(30, 60, 90, 120, 180, 240, 300, 360, 420, 480, 540, 600, 900, 1200, 1800, 3600, 7200, _INF), 

237) 

238 

239logins_counter: Counter = Counter( 

240 "couchers_logins_total", 

241 "Number of logins", 

242 labelnames=["gender"], 

243) 

244 

245password_reset_initiations_counter: Counter = Counter( 

246 "couchers_password_reset_initiations_total", 

247 "Number of password reset initiations", 

248) 

249password_reset_completions_counter: Counter = Counter( 

250 "couchers_password_reset_completions_total", 

251 "Number of password reset completions", 

252) 

253 

254account_deletion_initiations_counter: Counter = Counter( 

255 "couchers_account_deletion_initiations_total", 

256 "Number of account deletion initiations", 

257 labelnames=["gender"], 

258) 

259account_deletion_completions_counter: Counter = Counter( 

260 "couchers_account_deletion_completions_total", 

261 "Number of account deletion completions", 

262 labelnames=["gender"], 

263) 

264account_recoveries_counter: Counter = Counter( 

265 "couchers_account_recoveries_total", 

266 "Number of account recoveries", 

267 labelnames=["gender"], 

268) 

269 

270strong_verification_initiations_counter: Counter = Counter( 

271 "couchers_strong_verification_initiations_total", 

272 "Number of strong verification initiations", 

273 labelnames=["gender"], 

274) 

275strong_verification_completions_counter: Counter = Counter( 

276 "couchers_strong_verification_completions_total", 

277 "Number of strong verification completions", 

278) 

279strong_verification_data_deletions_counter: Counter = Counter( 

280 "couchers_strong_verification_data_deletions_total", 

281 "Number of strong verification data deletions", 

282 labelnames=["gender"], 

283) 

284 

285host_requests_sent_counter: Counter = Counter( 

286 "couchers_host_requests_total", 

287 "Number of host requests sent", 

288 labelnames=["from_gender", "to_gender"], 

289) 

290host_request_responses_counter: Counter = Counter( 

291 "couchers_host_requests_responses_total", 

292 "Number of responses to host requests", 

293 labelnames=["responder_gender", "other_gender", "response_type"], 

294) 

295 

296sent_messages_counter: Counter = Counter( 

297 "couchers_sent_messages_total", 

298 "Number of messages sent", 

299 labelnames=["gender", "message_type"], 

300) 

301 

302 

303push_notification_counter: Counter = Counter( 

304 "couchers_push_notification_total", 

305 "Number of push notification delivery attempts", 

306 labelnames=["platform", "outcome"], 

307) 

308emails_counter: Counter = Counter( 

309 "couchers_emails_total", 

310 "Number of emails sent", 

311) 

312 

313 

314recaptchas_assessed_counter: Counter = Counter( 

315 "couchers_recaptchas_assessed_total", 

316 "Number of times a recaptcha assessment is created", 

317 labelnames=["action"], 

318) 

319 

320recaptcha_score_histogram: Histogram = Histogram( 

321 "couchers_recaptcha_score", 

322 "Score of recaptcha assessments", 

323 labelnames=["action"], 

324 buckets=tuple(x / 20 for x in range(0, 21)), 

325) 

326 

327host_request_first_response_histogram: Histogram = Histogram( 

328 "couchers_host_request_first_response_seconds", 

329 "Response time to host requests", 

330 labelnames=["host_gender", "surfer_gender", "response_type"], 

331 buckets=( 

332 1 * 60, # 1m 

333 2 * 60, # 2m 

334 5 * 60, # 5m 

335 10 * 60, # 10m 

336 15 * 60, # 15m 

337 30 * 60, # 30m 

338 45 * 60, # 45m 

339 3_600, # 1h 

340 2 * 3_600, # 2h 

341 3 * 3_600, # 3h 

342 6 * 3_600, # 6h 

343 12 * 3_600, # 12h 

344 86_400, # 24h 

345 2 * 86_400, # 2d 

346 5 * 86_400, # 4d 

347 602_000, # 1w 

348 2 * 602_000, # 2w 

349 3 * 602_000, # 3w 

350 4 * 602_000, # 4w 

351 _INF, 

352 ), 

353) 

354account_age_on_host_request_create_histogram: Histogram = Histogram( 

355 "couchers_account_age_on_host_request_create_histogram_seconds", 

356 "Age of account sending a host request", 

357 labelnames=["surfer_gender", "host_gender"], 

358 buckets=( 

359 5 * 60, # 5m 

360 10 * 60, # 10m 

361 15 * 60, # 15m 

362 30 * 60, # 30m 

363 45 * 60, # 45m 

364 3_600, # 1h 

365 2 * 3_600, # 2h 

366 3 * 3_600, # 3h 

367 6 * 3_600, # 6h 

368 12 * 3_600, # 12h 

369 86_400, # 24h 

370 2 * 86_400, # 2d 

371 3 * 86_400, # 3d 

372 4 * 86_400, # 4d 

373 5 * 86_400, # 5d 

374 6 * 86_400, # 6d 

375 602_000, # 1w 

376 2 * 602_000, # 2w 

377 3 * 602_000, # 3w 

378 4 * 602_000, # 4w 

379 5 * 602_000, # 5w 

380 10 * 602_000, # 10w 

381 25 * 602_000, # 25w 

382 52 * 602_000, # 52w 

383 104 * 602_000, # 104w 

384 _INF, 

385 ), 

386) 

387 

388 

389# ============================================================================= 

390# Moderation metrics 

391# ============================================================================= 

392 

393# Gauges: Queue lengths 

394moderation_queue_length_gauge: Gauge = _make_gauge_from_query( 

395 "couchers_moderation_queue_length", 

396 "Total number of unresolved items in the moderation queue", 

397 select(func.count()).select_from(ModerationQueueItem).where(ModerationQueueItem.resolved_by_log_id.is_(None)), 

398) 

399 

400moderation_queue_length_by_trigger_gauges: list[Gauge] = [ 

401 _make_gauge_from_query( 

402 f"couchers_moderation_queue_length_{trigger.name.lower()}", 

403 f"Number of unresolved items in the moderation queue with trigger {trigger.name}", 

404 select(func.count()) 

405 .select_from(ModerationQueueItem) 

406 .where(ModerationQueueItem.resolved_by_log_id.is_(None)) 

407 .where(ModerationQueueItem.trigger == trigger), 

408 ) 

409 for trigger in ModerationTrigger 

410] 

411 

412moderation_queue_length_by_object_type_gauges: list[Gauge] = [ 

413 _make_gauge_from_query( 

414 f"couchers_moderation_queue_length_{object_type.name.lower()}", 

415 f"Number of unresolved items in the moderation queue for {object_type.name}", 

416 select(func.count()) 

417 .select_from(ModerationQueueItem) 

418 .join(ModerationState, ModerationQueueItem.moderation_state_id == ModerationState.id) 

419 .where(ModerationQueueItem.resolved_by_log_id.is_(None)) 

420 .where(ModerationState.object_type == object_type), 

421 ) 

422 for object_type in ModerationObjectType 

423] 

424 

425# Gauges: Items in each visibility state by object type 

426moderation_visibility_gauges: list[Gauge] = [ 

427 _make_gauge_from_query( 

428 f"couchers_moderation_items_{object_type.name.lower()}_{visibility.name.lower()}", 

429 f"Number of {object_type.name} items with visibility {visibility.name}", 

430 select(func.count()) 

431 .select_from(ModerationState) 

432 .where(ModerationState.object_type == object_type) 

433 .where(ModerationState.visibility == visibility), 

434 ) 

435 for object_type in ModerationObjectType 

436 for visibility in ModerationVisibility 

437] 

438 

439# Counters: Moderation actions taken 

440moderation_actions_counter: Counter = Counter( 

441 "couchers_moderation_actions_total", 

442 "Number of moderation actions taken", 

443 labelnames=["action", "object_type"], 

444) 

445 

446 

447def observe_moderation_action(action: ModerationAction, object_type: ModerationObjectType) -> None: 

448 moderation_actions_counter.labels(action.name, object_type.name).inc() 

449 

450 

451# Counters: Visibility state transitions 

452moderation_visibility_transitions_counter: Counter = Counter( 

453 "couchers_moderation_visibility_transitions_total", 

454 "Number of visibility state transitions", 

455 labelnames=["from_visibility", "to_visibility", "object_type"], 

456) 

457 

458 

459def observe_moderation_visibility_transition( 

460 from_visibility: ModerationVisibility, to_visibility: ModerationVisibility, object_type: ModerationObjectType 

461) -> None: 

462 moderation_visibility_transitions_counter.labels(from_visibility.name, to_visibility.name, object_type.name).inc() 

463 

464 

465# Counters: Auto-approved items 

466moderation_auto_approved_counter: Counter = Counter( 

467 "couchers_moderation_auto_approved_total", 

468 "Number of items that were auto-approved", 

469) 

470 

471 

472# Counters: Queue items created 

473moderation_queue_items_created_counter: Counter = Counter( 

474 "couchers_moderation_queue_items_created_total", 

475 "Number of moderation queue items created", 

476 labelnames=["trigger", "object_type"], 

477) 

478 

479 

480def observe_moderation_queue_item_created(trigger: ModerationTrigger, object_type: ModerationObjectType) -> None: 

481 moderation_queue_items_created_counter.labels(trigger.name, object_type.name).inc() 

482 

483 

484# Counters: Queue items resolved 

485moderation_queue_items_resolved_counter: Counter = Counter( 

486 "couchers_moderation_queue_items_resolved_total", 

487 "Number of moderation queue items resolved", 

488 labelnames=["trigger", "action", "object_type"], 

489) 

490 

491 

492def observe_moderation_queue_item_resolved( 

493 trigger: ModerationTrigger, action: ModerationAction, object_type: ModerationObjectType 

494) -> None: 

495 moderation_queue_items_resolved_counter.labels(trigger.name, action.name, object_type.name).inc() 

496 

497 

498# Histogram: Time to resolve queue items 

499moderation_queue_resolution_time_histogram: Histogram = Histogram( 

500 "couchers_moderation_queue_resolution_seconds", 

501 "Time taken to resolve moderation queue items", 

502 labelnames=["trigger", "action", "object_type"], 

503 buckets=( 

504 0.1, 

505 0.25, 

506 0.5, 

507 1, 

508 2.5, 

509 5, 

510 10, 

511 30, 

512 60, 

513 5 * 60, 

514 15 * 60, 

515 30 * 60, 

516 3_600, 

517 2 * 3_600, 

518 6 * 3_600, 

519 12 * 3_600, 

520 86_400, 

521 2 * 86_400, 

522 3 * 86_400, 

523 7 * 86_400, 

524 14 * 86_400, 

525 30 * 86_400, 

526 _INF, 

527 ), 

528) 

529 

530 

531def observe_moderation_queue_resolution_time( 

532 trigger: ModerationTrigger, action: ModerationAction, object_type: ModerationObjectType, duration_s: float 

533) -> None: 

534 moderation_queue_resolution_time_histogram.labels(trigger.name, action.name, object_type.name).observe(duration_s) 

535 

536 

537def create_prometheus_server(port: int) -> Any: 

538 """custom start method to fix problem descrbied in https://github.com/prometheus/client_python/issues/155""" 

539 

540 def app(environ: Any, start_response: Any) -> Any: 

541 # set hacky gauges 

542 for gauge, f in _set_hacky_gauges_funcs: 

543 gauge.set(f()) 

544 

545 data = generate_latest(registry) 

546 start_response("200 OK", [("Content-type", CONTENT_TYPE_LATEST), ("Content-Length", str(len(data)))]) 

547 return [data] 

548 

549 httpd = exposition.make_server( # type: ignore[attr-defined] 

550 "", port, app, exposition.ThreadingWSGIServer, handler_class=exposition._SilentHandler 

551 ) 

552 t = threading.Thread(target=httpd.serve_forever) 

553 t.daemon = True 

554 t.start() 

555 return httpd