Coverage for src/couchers/metrics.py: 100%

104 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-12-14 00:52 +0000

1import threading 

2from collections.abc import Callable 

3from datetime import timedelta 

4from typing import Any 

5 

6from opentelemetry import trace 

7from prometheus_client import ( 

8 CONTENT_TYPE_LATEST, 

9 CollectorRegistry, 

10 Counter, 

11 Gauge, 

12 Histogram, 

13 exposition, 

14 generate_latest, 

15 multiprocess, 

16) 

17from prometheus_client.registry import CollectorRegistry 

18from sqlalchemy.sql import distinct, func 

19from sqlalchemy.sql.selectable import Select 

20 

21from couchers.db import session_scope 

22from couchers.models import BackgroundJob, EventOccurrenceAttendee, HostingStatus, HostRequest, Message, Reference, User 

23from couchers.models.moderation import ( 

24 ModerationAction, 

25 ModerationObjectType, 

26 ModerationQueueItem, 

27 ModerationState, 

28 ModerationTrigger, 

29 ModerationVisibility, 

30) 

31from couchers.sql import couchers_select as select 

32 

33tracer = trace.get_tracer(__name__) 

34 

35registry: CollectorRegistry = CollectorRegistry() 

36multiprocess.MultiProcessCollector(registry) # type: ignore[no-untyped-call] 

37 

38_INF: float = float("inf") 

39 

40jobs_duration_histogram: Histogram = Histogram( 

41 "couchers_background_jobs_seconds", 

42 "Durations of background jobs", 

43 labelnames=["job", "status", "attempt", "exception"], 

44) 

45 

46 

47def observe_in_jobs_duration_histogram( 

48 job_type: str, job_state: str, try_count: int, exception_name: str, duration_s: float 

49) -> None: 

50 jobs_duration_histogram.labels(job_type, job_state, str(try_count), exception_name).observe(duration_s) 

51 

52 

53jobs_queued_histogram: Histogram = Histogram( 

54 "couchers_background_jobs_queued_seconds", 

55 "Time background job spent queued before being picked up", 

56 buckets=(0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 10, 20, 30, 40, 50, 60, 90, 120, 300, 600, 1800, 3600, _INF), 

57) 

58 

59 

60servicer_duration_histogram: Histogram = Histogram( 

61 "couchers_servicer_duration_seconds", 

62 "Durations of processing gRPC calls", 

63 labelnames=["method", "logged_in", "code", "exception"], 

64) 

65 

66 

67def observe_in_servicer_duration_histogram( 

68 method: str, user_id: Any, status_code: str, exception_type: str, duration_s: float 

69) -> None: 

70 servicer_duration_histogram.labels(method, user_id is not None, status_code, exception_type).observe(duration_s) 

71 

72 

73# list of gauge names and function to execute to set value to 

74# the python prometheus client does not support Gauge.set_function, so instead we hack around it and set each gauge just 

75# before collection with this 

76_set_hacky_gauges_funcs: list[tuple[Gauge, Callable[[], Any]]] = [] 

77 

78 

79def _make_gauge_from_query(name: str, description: str, statement: Select[Any]) -> Gauge: 

80 """ 

81 Given a name, description and statement that is a sqlalchemy statement, creates a gauge from it 

82 

83 statement should be a sqlalchemy SELECT statement that returns a single number 

84 """ 

85 

86 def f() -> Any: 

87 with tracer.start_as_current_span(f"metric.{name}"): 

88 with session_scope() as session: 

89 return session.execute(statement).scalar_one() 

90 

91 gauge = Gauge(name, description, multiprocess_mode="mostrecent") 

92 _set_hacky_gauges_funcs.append((gauge, f)) 

93 return gauge 

94 

95 

96active_users_gauges: list[Gauge] = [ 

97 _make_gauge_from_query( 

98 f"couchers_active_users_{name}", 

99 f"Number of active users in the last {description}", 

100 (select(func.count()).select_from(User).where(User.is_visible).where(User.last_active > func.now() - interval)), 

101 ) 

102 for name, description, interval in [ 

103 ("5m", "5 min", timedelta(minutes=5)), 

104 ("24h", "24 hours", timedelta(hours=24)), 

105 ("1month", "1 month", timedelta(days=31)), 

106 ("3month", "3 months", timedelta(days=92)), 

107 ("6month", "6 months", timedelta(days=183)), 

108 ("12month", "12 months", timedelta(days=365)), 

109 ] 

110] 

111 

112users_gauge: Gauge = _make_gauge_from_query( 

113 "couchers_users", "Total number of users", select(func.count()).select_from(User).where(User.is_visible) 

114) 

115 

116man_gauge: Gauge = _make_gauge_from_query( 

117 "couchers_users_man", 

118 "Total number of users with gender 'Man'", 

119 select(func.count()).select_from(User).where(User.is_visible).where(User.gender == "Man"), 

120) 

121 

122woman_gauge: Gauge = _make_gauge_from_query( 

123 "couchers_users_woman", 

124 "Total number of users with gender 'Woman'", 

125 select(func.count()).select_from(User).where(User.is_visible).where(User.gender == "Woman"), 

126) 

127 

128nonbinary_gauge: Gauge = _make_gauge_from_query( 

129 "couchers_users_nonbinary", 

130 "Total number of users with gender 'Non-binary'", 

131 select(func.count()).select_from(User).where(User.is_visible).where(User.gender == "Non-binary"), 

132) 

133 

134can_host_gauge: Gauge = _make_gauge_from_query( 

135 "couchers_users_can_host", 

136 "Total number of users with hosting status 'can_host'", 

137 select(func.count()).select_from(User).where(User.is_visible).where(User.hosting_status == HostingStatus.can_host), 

138) 

139 

140cant_host_gauge: Gauge = _make_gauge_from_query( 

141 "couchers_users_cant_host", 

142 "Total number of users with hosting status 'cant_host'", 

143 select(func.count()).select_from(User).where(User.is_visible).where(User.hosting_status == HostingStatus.cant_host), 

144) 

145 

146maybe_gauge: Gauge = _make_gauge_from_query( 

147 "couchers_users_maybe", 

148 "Total number of users with hosting status 'maybe'", 

149 select(func.count()).select_from(User).where(User.is_visible).where(User.hosting_status == HostingStatus.maybe), 

150) 

151 

152completed_profile_gauge: Gauge = _make_gauge_from_query( 

153 "couchers_users_completed_profile", 

154 "Total number of users with a completed profile", 

155 select(func.count()).select_from(User).where(User.is_visible).where(User.has_completed_profile), 

156) 

157 

158completed_my_home_gauge: Gauge = _make_gauge_from_query( 

159 "couchers_users_completed_my_home", 

160 "Total number of users with a completed my home section", 

161 select(func.count()).select_from(User).where(User.is_visible).where(User.has_completed_my_home), 

162) 

163 

164sent_message_gauge: Gauge = _make_gauge_from_query( 

165 "couchers_users_sent_message", 

166 "Total number of users who have sent a message", 

167 (select(func.count(distinct(Message.author_id))).join(User, User.id == Message.author_id).where(User.is_visible)), 

168) 

169 

170sent_request_gauge: Gauge = _make_gauge_from_query( 

171 "couchers_users_sent_request", 

172 "Total number of users who have sent a host request", 

173 ( 

174 select(func.count(distinct(HostRequest.surfer_user_id))) 

175 .join(User, User.id == HostRequest.surfer_user_id) 

176 .where(User.is_visible) 

177 ), 

178) 

179 

180has_reference_gauge: Gauge = _make_gauge_from_query( 

181 "couchers_users_has_reference", 

182 "Total number of users who have a reference", 

183 ( 

184 select(func.count(distinct(Reference.to_user_id))) 

185 .join(User, User.id == Reference.to_user_id) 

186 .where(User.is_visible) 

187 ), 

188) 

189 

190rsvpd_to_event_gauge: Gauge = _make_gauge_from_query( 

191 "couchers_users_rsvpd_to_event", 

192 "Total number of users who have RSVPd to an event", 

193 ( 

194 select(func.count(distinct(EventOccurrenceAttendee.user_id))) 

195 .join(User, User.id == EventOccurrenceAttendee.user_id) 

196 .where(User.is_visible) 

197 ), 

198) 

199 

200background_jobs_ready_to_execute_gauge: Gauge = _make_gauge_from_query( 

201 "couchers_background_jobs_ready_to_execute", 

202 "Total number of background jobs ready to execute", 

203 select(func.count()).select_from(BackgroundJob).where(BackgroundJob.ready_for_retry), 

204) 

205 

206background_jobs_serialization_errors_counter: Counter = Counter( 

207 "couchers_background_jobs_serialization_errors_total", 

208 "Number of times a bg worker has a serialization error", 

209) 

210 

211background_jobs_no_jobs_counter: Counter = Counter( 

212 "couchers_background_jobs_no_jobs_total", 

213 "Number of times a bg worker tries to grab a job but there is none", 

214) 

215 

216background_jobs_got_job_counter: Counter = Counter( 

217 "couchers_background_jobs_got_job_total", 

218 "Number of times a bg worker grabbed a job", 

219) 

220 

221 

222signup_initiations_counter: Counter = Counter( 

223 "couchers_signup_initiations_total", 

224 "Number of initiated signups", 

225) 

226signup_completions_counter: Counter = Counter( 

227 "couchers_signup_completions_total", 

228 "Number of completed signups", 

229 labelnames=["gender"], 

230) 

231signup_time_histogram: Histogram = Histogram( 

232 "couchers_signup_time_seconds", 

233 "Time taken for a user to sign up", 

234 labelnames=["gender"], 

235 buckets=(30, 60, 90, 120, 180, 240, 300, 360, 420, 480, 540, 600, 900, 1200, 1800, 3600, 7200, _INF), 

236) 

237 

238logins_counter: Counter = Counter( 

239 "couchers_logins_total", 

240 "Number of logins", 

241 labelnames=["gender"], 

242) 

243 

244password_reset_initiations_counter: Counter = Counter( 

245 "couchers_password_reset_initiations_total", 

246 "Number of password reset initiations", 

247) 

248password_reset_completions_counter: Counter = Counter( 

249 "couchers_password_reset_completions_total", 

250 "Number of password reset completions", 

251) 

252 

253account_deletion_initiations_counter: Counter = Counter( 

254 "couchers_account_deletion_initiations_total", 

255 "Number of account deletion initiations", 

256 labelnames=["gender"], 

257) 

258account_deletion_completions_counter: Counter = Counter( 

259 "couchers_account_deletion_completions_total", 

260 "Number of account deletion completions", 

261 labelnames=["gender"], 

262) 

263account_recoveries_counter: Counter = Counter( 

264 "couchers_account_recoveries_total", 

265 "Number of account recoveries", 

266 labelnames=["gender"], 

267) 

268 

269strong_verification_initiations_counter: Counter = Counter( 

270 "couchers_strong_verification_initiations_total", 

271 "Number of strong verification initiations", 

272 labelnames=["gender"], 

273) 

274strong_verification_completions_counter: Counter = Counter( 

275 "couchers_strong_verification_completions_total", 

276 "Number of strong verification completions", 

277) 

278strong_verification_data_deletions_counter: Counter = Counter( 

279 "couchers_strong_verification_data_deletions_total", 

280 "Number of strong verification data deletions", 

281 labelnames=["gender"], 

282) 

283 

284host_requests_sent_counter: Counter = Counter( 

285 "couchers_host_requests_total", 

286 "Number of host requests sent", 

287 labelnames=["from_gender", "to_gender"], 

288) 

289host_request_responses_counter: Counter = Counter( 

290 "couchers_host_requests_responses_total", 

291 "Number of responses to host requests", 

292 labelnames=["responder_gender", "other_gender", "response_type"], 

293) 

294 

295sent_messages_counter: Counter = Counter( 

296 "couchers_sent_messages_total", 

297 "Number of messages sent", 

298 labelnames=["gender", "message_type"], 

299) 

300 

301 

302push_notification_counter: Counter = Counter( 

303 "couchers_push_notification_total", 

304 "Number of push notification delivery attempts", 

305 labelnames=["platform", "outcome"], 

306) 

307emails_counter: Counter = Counter( 

308 "couchers_emails_total", 

309 "Number of emails sent", 

310) 

311 

312 

313recaptchas_assessed_counter: Counter = Counter( 

314 "couchers_recaptchas_assessed_total", 

315 "Number of times a recaptcha assessment is created", 

316 labelnames=["action"], 

317) 

318 

319recaptcha_score_histogram: Histogram = Histogram( 

320 "couchers_recaptcha_score", 

321 "Score of recaptcha assessments", 

322 labelnames=["action"], 

323 buckets=tuple(x / 20 for x in range(0, 21)), 

324) 

325 

326host_request_first_response_histogram: Histogram = Histogram( 

327 "couchers_host_request_first_response_seconds", 

328 "Response time to host requests", 

329 labelnames=["host_gender", "surfer_gender", "response_type"], 

330 buckets=( 

331 1 * 60, # 1m 

332 2 * 60, # 2m 

333 5 * 60, # 5m 

334 10 * 60, # 10m 

335 15 * 60, # 15m 

336 30 * 60, # 30m 

337 45 * 60, # 45m 

338 3_600, # 1h 

339 2 * 3_600, # 2h 

340 3 * 3_600, # 3h 

341 6 * 3_600, # 6h 

342 12 * 3_600, # 12h 

343 86_400, # 24h 

344 2 * 86_400, # 2d 

345 5 * 86_400, # 4d 

346 602_000, # 1w 

347 2 * 602_000, # 2w 

348 3 * 602_000, # 3w 

349 4 * 602_000, # 4w 

350 _INF, 

351 ), 

352) 

353account_age_on_host_request_create_histogram: Histogram = Histogram( 

354 "couchers_account_age_on_host_request_create_histogram_seconds", 

355 "Age of account sending a host request", 

356 labelnames=["surfer_gender", "host_gender"], 

357 buckets=( 

358 5 * 60, # 5m 

359 10 * 60, # 10m 

360 15 * 60, # 15m 

361 30 * 60, # 30m 

362 45 * 60, # 45m 

363 3_600, # 1h 

364 2 * 3_600, # 2h 

365 3 * 3_600, # 3h 

366 6 * 3_600, # 6h 

367 12 * 3_600, # 12h 

368 86_400, # 24h 

369 2 * 86_400, # 2d 

370 3 * 86_400, # 3d 

371 4 * 86_400, # 4d 

372 5 * 86_400, # 5d 

373 6 * 86_400, # 6d 

374 602_000, # 1w 

375 2 * 602_000, # 2w 

376 3 * 602_000, # 3w 

377 4 * 602_000, # 4w 

378 5 * 602_000, # 5w 

379 10 * 602_000, # 10w 

380 25 * 602_000, # 25w 

381 52 * 602_000, # 52w 

382 104 * 602_000, # 104w 

383 _INF, 

384 ), 

385) 

386 

387 

388# ============================================================================= 

389# Moderation metrics 

390# ============================================================================= 

391 

392# Gauges: Queue lengths 

393moderation_queue_length_gauge: Gauge = _make_gauge_from_query( 

394 "couchers_moderation_queue_length", 

395 "Total number of unresolved items in the moderation queue", 

396 select(func.count()).select_from(ModerationQueueItem).where(ModerationQueueItem.resolved_by_log_id.is_(None)), 

397) 

398 

399moderation_queue_length_by_trigger_gauges: list[Gauge] = [ 

400 _make_gauge_from_query( 

401 f"couchers_moderation_queue_length_{trigger.name.lower()}", 

402 f"Number of unresolved items in the moderation queue with trigger {trigger.name}", 

403 select(func.count()) 

404 .select_from(ModerationQueueItem) 

405 .where(ModerationQueueItem.resolved_by_log_id.is_(None)) 

406 .where(ModerationQueueItem.trigger == trigger), 

407 ) 

408 for trigger in ModerationTrigger 

409] 

410 

411moderation_queue_length_by_object_type_gauges: list[Gauge] = [ 

412 _make_gauge_from_query( 

413 f"couchers_moderation_queue_length_{object_type.name.lower()}", 

414 f"Number of unresolved items in the moderation queue for {object_type.name}", 

415 select(func.count()) 

416 .select_from(ModerationQueueItem) 

417 .join(ModerationState, ModerationQueueItem.moderation_state_id == ModerationState.id) 

418 .where(ModerationQueueItem.resolved_by_log_id.is_(None)) 

419 .where(ModerationState.object_type == object_type), 

420 ) 

421 for object_type in ModerationObjectType 

422] 

423 

424# Gauges: Items in each visibility state by object type 

425moderation_visibility_gauges: list[Gauge] = [ 

426 _make_gauge_from_query( 

427 f"couchers_moderation_items_{object_type.name.lower()}_{visibility.name.lower()}", 

428 f"Number of {object_type.name} items with visibility {visibility.name}", 

429 select(func.count()) 

430 .select_from(ModerationState) 

431 .where(ModerationState.object_type == object_type) 

432 .where(ModerationState.visibility == visibility), 

433 ) 

434 for object_type in ModerationObjectType 

435 for visibility in ModerationVisibility 

436] 

437 

438# Counters: Moderation actions taken 

439moderation_actions_counter: Counter = Counter( 

440 "couchers_moderation_actions_total", 

441 "Number of moderation actions taken", 

442 labelnames=["action", "object_type"], 

443) 

444 

445 

446def observe_moderation_action(action: ModerationAction, object_type: ModerationObjectType) -> None: 

447 moderation_actions_counter.labels(action.name, object_type.name).inc() 

448 

449 

450# Counters: Visibility state transitions 

451moderation_visibility_transitions_counter: Counter = Counter( 

452 "couchers_moderation_visibility_transitions_total", 

453 "Number of visibility state transitions", 

454 labelnames=["from_visibility", "to_visibility", "object_type"], 

455) 

456 

457 

458def observe_moderation_visibility_transition( 

459 from_visibility: ModerationVisibility, to_visibility: ModerationVisibility, object_type: ModerationObjectType 

460) -> None: 

461 moderation_visibility_transitions_counter.labels(from_visibility.name, to_visibility.name, object_type.name).inc() 

462 

463 

464# Counters: Auto-approved items 

465moderation_auto_approved_counter: Counter = Counter( 

466 "couchers_moderation_auto_approved_total", 

467 "Number of items that were auto-approved", 

468) 

469 

470 

471# Counters: Queue items created 

472moderation_queue_items_created_counter: Counter = Counter( 

473 "couchers_moderation_queue_items_created_total", 

474 "Number of moderation queue items created", 

475 labelnames=["trigger", "object_type"], 

476) 

477 

478 

479def observe_moderation_queue_item_created(trigger: ModerationTrigger, object_type: ModerationObjectType) -> None: 

480 moderation_queue_items_created_counter.labels(trigger.name, object_type.name).inc() 

481 

482 

483# Counters: Queue items resolved 

484moderation_queue_items_resolved_counter: Counter = Counter( 

485 "couchers_moderation_queue_items_resolved_total", 

486 "Number of moderation queue items resolved", 

487 labelnames=["trigger", "action", "object_type"], 

488) 

489 

490 

491def observe_moderation_queue_item_resolved( 

492 trigger: ModerationTrigger, action: ModerationAction, object_type: ModerationObjectType 

493) -> None: 

494 moderation_queue_items_resolved_counter.labels(trigger.name, action.name, object_type.name).inc() 

495 

496 

497# Histogram: Time to resolve queue items 

498moderation_queue_resolution_time_histogram: Histogram = Histogram( 

499 "couchers_moderation_queue_resolution_seconds", 

500 "Time taken to resolve moderation queue items", 

501 labelnames=["trigger", "action", "object_type"], 

502 buckets=( 

503 0.1, 

504 0.25, 

505 0.5, 

506 1, 

507 2.5, 

508 5, 

509 10, 

510 30, 

511 60, 

512 5 * 60, 

513 15 * 60, 

514 30 * 60, 

515 3_600, 

516 2 * 3_600, 

517 6 * 3_600, 

518 12 * 3_600, 

519 86_400, 

520 2 * 86_400, 

521 3 * 86_400, 

522 7 * 86_400, 

523 14 * 86_400, 

524 30 * 86_400, 

525 _INF, 

526 ), 

527) 

528 

529 

530def observe_moderation_queue_resolution_time( 

531 trigger: ModerationTrigger, action: ModerationAction, object_type: ModerationObjectType, duration_s: float 

532) -> None: 

533 moderation_queue_resolution_time_histogram.labels(trigger.name, action.name, object_type.name).observe(duration_s) 

534 

535 

536def create_prometheus_server(port: int) -> Any: 

537 """custom start method to fix problem descrbied in https://github.com/prometheus/client_python/issues/155""" 

538 

539 def app(environ: Any, start_response: Any) -> Any: 

540 # set hacky gauges 

541 for gauge, f in _set_hacky_gauges_funcs: 

542 gauge.set(f()) 

543 

544 data = generate_latest(registry) 

545 start_response("200 OK", [("Content-type", CONTENT_TYPE_LATEST), ("Content-Length", str(len(data)))]) 

546 return [data] 

547 

548 httpd = exposition.make_server( # type: ignore[attr-defined] 

549 "", port, app, exposition.ThreadingWSGIServer, handler_class=exposition._SilentHandler 

550 ) 

551 t = threading.Thread(target=httpd.serve_forever) 

552 t.daemon = True 

553 t.start() 

554 return httpd