Coverage for app/backend/src/couchers/markup.py: 100%

1from html.parser import HTMLParser

2from typing import Any

4from markdown_it import MarkdownIt

5from markupsafe import Markup

7# Markdown config should match frontend's MarkdownNoSSR component.

8_markdown = MarkdownIt(

9 "zero", # Base configuration disables all features

10 options_update={

11 "typographer": True, # Enable some language-neutral replacement + quotes beautification

12 "breaks": True, # Convert '\n' in paragraphs into <br>

13 },

14).enable(

16 "emphasis", # Process *this* and _that_

17 "heading", # Headings (#, ##, ...)

18 "hr", # Horizontal rule

19 "link", # Process [link](<to> "stuff")

20 "list", # Lists

21 "newline", # Process '\n'

22 "smartquotes", # Convert straight quotation marks to typographic ones

27def markdown_to_html(text: str) -> Markup:

28 return Markup(_markdown.render(text))

31def markdown_to_plaintext(text: str) -> str:

32 return html_to_plaintext(markdown_to_html(text))

35def html_to_plaintext(html: str | Markup) -> str:

36 """

37 Renders a plaintext version of HTML by extracting inner HTML and converting entities+newlines.

38 Do not use for sanitization. The resulting string may not be markup-safe.

39 """

41 if isinstance(html, Markup):

42 html = str(html)

44 converter = _HTMLToPlaintext()

45 converter.feed(html)

46 return converter.plaintext

49class _HTMLToPlaintext(HTMLParser):

50 plaintext: str

52 def __init__(self) -> None:

53 super().__init__()

54 self.plaintext = ""

56 def handle_starttag(self, tag: str, attrs: Any) -> None:

57 if tag == "br":

58 self.plaintext += "\n"

60 def handle_data(self, data: str) -> None:

61 # Escapes have already been unescaped

62 self.plaintext += data.replace("\n", "") # Newlines in html are meaningless