Merge pull request #52 from e10v/dev

Add and option to strip unnecessary whitespaces
e10v · May 21, 2023 · 6d2f775 · 6d2f775
2 parents 4bfdf2a + d48940b
commit 6d2f775
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 119 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -86,48 +86,13 @@ reportMissingTypeStubs = false
 
 [tool.ruff]
 select = [
-    "A",
-    "ANN",
-    "ARG",
-    "B",
-    "C4",
-    "C90",
-    "COM",
-    "D",
-    "DTZ",
-    "E",
-    "ERA",
-    "F",
-    "I",
-    "ICN",
-    "INP",
-    "N",
-    "PGH",
-    "PL",
-    "PT",
-    "RET",
-    "RSE",
-    "RUF",
-    "Q",
-    "SIM",
-    "SLF",
-    "TCH",
-    "TID",
-    "TRY",
-    "UP",
-    "W",
+    "A", "ANN", "ARG", "B", "C4", "C90", "COM", "D", "DTZ", "E", "ERA", "F",
+    "I", "ICN", "INP", "N", "PGH", "PL", "PT", "RET", "RSE", "RUF", "Q", "SIM",
+    "SLF", "TCH", "TID", "TRY", "UP", "W",
 ]
 ignore = [
-    "ANN101",
-    "ANN102",
-    "ANN204",
-    "ANN401",
-    "B006",
-    "N817",
-    "PGH003",
-    "PT001",
-    "SLF001",
-    "TRY003",
+    "ANN101", "ANN102", "ANN204", "ANN401", "B006", "N817", "PGH003", "PT001",
+    "SLF001", "TRY003",
 ]
 
 [tool.ruff.per-file-ignores]

diff --git a/src/rico/__init__.py b/src/rico/__init__.py
@@ -3,4 +3,21 @@
 """Rich content to HTML as easy as doc.print(x)."""
 
 from rico._version import __version__
-from rico.html import HTMLParser, indent_html, parse_html, serialize_html
+from rico.content import (
+    HTML,
+    Chart,
+    Code,
+    Content,
+    ContentBase,
+    Image,
+    Markdown,
+    Tag,
+    Text,
+)
+from rico.html import (
+    HTMLParser,
+    indent_html,
+    parse_html,
+    serialize_html,
+    strip_html,
+)
diff --git a/src/rico/content.py b/src/rico/content.py
@@ -57,18 +57,20 @@ def __init__(self, class_: str | None = None):
         attrib = {"class": class_} if class_ is not None else {}
         self.container = ET.Element("div", attrib=attrib)
 
-    def serialize(self, indent_space: str | None = None) -> str:
+    def serialize(self, indent_space: str | None = None, strip: bool = False) -> str:
         """Serialize the object to string in HTML format.
 
         Indent the object if `indent_space` is not None.
 
         Args:
             indent_space: The whitespace for indentation.
+            strip: If True, strip unnecessary whitespace.
 
         Returns:
             The serialized object.
         """
-        return rico.html.serialize_html(self.container, indent_space=indent_space)
+        return rico.html.serialize_html(
+            self.container, indent_space=indent_space, strip=strip)
 
     def __str__(self) -> str:
         """Serialize the object to string in HTML format."""

diff --git a/src/rico/html.py b/src/rico/html.py
@@ -6,15 +6,19 @@
 import xml.etree.ElementTree as ET
 
 
-UNINDENTED_TAGS = {"pre"}
-
-# Copy of xml.etree.ElementTree.HTML_EMPTY.
-EMPTY_TAGS = {
+TAGS_EMPTY = {
     "area", "base", "basefont", "br", "col", "embed", "frame", "hr", "img",
     "input", "isindex", "link", "meta", "param", "path", "source", "track", "wbr",
 }
 
-UNESCAPED_TAGS = {"script", "style"}
+TAGS_INLINE = {
+    "a", "abbr", "b", "bdi", "bdo", "br", "cite", "code", "data", "dfn", "em",
+    "i", "kbd", "mark", "q", "rp", "rt", "ruby", "s", "samp", "small", "span",
+    "strong", "sub", "sup", "time", "u", "var", "wbr",
+}
+
+TAGS_NOT_ESCAPED = {"script", "style"}
+TAGS_PRE_FORMATTED = {"pre"}
 
 
 class HTMLParser(html.parser.HTMLParser):
@@ -75,10 +79,10 @@ def indent_html(
     space: str = "  ",
     level: int = 0,
 ) -> ET.Element:
-    """Indent an HTML document.
+    """Indent an HTML element.
 
     Tnsert newlines and indentation space after elements.
-    Create a new document instead of updating inplace.
+    Create a new element instead of updating inplace.
     Do not indent elements inside <pre> tag.
 
     Args:
@@ -87,9 +91,9 @@ def indent_html(
         level: The initial indentation level. Should always be 0.
 
     Returns:
-        The indented HTML document.
+        The indented HTML element.
     """
-    if element.tag in UNINDENTED_TAGS or not len(element):
+    if element.tag.lower() in TAGS_PRE_FORMATTED or not len(element):
         return element
 
     indented_element = ET.Element(element.tag, attrib=element.attrib)
@@ -113,6 +117,48 @@ def indent_html(
     return indented_element
 
 
+def strip_html(element: ET.Element) -> ET.Element:
+    """Strip an HTML element.
+
+    Remove unnecessary whitespaces from the element by strippping elements'
+    text and tail.
+    Do not strip elements inside <pre> tag or inside inline tags.
+
+    Args:
+        element: The element to strip.
+
+    Returns:
+        The stripped HTML element.
+    """
+    stripped_element = ET.Element(element.tag, attrib=element.attrib)
+    stripped_element.text = element.text
+    stripped_element.tail = element.tail
+
+    if element.tag.lower() in TAGS_INLINE | TAGS_PRE_FORMATTED:
+        for child in element:
+            stripped_element.append(child)
+    else:
+        for child in element:
+            stripped_element.append(strip_html(child))
+
+        if stripped_element.text:
+            if len(stripped_element) == 0 and (
+                not stripped_element.tail or
+                not stripped_element.tail.strip()
+            ):
+                stripped_element.text = stripped_element.text.strip()
+            else:
+                stripped_element.text = stripped_element.text.lstrip()
+
+    if stripped_element.tail:
+        if len(stripped_element) == 0 and not stripped_element.text:
+            stripped_element.tail = stripped_element.tail.strip()
+        else:
+            stripped_element.tail = stripped_element.tail.rstrip()
+
+    return stripped_element
+
+
 def _escape_cdata(text: str) -> str:
     """Copy of xml.etree.ElementTree._escape_cdata."""
     if "&" in text:
@@ -133,7 +179,11 @@ def _escape_attrib_html(text: str) -> str:
         text = text.replace('"', "&quot;")
     return text
 
-def serialize_html(element: ET.Element, indent_space: str | None = None) -> str:
+def serialize_html(
+    element: ET.Element,
+    indent_space: str | None = None,
+    strip: bool = False,
+) -> str:
     """Serialize an HTML document to a string.
 
     Indent the document if `indent_space` is not None.
@@ -143,10 +193,14 @@ def serialize_html(element: ET.Element, indent_space: str | None = None) -> str:
     Args:
         element: The HTML document.
         indent_space: The whitespace for indentation.
+        strip: If True, strip unnecessary whitespace.
 
     Returns:
         The serialized HTML document.
     """
+    if strip:
+        element = strip_html(element)
+
     if indent_space is not None:
         element = indent_html(element, space=indent_space)
 
@@ -162,12 +216,12 @@ def serialize_html(element: ET.Element, indent_space: str | None = None) -> str:
     ltag = element.tag.lower()
 
     if element.text is not None:
-        text = element.text if ltag in UNESCAPED_TAGS else _escape_cdata(element.text)
+        text = element.text if ltag in TAGS_NOT_ESCAPED else _escape_cdata(element.text)
     else:
         text = ""
 
     children = "".join(serialize_html(e) for e in element)
-    closing_tag = f"</{element.tag}>" if ltag not in EMPTY_TAGS else ""
+    closing_tag = f"</{element.tag}>" if ltag not in TAGS_EMPTY else ""
     tail = _escape_cdata(element.tail) if element.tail is not None else ""
 
     return opening_tag + text + children + closing_tag + tail
diff --git a/tests/test_content.py b/tests/test_content.py
@@ -66,6 +66,11 @@ def test_content_base_indent(content_base_subclass_sample: rico.content.ContentB
     assert content_base_subclass_sample.serialize("    ") == expectation
 
 
+def test_content_base_strip(content_base_subclass_sample: rico.content.ContentBase):
+    expectation = '<div class="row"><p>Hello world</p></div>'
+    assert content_base_subclass_sample.serialize(strip=True) == expectation
+
+
 def test_tag():
     content = rico.content.Tag(
         "p",