Split the parser module apart into multiple files.

speced · Dec 1, 2023 · 4b055e7 · 4b055e7
1 parent 0689f35
commit 4b055e7
Show file tree

Hide file tree

Showing 7 changed files with 3,137 additions and 3,241 deletions.
diff --git a/bikeshed/h/__init__.py b/bikeshed/h/__init__.py
@@ -77,44 +77,11 @@
     wrapContents,
 )
 from .parser import (
-    Comment,
-    EndTag,
-    Failure,
     ParseConfig,
-    ParseFailure,
-    Result,
-    StartTag,
-    Stream,
     initialDocumentParse,
-    isASCII,
-    isASCIIAlpha,
-    isASCIIAlphanum,
-    isASCIILowerAlpha,
-    isASCIIUpperAlpha,
-    isAttrNameChar,
-    isControl,
-    isDigit,
-    isHexDigit,
-    isNoncharacter,
-    isTagnameChar,
-    isWhitespace,
-    parseAttribute,
-    parseCharRef,
-    parseComment,
-    parseDoctype,
-    parseEndTag,
     parseLines,
-    parseQuotedAttrValue,
-    parseScriptToEnd,
-    parseStartTag,
-    parseStyleToEnd,
-    parseTagName,
     parseText,
     parseTitle,
-    parseUnquotedAttrValue,
-    parseWhitespace,
-    parseXmpToEnd,
     strFromNodes,
-    test,
 )
 from .serializer import Serializer
diff --git a/bikeshed/h/parser/__init__.py b/bikeshed/h/parser/__init__.py
@@ -1,25 +1,29 @@
-from .parser import (
+from .main import (
+    initialDocumentParse,
+    linesFromNodes,
+    nodesFromHtml,
+    nodesFromStream,
+    parseLines,
+    parseText,
+    parseTitle,
+    strFromNodes,
+)
+from .nodes import (
     Comment,
     Doctype,
     EndTag,
-    Failure,
-    ParseConfig,
-    ParseFailure,
     ParserNode,
     RawElement,
     RawText,
-    Result,
     SafeText,
     SelfClosedTag,
     StartTag,
-    Stream,
     Text,
-    initialDocumentParse,
-    linesFromNodes,
-    nodesFromHtml,
-    nodesFromStream,
-    parseLines,
-    parseText,
-    parseTitle,
-    strFromNodes,
+)
+from .stream import (
+    Failure,
+    ParseConfig,
+    ParseFailure,
+    Result,
+    Stream,
 )
diff --git a/bikeshed/h/parser/main.py b/bikeshed/h/parser/main.py
@@ -0,0 +1,112 @@
+# pylint: skip-file
+from __future__ import annotations
+
+import enum
+import io
+import os
+import re
+
+from ... import constants, t
+from ... import messages as m
+from .nodes import (
+    Comment,
+    Doctype,
+    EndTag,
+    ParserNode,
+    RawElement,
+    RawText,
+    SafeText,
+    SelfClosedTag,
+    StartTag,
+    Text,
+)
+from .parser import POSSIBLE_NODE_START_CHARS, nodesFromStream
+from .stream import Failure, ParseConfig, ParseFailure, Result, Stream
+
+
+def test() -> None:
+    import json
+
+    with io.open(os.path.abspath("test.txt"), "r") as fh:
+        vals = "\n".join(x for x in json.load(fh).values())
+        list(nodesFromHtml(vals, ParseConfig()))
+
+
+def nodesFromHtml(data: str, config: ParseConfig, startLine: int = 1) -> t.Generator[ParserNode, None, None]:
+    s = Stream(data, startLine=startLine, config=config)
+    yield from nodesFromStream(s, 0)
+
+
+def initialDocumentParse(text: str, config: ParseConfig, startLine: int = 1) -> list[ParserNode]:
+    # Just do a document parse.
+    # This will add `bs-line-number` attributes,
+    # normalize any difficult shorthands
+    # (ones that look like tags, or that contain raw text),
+    # and blank out comments.
+
+    return list(nodesFromHtml(text, config, startLine=startLine))
+
+
+def strFromNodes(nodes: t.Iterable[ParserNode], withIlcc: bool = False) -> str:
+    strs = []
+    ilcc = constants.incrementLineCountChar
+    dlcc = constants.decrementLineCountChar
+    for node in nodes:
+        if isinstance(node, Comment):
+            # Serialize comments as a standardized, recognizable sequence
+            # so Markdown processing can ignore them better.
+            strs.append(constants.bsComment)
+            if withIlcc:
+                strs.append(ilcc * node.data.count("\n"))
+            continue
+        s = str(node)
+        if withIlcc:
+            outputExtraLines = s.count("\n")
+            sourceExtraLines = node.endLine - node.line
+            diff = sourceExtraLines - outputExtraLines
+            if diff > 0:
+                s += ilcc * diff
+            elif diff < 0:
+                s += dlcc * -diff
+        strs.append(s)
+    return "".join(strs)
+
+
+def linesFromNodes(nodes: t.Iterable[ParserNode]) -> list[str]:
+    return strFromNodes(nodes).split("\n")
+
+
+def debugNodes(nodes: t.Iterable[ParserNode]) -> list[ParserNode]:
+    nodes = list(nodes)
+    print("\n".join(repr(x) for x in nodes))  # noqa: T201
+    return nodes
+
+
+def parseLines(textLines: list[str], config: ParseConfig, startLine: int = 1) -> list[str]:
+    # Runs a list of lines thru the parser,
+    # returning another list of lines.
+
+    if len(textLines) == 0:
+        return textLines
+    endingWithNewline = textLines[0].endswith("\n")
+    if endingWithNewline:
+        text = "".join(textLines)
+    else:
+        text = "\n".join(textLines)
+    parsedLines = strFromNodes(nodesFromHtml(text, config, startLine=startLine)).split("\n")
+    if endingWithNewline:
+        parsedLines = [x + "\n" for x in parsedLines]
+
+    return parsedLines
+
+
+def parseText(text: str, config: ParseConfig, startLine: int = 1) -> str:
+    # Just runs the text thru the parser.
+    return strFromNodes(nodesFromHtml(text, config, startLine=startLine))
+
+
+def parseTitle(text: str, config: ParseConfig, startLine: int = 1) -> str:
+    # Parses the text, but removes any tags from the content,
+    # as they'll just show up as literal text in <title>.
+    nodes = nodesFromHtml(text, config, startLine=startLine)
+    return strFromNodes(n for n in nodes if isinstance(n, Text))
diff --git a/bikeshed/h/parser/nodes.py b/bikeshed/h/parser/nodes.py
@@ -0,0 +1,165 @@
+from __future__ import annotations
+
+import dataclasses
+import re
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass, field
+
+from ... import t
+
+
+@dataclass
+class ParserNode(metaclass=ABCMeta):
+    line: int
+    endLine: int
+
+
+@dataclass
+class Text(ParserNode, metaclass=ABCMeta):
+    text: str
+
+    @abstractmethod
+    def __str__(self) -> str:
+        pass
+
+
+@dataclass
+class RawText(Text):
+    # Raw source text, might contain HTML characters/etc
+
+    def __str__(self) -> str:
+        return self.text
+
+    def curlifyApostrophes(self, lastNode: ParserNode | None) -> RawText:
+        if (
+            self.text[0] == "'"
+            and isinstance(lastNode, (EndTag, RawElement, SelfClosedTag))
+            and re.match(r"'\w", self.text)
+        ):
+            self.text = "’" + self.text[1:]
+        if "'" in self.text:
+            self.text = re.sub(r"(\w)'(\w)", r"\1’\2", self.text)
+        return self
+
+
+@dataclass
+class SafeText(Text):
+    # "Safe" text, automatically escapes special HTML chars
+    # when stringified.
+    def __str__(self) -> str:
+        return escapeHTML(self.text)
+
+
+@dataclass
+class Doctype(ParserNode):
+    data: str
+
+    def __str__(self) -> str:
+        return self.data
+
+
+@dataclass
+class StartTag(ParserNode):
+    tag: str
+    attrs: dict[str, str] = field(default_factory=dict)
+    classes: set[str] = field(default_factory=set)
+
+    def __str__(self) -> str:
+        s = f"<{self.tag} bs-line-number={self.line}"
+        for k, v in sorted(self.attrs.items()):
+            if k == "bs-line-number":
+                continue
+            v = v.replace('"', "&#34;")
+            s += f' {k}="{v}"'
+        if self.classes:
+            s += f' class="{" ".join(sorted(self.classes))}"'
+        s += ">"
+        return s
+
+    def printEndTag(self) -> str:
+        return f"</{self.tag}>"
+
+    def finalize(self) -> StartTag:
+        if "class" in self.attrs:
+            self.classes = set(self.attrs["class"].split())
+            del self.attrs["class"]
+        return self
+
+    def clone(self, **kwargs: t.Any) -> StartTag:
+        return dataclasses.replace(self, **kwargs)
+
+
+@dataclass
+class SelfClosedTag(ParserNode):
+    tag: str
+    attrs: dict[str, str] = field(default_factory=dict)
+    classes: set[str] = field(default_factory=set)
+
+    def __str__(self) -> str:
+        s = f"<{self.tag} bs-line-number={self.line}"
+        for k, v in sorted(self.attrs.items()):
+            if k == "bs-line-number":
+                continue
+            s += f' {k}="{escapeAttr(v)}"'
+        if self.classes:
+            s += f' class="{" ".join(sorted(self.classes))}"'
+        s += f"></{self.tag}>"
+        return s
+
+    def finalize(self) -> SelfClosedTag:
+        if "class" in self.attrs:
+            self.classes = set(self.attrs["class"].split())
+            del self.attrs["class"]
+        return self
+
+    def clone(self, **kwargs: t.Any) -> SelfClosedTag:
+        return dataclasses.replace(self, **kwargs)
+
+    @classmethod
+    def fromStartTag(cls: t.Type[SelfClosedTag], tag: StartTag) -> SelfClosedTag:
+        return cls(
+            line=tag.line,
+            endLine=tag.endLine,
+            tag=tag.tag,
+            attrs=tag.attrs,
+            classes=tag.classes,
+        )
+
+
+@dataclass
+class EndTag(ParserNode):
+    tag: str
+
+    def __str__(self) -> str:
+        return f"</{self.tag}>"
+
+
+@dataclass
+class Comment(ParserNode):
+    data: str
+
+    def __str__(self) -> str:
+        return f"<!--{escapeHTML(self.data)}-->"
+
+
+# RawElement is for things like <script> or <xmp>
+# which have special parsing rules that just look
+# for the ending tag and treat the entire rest of
+# the contents as raw text, without escaping.
+@dataclass
+class RawElement(ParserNode):
+    tag: str
+    startTag: StartTag
+    data: str
+
+    def __str__(self) -> str:
+        return f"{self.startTag}{self.data}</{self.tag}>"
+
+
+def escapeHTML(text: str) -> str:
+    # Escape HTML
+    return text.replace("&", "&amp;").replace("<", "&lt;")
+
+
+def escapeAttr(text: str) -> str:
+    return text.replace("&", "&amp;").replace('"', "&quot;")