Skip to content

Commit

Permalink
Split the parser module apart into multiple files.
Browse files Browse the repository at this point in the history
  • Loading branch information
tabatkins committed Dec 1, 2023
1 parent 0689f35 commit 4b055e7
Show file tree
Hide file tree
Showing 7 changed files with 3,137 additions and 3,241 deletions.
33 changes: 0 additions & 33 deletions bikeshed/h/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,44 +77,11 @@
wrapContents,
)
from .parser import (
Comment,
EndTag,
Failure,
ParseConfig,
ParseFailure,
Result,
StartTag,
Stream,
initialDocumentParse,
isASCII,
isASCIIAlpha,
isASCIIAlphanum,
isASCIILowerAlpha,
isASCIIUpperAlpha,
isAttrNameChar,
isControl,
isDigit,
isHexDigit,
isNoncharacter,
isTagnameChar,
isWhitespace,
parseAttribute,
parseCharRef,
parseComment,
parseDoctype,
parseEndTag,
parseLines,
parseQuotedAttrValue,
parseScriptToEnd,
parseStartTag,
parseStyleToEnd,
parseTagName,
parseText,
parseTitle,
parseUnquotedAttrValue,
parseWhitespace,
parseXmpToEnd,
strFromNodes,
test,
)
from .serializer import Serializer
32 changes: 18 additions & 14 deletions bikeshed/h/parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
from .parser import (
from .main import (
initialDocumentParse,
linesFromNodes,
nodesFromHtml,
nodesFromStream,
parseLines,
parseText,
parseTitle,
strFromNodes,
)
from .nodes import (
Comment,
Doctype,
EndTag,
Failure,
ParseConfig,
ParseFailure,
ParserNode,
RawElement,
RawText,
Result,
SafeText,
SelfClosedTag,
StartTag,
Stream,
Text,
initialDocumentParse,
linesFromNodes,
nodesFromHtml,
nodesFromStream,
parseLines,
parseText,
parseTitle,
strFromNodes,
)
from .stream import (
Failure,
ParseConfig,
ParseFailure,
Result,
Stream,
)
112 changes: 112 additions & 0 deletions bikeshed/h/parser/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# pylint: skip-file
from __future__ import annotations

import enum
import io
import os
import re

from ... import constants, t
from ... import messages as m
from .nodes import (
Comment,
Doctype,
EndTag,
ParserNode,
RawElement,
RawText,
SafeText,
SelfClosedTag,
StartTag,
Text,
)
from .parser import POSSIBLE_NODE_START_CHARS, nodesFromStream
from .stream import Failure, ParseConfig, ParseFailure, Result, Stream


def test() -> None:
import json

with io.open(os.path.abspath("test.txt"), "r") as fh:
vals = "\n".join(x for x in json.load(fh).values())
list(nodesFromHtml(vals, ParseConfig()))


def nodesFromHtml(data: str, config: ParseConfig, startLine: int = 1) -> t.Generator[ParserNode, None, None]:
s = Stream(data, startLine=startLine, config=config)
yield from nodesFromStream(s, 0)


def initialDocumentParse(text: str, config: ParseConfig, startLine: int = 1) -> list[ParserNode]:
# Just do a document parse.
# This will add `bs-line-number` attributes,
# normalize any difficult shorthands
# (ones that look like tags, or that contain raw text),
# and blank out comments.

return list(nodesFromHtml(text, config, startLine=startLine))


def strFromNodes(nodes: t.Iterable[ParserNode], withIlcc: bool = False) -> str:
strs = []
ilcc = constants.incrementLineCountChar
dlcc = constants.decrementLineCountChar
for node in nodes:
if isinstance(node, Comment):
# Serialize comments as a standardized, recognizable sequence
# so Markdown processing can ignore them better.
strs.append(constants.bsComment)
if withIlcc:
strs.append(ilcc * node.data.count("\n"))
continue
s = str(node)
if withIlcc:
outputExtraLines = s.count("\n")
sourceExtraLines = node.endLine - node.line
diff = sourceExtraLines - outputExtraLines
if diff > 0:
s += ilcc * diff
elif diff < 0:
s += dlcc * -diff
strs.append(s)
return "".join(strs)


def linesFromNodes(nodes: t.Iterable[ParserNode]) -> list[str]:
return strFromNodes(nodes).split("\n")


def debugNodes(nodes: t.Iterable[ParserNode]) -> list[ParserNode]:
nodes = list(nodes)
print("\n".join(repr(x) for x in nodes)) # noqa: T201
return nodes


def parseLines(textLines: list[str], config: ParseConfig, startLine: int = 1) -> list[str]:
# Runs a list of lines thru the parser,
# returning another list of lines.

if len(textLines) == 0:
return textLines
endingWithNewline = textLines[0].endswith("\n")
if endingWithNewline:
text = "".join(textLines)
else:
text = "\n".join(textLines)
parsedLines = strFromNodes(nodesFromHtml(text, config, startLine=startLine)).split("\n")
if endingWithNewline:
parsedLines = [x + "\n" for x in parsedLines]

return parsedLines


def parseText(text: str, config: ParseConfig, startLine: int = 1) -> str:
# Just runs the text thru the parser.
return strFromNodes(nodesFromHtml(text, config, startLine=startLine))


def parseTitle(text: str, config: ParseConfig, startLine: int = 1) -> str:
# Parses the text, but removes any tags from the content,
# as they'll just show up as literal text in <title>.
nodes = nodesFromHtml(text, config, startLine=startLine)
return strFromNodes(n for n in nodes if isinstance(n, Text))
165 changes: 165 additions & 0 deletions bikeshed/h/parser/nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from __future__ import annotations

import dataclasses
import re
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass, field

from ... import t


@dataclass
class ParserNode(metaclass=ABCMeta):
line: int
endLine: int


@dataclass
class Text(ParserNode, metaclass=ABCMeta):
text: str

@abstractmethod
def __str__(self) -> str:
pass


@dataclass
class RawText(Text):
# Raw source text, might contain HTML characters/etc

def __str__(self) -> str:
return self.text

def curlifyApostrophes(self, lastNode: ParserNode | None) -> RawText:
if (
self.text[0] == "'"
and isinstance(lastNode, (EndTag, RawElement, SelfClosedTag))
and re.match(r"'\w", self.text)
):
self.text = "’" + self.text[1:]
if "'" in self.text:
self.text = re.sub(r"(\w)'(\w)", r"\1’\2", self.text)
return self


@dataclass
class SafeText(Text):
# "Safe" text, automatically escapes special HTML chars
# when stringified.
def __str__(self) -> str:
return escapeHTML(self.text)


@dataclass
class Doctype(ParserNode):
data: str

def __str__(self) -> str:
return self.data


@dataclass
class StartTag(ParserNode):
tag: str
attrs: dict[str, str] = field(default_factory=dict)
classes: set[str] = field(default_factory=set)

def __str__(self) -> str:
s = f"<{self.tag} bs-line-number={self.line}"
for k, v in sorted(self.attrs.items()):
if k == "bs-line-number":
continue
v = v.replace('"', "&#34;")
s += f' {k}="{v}"'
if self.classes:
s += f' class="{" ".join(sorted(self.classes))}"'
s += ">"
return s

def printEndTag(self) -> str:
return f"</{self.tag}>"

def finalize(self) -> StartTag:
if "class" in self.attrs:
self.classes = set(self.attrs["class"].split())
del self.attrs["class"]
return self

def clone(self, **kwargs: t.Any) -> StartTag:
return dataclasses.replace(self, **kwargs)


@dataclass
class SelfClosedTag(ParserNode):
tag: str
attrs: dict[str, str] = field(default_factory=dict)
classes: set[str] = field(default_factory=set)

def __str__(self) -> str:
s = f"<{self.tag} bs-line-number={self.line}"
for k, v in sorted(self.attrs.items()):
if k == "bs-line-number":
continue
s += f' {k}="{escapeAttr(v)}"'
if self.classes:
s += f' class="{" ".join(sorted(self.classes))}"'
s += f"></{self.tag}>"
return s

def finalize(self) -> SelfClosedTag:
if "class" in self.attrs:
self.classes = set(self.attrs["class"].split())
del self.attrs["class"]
return self

def clone(self, **kwargs: t.Any) -> SelfClosedTag:
return dataclasses.replace(self, **kwargs)

@classmethod
def fromStartTag(cls: t.Type[SelfClosedTag], tag: StartTag) -> SelfClosedTag:
return cls(
line=tag.line,
endLine=tag.endLine,
tag=tag.tag,
attrs=tag.attrs,
classes=tag.classes,
)


@dataclass
class EndTag(ParserNode):
tag: str

def __str__(self) -> str:
return f"</{self.tag}>"


@dataclass
class Comment(ParserNode):
data: str

def __str__(self) -> str:
return f"<!--{escapeHTML(self.data)}-->"


# RawElement is for things like <script> or <xmp>
# which have special parsing rules that just look
# for the ending tag and treat the entire rest of
# the contents as raw text, without escaping.
@dataclass
class RawElement(ParserNode):
tag: str
startTag: StartTag
data: str

def __str__(self) -> str:
return f"{self.startTag}{self.data}</{self.tag}>"


def escapeHTML(text: str) -> str:
# Escape HTML
return text.replace("&", "&amp;").replace("<", "&lt;")


def escapeAttr(text: str) -> str:
return text.replace("&", "&amp;").replace('"', "&quot;")
Loading

0 comments on commit 4b055e7

Please sign in to comment.