-
Notifications
You must be signed in to change notification settings - Fork 201
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split the parser module apart into multiple files.
- Loading branch information
Showing
7 changed files
with
3,137 additions
and
3,241 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,29 @@ | ||
from .parser import ( | ||
from .main import ( | ||
initialDocumentParse, | ||
linesFromNodes, | ||
nodesFromHtml, | ||
nodesFromStream, | ||
parseLines, | ||
parseText, | ||
parseTitle, | ||
strFromNodes, | ||
) | ||
from .nodes import ( | ||
Comment, | ||
Doctype, | ||
EndTag, | ||
Failure, | ||
ParseConfig, | ||
ParseFailure, | ||
ParserNode, | ||
RawElement, | ||
RawText, | ||
Result, | ||
SafeText, | ||
SelfClosedTag, | ||
StartTag, | ||
Stream, | ||
Text, | ||
initialDocumentParse, | ||
linesFromNodes, | ||
nodesFromHtml, | ||
nodesFromStream, | ||
parseLines, | ||
parseText, | ||
parseTitle, | ||
strFromNodes, | ||
) | ||
from .stream import ( | ||
Failure, | ||
ParseConfig, | ||
ParseFailure, | ||
Result, | ||
Stream, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# pylint: skip-file | ||
from __future__ import annotations | ||
|
||
import enum | ||
import io | ||
import os | ||
import re | ||
|
||
from ... import constants, t | ||
from ... import messages as m | ||
from .nodes import ( | ||
Comment, | ||
Doctype, | ||
EndTag, | ||
ParserNode, | ||
RawElement, | ||
RawText, | ||
SafeText, | ||
SelfClosedTag, | ||
StartTag, | ||
Text, | ||
) | ||
from .parser import POSSIBLE_NODE_START_CHARS, nodesFromStream | ||
from .stream import Failure, ParseConfig, ParseFailure, Result, Stream | ||
|
||
|
||
def test() -> None: | ||
import json | ||
|
||
with io.open(os.path.abspath("test.txt"), "r") as fh: | ||
vals = "\n".join(x for x in json.load(fh).values()) | ||
list(nodesFromHtml(vals, ParseConfig())) | ||
|
||
|
||
def nodesFromHtml(data: str, config: ParseConfig, startLine: int = 1) -> t.Generator[ParserNode, None, None]: | ||
s = Stream(data, startLine=startLine, config=config) | ||
yield from nodesFromStream(s, 0) | ||
|
||
|
||
def initialDocumentParse(text: str, config: ParseConfig, startLine: int = 1) -> list[ParserNode]: | ||
# Just do a document parse. | ||
# This will add `bs-line-number` attributes, | ||
# normalize any difficult shorthands | ||
# (ones that look like tags, or that contain raw text), | ||
# and blank out comments. | ||
|
||
return list(nodesFromHtml(text, config, startLine=startLine)) | ||
|
||
|
||
def strFromNodes(nodes: t.Iterable[ParserNode], withIlcc: bool = False) -> str: | ||
strs = [] | ||
ilcc = constants.incrementLineCountChar | ||
dlcc = constants.decrementLineCountChar | ||
for node in nodes: | ||
if isinstance(node, Comment): | ||
# Serialize comments as a standardized, recognizable sequence | ||
# so Markdown processing can ignore them better. | ||
strs.append(constants.bsComment) | ||
if withIlcc: | ||
strs.append(ilcc * node.data.count("\n")) | ||
continue | ||
s = str(node) | ||
if withIlcc: | ||
outputExtraLines = s.count("\n") | ||
sourceExtraLines = node.endLine - node.line | ||
diff = sourceExtraLines - outputExtraLines | ||
if diff > 0: | ||
s += ilcc * diff | ||
elif diff < 0: | ||
s += dlcc * -diff | ||
strs.append(s) | ||
return "".join(strs) | ||
|
||
|
||
def linesFromNodes(nodes: t.Iterable[ParserNode]) -> list[str]: | ||
return strFromNodes(nodes).split("\n") | ||
|
||
|
||
def debugNodes(nodes: t.Iterable[ParserNode]) -> list[ParserNode]: | ||
nodes = list(nodes) | ||
print("\n".join(repr(x) for x in nodes)) # noqa: T201 | ||
return nodes | ||
|
||
|
||
def parseLines(textLines: list[str], config: ParseConfig, startLine: int = 1) -> list[str]: | ||
# Runs a list of lines thru the parser, | ||
# returning another list of lines. | ||
|
||
if len(textLines) == 0: | ||
return textLines | ||
endingWithNewline = textLines[0].endswith("\n") | ||
if endingWithNewline: | ||
text = "".join(textLines) | ||
else: | ||
text = "\n".join(textLines) | ||
parsedLines = strFromNodes(nodesFromHtml(text, config, startLine=startLine)).split("\n") | ||
if endingWithNewline: | ||
parsedLines = [x + "\n" for x in parsedLines] | ||
|
||
return parsedLines | ||
|
||
|
||
def parseText(text: str, config: ParseConfig, startLine: int = 1) -> str: | ||
# Just runs the text thru the parser. | ||
return strFromNodes(nodesFromHtml(text, config, startLine=startLine)) | ||
|
||
|
||
def parseTitle(text: str, config: ParseConfig, startLine: int = 1) -> str: | ||
# Parses the text, but removes any tags from the content, | ||
# as they'll just show up as literal text in <title>. | ||
nodes = nodesFromHtml(text, config, startLine=startLine) | ||
return strFromNodes(n for n in nodes if isinstance(n, Text)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
from __future__ import annotations | ||
|
||
import dataclasses | ||
import re | ||
from abc import ABCMeta, abstractmethod | ||
from dataclasses import dataclass, field | ||
|
||
from ... import t | ||
|
||
|
||
@dataclass | ||
class ParserNode(metaclass=ABCMeta): | ||
line: int | ||
endLine: int | ||
|
||
|
||
@dataclass | ||
class Text(ParserNode, metaclass=ABCMeta): | ||
text: str | ||
|
||
@abstractmethod | ||
def __str__(self) -> str: | ||
pass | ||
|
||
|
||
@dataclass | ||
class RawText(Text): | ||
# Raw source text, might contain HTML characters/etc | ||
|
||
def __str__(self) -> str: | ||
return self.text | ||
|
||
def curlifyApostrophes(self, lastNode: ParserNode | None) -> RawText: | ||
if ( | ||
self.text[0] == "'" | ||
and isinstance(lastNode, (EndTag, RawElement, SelfClosedTag)) | ||
and re.match(r"'\w", self.text) | ||
): | ||
self.text = "’" + self.text[1:] | ||
if "'" in self.text: | ||
self.text = re.sub(r"(\w)'(\w)", r"\1’\2", self.text) | ||
return self | ||
|
||
|
||
@dataclass | ||
class SafeText(Text): | ||
# "Safe" text, automatically escapes special HTML chars | ||
# when stringified. | ||
def __str__(self) -> str: | ||
return escapeHTML(self.text) | ||
|
||
|
||
@dataclass | ||
class Doctype(ParserNode): | ||
data: str | ||
|
||
def __str__(self) -> str: | ||
return self.data | ||
|
||
|
||
@dataclass | ||
class StartTag(ParserNode): | ||
tag: str | ||
attrs: dict[str, str] = field(default_factory=dict) | ||
classes: set[str] = field(default_factory=set) | ||
|
||
def __str__(self) -> str: | ||
s = f"<{self.tag} bs-line-number={self.line}" | ||
for k, v in sorted(self.attrs.items()): | ||
if k == "bs-line-number": | ||
continue | ||
v = v.replace('"', """) | ||
s += f' {k}="{v}"' | ||
if self.classes: | ||
s += f' class="{" ".join(sorted(self.classes))}"' | ||
s += ">" | ||
return s | ||
|
||
def printEndTag(self) -> str: | ||
return f"</{self.tag}>" | ||
|
||
def finalize(self) -> StartTag: | ||
if "class" in self.attrs: | ||
self.classes = set(self.attrs["class"].split()) | ||
del self.attrs["class"] | ||
return self | ||
|
||
def clone(self, **kwargs: t.Any) -> StartTag: | ||
return dataclasses.replace(self, **kwargs) | ||
|
||
|
||
@dataclass | ||
class SelfClosedTag(ParserNode): | ||
tag: str | ||
attrs: dict[str, str] = field(default_factory=dict) | ||
classes: set[str] = field(default_factory=set) | ||
|
||
def __str__(self) -> str: | ||
s = f"<{self.tag} bs-line-number={self.line}" | ||
for k, v in sorted(self.attrs.items()): | ||
if k == "bs-line-number": | ||
continue | ||
s += f' {k}="{escapeAttr(v)}"' | ||
if self.classes: | ||
s += f' class="{" ".join(sorted(self.classes))}"' | ||
s += f"></{self.tag}>" | ||
return s | ||
|
||
def finalize(self) -> SelfClosedTag: | ||
if "class" in self.attrs: | ||
self.classes = set(self.attrs["class"].split()) | ||
del self.attrs["class"] | ||
return self | ||
|
||
def clone(self, **kwargs: t.Any) -> SelfClosedTag: | ||
return dataclasses.replace(self, **kwargs) | ||
|
||
@classmethod | ||
def fromStartTag(cls: t.Type[SelfClosedTag], tag: StartTag) -> SelfClosedTag: | ||
return cls( | ||
line=tag.line, | ||
endLine=tag.endLine, | ||
tag=tag.tag, | ||
attrs=tag.attrs, | ||
classes=tag.classes, | ||
) | ||
|
||
|
||
@dataclass | ||
class EndTag(ParserNode): | ||
tag: str | ||
|
||
def __str__(self) -> str: | ||
return f"</{self.tag}>" | ||
|
||
|
||
@dataclass | ||
class Comment(ParserNode): | ||
data: str | ||
|
||
def __str__(self) -> str: | ||
return f"<!--{escapeHTML(self.data)}-->" | ||
|
||
|
||
# RawElement is for things like <script> or <xmp> | ||
# which have special parsing rules that just look | ||
# for the ending tag and treat the entire rest of | ||
# the contents as raw text, without escaping. | ||
@dataclass | ||
class RawElement(ParserNode): | ||
tag: str | ||
startTag: StartTag | ||
data: str | ||
|
||
def __str__(self) -> str: | ||
return f"{self.startTag}{self.data}</{self.tag}>" | ||
|
||
|
||
def escapeHTML(text: str) -> str: | ||
# Escape HTML | ||
return text.replace("&", "&").replace("<", "<") | ||
|
||
|
||
def escapeAttr(text: str) -> str: | ||
return text.replace("&", "&").replace('"', """) |
Oops, something went wrong.