From d2286d24b3ad2971e954d782d0e63e417ddb0a2c Mon Sep 17 00:00:00 2001 From: Siyuan Ma Date: Sun, 1 Sep 2024 19:49:59 -0700 Subject: [PATCH] Add parser --- llmake/cli/main.py | 14 ++++- llmake/markdown.py | 133 +++++++++++++++++++++++++++++++++++++++++++++ poetry.lock | 13 ++++- pyproject.toml | 1 + tests/test.md | 17 ++++++ 5 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 llmake/markdown.py create mode 100644 tests/test.md diff --git a/llmake/cli/main.py b/llmake/cli/main.py index a2f4c25..b5bf71e 100644 --- a/llmake/cli/main.py +++ b/llmake/cli/main.py @@ -1,9 +1,21 @@ +import json +from dataclasses import asdict + from cyclopts import App +from mistletoe.ast_renderer import AstRenderer +from mistletoe.html_renderer import HTMLRenderer -import llmake +from llmake.markdown import parse_markdown app = App() +@app.default +def default(file): + with open(file) as f: + renderer = AstRenderer() + print(json.dumps(asdict(parse_markdown(f.read())), indent=2)) + + def run_app(): app() diff --git a/llmake/markdown.py b/llmake/markdown.py new file mode 100644 index 0000000..fdef5c8 --- /dev/null +++ b/llmake/markdown.py @@ -0,0 +1,133 @@ +import re + +from dataclasses import dataclass +from re import Pattern +from mistletoe import Document +from enum import StrEnum +from itertools import takewhile + +from mistletoe.block_token import Heading, token +from mistletoe.html_renderer import HtmlRenderer +from mistletoe.span_token import Link, RawText, SpanToken, add_token, remove_token +from mistletoe.token import Token +from collections import deque + + +class LinkType(StrEnum): + WEB_LINK = "web_link" + WIKI_LINK = "wiki_link" + HEAD_LINK = "head_link" + + +@dataclass +class Context: + context_type: LinkType + name: str + target: str + + +@dataclass +class Task: + name: str + start: int + end: int + context: list[Context] + dependency : list[str] + + +@dataclass +class Project: + prompt: list[str] + tasks: list[Task] + + +def _match_header(level: int, matcher: Pattern): + def fn(token: Token) -> bool: + if not isinstance(token, Heading): + return False + if token.level != level: + return False + children = list(token.children or []) + if len(children) == 0: + return False + title = children[0] + if not isinstance(title, RawText): + return False + return matcher.match(title.content) + + return fn + + +MATCH_TASK_HEADER = _match_header(1, re.compile("tasks", re.IGNORECASE)) +MATCH_LEVEL1_HEADER = _match_header(1, re.compile(".*")) +MATCH_LEVEL2_HEADER = _match_header(2, re.compile(".*")) + + +def parse_markdown(markdown: str): + lines = markdown.splitlines() + doc = Document(lines) + children = list(doc.children or []) + # Task represents all task boundaries, ith task have line range [tasks[i], tasks[i+1]) + task_lines = [] + in_task = False + for child in children: + if not in_task and MATCH_TASK_HEADER(child): + in_task = True + continue + if in_task and MATCH_LEVEL2_HEADER(child): + task_lines.append(getattr(child, "line_number", -1) - 1) + if in_task and MATCH_LEVEL1_HEADER(child): + in_task = False + task_lines.append(getattr(child, "line_number", -1) - 1) + if in_task: + task_lines.append(len(lines)) + + tasks = [] + for start, end in zip(task_lines, task_lines[1:]): + links = get_context_links(lines[start:end]) + dependency = [l.target for l in links if l.context_type == LinkType.HEAD_LINK] + context = [l for l in links if l.context_type != LinkType.HEAD_LINK] + if not dependency: + dependency = [t.name for t in tasks] + name = lines[start][2:].lstrip() + tasks.append(Task(name, start, end, context, dependency)) + + + return Project(lines, tasks) + + +class WikiLinkToken(SpanToken): + pattern = re.compile(r"\[\[ *(.+?) *\]\]") + + def __init__(self, match): + target = match.group(1) + if "|" in target: + target, self.name = target.split() + else: + self.name = target + if target.startswith("#"): + self.target = target[1:] + self.link_type = LinkType.HEAD_LINK + else: + self.target = target + self.link_type = LinkType.WIKI_LINK + + +def get_context_links(prompt: str | list[str]) -> list[Context]: + add_token(WikiLinkToken) + doc = Document(prompt) + remove_token(WikiLinkToken) + + context = [] + + def dfs(token: Token): + if isinstance(token, WikiLinkToken): + context.append(Context(token.link_type, token.name, token.target)) + if isinstance(token, Link): + context.append(Context(LinkType.WEB_LINK, token.label or "", token.target)) + if token.children: + for child in token.children: + dfs(child) + + dfs(doc) + return context diff --git a/poetry.lock b/poetry.lock index 14c3b83..906c440 100644 --- a/poetry.lock +++ b/poetry.lock @@ -835,6 +835,17 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "mistletoe" +version = "1.4.0" +description = "A fast, extensible Markdown parser in pure Python." +optional = false +python-versions = "~=3.5" +files = [ + {file = "mistletoe-1.4.0-py3-none-any.whl", hash = "sha256:44a477803861de1237ba22e375c6b617690a31d2902b47279d1f8f7ed498a794"}, + {file = "mistletoe-1.4.0.tar.gz", hash = "sha256:1630f906e5e4bbe66fdeb4d29d277e2ea515d642bb18a9b49b136361a9818c9d"}, +] + [[package]] name = "myst-parser" version = "3.0.1" @@ -1559,4 +1570,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "a2518c3f2175d475e8363f08f72ba31e007083f03d3c53b88c1a9a14b21648d5" +content-hash = "b63786f162990aa0e0f8797c88e866a413e6ca62a7b3c0822e0f4964a1b4e3e6" diff --git a/pyproject.toml b/pyproject.toml index 22087c2..a388264 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ python = "^3.8" typing-extensions = "^4.12.2" rich = ">=10.11.0" cyclopts = "^2.9.7" +mistletoe = "^1.4.0" [tool.poetry.group.docs.dependencies] myst-parser = {extras = ["linkify"], version = "^3.0.1"} diff --git a/tests/test.md b/tests/test.md new file mode 100644 index 0000000..f016856 --- /dev/null +++ b/tests/test.md @@ -0,0 +1,17 @@ +# Context + +We are creating a new tool called [[llmake]], we need help writing the readme and devlogs + +# Tasks + +## README + +Write a readme.md that describe the llmake cli + +## What's new + +From devlog sections of llmake, create a small paragraph that describe the recent development process + +## Final Output + +Combine results from [[#README]] and [[#What's new]], create a signal markdown page.