Skip to content

Commit

Permalink
Add parser
Browse files Browse the repository at this point in the history
  • Loading branch information
smy20011 committed Sep 2, 2024
1 parent 0b4a108 commit d2286d2
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 2 deletions.
14 changes: 13 additions & 1 deletion llmake/cli/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
import json
from dataclasses import asdict

from cyclopts import App
from mistletoe.ast_renderer import AstRenderer
from mistletoe.html_renderer import HTMLRenderer

import llmake
from llmake.markdown import parse_markdown

app = App()


@app.default
def default(file):
with open(file) as f:
renderer = AstRenderer()
print(json.dumps(asdict(parse_markdown(f.read())), indent=2))


def run_app():
app()
133 changes: 133 additions & 0 deletions llmake/markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import re

from dataclasses import dataclass
from re import Pattern
from mistletoe import Document
from enum import StrEnum
from itertools import takewhile

from mistletoe.block_token import Heading, token
from mistletoe.html_renderer import HtmlRenderer
from mistletoe.span_token import Link, RawText, SpanToken, add_token, remove_token
from mistletoe.token import Token
from collections import deque


class LinkType(StrEnum):
WEB_LINK = "web_link"
WIKI_LINK = "wiki_link"
HEAD_LINK = "head_link"


@dataclass
class Context:
context_type: LinkType
name: str
target: str


@dataclass
class Task:
name: str
start: int
end: int
context: list[Context]
dependency : list[str]


@dataclass
class Project:
prompt: list[str]
tasks: list[Task]


def _match_header(level: int, matcher: Pattern):
def fn(token: Token) -> bool:
if not isinstance(token, Heading):
return False
if token.level != level:
return False
children = list(token.children or [])
if len(children) == 0:
return False
title = children[0]
if not isinstance(title, RawText):
return False
return matcher.match(title.content)

return fn


MATCH_TASK_HEADER = _match_header(1, re.compile("tasks", re.IGNORECASE))
MATCH_LEVEL1_HEADER = _match_header(1, re.compile(".*"))
MATCH_LEVEL2_HEADER = _match_header(2, re.compile(".*"))


def parse_markdown(markdown: str):
lines = markdown.splitlines()
doc = Document(lines)
children = list(doc.children or [])
# Task represents all task boundaries, ith task have line range [tasks[i], tasks[i+1])
task_lines = []
in_task = False
for child in children:
if not in_task and MATCH_TASK_HEADER(child):
in_task = True
continue
if in_task and MATCH_LEVEL2_HEADER(child):
task_lines.append(getattr(child, "line_number", -1) - 1)
if in_task and MATCH_LEVEL1_HEADER(child):
in_task = False
task_lines.append(getattr(child, "line_number", -1) - 1)
if in_task:
task_lines.append(len(lines))

tasks = []
for start, end in zip(task_lines, task_lines[1:]):
links = get_context_links(lines[start:end])
dependency = [l.target for l in links if l.context_type == LinkType.HEAD_LINK]
context = [l for l in links if l.context_type != LinkType.HEAD_LINK]
if not dependency:
dependency = [t.name for t in tasks]
name = lines[start][2:].lstrip()
tasks.append(Task(name, start, end, context, dependency))


return Project(lines, tasks)


class WikiLinkToken(SpanToken):
pattern = re.compile(r"\[\[ *(.+?) *\]\]")

def __init__(self, match):
target = match.group(1)
if "|" in target:
target, self.name = target.split()
else:
self.name = target
if target.startswith("#"):
self.target = target[1:]
self.link_type = LinkType.HEAD_LINK
else:
self.target = target
self.link_type = LinkType.WIKI_LINK


def get_context_links(prompt: str | list[str]) -> list[Context]:
add_token(WikiLinkToken)
doc = Document(prompt)
remove_token(WikiLinkToken)

context = []

def dfs(token: Token):
if isinstance(token, WikiLinkToken):
context.append(Context(token.link_type, token.name, token.target))
if isinstance(token, Link):
context.append(Context(LinkType.WEB_LINK, token.label or "", token.target))
if token.children:
for child in token.children:
dfs(child)

dfs(doc)
return context
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ python = "^3.8"
typing-extensions = "^4.12.2"
rich = ">=10.11.0"
cyclopts = "^2.9.7"
mistletoe = "^1.4.0"

[tool.poetry.group.docs.dependencies]
myst-parser = {extras = ["linkify"], version = "^3.0.1"}
Expand Down
17 changes: 17 additions & 0 deletions tests/test.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Context

We are creating a new tool called [[llmake]], we need help writing the readme and devlogs

# Tasks

## README

Write a readme.md that describe the llmake cli

## What's new

From devlog sections of llmake, create a small paragraph that describe the recent development process

## Final Output

Combine results from [[#README]] and [[#What's new]], create a signal markdown page.

0 comments on commit d2286d2

Please sign in to comment.