Skip to content

Commit

Permalink
Support code block in clipper
Browse files Browse the repository at this point in the history
  • Loading branch information
selfboot committed Apr 28, 2023
1 parent be1f35f commit dac2a73
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 4 deletions.
104 changes: 104 additions & 0 deletions examples/parse_code.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup, NavigableString\n",
"\n",
"html_doc = \"\"\"\n",
" <pre\n",
" style=\"\n",
" box-sizing: border-box;\n",
" font-family: 'Roboto Mono', sfmono-regular,\n",
" consolas, 'liberation mono', menlo, courier,\n",
" monospace;\n",
" background: rgb(32, 33, 35) none repeat scroll\n",
" 0% 0% / auto padding-box border-box;\n",
" color: rgb(255, 255, 255);\n",
" overflow-x: auto;\n",
" border-bottom-left-radius: 4px;\n",
" border-bottom-right-radius: 4px;\n",
" margin: 0px;\n",
" min-height: 44px;\n",
" padding: 12px 16px;\n",
" font-size: 15px;\n",
" line-height: 24px;\n",
" border-top-left-radius: 4px;\n",
" border-top-right-radius: 4px;\n",
" \"\n",
" ><code style=\"box-sizing:border-box;font-family:&quot;Roboto Mono&quot;, sfmono-regular, consolas, &quot;liberation mono&quot;, menlo, courier, monospace;white-space:pre;\">\n",
" <code style=\"box-sizing:border-box;float:left;font-family:&quot;Roboto Mono&quot;, sfmono-regular, consolas, &quot;liberation mono&quot;, menlo, courier, monospace;padding-right:16px;\"><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">1\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">2\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">3\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">4\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">5\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">6\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">7\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">8\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">9\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">10\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">11\n",
"</span><span style=\"box-sizing:border-box;color:rgb(110, 110, 128);\">12\n",
"</span></code>\n",
"<span style=\"box-sizing:border-box;color:rgba(255, 255, 255, 0.5);\"># Note: you need to be using OpenAI Python v0.27.0 for the code below to work</span><span style=\"box-sizing:border-box;\"\n",
"/><span style=\"box-sizing:border-box;\"/><span style=\"box-sizing:border-box;color:rgb(46, 149, 211);\">import</span><span style=\"box-sizing:border-box;\"> openai\n",
"</span>\n",
"openai.ChatCompletion.create(\n",
"<span style=\"box-sizing:border-box;\"> model=</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"gpt-3.5-turbo\"</span><span style=\"box-sizing:border-box;\">,\n",
"</span> messages=[\n",
"<span style=\"box-sizing:border-box;\"> {</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"role\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"system\"</span><span style=\"box-sizing:border-box;\">, </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"content\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"You are a helpful assistant.\"</span><span style=\"box-sizing:border-box;\">},\n",
"</span><span style=\"box-sizing:border-box;\"> {</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"role\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"user\"</span><span style=\"box-sizing:border-box;\">, </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"content\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"Who won the world series in 2020?\"</span><span style=\"box-sizing:border-box;\">},\n",
"</span><span style=\"box-sizing:border-box;\"> {</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"role\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"assistant\"</span><span style=\"box-sizing:border-box;\">, </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"content\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"The Los Angeles Dodgers won the World Series in 2020.\"</span><span style=\"box-sizing:border-box;\">},\n",
"</span><span style=\"box-sizing:border-box;\"> {</span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"role\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"user\"</span><span style=\"box-sizing:border-box;\">, </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"content\"</span><span style=\"box-sizing:border-box;\">: </span><span style=\"box-sizing:border-box;color:rgb(0, 166, 125);\">\"Where was it played?\"</span><span style=\"box-sizing:border-box;\">}\n",
"</span> ]\n",
")</code></pre>\n",
"\"\"\"\n",
"soup = BeautifulSoup(html_doc, 'html.parser')\n",
"\n",
"# 找到所有的<pre>标签\n",
"pre_tags = soup.find_all('pre')\n",
"\n",
"for pre in pre_tags:\n",
" # 在每个<pre>标签中找到<code>标签\n",
" code_tags = pre.find_all('code')\n",
" \n",
" for code in code_tags:\n",
" # 检查<code>标签是否包含行号,这里假设行号是在<span>标签中的数字\n",
" span_tags = code.find_all('span')\n",
" \n",
" for span in span_tags:\n",
" if span.string and span.string.strip().isdigit():\n",
" # 如果是行号,则删除这个<span>标签\n",
" span.decompose()\n",
"\n",
"# 这时,soup中的HTML已经没有行号了\n",
"print(soup.prettify())\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "notion",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
12 changes: 8 additions & 4 deletions html2notion/translate/html2json_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,15 +294,19 @@ def get_color(styles: dict, attrs):
color = attrs['color']
if not color:
return "default"
# If the color_values have 4 items, then it is RGBA and the last value is alpha
# rgba(174, 174, 188, 0.2)
if color.startswith("rgb"):
r, g, b = [int(x.strip()) for x in color[4:-1].split(",")]
color_values = [int(x.strip()) for x in re.findall(r'\d+', color)]
if len(color_values) >= 3:
r, g, b = color_values[:3]
return Html2JsonBase._closest_color(r, g, b)
# Check if color is in hexadecimal format
elif re.match(r'^#(?:[0-9a-fA-F]{3}){1,2}$', color):
r, g, b = Html2JsonBase._hex_to_rgb(color)
else:
return "default"
return Html2JsonBase._closest_color(r, g, b)

return Html2JsonBase._closest_color(r, g, b)
return "default"

def convert_paragraph(self, soup):
json_obj = {
Expand Down
30 changes: 30 additions & 0 deletions html2notion/translate/html2json_clipper.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def get_block_type(self, element):
return Block.BULLETED_LIST.value
elif tag_name == 'p':
return Block.PARAGRAPH.value
elif element.name == 'pre' and element.code:
return Block.CODE.value

return Block.FAIL.value

def convert_children(self, soup):
Expand All @@ -80,4 +83,31 @@ def convert_children(self, soup):
logger.warning(f"Unknown cnvert {element}, {block_type}")
return

# <pre><code><code>line number</code>... code content ...</code></pre>
def convert_code(self, soup):
json_obj = {
"object": "block",
"type": "code",
"code": {
"rich_text": [],
"language": "plain text",
},
}
rich_text = json_obj["code"]["rich_text"]
code_tag = soup.code
if not code_tag:
logger.error(f'No code tag found in {soup}')
return
children_list = list(code_tag.children) if isinstance(code_tag, Tag) else [code_tag]
for child in children_list:
if isinstance(child, Tag) and child.name == "code":
logger.debug(f'Skip line number')
continue
text_obj = self.generate_inline_obj(child)
if text_obj:
rich_text.extend(text_obj)
json_obj["code"]["rich_text"] = self.merge_rich_text(rich_text)
return json_obj


Html2JsonBase.register(YinXiangClipper_Type, Html2JsonYinXiang)

0 comments on commit dac2a73

Please sign in to comment.