Support clipper wechat

selfboot · Apr 28, 2023 · fb00490 · fb00490
1 parent dac2a73
commit fb00490
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 6 deletions.
diff --git a/demos/Test Case E.html b/demos/Test Case E.html
diff --git a/html2notion/translate/html2json.py b/html2notion/translate/html2json.py
@@ -32,6 +32,7 @@ def _is_yinxiang_export_html(html_soup):
 
 """
 <meta name="source-application" content="webclipper.evernote" />
+<meta name="source-application" content="微信" />
 """
 def _is_yinxiang_clipper_html(html_soup):
     exporter_version_meta = html_soup.select_one('html > head > meta[name="exporter-version"]')
@@ -45,6 +46,8 @@ def _is_yinxiang_clipper_html(html_soup):
     clipper_source_content = clipper_source_meta.get('content', "") if isinstance(clipper_source_meta, Tag) else ""
     if isinstance(clipper_source_content, str) and clipper_source_content.endswith("evernote"):
         return True
+    if isinstance(clipper_source_content, str) and clipper_source_content in ("微信"):
+        return True
     return False
 
 

diff --git a/html2notion/translate/html2json_base.py b/html2notion/translate/html2json_base.py
@@ -71,7 +71,8 @@ def get_notion_data(self):
     @staticmethod
     def extract_text_and_parents(tag: PageElement, parents=[]):
         results = []
-        if isinstance(tag, NavigableString):
+        # Filter empty content
+        if isinstance(tag, NavigableString) and tag.text:
             results.append((tag.text, parents))
             return results
         elif isinstance(tag, Tag):
@@ -328,7 +329,7 @@ def convert_divider(self, soup):
             "type": "divider",
             "divider": {}
         }
-    
+
     def convert_heading(self, soup):
         heading_map = {"h1": "heading_1", "h2": "heading_2", "h3": "heading_3",
                        "h4": "heading_3", "h5": "heading_3", "h6": "heading_3"}
@@ -345,7 +346,8 @@ def convert_heading(self, soup):
         text_obj = self.generate_inline_obj(soup)
         if text_obj:
             rich_text.extend(text_obj)
-        return json_obj
+            return json_obj
+        return None
 
     # <ol><li><div>first</div></li><li><div>second</div></li><li><div>third</div></li></ol>
     def convert_numbered_list_item(self, soup):
@@ -356,6 +358,10 @@ def convert_bulleted_list_item(self, soup):
         return self.convert_list_items(soup, 'bulleted_list_item')
 
     def convert_list_items(self, soup, list_type):
+        # Remove heading tags in li
+        for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+            heading.unwrap()
+
         items = soup.find_all('li', recursive=True)
         if not items:
             logger.warning("No list items found in {soup}")

diff --git a/html2notion/translate/html2json_clipper.py b/html2notion/translate/html2json_clipper.py
@@ -109,5 +109,5 @@ def convert_code(self, soup):
         json_obj["code"]["rich_text"] = self.merge_rich_text(rich_text)
         return json_obj
 
-
+        
 Html2JsonBase.register(YinXiangClipper_Type, Html2JsonYinXiang)
diff --git a/html2notion/translate/notion_import.py b/html2notion/translate/notion_import.py
@@ -44,7 +44,8 @@ async def create_new_page(self, notion_data):
         blocks = notion_data.get("children", [])
         limit_size = 100
         chunks = [blocks[i: i + limit_size] for i in range(0, len(blocks), limit_size)]
-        notion_data.pop("children")
+        if blocks:
+            notion_data.pop("children")
         first_chunk = chunks[0] if chunks else []
         created_page = await self.notion_client.pages.create(**notion_data, children=first_chunk)
         page_id = created_page["id"]
@@ -63,7 +64,7 @@ async def main(file_path, notion_api_key):
 
 if __name__ == "__main__":
     test_prepare_conf()
-    file = Path("./demos/Test Case D.html")
+    file = Path("./demos/Test Case E.html")
     notion_api_key = ""
     if 'GITHUB_ACTIONS' in os.environ:
         notion_api_key = os.environ['notion_api_key']