添加香书小说网站搜索和下载功能

MidCheck · Nov 8, 2023 · a487b48 · a487b48
1 parent 2a33289
commit a487b48
Show file tree

Hide file tree

Showing 3 changed files with 154 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+.vscode
+venv/
diff --git a/README.md b/README.md
@@ -1,2 +1,32 @@
 # novelsCraw
 novelsCraw是一个小说爬取工具，可以将搜索到的小说下载到本地阅读，适用于不想被网站广告打扰和在线阅读无网络情况
+
+## 使用方法
+```
+$ python3 novel_scraw.py -h
+usage: novel_scraw.py [-h] [--href href_path] [-n craw_worker_nums] novel_name
+
+novelScraw: 小说爬取工具
+
+positional arguments:
+  novel_name            小说名称
+
+options:
+  -h, --help            show this help message and exit
+  --href href_path      小说首页URL中的路径部分,未指定时使用小说名称进行搜索
+  -n craw_worker_nums, --nums craw_worker_nums
+                        并行爬取数量,默认为20个协程
+```
+
+## 支持的小说网站
+`*感谢以下网站提供的免费小说资源*`
+
+| 时间 | 网站名称 | 网站地址 |
+|:----:|:-------:|:---------|
+| 2023-11-09 | 香书小说 | https://www.ibiquges.org/ |
+
+
+## 功能
+- [x] 搜索小说
+- [x] 将小说下载为TXT
+- [ ] 将小说下载为支持本地打开的html
diff --git a/novel_scraw.py b/novel_scraw.py
@@ -0,0 +1,121 @@
+from requests_html import AsyncHTMLSession
+from prettytable import PrettyTable
+from pathlib import Path
+import argparse
+import asyncio
+import re
+
+root_url = "https://www.ibiquges.org"
+rootdir = Path(__file__).parent
+
+async def get_href_links(asession: AsyncHTMLSession, queue: asyncio.Queue, href: str):
+    try:
+        resp = await asession.get(root_url + href)
+    except Exception as e:
+        print("[-] 获取首页目录失败:", str(e))
+    else:
+        i = 0
+        if matched := re.findall('''<dd><a href=\\'(.*?)\\' >(.*?)</a></dd>''', resp.html.html):
+            for href, title in matched:
+                queue.put_nowait((i, href, title))
+                i += 1
+        print(f"[+] 共有{i}条链接待下载...")
+        return i
+
+async def get_content(asession: AsyncHTMLSession, queue: asyncio.Queue, txt: dict):
+    while True:
+        try:
+            i, href, title = await queue.get()
+            resp = await asession.get(root_url + href)
+            if resp.status_code != 200:
+                queue.task_done()
+                raise Exception(f"http status code: {resp.status_code}")
+            div = resp.html.find('div#content', first=True)
+            if matched := re.findall(r'最新网址.*?\s(.*?)\n\n\n', div.text, re.DOTALL):
+                # # 为段落开头添加两个空格
+                # content = re.sub(r"^(.*)$", r"  \1", matched[0], flags=re.DOTALL)
+                # txt[i] = f"    {title}\n{content}\n\n\n"
+                txt[i] = f"----\n{title}\n----\n{matched[0]}\n\n\n"
+            queue.task_done()
+        except asyncio.CancelledError:
+            break
+        except Exception as e:
+            print(f"[-] {title} '{href}': {str(e)}")
+            await queue.put((i, href, title))
+
+
+async def download_noval_txt(asession: AsyncHTMLSession, noval_name: str, noval_href: str, works: int=20):
+    queue = asyncio.Queue()
+    txt = {}
+    tasks = [ asyncio.create_task(get_content(asession, queue, txt)) for _ in range(works)]
+    try:
+        total = await get_href_links(asession, queue, noval_href)
+        while True:
+            txt_size = len(txt)
+            print(f'\b\r[.] 下载"{noval_name}"进度: {txt_size}/{total}', end='')
+            if txt_size >= total:
+                print()
+                break
+            await asyncio.sleep(1)
+        print(f"[+] 下载 {noval_name} 完成!")
+        for task in tasks:
+            task.cancel()
+    except Exception as e:
+        print('[-] 等待下载任务出错:', str(e))
+
+    with open(rootdir.joinpath(noval_name + ".txt"), 'w', encoding='utf-8') as f:
+        for i in sorted(txt):
+            f.write(txt[i])
+    print(f"[+] 存储完成,保存路径: {noval_name}.txt")
+
+
+async def search_noval_href(asession: AsyncHTMLSession, noval_name: str) -> str:
+    pattern = re.compile(r'''<tr>\s*<td class="even"><a href="(.*?)" target="_blank">(.*?)</a></td>\s*<td class="odd"><a href="(.*?)" target="_blank">(.*?)</a>\s*</td>\s*<td class="even">(.*?)</td>\s*<td class="odd" align="center">(\S*)\s*</td>\s*</tr>''', re.DOTALL)
+    table = PrettyTable(['序号', '小说名称', '作者', '小说网址', '更新日期']) # '最新章节', '最新章节地址'
+    table.align['小说网址'] = 'l'
+    # table.align['最新章节地址'] = 'l'
+    try:
+        resp = await asession.post(root_url + "/modules/article/waps.php", data={'searchkey': noval_name})
+        if matched := pattern.findall(resp.html.html):
+            for i in range(len(matched)):
+                noval_url, noval, last_href, last_title, author, update_date = matched[i]
+                table.add_row([i+1, noval, author, noval_url, update_date]) # , last_title, last_href
+            print(table)
+            while True:
+                choice = input(f"请选择要下载的小说序号(1-{len(matched)}): ")
+                if not choice:
+                    print("[!] 没有选择想要下载的小说,退出工具")
+                    return None
+                if not choice.isdigit() or int(choice) < 1 or int(choice) > len(matched):
+                    print(f"[-] 没有该序号'{choice}'的小说,请重新选择...")
+                    continue
+                idx = int(choice)-1
+                url: str
+                name, url = matched[idx][1], matched[idx][0]
+                if url.startswith(root_url):
+                    return name, url.replace(root_url, '')
+                return name, url
+    except Exception as e:
+        print("[-] 搜索小说出错:", str(e))
+        return None
+
+async def main(noval_name: str, noval_href: str, works: int):
+    asession = AsyncHTMLSession()
+    if not noval_href:
+        # 搜索小说
+        result = await search_noval_href(asession, noval_name)
+        if result is None:
+            return
+        noval_name, noval_href = result
+
+    await download_noval_txt(asession, noval_name, noval_href, works)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="novelScraw: 小说爬取工具")
+    parser.add_argument("name", metavar="novel_name", help="小说名称")
+    parser.add_argument("--href", metavar="href_path", help="小说首页URL中的路径部分,未指定时使用小说名称进行搜索")
+    parser.add_argument("-n", "--nums", default=20, metavar="craw_worker_nums", type=int, help="并行爬取数量,默认为20个协程")
+    args = parser.parse_args()
+
+    asyncio.run(main(args.name, args.href, args.nums))
-Original file line number
+Diff line change
@@ Expand Up / @@ -158,3 +158,6 @@ cython_debug/ @@
     #  and can be added to the global gitignore or merged into this file.  For a more nuclear
     #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
     #.idea/
+    .vscode
+    venv/