1.4.0 支持音频下载

jianboy · Oct 18, 2023 · 02016ee · 02016ee
1 parent e82ed27
commit 02016ee
Show file tree

Hide file tree

Showing 10 changed files with 147 additions and 106 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 /__pycache__
 /data
 /dist
-/build
+/build
+*.pyc
diff --git a/README.md b/README.md
@@ -4,14 +4,19 @@
 
 本项目初衷是下载 学习强国APP(www.xuexi.cn)上面的机器学习课程.后续用户需求，**支持所有 “学习慕课”栏目下的课程下载**。
 
+- [x] 课程批量下载
+- [x] 电台音频下载
+- [x] 多线程
+- [x] 支持断点续传
+
 《机器学习课程》
 链接：https://pan.baidu.com/s/1U8Nu4ZStfpQfuCnaoXmWIg
 提取码：3b3s
 
 由于"学习强国APP"上视频采用cdn分发,而百度云下载限速,所以建议直接用本项目程序下载视频,方便,迅速.
 
 ![](screenshot/BaiduHi_2019-4-8_16-26-42.png)
-
+![](screenshot/BaiduHi_2023-10-19_3-41-33.jpg)
 
 ### 使用
 1. 到 https://github.com/jianboy/crawl_xuexi/releases 下载 crwal_xuexi.zip 可执行文件.

diff --git a/DownloadProgress.py → crawl_xuexi/DownloadProgress.py b/DownloadProgress.py → crawl_xuexi/DownloadProgress.py
diff --git a/crawl_xuexi/__init__.py b/crawl_xuexi/__init__.py
@@ -0,0 +1 @@
+from .xuexi import Xuexi
diff --git a/crawl_xuexi/api.py b/crawl_xuexi/api.py
@@ -0,0 +1,2 @@
+
+_host= r"https://www.xuexi.cn"
diff --git a/threads.py → crawl_xuexi/threads.py b/threads.py → crawl_xuexi/threads.py
diff --git a/user_agent.py → crawl_xuexi/user_agent.py b/user_agent.py → crawl_xuexi/user_agent.py
diff --git a/crawl_xuexi/xuexi.py b/crawl_xuexi/xuexi.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   [email protected]
+@Time    :   2019/4/8
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   main xuexi
+'''
+import re,json,requests,os
+from concurrent.futures import ThreadPoolExecutor
+from . import DownloadProgress
+from contextlib import closing
+
+class Xuexi(object):
+    ''' xuexi class '''
+
+    def __init__(self):
+        self.sess= requests.Session()
+
+    def get_video_links(self,url:str) -> list[str]:
+        ''' get video links '''
+        video = self.sess.get(url=url).content.decode("utf8")
+        pattern = r'https://video.xuexi.cn/[^,"]*mp4'
+        link = re.findall(pattern, video, re.I)
+        link.reverse()
+        return link
+
+    def download(self,url:str, file_name:str, type:str = "mp4"):
+        ''' download video
+         :param url: download url path
+          :return: file
+         '''
+        headers = {
+            "Sec-Fetch-Dest": "video",
+            "Sec-Fetch-Mode": "no-cors",
+            "Sec-Fetch-Site": "same-site",
+            "Referer": "https://www.xuexi.cn/"
+        }
+        with closing(self.sess.get(url=url, stream=True, headers=headers)) as response:
+            chunk_size = 1024
+            content_size = int(response.headers['content-length'])
+            if type == "mp4":
+                file_D = './Video/' + file_name + '.mp4'
+            else:
+                file_D = './Video/' + file_name + '.mp3'
+            if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
+                print('跳过' + file_name)
+            else:
+                progress = DownloadProgress.DownloadProgress(file_name, total=content_size, unit="KB",
+                                                             chunk_size=chunk_size,
+                                                             run_status="正在下载", fin_status="下载完成")
+                with open(file_D, "wb") as file:
+                    for data in response.iter_content(chunk_size=chunk_size):
+                        file.write(data)
+                        progress.refresh(count=len(data))
+
+    def crawl(self, url:str):
+        ''' crawl '''
+        pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
+        if (url.startswith("https://www.xuexi.cn/lgpage/detail/index.html")):
+            lessonList = self.getLessonListByLgPage(url)
+            mlData = json.loads(lessonList)
+            for i in range(len(mlData["sub_items"])):
+                frst_name = mlData["sub_items"][i]["title"].replace(" ", "")
+                # find video
+                try:
+                    for j in range(len(mlData["sub_items"][i]["videos"][0]["video_storage_info"])):
+                        res = mlData["sub_items"][i]["videos"][0]["video_storage_info"][j]["normal"]
+                        if ".mp4" in res:
+                            break
+                    pool.submit(self.download, res, frst_name)
+                except Exception as e:
+                    pass
+
+                # find voice
+                try:
+                    for j in range(len(mlData["sub_items"][i]["audios"][0]["audio_storage_info"])):
+                        res2 = mlData["sub_items"][i]["audios"][0]["audio_storage_info"][j]["url"]
+                        if ".mp3" in res2:
+                            break
+                    pool.submit(self.download, res2, frst_name, "mp3")
+                except Exception as e:
+                    pass
+        else:
+            lessonList = self.getLessonList(url)
+            mlData = json.loads(lessonList)
+            print("已配置10个线程下载")
+            for i in range((len(mlData["fpe1ki18v228w00"]))):
+                frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace(
+                    '\t', ' ')
+                static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
+                # 打开 mp4 视频网页链接
+                resData = self.sess.get(static_page_url).content.decode("utf8")
+                preUrl = static_page_url.split("/")[3]
+                pattern = r'src="./data(.*?)"></script>'
+                url = "https://www.xuexi.cn/" + preUrl + \
+                    "/data" + re.findall(pattern, resData, re.I)[0]
+                res = self.get_video_links(url)[0]
+                print("已解析第 %s 个视频的下载地址：%s" % (i, res))
+                pool.submit(self.download, res, frst_name)  # 往线程池里面加入一个task
+
+    def getLessonListByLgPage(self, url):
+        '''
+        针对新格式 url 解析视频
+        '''
+        pattern = r'index.html\?id=(.*?)&'
+        id = re.findall(pattern, url, re.I)[0]
+        newUrl = r"https://boot-source.xuexi.cn/data/app/" + id + ".js"
+        resData = self.sess.get(url=newUrl).content.decode("utf8")
+        print("已解析视频列表数据...")
+        return resData[9:-1]
+
+    def getLessonList(self, url):
+        resData = self.sess.get(url=url).content.decode("utf8")
+        print("已解析视频列表数据...")
+        pattern = r'src="./data(.*?)"></script>'
+        preUrl = url.split("/")[3]
+        jsonUrl = "https://www.xuexi.cn/" + preUrl + \
+            "/data" + re.findall(pattern, resData, re.I)[0]
+        resData2 = self.sess.get(url=jsonUrl).content.decode("utf8")
+        print("已请求视频列表数据...")
+        return resData2[14:-1]
diff --git a/main.py b/main.py
@@ -5,111 +5,20 @@
 '''
 __author__ = "liuyuqi"
 
-import json
-import os
-import sys
-import re
-import time
-from concurrent.futures import ThreadPoolExecutor
-from contextlib import closing
-
-import requests
-
-import DownloadProgress
-import user_agent
-
-s = requests.Session()
-
-
-def get_video_links(url):
-    video = s.get(url=url).content.decode("utf8")
-    pattern = r'https://video.xuexi.cn/[^,"]*mp4'
-    link = re.findall(pattern, video, re.I)
-    link.reverse()
-    return link
-
-
-def downloadVideo(url, file_name):
-    '''
-    下载视频
-    :param url: 下载url路径
-    :return: 文件
-     '''
-    headers = {
-        "Sec-Fetch-Dest": "video",
-        "Sec-Fetch-Mode": "no-cors",
-        "Sec-Fetch-Site": "same-site",
-        "Referer": "https://www.xuexi.cn/"
-    }
-    with closing(s.get(url=url, stream=True, headers=headers)) as response:
-        chunk_size = 1024
-        content_size = int(response.headers['content-length'])
-        file_D = './Video/' + file_name + '.mp4'
-        if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
-            print('跳过' + file_name)
-        else:
-            progress = DownloadProgress.DownloadProgress(file_name, total=content_size, unit="KB",
-                                                         chunk_size=chunk_size,
-                                                         run_status="正在下载", fin_status="下载完成")
-            with open(file_D, "wb") as file:
-                for data in response.iter_content(chunk_size=chunk_size):
-                    file.write(data)
-                    progress.refresh(count=len(data))
-
-
-def crawl(url):
-    pool = ThreadPoolExecutor(max_workers=10)  # 创建一个最大可容纳10个task的线程池
-    if (url.startswith("https://www.xuexi.cn/lgpage/detail/index.html")):
-        lessonList = getLessonListByLgPage(url)
-        mlData = json.loads(lessonList)
-        for i in range(len(mlData["sub_items"])):
-            frst_name = mlData["sub_items"][i]["title"].replace(" ", "")
-            for j in range(len(mlData["sub_items"][i]["videos"][0]["video_storage_info"])):
-                res = mlData["sub_items"][i]["videos"][0]["video_storage_info"][j]["normal"]
-                if ".mp4" in res:
-                    break
-            pool.submit(downloadVideo, res, frst_name)
-    else:
-        lessonList = getLessonList(url)
-        mlData = json.loads(lessonList)
-        print("已配置10个线程下载")
-        for i in range((len(mlData["fpe1ki18v228w00"]))):
-            frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace(
-                '\t', ' ')
-            static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
-            # 打开 mp4 视频网页链接
-            resData = s.get(static_page_url).content.decode("utf8")
-            preUrl = static_page_url.split("/")[3]
-            pattern = r'src="./data(.*?)"></script>'
-            url = "https://www.xuexi.cn/" + preUrl + \
-                "/data" + re.findall(pattern, resData, re.I)[0]
-            res = get_video_links(url)[0]
-            print("已解析第 %s 个视频的下载地址：%s" % (i, res))
-            pool.submit(downloadVideo, res, frst_name)  # 往线程池里面加入一个task
-
-
-def getLessonListByLgPage(url):
-    '''
-    针对新格式 url 解析视频
-    '''
-    newUrl = r"https://boot-source.xuexi.cn/data/app/" + url[49:] + ".js"
-    resData = s.get(url=newUrl).content.decode("utf8")
-    print("已解析视频列表数据...")
-    return resData[9:-1]
-
-def getLessonList(url):
-    resData = s.get(url=url).content.decode("utf8")
-    print("已解析视频列表数据...")
-    pattern = r'src="./data(.*?)"></script>'
-    preUrl = url.split("/")[3]
-    jsonUrl = "https://www.xuexi.cn/" + preUrl + \
-        "/data" + re.findall(pattern, resData, re.I)[0]
-    resData2 = s.get(url=jsonUrl).content.decode("utf8")
-    print("已请求视频列表数据...")
-    return resData2[14:-1]
-
+import time,os ,sys 
+from crawl_xuexi import Xuexi
+
+def banner():
+    print("""
+    _   _   _   _   _   _   _   _   _   _   _   _   _   _   _  
+   / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ 
+  ( X | u | e | x | i | . | c | n |   | v | i | d | e | o | s )
+   \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ 
+    """)
+    print("Author: liuyuqi")
 
 if __name__ == '__main__':
+    banner()
     start_time = time.time()
     if not os.path.exists("Video"):
         os.mkdir("Video")
@@ -118,5 +27,6 @@ def getLessonList(url):
     else:
         url = input(
             "请输入“学习慕课”下面的免费课程链接：（eg：https://www.xuexi.cn/9f584b49d8a7386a4cf248ce16f5e667/9b0f04ec6509904be734f5f609a3604a.html）")
-    crawl(url)
+    xuexi=Xuexi()
+    xuexi.crawl(url)
     print("last time: {} s".format(time.time() - start_time))
diff --git a/screenshot/BaiduHi_2023-10-19_3-41-33.jpg b/screenshot/BaiduHi_2023-10-19_3-41-33.jpg
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,5 @@ @@
     /__pycache__
     /data
     /dist
-    /build
+    /build
+    *.pyc