Skip to content

Commit

Permalink
1.4.0 支持音频下载
Browse files Browse the repository at this point in the history
  • Loading branch information
jianboy committed Oct 18, 2023
1 parent e82ed27 commit 02016ee
Show file tree
Hide file tree
Showing 10 changed files with 147 additions and 106 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
/__pycache__
/data
/dist
/build
/build
*.pyc
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,19 @@

本项目初衷是下载 学习强国APP(www.xuexi.cn)上面的机器学习课程.后续用户需求,**支持所有 “学习慕课”栏目下的课程下载**

- [x] 课程批量下载
- [x] 电台音频下载
- [x] 多线程
- [x] 支持断点续传

《机器学习课程》
链接:https://pan.baidu.com/s/1U8Nu4ZStfpQfuCnaoXmWIg
提取码:3b3s

由于"学习强国APP"上视频采用cdn分发,而百度云下载限速,所以建议直接用本项目程序下载视频,方便,迅速.

![](screenshot/BaiduHi_2019-4-8_16-26-42.png)

![](screenshot/BaiduHi_2023-10-19_3-41-33.jpg)

### 使用
1.https://github.com/jianboy/crawl_xuexi/releases 下载 crwal_xuexi.zip 可执行文件.
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions crawl_xuexi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .xuexi import Xuexi
2 changes: 2 additions & 0 deletions crawl_xuexi/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

_host= r"https://www.xuexi.cn"
File renamed without changes.
File renamed without changes.
122 changes: 122 additions & 0 deletions crawl_xuexi/xuexi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Contact : [email protected]
@Time : 2019/4/8
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
@Desc : main xuexi
'''
import re,json,requests,os
from concurrent.futures import ThreadPoolExecutor
from . import DownloadProgress
from contextlib import closing

class Xuexi(object):
''' xuexi class '''

def __init__(self):
self.sess= requests.Session()

def get_video_links(self,url:str) -> list[str]:
''' get video links '''
video = self.sess.get(url=url).content.decode("utf8")
pattern = r'https://video.xuexi.cn/[^,"]*mp4'
link = re.findall(pattern, video, re.I)
link.reverse()
return link

def download(self,url:str, file_name:str, type:str = "mp4"):
''' download video
:param url: download url path
:return: file
'''
headers = {
"Sec-Fetch-Dest": "video",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Site": "same-site",
"Referer": "https://www.xuexi.cn/"
}
with closing(self.sess.get(url=url, stream=True, headers=headers)) as response:
chunk_size = 1024
content_size = int(response.headers['content-length'])
if type == "mp4":
file_D = './Video/' + file_name + '.mp4'
else:
file_D = './Video/' + file_name + '.mp3'
if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
print('跳过' + file_name)
else:
progress = DownloadProgress.DownloadProgress(file_name, total=content_size, unit="KB",
chunk_size=chunk_size,
run_status="正在下载", fin_status="下载完成")
with open(file_D, "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
progress.refresh(count=len(data))

def crawl(self, url:str):
''' crawl '''
pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
if (url.startswith("https://www.xuexi.cn/lgpage/detail/index.html")):
lessonList = self.getLessonListByLgPage(url)
mlData = json.loads(lessonList)
for i in range(len(mlData["sub_items"])):
frst_name = mlData["sub_items"][i]["title"].replace(" ", "")
# find video
try:
for j in range(len(mlData["sub_items"][i]["videos"][0]["video_storage_info"])):
res = mlData["sub_items"][i]["videos"][0]["video_storage_info"][j]["normal"]
if ".mp4" in res:
break
pool.submit(self.download, res, frst_name)
except Exception as e:
pass

# find voice
try:
for j in range(len(mlData["sub_items"][i]["audios"][0]["audio_storage_info"])):
res2 = mlData["sub_items"][i]["audios"][0]["audio_storage_info"][j]["url"]
if ".mp3" in res2:
break
pool.submit(self.download, res2, frst_name, "mp3")
except Exception as e:
pass
else:
lessonList = self.getLessonList(url)
mlData = json.loads(lessonList)
print("已配置10个线程下载")
for i in range((len(mlData["fpe1ki18v228w00"]))):
frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace(
'\t', ' ')
static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
# 打开 mp4 视频网页链接
resData = self.sess.get(static_page_url).content.decode("utf8")
preUrl = static_page_url.split("/")[3]
pattern = r'src="./data(.*?)"></script>'
url = "https://www.xuexi.cn/" + preUrl + \
"/data" + re.findall(pattern, resData, re.I)[0]
res = self.get_video_links(url)[0]
print("已解析第 %s 个视频的下载地址:%s" % (i, res))
pool.submit(self.download, res, frst_name) # 往线程池里面加入一个task

def getLessonListByLgPage(self, url):
'''
针对新格式 url 解析视频
'''
pattern = r'index.html\?id=(.*?)&'
id = re.findall(pattern, url, re.I)[0]
newUrl = r"https://boot-source.xuexi.cn/data/app/" + id + ".js"
resData = self.sess.get(url=newUrl).content.decode("utf8")
print("已解析视频列表数据...")
return resData[9:-1]

def getLessonList(self, url):
resData = self.sess.get(url=url).content.decode("utf8")
print("已解析视频列表数据...")
pattern = r'src="./data(.*?)"></script>'
preUrl = url.split("/")[3]
jsonUrl = "https://www.xuexi.cn/" + preUrl + \
"/data" + re.findall(pattern, resData, re.I)[0]
resData2 = self.sess.get(url=jsonUrl).content.decode("utf8")
print("已请求视频列表数据...")
return resData2[14:-1]
118 changes: 14 additions & 104 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,111 +5,20 @@
'''
__author__ = "liuyuqi"

import json
import os
import sys
import re
import time
from concurrent.futures import ThreadPoolExecutor
from contextlib import closing

import requests

import DownloadProgress
import user_agent

s = requests.Session()


def get_video_links(url):
video = s.get(url=url).content.decode("utf8")
pattern = r'https://video.xuexi.cn/[^,"]*mp4'
link = re.findall(pattern, video, re.I)
link.reverse()
return link


def downloadVideo(url, file_name):
'''
下载视频
:param url: 下载url路径
:return: 文件
'''
headers = {
"Sec-Fetch-Dest": "video",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Site": "same-site",
"Referer": "https://www.xuexi.cn/"
}
with closing(s.get(url=url, stream=True, headers=headers)) as response:
chunk_size = 1024
content_size = int(response.headers['content-length'])
file_D = './Video/' + file_name + '.mp4'
if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size):
print('跳过' + file_name)
else:
progress = DownloadProgress.DownloadProgress(file_name, total=content_size, unit="KB",
chunk_size=chunk_size,
run_status="正在下载", fin_status="下载完成")
with open(file_D, "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
progress.refresh(count=len(data))


def crawl(url):
pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池
if (url.startswith("https://www.xuexi.cn/lgpage/detail/index.html")):
lessonList = getLessonListByLgPage(url)
mlData = json.loads(lessonList)
for i in range(len(mlData["sub_items"])):
frst_name = mlData["sub_items"][i]["title"].replace(" ", "")
for j in range(len(mlData["sub_items"][i]["videos"][0]["video_storage_info"])):
res = mlData["sub_items"][i]["videos"][0]["video_storage_info"][j]["normal"]
if ".mp4" in res:
break
pool.submit(downloadVideo, res, frst_name)
else:
lessonList = getLessonList(url)
mlData = json.loads(lessonList)
print("已配置10个线程下载")
for i in range((len(mlData["fpe1ki18v228w00"]))):
frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace(
'\t', ' ')
static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
# 打开 mp4 视频网页链接
resData = s.get(static_page_url).content.decode("utf8")
preUrl = static_page_url.split("/")[3]
pattern = r'src="./data(.*?)"></script>'
url = "https://www.xuexi.cn/" + preUrl + \
"/data" + re.findall(pattern, resData, re.I)[0]
res = get_video_links(url)[0]
print("已解析第 %s 个视频的下载地址:%s" % (i, res))
pool.submit(downloadVideo, res, frst_name) # 往线程池里面加入一个task


def getLessonListByLgPage(url):
'''
针对新格式 url 解析视频
'''
newUrl = r"https://boot-source.xuexi.cn/data/app/" + url[49:] + ".js"
resData = s.get(url=newUrl).content.decode("utf8")
print("已解析视频列表数据...")
return resData[9:-1]

def getLessonList(url):
resData = s.get(url=url).content.decode("utf8")
print("已解析视频列表数据...")
pattern = r'src="./data(.*?)"></script>'
preUrl = url.split("/")[3]
jsonUrl = "https://www.xuexi.cn/" + preUrl + \
"/data" + re.findall(pattern, resData, re.I)[0]
resData2 = s.get(url=jsonUrl).content.decode("utf8")
print("已请求视频列表数据...")
return resData2[14:-1]

import time,os ,sys
from crawl_xuexi import Xuexi

def banner():
print("""
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/ \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \ / \
( X | u | e | x | i | . | c | n | | v | i | d | e | o | s )
\_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/
""")
print("Author: liuyuqi")

if __name__ == '__main__':
banner()
start_time = time.time()
if not os.path.exists("Video"):
os.mkdir("Video")
Expand All @@ -118,5 +27,6 @@ def getLessonList(url):
else:
url = input(
"请输入“学习慕课”下面的免费课程链接:(eg:https://www.xuexi.cn/9f584b49d8a7386a4cf248ce16f5e667/9b0f04ec6509904be734f5f609a3604a.html)")
crawl(url)
xuexi=Xuexi()
xuexi.crawl(url)
print("last time: {} s".format(time.time() - start_time))
Binary file added screenshot/BaiduHi_2023-10-19_3-41-33.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 02016ee

Please sign in to comment.