-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
147 additions
and
106 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,5 @@ | |
/__pycache__ | ||
/data | ||
/dist | ||
/build | ||
/build | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .xuexi import Xuexi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
|
||
_host= r"https://www.xuexi.cn" |
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#!/usr/bin/env python | ||
# -*- encoding: utf-8 -*- | ||
''' | ||
@Contact : [email protected] | ||
@Time : 2019/4/8 | ||
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved. | ||
@Desc : main xuexi | ||
''' | ||
import re,json,requests,os | ||
from concurrent.futures import ThreadPoolExecutor | ||
from . import DownloadProgress | ||
from contextlib import closing | ||
|
||
class Xuexi(object): | ||
''' xuexi class ''' | ||
|
||
def __init__(self): | ||
self.sess= requests.Session() | ||
|
||
def get_video_links(self,url:str) -> list[str]: | ||
''' get video links ''' | ||
video = self.sess.get(url=url).content.decode("utf8") | ||
pattern = r'https://video.xuexi.cn/[^,"]*mp4' | ||
link = re.findall(pattern, video, re.I) | ||
link.reverse() | ||
return link | ||
|
||
def download(self,url:str, file_name:str, type:str = "mp4"): | ||
''' download video | ||
:param url: download url path | ||
:return: file | ||
''' | ||
headers = { | ||
"Sec-Fetch-Dest": "video", | ||
"Sec-Fetch-Mode": "no-cors", | ||
"Sec-Fetch-Site": "same-site", | ||
"Referer": "https://www.xuexi.cn/" | ||
} | ||
with closing(self.sess.get(url=url, stream=True, headers=headers)) as response: | ||
chunk_size = 1024 | ||
content_size = int(response.headers['content-length']) | ||
if type == "mp4": | ||
file_D = './Video/' + file_name + '.mp4' | ||
else: | ||
file_D = './Video/' + file_name + '.mp3' | ||
if (os.path.exists(file_D) and os.path.getsize(file_D) == content_size): | ||
print('跳过' + file_name) | ||
else: | ||
progress = DownloadProgress.DownloadProgress(file_name, total=content_size, unit="KB", | ||
chunk_size=chunk_size, | ||
run_status="正在下载", fin_status="下载完成") | ||
with open(file_D, "wb") as file: | ||
for data in response.iter_content(chunk_size=chunk_size): | ||
file.write(data) | ||
progress.refresh(count=len(data)) | ||
|
||
def crawl(self, url:str): | ||
''' crawl ''' | ||
pool = ThreadPoolExecutor(max_workers=10) # 创建一个最大可容纳10个task的线程池 | ||
if (url.startswith("https://www.xuexi.cn/lgpage/detail/index.html")): | ||
lessonList = self.getLessonListByLgPage(url) | ||
mlData = json.loads(lessonList) | ||
for i in range(len(mlData["sub_items"])): | ||
frst_name = mlData["sub_items"][i]["title"].replace(" ", "") | ||
# find video | ||
try: | ||
for j in range(len(mlData["sub_items"][i]["videos"][0]["video_storage_info"])): | ||
res = mlData["sub_items"][i]["videos"][0]["video_storage_info"][j]["normal"] | ||
if ".mp4" in res: | ||
break | ||
pool.submit(self.download, res, frst_name) | ||
except Exception as e: | ||
pass | ||
|
||
# find voice | ||
try: | ||
for j in range(len(mlData["sub_items"][i]["audios"][0]["audio_storage_info"])): | ||
res2 = mlData["sub_items"][i]["audios"][0]["audio_storage_info"][j]["url"] | ||
if ".mp3" in res2: | ||
break | ||
pool.submit(self.download, res2, frst_name, "mp3") | ||
except Exception as e: | ||
pass | ||
else: | ||
lessonList = self.getLessonList(url) | ||
mlData = json.loads(lessonList) | ||
print("已配置10个线程下载") | ||
for i in range((len(mlData["fpe1ki18v228w00"]))): | ||
frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace( | ||
'\t', ' ') | ||
static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"] | ||
# 打开 mp4 视频网页链接 | ||
resData = self.sess.get(static_page_url).content.decode("utf8") | ||
preUrl = static_page_url.split("/")[3] | ||
pattern = r'src="./data(.*?)"></script>' | ||
url = "https://www.xuexi.cn/" + preUrl + \ | ||
"/data" + re.findall(pattern, resData, re.I)[0] | ||
res = self.get_video_links(url)[0] | ||
print("已解析第 %s 个视频的下载地址:%s" % (i, res)) | ||
pool.submit(self.download, res, frst_name) # 往线程池里面加入一个task | ||
|
||
def getLessonListByLgPage(self, url): | ||
''' | ||
针对新格式 url 解析视频 | ||
''' | ||
pattern = r'index.html\?id=(.*?)&' | ||
id = re.findall(pattern, url, re.I)[0] | ||
newUrl = r"https://boot-source.xuexi.cn/data/app/" + id + ".js" | ||
resData = self.sess.get(url=newUrl).content.decode("utf8") | ||
print("已解析视频列表数据...") | ||
return resData[9:-1] | ||
|
||
def getLessonList(self, url): | ||
resData = self.sess.get(url=url).content.decode("utf8") | ||
print("已解析视频列表数据...") | ||
pattern = r'src="./data(.*?)"></script>' | ||
preUrl = url.split("/")[3] | ||
jsonUrl = "https://www.xuexi.cn/" + preUrl + \ | ||
"/data" + re.findall(pattern, resData, re.I)[0] | ||
resData2 = self.sess.get(url=jsonUrl).content.decode("utf8") | ||
print("已请求视频列表数据...") | ||
return resData2[14:-1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.