Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version 0.6.0 #27

Merged
merged 14 commits into from
Mar 12, 2020
Next Next commit
Fix mangafox parsing by using requests_html
This leads to heavy load times as a headless browser is used.
This is just for testing of viability.
schemen committed Jan 23, 2019
commit 1d0ab5f88d07049ca4c897a090d3159c278ca0f5
29 changes: 19 additions & 10 deletions bin/sourceparser/Mangafox.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
import logging
import re
from urllib.parse import urlparse
from requests_html import HTMLSession
import requests
from bs4 import BeautifulSoup

@@ -19,14 +20,15 @@
Returns: title
'''
def getTitle(page):
title = None
soup = BeautifulSoup(page.content, 'html.parser')

#Get Manga Titel
var = soup.findAll("h2")
step1 = ''.join(var[0].findAll(text=True))
step2 = step1.split()
step3 = step2[:-3]
title = ' '.join(step3)
search = re.search('content="Read\s(.*?)\smanga online,', str(soup))
try:
title = search.group(1)
except AttributeError:
logging.error("No Title Fount!")

return title

@@ -56,7 +58,7 @@ def getPages(page):
soup = BeautifulSoup(page.content, 'html.parser')

#Get Manga Titel
search =re.search('var total_pages=(.*?);', str(soup))
search =re.search('var imagecount=(.*?);', str(soup))
pages = search.group(1)
return pages

@@ -108,11 +110,18 @@ def getPagesUrl(starturl,pages):
'''
def getImageUrl(pageurl):
# Download Page
page = requests.get(pageurl)
#page = requests.get(pageurl)
logging.getLogger("pyppeteer").setLevel(logging.WARNING)


session = HTMLSession()

page = session.get(pageurl)
page.html.render()

#Pass page to parser
soup = BeautifulSoup(page.content, 'html.parser')
var1 = soup.find(id='image')
var1 = page.html.find('img.reader-main-img')
var2 =re.search("style=\'cursor:pointer\' src=\'//(.*?)\'", str(var1))

imageurl = var1['src']
imageurl = "http://" + var2.group(1)
return imageurl
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -7,3 +7,4 @@ feedparser==5.2.1
KindleComicConverter==5.4.3
peewee==3.7.0
python-dateutil==2.7.5
requests-html==0.9.0