From 1d0ab5f88d07049ca4c897a090d3159c278ca0f5 Mon Sep 17 00:00:00 2001 From: Schemen Date: Wed, 23 Jan 2019 09:58:18 +0100 Subject: [PATCH 1/2] Fix mangafox parsing by using requests_html This leads to heavy load times as a headless browser is used. This is just for testing of viability. --- bin/sourceparser/Mangafox.py | 29 +++++++++++++++++++---------- requirements.txt | 1 + 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/bin/sourceparser/Mangafox.py b/bin/sourceparser/Mangafox.py index 74bb704..f76b875 100644 --- a/bin/sourceparser/Mangafox.py +++ b/bin/sourceparser/Mangafox.py @@ -3,6 +3,7 @@ import logging import re from urllib.parse import urlparse +from requests_html import HTMLSession import requests from bs4 import BeautifulSoup @@ -19,14 +20,15 @@ Returns: title ''' def getTitle(page): + title = None soup = BeautifulSoup(page.content, 'html.parser') #Get Manga Titel - var = soup.findAll("h2") - step1 = ''.join(var[0].findAll(text=True)) - step2 = step1.split() - step3 = step2[:-3] - title = ' '.join(step3) + search = re.search('content="Read\s(.*?)\smanga online,', str(soup)) + try: + title = search.group(1) + except AttributeError: + logging.error("No Title Fount!") return title @@ -56,7 +58,7 @@ def getPages(page): soup = BeautifulSoup(page.content, 'html.parser') #Get Manga Titel - search =re.search('var total_pages=(.*?);', str(soup)) + search =re.search('var imagecount=(.*?);', str(soup)) pages = search.group(1) return pages @@ -108,11 +110,18 @@ def getPagesUrl(starturl,pages): ''' def getImageUrl(pageurl): # Download Page - page = requests.get(pageurl) + #page = requests.get(pageurl) + logging.getLogger("pyppeteer").setLevel(logging.WARNING) + + + session = HTMLSession() + + page = session.get(pageurl) + page.html.render() #Pass page to parser - soup = BeautifulSoup(page.content, 'html.parser') - var1 = soup.find(id='image') + var1 = page.html.find('img.reader-main-img') + var2 =re.search("style=\'cursor:pointer\' src=\'//(.*?)\'", str(var1)) - imageurl = var1['src'] + imageurl = "http://" + var2.group(1) return imageurl diff --git a/requirements.txt b/requirements.txt index 7c075ab..76e4965 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ feedparser==5.2.1 KindleComicConverter==5.4.3 peewee==3.7.0 python-dateutil==2.7.5 +requests-html==0.9.0 From 1c909e29fa6ba81e961f2cc43b6dcecf77804984 Mon Sep 17 00:00:00 2001 From: elia Date: Sun, 7 Apr 2019 11:41:01 +0200 Subject: [PATCH 2/2] Use splash container to render JS sites Relates: #18 --- .dockerignore | 2 ++ README.md | 34 ++++++++++++++------------------ bin/sourceparser/Mangafox.py | 38 ++++++++++++++++++++++++------------ config.ini | 1 + docker-compose.yml | 17 ++++++++++++++++ requirements.txt | 1 - 6 files changed, 61 insertions(+), 32 deletions(-) create mode 100644 docker-compose.yml diff --git a/.dockerignore b/.dockerignore index 27e9f85..11f1fae 100644 --- a/.dockerignore +++ b/.dockerignore @@ -11,3 +11,5 @@ kindlegen.exe venv main.db log/* +docker-compose.yml +Dockerfile diff --git a/README.md b/README.md index 24f4786..bd7c719 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,23 @@ M2em let's you automatically download Mangas via RSS Feed that updates at a conf ## Supported Websites * Mangastream -* MangaFox +* MangaFox (With Splash Rendering container) * Cdmnet # Setup -M2em requires Python3 and I highly recommend working in a virtualenv. Some OS require the python-dev package! +M2em requires Python3 and I highly recommend working in a virtualenv and if you want to use Mangasources which are JavaScript heavy, I actually recommend to use docker to deploy the m2em binary and the rendering service together. Some OS require the python-dev package! + +## Docker Setup +You can use the Dockerfile or the image schemen/m2em. All options in the config.ini are available as environment variable. Make sure you write the exactly the same! + +Have a look at the example Compose file in the repository. This will deploy two containers, m2em and splash. Splash is to render websites which use javascript. The alias (which you can add to your bashrc if you want) allows you to directly call the containerized application + +``` +docker-compose up -d +alias m2em='sudo docker exec -it m2em_m2em_1 ./m2em.py' +m2em -h +``` ## Create and install virtual environment ```x-sh @@ -45,23 +56,6 @@ deactivate Get Kindlegen here: https://www.amazon.com/gp/feature.html?docId=1000765211 -## Docker Setup -You can use the Dockerfile or the image schemen/m2em. All options in the config.ini are available as environment variable. Make sure you write the exactly the same! - -Example Compose file: -``` -version: '2' -services: - m2em: - image: schemen/m2em:latest - environment: - - SMTPServer=mail.example.com - - EmailAddress=comic@example.com - - EmailAddressPw=verysecurepassword - volumes: - - :/usr/src/app/data - -``` ## Concept As a concept, M2em has different workers that run in a loop. All Chapter/user data is saved in a SQLite3 Database. @@ -185,6 +179,8 @@ EbookFormat = MOBI # Ebook Profile setting, check # https://github.com/ciromattia/kcc for more information EbookProfile = KV +# If you want to run splash intependently change this setting +SplashServer = http://splash:8050 # Sender Email Server Settings SMTPServer = mail.example.com ServerPort = 587 diff --git a/bin/sourceparser/Mangafox.py b/bin/sourceparser/Mangafox.py index f76b875..9d170f3 100644 --- a/bin/sourceparser/Mangafox.py +++ b/bin/sourceparser/Mangafox.py @@ -3,9 +3,9 @@ import logging import re from urllib.parse import urlparse -from requests_html import HTMLSession import requests from bs4 import BeautifulSoup +import bin.Config as Config ''' @@ -13,7 +13,9 @@ ''' - +# Splash Rendering Service address +config = Config.load_config() +splash_server = config["SplashServer"] ''' get Manga Title @@ -110,18 +112,30 @@ def getPagesUrl(starturl,pages): ''' def getImageUrl(pageurl): # Download Page - #page = requests.get(pageurl) - logging.getLogger("pyppeteer").setLevel(logging.WARNING) - - - session = HTMLSession() - page = session.get(pageurl) - page.html.render() + # Splash LUA script + script = """ + splash.resource_timeout = 5 + splash:add_cookie{"IsAdult", "1", "/", domain="fanfox.net"} + splash:on_request(function(request) + if string.find(request.url, "tenmanga.com") ~= nil then + request.abort() + end + end) + splash:go(args.url) + return splash:html() + """ + + logging.debug("Sending rendering request to Splash") + resp = requests.post(str(splash_server + "/run"), json={ + 'lua_source': script, + 'url': pageurl + }) + page = resp.content #Pass page to parser - var1 = page.html.find('img.reader-main-img') - var2 =re.search("style=\'cursor:pointer\' src=\'//(.*?)\'", str(var1)) + var =re.search('style=\"cursor:pointer\" src=\"//(.*?)\"', str(page)) - imageurl = "http://" + var2.group(1) + logging.debug(var.group(1)) + imageurl = "http://" + var.group(1) return imageurl diff --git a/config.ini b/config.ini index 698a0ad..928d885 100644 --- a/config.ini +++ b/config.ini @@ -4,6 +4,7 @@ Database = data/main.db Sleep = 900 EbookFormat = MOBI EbookProfile = KV +SplashServer = http://splash:8050 # Sender Email Server Settings SMTPServer = mail.example.com ServerPort = 587 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..1e3f939 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +version: '2' +services: + m2em: + image: schemen/m2em:latest + environment: + - SMTPServer=mail.example.com + - EmailAddress=comic@example.com + - EmailAddressPw=verysecurepassword + volumes: + - m2em:/usr/src/app/data + + splash: + image: scrapinghub/splash + command: --max-timeout 3600 + +volumes: + m2em: \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 76e4965..7c075ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,3 @@ feedparser==5.2.1 KindleComicConverter==5.4.3 peewee==3.7.0 python-dateutil==2.7.5 -requests-html==0.9.0