From 1d0ab5f88d07049ca4c897a090d3159c278ca0f5 Mon Sep 17 00:00:00 2001
From: Schemen <eli.ponzio@gmail.com>
Date: Wed, 23 Jan 2019 09:58:18 +0100
Subject: [PATCH 1/2] Fix mangafox parsing by using requests_html

This leads to heavy load times as a headless browser is used.
This is just for testing of viability.
---
 bin/sourceparser/Mangafox.py | 29 +++++++++++++++++++----------
 requirements.txt             |  1 +
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/bin/sourceparser/Mangafox.py b/bin/sourceparser/Mangafox.py
index 74bb704..f76b875 100644
--- a/bin/sourceparser/Mangafox.py
+++ b/bin/sourceparser/Mangafox.py
@@ -3,6 +3,7 @@
 import logging
 import re
 from urllib.parse import urlparse
+from requests_html import HTMLSession
 import requests
 from bs4 import BeautifulSoup
 
@@ -19,14 +20,15 @@
 Returns: title
 '''
 def getTitle(page):
+    title = None
     soup = BeautifulSoup(page.content, 'html.parser')
 
     #Get Manga Titel
-    var = soup.findAll("h2")
-    step1 = ''.join(var[0].findAll(text=True))
-    step2 = step1.split()
-    step3 = step2[:-3]
-    title = ' '.join(step3)
+    search = re.search('content="Read\s(.*?)\smanga online,', str(soup))
+    try:
+        title = search.group(1)
+    except AttributeError:
+        logging.error("No Title Fount!")
 
     return title
 
@@ -56,7 +58,7 @@ def getPages(page):
     soup = BeautifulSoup(page.content, 'html.parser')
 
     #Get Manga Titel
-    search =re.search('var total_pages=(.*?);', str(soup))
+    search =re.search('var imagecount=(.*?);', str(soup))
     pages = search.group(1)
     return pages
 
@@ -108,11 +110,18 @@ def getPagesUrl(starturl,pages):
 '''
 def getImageUrl(pageurl):
     # Download Page
-    page = requests.get(pageurl)
+    #page = requests.get(pageurl)
+    logging.getLogger("pyppeteer").setLevel(logging.WARNING)
+
+
+    session = HTMLSession()
+
+    page = session.get(pageurl)
+    page.html.render()
 
     #Pass page to parser
-    soup = BeautifulSoup(page.content, 'html.parser')
-    var1 = soup.find(id='image')
+    var1 = page.html.find('img.reader-main-img')
+    var2 =re.search("style=\'cursor:pointer\' src=\'//(.*?)\'", str(var1))
 
-    imageurl = var1['src']
+    imageurl = "http://" + var2.group(1)
     return imageurl
diff --git a/requirements.txt b/requirements.txt
index 7c075ab..76e4965 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,4 @@ feedparser==5.2.1
 KindleComicConverter==5.4.3
 peewee==3.7.0
 python-dateutil==2.7.5
+requests-html==0.9.0

From 1c909e29fa6ba81e961f2cc43b6dcecf77804984 Mon Sep 17 00:00:00 2001
From: elia <elia@sirius.ponz.io>
Date: Sun, 7 Apr 2019 11:41:01 +0200
Subject: [PATCH 2/2] Use splash container to render JS sites

Relates: #18
---
 .dockerignore                |  2 ++
 README.md                    | 34 ++++++++++++++------------------
 bin/sourceparser/Mangafox.py | 38 ++++++++++++++++++++++++------------
 config.ini                   |  1 +
 docker-compose.yml           | 17 ++++++++++++++++
 requirements.txt             |  1 -
 6 files changed, 61 insertions(+), 32 deletions(-)
 create mode 100644 docker-compose.yml

diff --git a/.dockerignore b/.dockerignore
index 27e9f85..11f1fae 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -11,3 +11,5 @@ kindlegen.exe
 venv
 main.db
 log/*
+docker-compose.yml
+Dockerfile
diff --git a/README.md b/README.md
index 24f4786..bd7c719 100644
--- a/README.md
+++ b/README.md
@@ -14,12 +14,23 @@ M2em let's you automatically download Mangas via RSS Feed that updates at a conf
 ## Supported Websites
 
 * Mangastream
-* MangaFox
+* MangaFox (With Splash Rendering container)
 * Cdmnet
 
 # Setup
 
-M2em requires Python3 and I highly recommend working in a virtualenv. Some OS require the python-dev package!
+M2em requires Python3 and I highly recommend working in a virtualenv and if you want to use Mangasources which are JavaScript heavy, I actually recommend to use docker to deploy the m2em binary and the rendering service together. Some OS require the python-dev package!
+
+## Docker Setup
+You can use the Dockerfile or the image schemen/m2em. All options in the config.ini are available as environment variable. Make sure you write the exactly the same!
+
+Have a look at the example Compose file in the repository. This will deploy two containers, m2em and splash. Splash is to render websites which use javascript. The alias (which you can add to your bashrc if you want) allows you to directly call the containerized application
+
+```
+docker-compose up -d
+alias m2em='sudo docker exec -it m2em_m2em_1 ./m2em.py'
+m2em -h
+```
 
 ## Create and install virtual environment
 ```x-sh
@@ -45,23 +56,6 @@ deactivate
 
 Get Kindlegen here: https://www.amazon.com/gp/feature.html?docId=1000765211
 
-## Docker Setup
-You can use the Dockerfile or the image schemen/m2em. All options in the config.ini are available as environment variable. Make sure you write the exactly the same!
-
-Example Compose file:
-```
-version: '2'
-services:
-  m2em:
-    image: schemen/m2em:latest
-    environment:
-     - SMTPServer=mail.example.com
-     - EmailAddress=comic@example.com
-     - EmailAddressPw=verysecurepassword
-    volumes:
-     - <DATA_DIRECTORY>:/usr/src/app/data
-
-``` 
 
 ## Concept
 As a concept, M2em has different workers that run in a loop. All Chapter/user data is saved in a SQLite3 Database.
@@ -185,6 +179,8 @@ EbookFormat = MOBI
 # Ebook Profile setting, check 
 # https://github.com/ciromattia/kcc for more information
 EbookProfile = KV
+# If you want to run splash intependently change this setting
+SplashServer = http://splash:8050
 # Sender Email Server Settings
 SMTPServer = mail.example.com
 ServerPort = 587
diff --git a/bin/sourceparser/Mangafox.py b/bin/sourceparser/Mangafox.py
index f76b875..9d170f3 100644
--- a/bin/sourceparser/Mangafox.py
+++ b/bin/sourceparser/Mangafox.py
@@ -3,9 +3,9 @@
 import logging
 import re
 from urllib.parse import urlparse
-from requests_html import HTMLSession
 import requests
 from bs4 import BeautifulSoup
+import bin.Config as Config
 
 '''
 
@@ -13,7 +13,9 @@
 
 
 '''
-
+# Splash Rendering Service address
+config = Config.load_config()
+splash_server = config["SplashServer"]
 
 '''
 get Manga Title
@@ -110,18 +112,30 @@ def getPagesUrl(starturl,pages):
 '''
 def getImageUrl(pageurl):
     # Download Page
-    #page = requests.get(pageurl)
-    logging.getLogger("pyppeteer").setLevel(logging.WARNING)
-
-
-    session = HTMLSession()
 
-    page = session.get(pageurl)
-    page.html.render()
+    # Splash LUA script
+    script = """
+    splash.resource_timeout = 5
+    splash:add_cookie{"IsAdult", "1", "/", domain="fanfox.net"}
+    splash:on_request(function(request)
+        if string.find(request.url, "tenmanga.com") ~= nil then
+            request.abort()
+        end
+    end)
+    splash:go(args.url)
+    return splash:html()
+    """
+
+    logging.debug("Sending rendering request to Splash")
+    resp = requests.post(str(splash_server + "/run"), json={
+        'lua_source': script,
+        'url': pageurl
+    })
+    page = resp.content
 
     #Pass page to parser
-    var1 = page.html.find('img.reader-main-img')
-    var2 =re.search("style=\'cursor:pointer\' src=\'//(.*?)\'", str(var1))
+    var =re.search('style=\"cursor:pointer\" src=\"//(.*?)\"', str(page))
 
-    imageurl = "http://" + var2.group(1)
+    logging.debug(var.group(1))
+    imageurl = "http://" + var.group(1)
     return imageurl
diff --git a/config.ini b/config.ini
index 698a0ad..928d885 100644
--- a/config.ini
+++ b/config.ini
@@ -4,6 +4,7 @@ Database = data/main.db
 Sleep = 900
 EbookFormat = MOBI
 EbookProfile = KV
+SplashServer = http://splash:8050
 # Sender Email Server Settings
 SMTPServer = mail.example.com
 ServerPort = 587
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..1e3f939
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,17 @@
+version: '2'
+services:
+  m2em:
+    image: schemen/m2em:latest
+    environment:
+     - SMTPServer=mail.example.com
+     - EmailAddress=comic@example.com
+     - EmailAddressPw=verysecurepassword
+    volumes:
+     - m2em:/usr/src/app/data
+
+  splash:
+    image: scrapinghub/splash
+    command: --max-timeout 3600
+
+volumes:
+  m2em:
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 76e4965..7c075ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,3 @@ feedparser==5.2.1
 KindleComicConverter==5.4.3
 peewee==3.7.0
 python-dateutil==2.7.5
-requests-html==0.9.0