diff --git a/.dockerignore b/.dockerignore index 27e9f85..11f1fae 100644 --- a/.dockerignore +++ b/.dockerignore @@ -11,3 +11,5 @@ kindlegen.exe venv main.db log/* +docker-compose.yml +Dockerfile diff --git a/README.md b/README.md index 24f4786..bd7c719 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,23 @@ M2em let's you automatically download Mangas via RSS Feed that updates at a conf ## Supported Websites * Mangastream -* MangaFox +* MangaFox (With Splash Rendering container) * Cdmnet # Setup -M2em requires Python3 and I highly recommend working in a virtualenv. Some OS require the python-dev package! +M2em requires Python3 and I highly recommend working in a virtualenv and if you want to use Mangasources which are JavaScript heavy, I actually recommend to use docker to deploy the m2em binary and the rendering service together. Some OS require the python-dev package! + +## Docker Setup +You can use the Dockerfile or the image schemen/m2em. All options in the config.ini are available as environment variable. Make sure you write the exactly the same! + +Have a look at the example Compose file in the repository. This will deploy two containers, m2em and splash. Splash is to render websites which use javascript. The alias (which you can add to your bashrc if you want) allows you to directly call the containerized application + +``` +docker-compose up -d +alias m2em='sudo docker exec -it m2em_m2em_1 ./m2em.py' +m2em -h +``` ## Create and install virtual environment ```x-sh @@ -45,23 +56,6 @@ deactivate Get Kindlegen here: https://www.amazon.com/gp/feature.html?docId=1000765211 -## Docker Setup -You can use the Dockerfile or the image schemen/m2em. All options in the config.ini are available as environment variable. Make sure you write the exactly the same! - -Example Compose file: -``` -version: '2' -services: - m2em: - image: schemen/m2em:latest - environment: - - SMTPServer=mail.example.com - - EmailAddress=comic@example.com - - EmailAddressPw=verysecurepassword - volumes: - - :/usr/src/app/data - -``` ## Concept As a concept, M2em has different workers that run in a loop. All Chapter/user data is saved in a SQLite3 Database. @@ -185,6 +179,8 @@ EbookFormat = MOBI # Ebook Profile setting, check # https://github.com/ciromattia/kcc for more information EbookProfile = KV +# If you want to run splash intependently change this setting +SplashServer = http://splash:8050 # Sender Email Server Settings SMTPServer = mail.example.com ServerPort = 587 diff --git a/bin/sourceparser/Mangafox.py b/bin/sourceparser/Mangafox.py index 74bb704..9d170f3 100644 --- a/bin/sourceparser/Mangafox.py +++ b/bin/sourceparser/Mangafox.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse import requests from bs4 import BeautifulSoup +import bin.Config as Config ''' @@ -12,21 +13,24 @@ ''' - +# Splash Rendering Service address +config = Config.load_config() +splash_server = config["SplashServer"] ''' get Manga Title Returns: title ''' def getTitle(page): + title = None soup = BeautifulSoup(page.content, 'html.parser') #Get Manga Titel - var = soup.findAll("h2") - step1 = ''.join(var[0].findAll(text=True)) - step2 = step1.split() - step3 = step2[:-3] - title = ' '.join(step3) + search = re.search('content="Read\s(.*?)\smanga online,', str(soup)) + try: + title = search.group(1) + except AttributeError: + logging.error("No Title Fount!") return title @@ -56,7 +60,7 @@ def getPages(page): soup = BeautifulSoup(page.content, 'html.parser') #Get Manga Titel - search =re.search('var total_pages=(.*?);', str(soup)) + search =re.search('var imagecount=(.*?);', str(soup)) pages = search.group(1) return pages @@ -108,11 +112,30 @@ def getPagesUrl(starturl,pages): ''' def getImageUrl(pageurl): # Download Page - page = requests.get(pageurl) + + # Splash LUA script + script = """ + splash.resource_timeout = 5 + splash:add_cookie{"IsAdult", "1", "/", domain="fanfox.net"} + splash:on_request(function(request) + if string.find(request.url, "tenmanga.com") ~= nil then + request.abort() + end + end) + splash:go(args.url) + return splash:html() + """ + + logging.debug("Sending rendering request to Splash") + resp = requests.post(str(splash_server + "/run"), json={ + 'lua_source': script, + 'url': pageurl + }) + page = resp.content #Pass page to parser - soup = BeautifulSoup(page.content, 'html.parser') - var1 = soup.find(id='image') + var =re.search('style=\"cursor:pointer\" src=\"//(.*?)\"', str(page)) - imageurl = var1['src'] + logging.debug(var.group(1)) + imageurl = "http://" + var.group(1) return imageurl diff --git a/config.ini b/config.ini index 698a0ad..928d885 100644 --- a/config.ini +++ b/config.ini @@ -4,6 +4,7 @@ Database = data/main.db Sleep = 900 EbookFormat = MOBI EbookProfile = KV +SplashServer = http://splash:8050 # Sender Email Server Settings SMTPServer = mail.example.com ServerPort = 587 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..1e3f939 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +version: '2' +services: + m2em: + image: schemen/m2em:latest + environment: + - SMTPServer=mail.example.com + - EmailAddress=comic@example.com + - EmailAddressPw=verysecurepassword + volumes: + - m2em:/usr/src/app/data + + splash: + image: scrapinghub/splash + command: --max-timeout 3600 + +volumes: + m2em: \ No newline at end of file