Skip to content

Commit

Permalink
New website added
Browse files Browse the repository at this point in the history
- Added option to change parser for web pages, since some websites are parsed better with different parsers.
- Added Novelle Leggere novels (Italian translations)
  • Loading branch information
EternalTrail authored May 5, 2020
1 parent 82e051f commit 8308740
Show file tree
Hide file tree
Showing 3 changed files with 183 additions and 4 deletions.
178 changes: 178 additions & 0 deletions NovelParsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,5 +736,183 @@ def cleanChapter(self, soup):
return BeautifulSoup(chapter, "html.parser")


class NovelleLeggereParser:

def __init__(self):

self.url = "https://www.novelleleggere.com/"

self.name = "Novelle Leggere"

# Create containers
self.novels = {}
self.novelNames = None
# self.novelSypnoses = None
self.isLoaded = False

# Container for all novels that are requested
self.novelLibrary = {}


def load(self):
if not self.isLoaded:
self.parseNovelList()
self.isLoaded = True


def clearNovelCache(self):
self.novelLibrary = {}


def insertSpecialCases(self):
del self.novels["The Legend of Randidly Ghosthound"]
pass


def parseNovelList(self):

tables = PageTools().getElementsFromUrl(self.url, ["table"])
tables = tables[0:2]

for table in tables:
novels = PageTools().getElementsFromSoup(table, ["strong","a"])

for novel in novels:
title = novel.text.strip()
self.novels[title] = [novel['href'], "N/A", "N/A"]


self.insertSpecialCases()

self.novelNames = list(self.novels.keys())
self.novelNames.sort()


def loadNovelInfo(self, novelName):

if novelName in self.novelLibrary.keys():
return

# Load the webpage for the novel
soup = PageTools().getElementsFromUrl(self.novels[novelName][0], [{"id":"content"}],parser = 'lxml')[0]
titles = PageTools().getElementsFromSoup(soup, [{"class_":"su-spoiler-title"}], onlyText=True)

chapterBlocks = PageTools().getElementsFromSoup(soup, [{"class_":"display-posts-listing"}])
self.novels[novelName][1] = PageTools().getElementsFromSoup(soup,["img"])[0]['src']

# Special parsing conditions
if novelName == "The Wandering Inn":
titles = titles[4:]
elif novelName == "Il Demone Contro il Cielo":
titles = titles[1:]
chapterBlocks = PageTools().getElementsFromSoup(soup, [{"class_":"su-spoiler-content su-u-clearfix su-u-trim"}])
else:
titles = titles[2:]

if novelName == "Legendary Moonlight Sculptor":
chapterBlocks = PageTools().getElementsFromSoup(soup, [{"class_":"su-spoiler-content su-u-clearfix su-u-trim"}])
chapterBlocks = chapterBlocks[2:]

# Download cover image
try:
coverImage = PageTools().downloadPage(self.novels[novelName][1])
except:
coverImage = PageTools().downloadPage(noCoverLink)

# Create an empty dictionary to store all chapter names and links
chapterLibrary = []
bookToC = {}
for title, chapterBlock in zip(titles, chapterBlocks):
chapters = PageTools().getElementsFromSoup(chapterBlock, ["a"])

chapterInfo = [[chapter['href'], chapter.string.replace("<",'').replace(">",'')] for chapter in chapters]

# Store chapters for each book
bookToC[title] = chapterInfo
chapterLibrary.extend(bookToC[title])

# Add the books, chapters, and the cover to the novel library
self.novelLibrary[novelName] = [titles, chapterLibrary, bookToC, coverImage]


def getNovelNames(self):

self.load()
return self.novelNames


def getImageBinary(self, novelName):

self.loadNovelInfo(novelName)
return self.novelLibrary[novelName][3]


def getImagePillow(self, novelName):

return Image.open(BytesIO(self.getImageBinary(novelName)))


def getNovelBookNames(self, novelName):

self.loadNovelInfo(novelName)
return self.novelLibrary[novelName][0]


def getNovelChapterLinks(self, novelName):

self.loadNovelInfo(novelName)
return [chapter[0] for chapter in self.novelLibrary[novelName][1]]


def getNovelChapterNames(self, novelName):

self.loadNovelInfo(novelName)
return [chapter[1] for chapter in self.novelLibrary[novelName][1]]


def getNovelBookChapterLinks(self, novelName, bookName):

self.loadNovelInfo(novelName)
return [chapter[0] for chapter in self.novelLibrary[novelName][2][bookName]]


def getNovelBookChapterNames(self, novelName, bookName):

self.loadNovelInfo(novelName)
return [chapter[1] for chapter in self.novelLibrary[novelName][2][bookName]]


def cleanChapter(self, soup):

hasSpoiler = None

# Extract the chapter title and the chapter content
chapterTitle = soup.find(class_="entry-title fusion-post-title").string
content = soup.find(class_="post-content")

# Remove characters that might corrupt the ebook file
chapterTitle = chapterTitle.replace("<",'&lt;').replace(">",'&gt;')

# Remove unnecessary objects and ads from the chapter
for obj in content.find_all(**{"data-type":"post"}):
obj.decompose()

for n in content.find_all(class_="su-note"):
n.decompose()

for i in range(1,10):
for ad in content.find_all(class_="quads-location quads-ad{}".format(i)):
ad.decompose()

# Add html header to the chapter
chapter = '<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<title>{0}</title>\n</head>\n<body>\n<h1>{0}</h1>\n'.format(chapterTitle)
chapter += str(content)
if hasSpoiler != None:
chapter += "<strong>The chapter name is: {}</strong>".format(hasSpoiler)

# Return the chapter as a BeautifulSoup html object
return BeautifulSoup(chapter, "html.parser")


if __name__ == "__main__":
pass
8 changes: 4 additions & 4 deletions PageTools.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ def getJsonFromPost(self, url, jsonPayload):
jsonFile = json.loads(response.text)
return jsonFile

def getSoupFromUrl(self, url):
def getSoupFromUrl(self, url, parser = 'html.parser'):

# Parse the webpage response with the HTML parset
soup = BeautifulSoup(self.downloadPage(url), 'html.parser')
soup = BeautifulSoup(self.downloadPage(url), parser)
return soup


Expand Down Expand Up @@ -120,7 +120,7 @@ def getElementsFromSoup(self, soup, elementRecursiveList, findAllEnableList = Tr
return elementList


def getElementsFromUrl(self, url, elementRecursiveList, findAllEnableList = True, onlyText = False, recursionIndex = 0):
def getElementsFromUrl(self, url, elementRecursiveList, findAllEnableList = True, onlyText = False, recursionIndex = 0, parser = 'html.parser'):

# Get the requested elements directly from the url
return self.getElementsFromSoup(self.getSoupFromUrl(url), elementRecursiveList, findAllEnableList, onlyText, recursionIndex)
return self.getElementsFromSoup(self.getSoupFromUrl(url, parser), elementRecursiveList, findAllEnableList, onlyText, recursionIndex)
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ To run this script you'll need to have Python 3.x.x installed which you can find
- [Gravity Tales](https://gravitytales.com)
- [Volare Novels](https://www.volarenovels.com)
- [Totally Translations](https://totallytranslations.com/)
- [Novelle Leggere](https://www.novelleleggere.com/)
- Automatically adds some metadata like title and cover
- Concurrent download of chapters - *significantly* faster download of books than just downloading them one by one (I'm talking an order or 2 of magnitudes faster)

Expand Down

0 comments on commit 8308740

Please sign in to comment.