From 93635981e826b7e4f9cce7f134e9436c3755b9e0 Mon Sep 17 00:00:00 2001 From: Alex Duchesne Date: Fri, 13 Sep 2024 13:53:02 -0400 Subject: [PATCH] Add pub_date support to limetorrents I had to overhauled the parser a little because the old way of using td classes didn't work for our purpose. --- nova3/engines/limetorrents.py | 85 ++++++++++++++++++++--------------- nova3/engines/versions.txt | 2 +- 2 files changed, 51 insertions(+), 36 deletions(-) diff --git a/nova3/engines/limetorrents.py b/nova3/engines/limetorrents.py index 37d8c5a..4b83622 100644 --- a/nova3/engines/limetorrents.py +++ b/nova3/engines/limetorrents.py @@ -1,8 +1,9 @@ -#VERSION: 4.7 +#VERSION: 4.8 # AUTHORS: Lima66 # CONTRIBUTORS: Diego de las Heras (ngosang@hotmail.es) import re +from datetime import datetime, timedelta from html.parser import HTMLParser from urllib.parse import quote @@ -37,38 +38,49 @@ def __init__(self, url): HTMLParser.__init__(self) self.url = url self.current_item = {} # dict for found item - self.item_name = None # key's name in current_item dict self.page_empty = 22000 + self.inside_table = False self.inside_tr = False - self.findTable = False - self.parser_class = {"tdnormal": "size", # class - "tdseed": "seeds", - "tdleech": "leech"} + self.column_index = -1 + self.column_name = None # key's name in current_item dict + self.columns = ["name", "pub_date", "size", "seeds", "leech"] + + now = datetime.now() + self.date_parsers = { + r"yesterday": lambda m: now - timedelta(days=1), + r"last\s+month": lambda m: now - timedelta(days=30), + r"(\d+)\s+years?": lambda m: now - timedelta(days=int(m[1]) * 365), + r"(\d+)\s+months?": lambda m: now - timedelta(days=int(m[1]) * 30), + r"(\d+)\s+days?": lambda m: now - timedelta(days=int(m[1])), + r"(\d+)\s+hours?": lambda m: now - timedelta(hours=int(m[1])), + r"(\d+)\s+minutes?": lambda m: now - timedelta(minutes=int(m[1])), + } def handle_starttag(self, tag, attrs): - params = dict(attrs) + if params.get('class') == 'table2': - self.findTable = True + self.inside_table = True + elif not self.inside_table: + return - if tag == self.TR and self.findTable and (params.get('bgcolor') == '#F4F4F4' or params.get('bgcolor') == '#FFFFFF'): # noqa + if tag == self.TR and (params.get('bgcolor') == '#F4F4F4' or params.get('bgcolor') == '#FFFFFF'): # noqa self.inside_tr = True - self.current_item = {} - if not self.inside_tr: + self.column_index = -1 + self.current_item = {"engine_url": self.url} + elif not self.inside_tr: return - if self.inside_tr and tag == self.TD: - if "class" in params: - self.item_name = self.parser_class.get(params["class"], None) - if self.item_name: - self.current_item[self.item_name] = -1 + if tag == self.TD: + self.column_index += 1 + if self.column_index < len(self.columns): + self.column_name = self.columns[self.column_index] + else: + self.column_name = None - if self.inside_tr and tag == self.A and self.HREF in params: + if self.column_name == "name" and tag == self.A and self.HREF in params: link = params["href"] - if link.startswith("http://itorrents.org/torrent/"): - self.current_item["engine_url"] = self.url - self.item_name = "name" - elif link.endswith(".html"): + if link.endswith(".html"): try: safe_link = quote(self.url + link, safe='/:') except KeyError: @@ -77,26 +89,29 @@ def handle_starttag(self, tag, attrs): self.current_item["desc_link"] = safe_link def handle_data(self, data): - if self.inside_tr and self.item_name: - if self.item_name == 'size' and (data.endswith('MB') or data.endswith('GB')): - self.current_item[self.item_name] = data.strip().replace(',', '') - elif not self.item_name == 'size': - self.current_item[self.item_name] = data.strip().replace(',', '') - - self.item_name = None + if self.column_name: + if self.column_name in ["size", "seeds", "leech"]: + data = data.replace(',', '') + elif self.column_name == "pub_date": + timestamp = -1 + for pattern, calc in self.date_parsers.items(): + m = re.match(pattern, data, re.IGNORECASE) + if m: + timestamp = int(calc(m).timestamp()) + break + data = str(timestamp) + self.current_item[self.column_name] = data.strip() + self.column_name = None def handle_endtag(self, tag): if tag == 'table': - self.findTable = False + self.inside_table = False if self.inside_tr and tag == self.TR: self.inside_tr = False - self.item_name = None - array_length = len(self.current_item) - if array_length < 1: - return - prettyPrinter(self.current_item) - self.current_item = {} + self.column_name = None + if "link" in self.current_item: + prettyPrinter(self.current_item) def download_torrent(self, info): # since limetorrents provides torrent links in itorrent (cloudflare protected), diff --git a/nova3/engines/versions.txt b/nova3/engines/versions.txt index 672def0..1de0ed4 100644 --- a/nova3/engines/versions.txt +++ b/nova3/engines/versions.txt @@ -1,6 +1,6 @@ eztv: 1.16 jackett: 4.0 -limetorrents: 4.7 +limetorrents: 4.8 piratebay: 3.3 solidtorrents: 2.3 torlock: 2.23