Merge pull request #11 from ThreeGiantNoobs/setup-beta

added setup for distribution
ThreeGiantNoobs · Jun 13, 2021 · 0f3f5ad · 0f3f5ad
2 parents 4015de0 + 155e832
commit 0f3f5ad
Show file tree

Hide file tree

Showing 11 changed files with 187 additions and 159 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,9 @@
+*
+!conf.json
+!setup.py
+!*/
+!cheggscraper/*
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -135,9 +141,4 @@ cookie.json
 *.pdf
 *.html
 
-
-
-
-*
-!template.html
-!conf.json
+!cheggscraper/template.html
diff --git a/Downloader.py b/Downloader.py
@@ -1,27 +1,3 @@
-import argparse
-import json
+from cheggscraper import Downloader
 
-from CheggScraper import CheggScraper
-
-with open('conf.json', 'r') as f:
-    conf = json.load(f)
-
-default_save_file_path = conf.get('default_save_file_path')
-default_cookie_file_path = conf.get('default_cookie_file_path')
-
-ap = argparse.ArgumentParser()
-ap.add_argument('-c', '--cookie', default=default_cookie_file_path,
-                help='path of cookie life', dest='cookie_file')
-ap.add_argument('-u', '--url', help='url of chegg homework-help, put inside " "',
-                type=str, dest='url')
-ap.add_argument('-s', '--save', help='file path, where you want to save, put inside " " eg: test.html or D:\\myFolder\\test.html or /home/test.html',
-                type=str, default=default_save_file_path, dest='file_path')
-args = vars(ap.parse_args())
-
-
-if __name__ == '__main__':
-    if not args.get('url'):
-        args.update({'url': input('Enter url of the homework-help: ')})
-
-    Chegg = CheggScraper(cookie_path=args['cookie_file'])
-    print(Chegg.url_to_html(args['url'], file_path=args['file_path']))
+Downloader.main()
diff --git a/cheggPcheck.py b/cheggPcheck.py
diff --git a/CheggScraper.py → cheggscraper/CheggScraper.py b/CheggScraper.py → cheggscraper/CheggScraper.py
@@ -1,29 +1,45 @@
 import os
 import re
 import json
+from contextlib import redirect_stderr
+
 import requests
 import logging
 import unicodedata
+from importlib.resources import read_text
 
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 
 logging.basicConfig(filename='scraper.log', filemode='w', level=logging.DEBUG)
 
 
-class CheggScraper(object):
-    def __init__(self, cookie: str = None, cookie_path: str = None):
+class CheggScraper:
+    """
+    Scrape html from chegg.com and store them in a way so you don't need cookie to view the file
+    """
+    def __init__(self, cookie: str = None, cookie_path: str = None, user_agent: str = None, base_path: str = None,
+                 save_file_path: str = None, config: dict = None, template_path: str = None):
         if cookie:
             self.cookie = cookie
         else:
             self.cookie = self.parse_cookie(cookie_path)
 
         self.cookie_dict = self.cookie_str_to_dict(self.cookie)
 
-        with open('conf.json', 'r') as f:
-            conf = json.load(f)
-        user_agent = conf.get('user_agent')
-        self.base_path = conf.get('base_path')
+        self.template_path = template_path
+
+        if not config:
+            config = json.loads(read_text('cheggscraper', 'conf.json'))
+
+        if not user_agent:
+            user_agent = config.get('user_agent')
+        if not user_agent:
+            raise Exception('user_agent not defined')
+
+        if not base_path:
+            self.base_path = config.get('base_path')
+
         if not self.base_path:
             self.base_path = ''
 
@@ -54,7 +70,17 @@ def __init__(self, cookie: str = None, cookie_path: str = None):
         self.deviceFingerPrintId = self.cookie_dict.get('DFID')
 
     @staticmethod
-    def slugify(value, allow_unicode=False):
+    def slugify(value: str, allow_unicode: bool = False) -> str:
+        """
+        slugify the names of files
+
+        :param value: string to be slugify
+        :type value: str
+        :param allow_unicode: allow unicode
+        :type allow_unicode: bool
+        :return: string after slugify
+        :rtype: str
+        """
         value = str(value)
         if allow_unicode:
             value = unicodedata.normalize('NFKC', value)
@@ -63,27 +89,55 @@ def slugify(value, allow_unicode=False):
         value = re.sub(r'[^\w\s-]', '', value.lower())
         return re.sub(r'[-\s]+', '-', value).strip('-_')
 
-    @staticmethod
-    def render_html(**kwargs):
+    def render_html(self, **kwargs) -> str:
+        """
+        render html from template file: {{var}}
 
+        :param kwargs: key, value to replace in template
+        :type kwargs:
+        :return: rendered html code
+        :rtype: str
+        """
         template_path = kwargs.get('template_path')
+        html_template = None
+        if not template_path:
+            template_path = self.template_path
+
         if not template_path:
-            template_path = 'template.html'
+            html_template = read_text('cheggscraper', 'template.html')
 
-        with open(template_path, 'r') as f:
-            html_template = f.read()
+        if not html_template:
+            with open(template_path, 'r') as f:
+                html_template = f.read()
 
         variables = re.findall(r'\{\{([a-zA-Z_]+)}}', html_template)
         for variable in variables:
             html_template = html_template.replace('{{' + variable + '}}', str(kwargs.get(variable)))
         return html_template
 
     @staticmethod
-    def replace_src_links(html_text: str):
+    def replace_src_links(html_text: str) -> str:
+        """
+        Replace relative links from page, so even you are opening file without any host, still can see all contents,
+        still some css and js won't load
+
+        :param html_text: html code of page
+        :type html_text: str
+        :return: html code after modify all relative links
+        :rtype: str
+        """
         return re.sub(r'src=\s*?"//(.*)?"', r'src="https://\1"', html_text)
 
     @staticmethod
     def cookie_str_to_dict(cookie_str: str):
+        """
+        Convert cookie str to dict of key, value pairs
+
+        :param cookie_str: cookie in format of string [key=value; key=value]
+        :type cookie_str: str
+        :return: dictionary of key value pairs of key value pairs
+        :rtype: dict
+        """
         ret = {}
         cookie_pairs = cookie_str.split(';')
         for pair in cookie_pairs:
@@ -95,6 +149,14 @@ def cookie_str_to_dict(cookie_str: str):
 
     @staticmethod
     def parse_json(json_string: str) -> (bool, dict):
+        """
+        just parse json
+
+        :param json_string: json data in format of string
+        :type json_string: str
+        :return: tuple of isJson, dictionary form of json
+        :rtype:
+        """
         try:
             data = json.loads(json_string)
             return True, data
@@ -103,7 +165,15 @@ def parse_json(json_string: str) -> (bool, dict):
             return False, None
 
     @staticmethod
-    def json_to_cookie_str(cookie_dict: dict):
+    def dict_to_cookie_str(cookie_dict: dict) -> str:
+        """
+        Convert dict to cookie string
+
+        :param cookie_dict: dictionary of cookie, key value pairs
+        :type cookie_dict: dict
+        :return: cookie in string format
+        :rtype: str
+        """
         cookie_str = ''
         first_flag = True
         for cookie in cookie_dict:
@@ -114,16 +184,23 @@ def json_to_cookie_str(cookie_dict: dict):
         return cookie_str
 
     @staticmethod
-    def parse_cookie(cookie_path: str):
+    def parse_cookie(cookie_path: str) -> str:
+        """
+        Parse cookie from cookie_path
+
+        :param cookie_path: path of cookie file
+        :type cookie_path: str
+        :return: string cookie
+        :rtype: str
+        """
         if os.path.exists(cookie_path):
             if os.path.isfile(cookie_path):
                 with open(cookie_path, 'r') as f:
                     cookie_text = f.read()
                     json_result = CheggScraper.parse_json(cookie_text)
                     if json_result[0]:
-                        return CheggScraper.json_to_cookie_str(json_result[1]).strip()
-                    else:
-                        return cookie_text.strip()
+                        return CheggScraper.dict_to_cookie_str(json_result[1]).strip()
+                    return cookie_text.strip()
             else:
                 logging.error(msg=f"{cookie_path} is not a file")
                 raise Exception
@@ -157,7 +234,6 @@ def final_touch(html_text: str) -> str:
         @return: modified FINAL html Text
         @rtype: str
         """
-
         soup = BeautifulSoup(html_text, 'lxml')
         if soup.find('div', {'id': 'show-more'}):
             soup.find('div', {'id': 'show-more'}).decompose()
@@ -177,10 +253,9 @@ def _web_response(self, url: str, headers: dict = None, expected_status: tuple =
         if response.status_code not in expected_status:
             logging.error(msg=f'Expected status code {expected_status} but got {response.status_code}\n{error_note}')
             return response
-        else:
-            if note:
-                logging.info(msg=note)
-            return response
+        if note:
+            logging.info(msg=note)
+        return response
 
     def _get_response_text(self, url: str, headers: dict = None, expected_status: tuple = (200,),
                            note: str = None, error_note: str = "Error in request"):
@@ -350,7 +425,3 @@ def url_to_html(self, url: str, file_path: str = None) -> str:
             f.write(final_html)
 
         return file_path
-
-
-if __name__ == '__main__':
-    pass