Skip to content

Commit

Permalink
Merge pull request #11 from ThreeGiantNoobs/setup-beta
Browse files Browse the repository at this point in the history
added setup for distribution
  • Loading branch information
Ravi-Akagra authored Jun 13, 2021
2 parents 4015de0 + 155e832 commit 0f3f5ad
Show file tree
Hide file tree
Showing 11 changed files with 187 additions and 159 deletions.
13 changes: 7 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
*
!conf.json
!setup.py
!*/
!cheggscraper/*

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -135,9 +141,4 @@ cookie.json
*.pdf
*.html




*
!template.html
!conf.json
!cheggscraper/template.html
28 changes: 2 additions & 26 deletions Downloader.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,3 @@
import argparse
import json
from cheggscraper import Downloader

from CheggScraper import CheggScraper

with open('conf.json', 'r') as f:
conf = json.load(f)

default_save_file_path = conf.get('default_save_file_path')
default_cookie_file_path = conf.get('default_cookie_file_path')

ap = argparse.ArgumentParser()
ap.add_argument('-c', '--cookie', default=default_cookie_file_path,
help='path of cookie life', dest='cookie_file')
ap.add_argument('-u', '--url', help='url of chegg homework-help, put inside " "',
type=str, dest='url')
ap.add_argument('-s', '--save', help='file path, where you want to save, put inside " " eg: test.html or D:\\myFolder\\test.html or /home/test.html',
type=str, default=default_save_file_path, dest='file_path')
args = vars(ap.parse_args())


if __name__ == '__main__':
if not args.get('url'):
args.update({'url': input('Enter url of the homework-help: ')})

Chegg = CheggScraper(cookie_path=args['cookie_file'])
print(Chegg.url_to_html(args['url'], file_path=args['file_path']))
Downloader.main()
91 changes: 0 additions & 91 deletions cheggPcheck.py

This file was deleted.

125 changes: 98 additions & 27 deletions CheggScraper.py → cheggscraper/CheggScraper.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,45 @@
import os
import re
import json
from contextlib import redirect_stderr

import requests
import logging
import unicodedata
from importlib.resources import read_text

from bs4 import BeautifulSoup
from bs4.element import Tag

logging.basicConfig(filename='scraper.log', filemode='w', level=logging.DEBUG)


class CheggScraper(object):
def __init__(self, cookie: str = None, cookie_path: str = None):
class CheggScraper:
"""
Scrape html from chegg.com and store them in a way so you don't need cookie to view the file
"""
def __init__(self, cookie: str = None, cookie_path: str = None, user_agent: str = None, base_path: str = None,
save_file_path: str = None, config: dict = None, template_path: str = None):
if cookie:
self.cookie = cookie
else:
self.cookie = self.parse_cookie(cookie_path)

self.cookie_dict = self.cookie_str_to_dict(self.cookie)

with open('conf.json', 'r') as f:
conf = json.load(f)
user_agent = conf.get('user_agent')
self.base_path = conf.get('base_path')
self.template_path = template_path

if not config:
config = json.loads(read_text('cheggscraper', 'conf.json'))

if not user_agent:
user_agent = config.get('user_agent')
if not user_agent:
raise Exception('user_agent not defined')

if not base_path:
self.base_path = config.get('base_path')

if not self.base_path:
self.base_path = ''

Expand Down Expand Up @@ -54,7 +70,17 @@ def __init__(self, cookie: str = None, cookie_path: str = None):
self.deviceFingerPrintId = self.cookie_dict.get('DFID')

@staticmethod
def slugify(value, allow_unicode=False):
def slugify(value: str, allow_unicode: bool = False) -> str:
"""
slugify the names of files
:param value: string to be slugify
:type value: str
:param allow_unicode: allow unicode
:type allow_unicode: bool
:return: string after slugify
:rtype: str
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
Expand All @@ -63,27 +89,55 @@ def slugify(value, allow_unicode=False):
value = re.sub(r'[^\w\s-]', '', value.lower())
return re.sub(r'[-\s]+', '-', value).strip('-_')

@staticmethod
def render_html(**kwargs):
def render_html(self, **kwargs) -> str:
"""
render html from template file: {{var}}
:param kwargs: key, value to replace in template
:type kwargs:
:return: rendered html code
:rtype: str
"""
template_path = kwargs.get('template_path')
html_template = None
if not template_path:
template_path = self.template_path

if not template_path:
template_path = 'template.html'
html_template = read_text('cheggscraper', 'template.html')

with open(template_path, 'r') as f:
html_template = f.read()
if not html_template:
with open(template_path, 'r') as f:
html_template = f.read()

variables = re.findall(r'\{\{([a-zA-Z_]+)}}', html_template)
for variable in variables:
html_template = html_template.replace('{{' + variable + '}}', str(kwargs.get(variable)))
return html_template

@staticmethod
def replace_src_links(html_text: str):
def replace_src_links(html_text: str) -> str:
"""
Replace relative links from page, so even you are opening file without any host, still can see all contents,
still some css and js won't load
:param html_text: html code of page
:type html_text: str
:return: html code after modify all relative links
:rtype: str
"""
return re.sub(r'src=\s*?"//(.*)?"', r'src="https://\1"', html_text)

@staticmethod
def cookie_str_to_dict(cookie_str: str):
"""
Convert cookie str to dict of key, value pairs
:param cookie_str: cookie in format of string [key=value; key=value]
:type cookie_str: str
:return: dictionary of key value pairs of key value pairs
:rtype: dict
"""
ret = {}
cookie_pairs = cookie_str.split(';')
for pair in cookie_pairs:
Expand All @@ -95,6 +149,14 @@ def cookie_str_to_dict(cookie_str: str):

@staticmethod
def parse_json(json_string: str) -> (bool, dict):
"""
just parse json
:param json_string: json data in format of string
:type json_string: str
:return: tuple of isJson, dictionary form of json
:rtype:
"""
try:
data = json.loads(json_string)
return True, data
Expand All @@ -103,7 +165,15 @@ def parse_json(json_string: str) -> (bool, dict):
return False, None

@staticmethod
def json_to_cookie_str(cookie_dict: dict):
def dict_to_cookie_str(cookie_dict: dict) -> str:
"""
Convert dict to cookie string
:param cookie_dict: dictionary of cookie, key value pairs
:type cookie_dict: dict
:return: cookie in string format
:rtype: str
"""
cookie_str = ''
first_flag = True
for cookie in cookie_dict:
Expand All @@ -114,16 +184,23 @@ def json_to_cookie_str(cookie_dict: dict):
return cookie_str

@staticmethod
def parse_cookie(cookie_path: str):
def parse_cookie(cookie_path: str) -> str:
"""
Parse cookie from cookie_path
:param cookie_path: path of cookie file
:type cookie_path: str
:return: string cookie
:rtype: str
"""
if os.path.exists(cookie_path):
if os.path.isfile(cookie_path):
with open(cookie_path, 'r') as f:
cookie_text = f.read()
json_result = CheggScraper.parse_json(cookie_text)
if json_result[0]:
return CheggScraper.json_to_cookie_str(json_result[1]).strip()
else:
return cookie_text.strip()
return CheggScraper.dict_to_cookie_str(json_result[1]).strip()
return cookie_text.strip()
else:
logging.error(msg=f"{cookie_path} is not a file")
raise Exception
Expand Down Expand Up @@ -157,7 +234,6 @@ def final_touch(html_text: str) -> str:
@return: modified FINAL html Text
@rtype: str
"""

soup = BeautifulSoup(html_text, 'lxml')
if soup.find('div', {'id': 'show-more'}):
soup.find('div', {'id': 'show-more'}).decompose()
Expand All @@ -177,10 +253,9 @@ def _web_response(self, url: str, headers: dict = None, expected_status: tuple =
if response.status_code not in expected_status:
logging.error(msg=f'Expected status code {expected_status} but got {response.status_code}\n{error_note}')
return response
else:
if note:
logging.info(msg=note)
return response
if note:
logging.info(msg=note)
return response

def _get_response_text(self, url: str, headers: dict = None, expected_status: tuple = (200,),
note: str = None, error_note: str = "Error in request"):
Expand Down Expand Up @@ -350,7 +425,3 @@ def url_to_html(self, url: str, file_path: str = None) -> str:
f.write(final_html)

return file_path


if __name__ == '__main__':
pass
Loading

0 comments on commit 0f3f5ad

Please sign in to comment.