Skip to content

Commit

Permalink
tumblr_backup: Use the internal SVC API to access dashboard-only blogs
Browse files Browse the repository at this point in the history
Also add tumblr_login.py to generate cookies without a browser.
Credit to @johanneszab for the C# implementation in TumblThree.
Credit to @MrEldritch for the initial Python port.

Fixes bbolli#161
  • Loading branch information
cebtenzzre committed Sep 24, 2020
1 parent 863a9ae commit debefb3
Show file tree
Hide file tree
Showing 5 changed files with 411 additions and 181 deletions.
62 changes: 21 additions & 41 deletions note_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,26 @@

from bs4 import BeautifulSoup

from util import ConnectionFile, is_dns_working, to_bytes, to_native_str
from util import (ConnectionFile, URLLIB3_FROM_PIP,
is_dns_working, make_requests_session, setup_urllib3_ssl, to_bytes, to_native_str)

if TYPE_CHECKING:
from typing import List, Text

try:
from http.cookiejar import MozillaCookieJar
from urllib.parse import quote, urlparse, urlsplit, urlunsplit
except ImportError:
from cookielib import MozillaCookieJar # type: ignore[no-redef]
from urllib import quote # type: ignore[attr-defined,no-redef]
from urlparse import urlparse, urlsplit, urlunsplit # type: ignore[no-redef]

if URLLIB3_FROM_PIP:
from pip._vendor.urllib3 import Retry, Timeout
from pip._vendor.urllib3.exceptions import HTTPError, InsecureRequestWarning
else:
from urllib3 import Retry, Timeout
from urllib3.exceptions import HTTPError, InsecureRequestWarning

setup_urllib3_ssl()

try:
import requests
Expand All @@ -31,27 +42,11 @@
raise RuntimeError('The requests module is required for note scraping. '
'Please install it with pip or your package manager.')

try:
from urllib.parse import quote, urlparse, urlsplit, urlunsplit
except ImportError:
from urllib import quote # type: ignore[attr-defined,no-redef]
from urlparse import urlparse, urlsplit, urlunsplit # type: ignore[no-redef]

try:
from urllib3 import Retry
from urllib3.exceptions import HTTPError, InsecureRequestWarning
except ImportError:
try:
# pip includes urllib3
from pip._vendor.urllib3 import Retry
from pip._vendor.urllib3.exceptions import HTTPError, InsecureRequestWarning
except ImportError:
raise RuntimeError('The urllib3 module is required. Please install it with pip or your package manager.')

EXIT_SUCCESS = 0
EXIT_SAFE_MODE = 2
EXIT_NO_INTERNET = 3

HTTP_TIMEOUT = Timeout(90)
HTTP_RETRY = Retry(3, connect=False)

# Globals
Expand All @@ -78,27 +73,12 @@ class WebCrawler(object):

TRY_LIMIT = 2 # For code 429, only give it one extra try

def __init__(self, noverify, cookiefile, notes_limit):
def __init__(self, noverify, user_agent, cookiefile, notes_limit):
self.notes_limit = notes_limit
self.lasturl = None

self.session = requests.Session()
self.session.verify = not noverify
for adapter in self.session.adapters.values():
adapter.max_retries = HTTP_RETRY

if cookiefile is not None:
cookies = MozillaCookieJar(cookiefile)
cookies.load()

# Session cookies are denoted by either `expires` field set to an empty string or 0. MozillaCookieJar only
# recognizes the former (see https://bugs.python.org/issue17164).
for cookie in cookies:
if cookie.expires == 0:
cookie.expires = None
cookie.discard = True

self.session.cookies = cookies # type: ignore[assignment]
self.session = make_requests_session(
requests.Session, HTTP_RETRY, HTTP_TIMEOUT, not noverify, user_agent, cookiefile,
)

@classmethod
def quote_unsafe(cls, string):
Expand Down Expand Up @@ -228,7 +208,7 @@ def get_notes(self, post_url):
return u''.join(notes_list)


def main(stdout_conn, msg_conn, post_url_, ident_, noverify, notes_limit, cookiefile):
def main(stdout_conn, msg_conn, post_url_, ident_, noverify, user_agent, cookiefile, notes_limit):
global post_url, ident, msg_pipe
post_url, ident = post_url_, ident_

Expand All @@ -237,7 +217,7 @@ def main(stdout_conn, msg_conn, post_url_, ident_, noverify, notes_limit, cookie
warnings.filterwarnings('ignore', category=InsecureRequestWarning)

with ConnectionFile(msg_conn, 'w') as msg_pipe:
crawler = WebCrawler(noverify, cookiefile, notes_limit)
crawler = WebCrawler(noverify, user_agent, cookiefile, notes_limit)

try:
notes = crawler.get_notes(post_url)
Expand Down
Loading

0 comments on commit debefb3

Please sign in to comment.