Skip to content
This repository has been archived by the owner on Dec 11, 2021. It is now read-only.

Unicode enhancement #90

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 76 additions & 20 deletions email2pdf
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ from requests.exceptions import RequestException
from subprocess import Popen, PIPE
from sys import platform as _platform
import argparse
import base64
import binascii
import email
import functools
import html
import io
import locale
import logging
Expand All @@ -21,13 +24,15 @@ import mimetypes
import os
import os.path
import pprint
import quopri
import re
import requests
import shutil
import sys
import uu
import tempfile
import traceback

_unicodeerror="replace"
mimetypes.init()

HEADER_MAPPING = {'Author': 'From',
Expand Down Expand Up @@ -112,7 +117,7 @@ def main(argv, syslog_handler, syserr_handler):
payload = header_info + payload

logger.debug("Final payload before output_body_pdf: " + payload)
output_body_pdf(input_email, bytes(payload, 'UTF-8'), output_file_name)
output_body_pdf(input_email, payload.encode('UTF-8',_unicodeerror), output_file_name)

if args.attachments:
number_of_attachments = handle_attachments(input_email,
Expand Down Expand Up @@ -167,6 +172,8 @@ def handle_args(argv):
"include the complete path, otherwise it defaults to the current directory. If "
"this option is not specified, email2pdf picks a date & time-based filename and puts "
"the file in the directory specified by --output-directory.")
parser.add_argument("--overwrite",action="store_true",
help="Overwrites the output file, if it already exists")

parser.add_argument("-d", "--output-directory", default=os.getcwd(),
help="If --output-file is not specified, the value of this parameter is used as "
Expand Down Expand Up @@ -230,6 +237,44 @@ def handle_args(argv):
return (False, None)
else:
return (True, args)
def _decodetxt(text,encoding,charset):
#function taken from gpgmailencrypt.py (https://github.com/gpgmailencrypt/gpgmailencrypt)
#necessary due to a bug in python 3 email module
if not charset:
charset="UTF-8"
if not encoding:
encoding="8bit"
bytetext=text.encode(charset,_unicodeerror)
result=bytetext
cte=encoding.upper()
if cte=="BASE64":
pad_err = len(bytetext) % 4
if pad_err:
padded_encoded = bytetext + b'==='[:4-pad_err]
else:
padded_encoded = bytetext
try:
result= base64.b64decode(padded_encoded, validate=True)
except binascii.Error:
for i in 0, 1, 2, 3:
try:
result= base64.b64decode(bytetext+b'='*i, validate=False)
break
except binascii.Error:
pass
else:
raise AssertionError("unexpected binascii.Error")
elif cte=="QUOTED-PRINTABLE":
result=quopri.decodestring(bytetext)
elif cte in ('X-UUENCODE', 'UUENCODE', 'UUE', 'X-UUE'):
in_file = _BytesIO(bytetext)
out_file = _BytesIO()
try:
uu.decode(in_file, out_file, quiet=True)
result=out_file.getvalue()
except uu.Error:
pass
return result.decode(charset,_unicodeerror)


def get_input_data(args):
Expand Down Expand Up @@ -265,9 +310,9 @@ def get_input_email(input_data):


def get_output_file_name(args, output_directory):
if args.output_file:
if args.output_file :
output_file_name = args.output_file
if os.path.isfile(output_file_name):
if os.path.isfile(output_file_name)and not args.overwrite:
raise FatalException("Output file " + output_file_name + " already exists.")
else:
output_file_name = get_unique_version(os.path.join(output_directory,
Expand Down Expand Up @@ -299,29 +344,39 @@ def handle_message_body(input_email):
part = find_part_by_content_type(input_email, "text/html")
if part is None:
part = find_part_by_content_type(input_email, "text/plain")
assert part is not None
if not part:
return ("",cid_parts_used)
is_text=part.get_content_maintype()=="text"
payload = html.escape(part.get_payload(decode=not is_text))
charset = part.get_content_charset()
cte=part["Content-Transfer-Encoding"]

if part['Content-Transfer-Encoding'] == '8bit':
payload = part.get_payload(decode=False)
assert isinstance(payload, str)
logger.info("Email is pre-decoded because Content-Transfer-Encoding is 8bit")
assert isinstance(payload, str)
logger.info("Email is pre-decoded because Content-Transfer-Encoding is 8bit")
else:
payload = part.get_payload(decode=True)
assert isinstance(payload, bytes)
is_text=part.get_content_maintype()=="text"
payload = part.get_payload(decode=not is_text)
if is_text:
payload=_decodetxt(payload,cte,charset)
charset = part.get_content_charset()
if not charset:
charset = 'utf-8'
logger.info("Determined email is plain text, defaulting to charset utf-8")
else:
logger.info("Determined email is plain text with charset " + str(charset))

payload = "<html><body><pre>\n" + (str(payload, charset)
payload = "<html><body><pre>\n" + (payload.decode(charset,_unicodeerror)
if isinstance(payload, bytes) else payload) + "\n</pre></body></html>"
else:
payload = part.get_payload(decode=True)
is_text=part.get_content_maintype()=="text"
payload = part.get_payload(decode=not is_text)
charset = part.get_content_charset()
cte=part["Content-Transfer-Encoding"]
if not charset:
charset = 'utf-8'
charset = 'utf-8'
if is_text:
payload=_decodetxt(payload,cte,charset)
logger.info("Determined email is HTML with charset " + str(charset))

def cid_replace(cid_parts_used, matchobj):
Expand All @@ -339,21 +394,22 @@ def handle_message_body(input_email):
logger.warning("Could not find image cid " + matchobj.group(1) + " in email content.")
return "broken"

if is_text:
pl=payload
else:
pl= payload.decode(charset,_unicodeerror)
payload = re.sub(r'cid:([\[email protected]]+)', functools.partial(cid_replace, cid_parts_used),
str(payload, charset))
pl)

return (payload, cid_parts_used)


def output_body_pdf(input_email, payload, output_file_name):
logger = logging.getLogger("email2pdf")

wkh2p_process = Popen([WKHTMLTOPDF_EXTERNAL_COMMAND, '-q', '--load-error-handling', 'ignore', '--load-media-error-handling', 'ignore',
wkh2p_process = Popen([WKHTMLTOPDF_EXTERNAL_COMMAND, '-q', '--load-error-handling', 'ignore', '--load-media-error-handling', 'ignore',
'--encoding', 'utf-8', '-', output_file_name], stdin=PIPE, stdout=PIPE, stderr=PIPE)
output, error = wkh2p_process.communicate(input=payload)
assert output == b''

stripped_error = str(error, 'utf-8')
stripped_error = error.decode('utf-8',_unicodeerror)

for error_pattern in WKHTMLTOPDF_ERRORS_IGNORE:
(stripped_error, number_of_subs_made) = re.subn(error_pattern, '', stripped_error)
Expand All @@ -367,7 +423,7 @@ def output_body_pdf(input_email, payload, output_file_name):
raise FatalException("wkhtmltopdf failed with exit code " + str(wkh2p_process.returncode) + ", no error output.")
elif wkh2p_process.returncode > 0 and stripped_error != '':
raise FatalException("wkhtmltopdf failed with exit code " + str(wkh2p_process.returncode) + ", stripped error: " +
str(stripped_error, 'utf-8'))
stripped_error)
elif stripped_error != '':
raise FatalException("wkhtmltopdf exited with rc = 0 but produced unknown stripped error output " + stripped_error)

Expand Down