-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDF_Fix.py
207 lines (190 loc) · 8.55 KB
/
PDF_Fix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python3
""" Enlarges very thin lines of PDF files for visibility and printing """
import os
import re # regular expressions
import sys
import shutil # for spawning process
import subprocess # for running ghostscript
_author_ = 'RDTSC'
_version_ = '0.1'
_date_ = '2021'
try:
gs1 = shutil.which("gs") # ensure GhostScript is available...
gs2 = shutil.which("gswin64.exe")
if not (gs1 or gs2):
print("GhostScript is required! See http://www.ghostscript.com")
exit()
if gs1:
gs = "gs"
elif gs2:
gs = "gswin64.exe"
del gs1, gs2
except shutil.Error as e:
print("Error checking for GhostScript: " + e)
exit()
def search(file):
""" Searches through uncompressed binary data for thin lines below 1pt """
changes = 0 # local changes for this file
pat = re.compile(rb'0.?\d* [wW]') # regex, binary "0[.*] w"
d = bytearray()
with open(file, mode='r+b') as f: # updateable, binary (cannot resize!)
d = bytearray(f.read()) # read file data as d
# d2 = d # copy to "non-file" buffer
it = pat.finditer(d) # find pattern in data as iterable
for match in it: # for each match,
m = match.group() # bytes of the match string to binary m
m2 = m[:-2] # just the numeric part, strip " w"
lm = len(m2) # number of bytes
dbl = float(m2)*20 # increase value
if dbl == 0.0: # for 0pt "hairlines",
dbl = 0.1 # suggest 0.1pt
prompt = f'*** Found: "{m.decode("ascii")}"; set to [{dbl}] or custom: '
while True:
got = input(prompt) # ask for val
if got == "": # if enter,
got = dbl # use doubled value
try:
val = float(got) # else convert to float
except (ValueError, TypeError, KeyboardInterrupt):
print("Value not understood... try again or 0 to skip.")
continue
if val > 0: # if legit value,
val = bytes(str(val).encode("ASCII")) # convert to bytes
lv = len(val) # and get length of this val
# if __debug__:
# print(f"* m2 = {m2}, len(m2) = {lm}, val = {val}, len(v) = {lv}")
#if lv > lm: # too many digits?
# print("Length too long; use fewer digits.")
# continue
while lv < lm: # too few digits?
val += b'0' # append a zero
lv += 1 # try again
# bytearray is a sequence-type, and it supports slice-based operations. The
# "insert at position i" idiom with slices goes like this x[i:i] = ...
val = val + b" w" # should be same length now
print(f'*** Changing {m} to {val}')
# print(f'len(d): {len(d)}, match.start(): {match.start()}, match.end(): {match.end()}, len(bytearray(val)): {len(bytearray(val))}')
#d = re.sub(m, val, d, 1) # assign the bastard
d[match.start():match.end()] = bytearray(val)
# TODO: THERE MAY STILL BE A POSSIBLE ISSUE HERE...
# BufferError: Existing exports of data: object cannot be re-sized.
changes += 1 # we changed one
break
# if __debug__:
# print(m)
print(f"*** Updating bytes in {file}...")
with open(file, mode='w+b') as f: # writeable, binary
f.seek(0) # go to beginning of file!
f.write(d) # write out changed data, close
d = "" # purge file buffer
return changes
def main(argv):
""" Main application, fix PDF hair/thin lines """
retval = 0 # 1 for error
nfiles = 0 # number of processed files
nchanges = 0 # number of total changes made
print(f"*** GhostScript found as: \"{gs}\"")
for infile in argv[1:]:
# qualify input file(s)...
print(f"*** Input file: {infile}")
if not (infile.endswith(".pdf") or infile.endswith(".PDF")):
print("*** File is not a .PDF file, ignored!\n")
retval = 1
continue # next file
if not os.path.isfile(infile):
print("*** File does not exist, ignored!\n")
retval = 1
continue
# file data must start with "%PDF"
with open(infile, mode='rb') as file: # b -> binary
indata = file.read() # file is immediately closed
if indata.startswith(b"%PDF"): # binary string
print(f"*** {infile} signature is correct, processing...")
indata = b"" # free buffer
nfiles += 1
else:
print(f"*** {infile} is not a valid .PDF file!\n")
indata = b""
retval = 1
continue
# decompress input file
midfile = os.path.splitext(infile)[0] + "-decompressed.pdf"
if os.path.isfile(midfile): # if midfile exists
choice = "q"
while choice not in ("y", "n"):
choice = input(f"*** WARNING: {midfile} exists, overwrite? [Y/N] (Y): ")
# if __debug__:
# print(f"Choice: \"{choice}\"")
if choice in ("Y", ""):
choice = "y"
if choice == ("N"):
choice = "n"
if choice == "n":
continue # skip this file
print(f"*** Decompressing {infile} to {midfile} via GS...")
try:
ret = subprocess.run([gs, '-sDEVICE=pdfwrite', f'-o{midfile}', \
'-q', '-dCompressPages=false', f'{infile}'])
ret.check_returncode()
except subprocess.CalledProcessError:
print(f"*** Error decompressing {infile}!")
retval = 1
continue
# open uncompressed file data as RW bytes, operate directly on it
print(f"*** Searching {midfile} for lines smaller than 1pt...")
nchanges += search(midfile) # perform search, returning count
# recompress to outfile
outfile = os.path.splitext(infile)[0] + "-fixed.pdf"
if os.path.isfile(outfile): # if outfile exists
choice = "q"
while choice not in ("y", "n"):
choice = input(f"*** WARNING: {outfile} exists, overwrite? [Y/N] (Y): ")
if choice in ("Y", ""):
choice = "y"
if choice == ("N"):
choice = "n"
if choice == "n":
continue
print(f"*** Compressing {midfile} to {outfile} via GS...")
try:
ret = subprocess.run([gs, '-sDEVICE=pdfwrite', f'-o{outfile}', \
'-q', '-dCompressPages=true', f'{midfile}'])
ret.check_returncode()
except subprocess.CalledProcessError:
print(f"*** Error compressing {outfile}!")
retval = 1
continue
# remove decompressed file
print(f"*** Deleting {midfile}...")
try:
os.remove(midfile)
except (FileNotFoundError, OSError) as e:
print(f"*** Could not delete {midfile}: {e}")
retval = 1
# this file is complete!
print("*** Success!\n")
# all files are complete!
print(f"{nfiles} file(s) processed. {nchanges} change(s) made.")
if retval == 1:
tmp = input("\nErrors occurred! Press <Enter> to exit: ")
tmp = tmp
return retval
if __name__ == "__main__":
# for debugging, un-comment and run this with no params to define these args:
# sys.argv = ["PDF_Fix.py", "Test.pdf", "b.PDF", "c.XDF"]
# main(sys.argv)
# exit()
if len(sys.argv) == 1:
print(f"""PDF Fix Hairlines by {_author_} v{_version_} {_date_}\n
Usage: [python3 PDF_Fix.py file.pdf] or drag .pdf file(s) onto .cmd or .sh
Purpose: uncompresses and searches for line widths less than 1.000pt. If any
thin lines are found, these may be changed. The default increase is twenty
times the original, so a width of 0.123pt defaults to 2.46pt, but you may enter
a size if desired. A special case is 0pt, so-called HairLines. These default to
0.1pt but you may need to experiment to get a usable size.\n
You may want to try printing the Test.pdf and Test-fixed.pdf examples.\n""")
tmp = input("Press <Enter> to exit: ")
tmp = tmp
exit()
main(sys.argv[0:])
# EOF