forked from primejyothi/pdfSplit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplitPdf.sh
executable file
·380 lines (327 loc) · 8.58 KB
/
splitPdf.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#! /usr/bin/env bash
# Script to build smaller pdf files from pdf files in a given folder.
#
# Extract the images from the original pdf
# Reduce the quality of the images to 10%
# Generate pdf files with 10 or 15 images.
# Combine all the pdf files generated in the previous step
# Requirements / Dependencies
# 1. Requires convert utility from ImageMagick suite
# 2. Requires pdfunite from poppler package
# 3. Need sufficient space in the directory pointed by imgF
# which points to /tmp/imgs
# Assumptions:
# 1. Pdf files are generated out of images and do not contain text.
# 2. The required data is in the extracted jpg files and not in ppm files
# The control csv has the school code and the number of students.
# The fields are comma separated and records are separated by new lines.
# Prime Jyothi (primejyothi at gmail dot com), 20140112
# License GPLv3
function log ()
{
echo "Log : $@"
}
function err ()
{
echo "Error : $@"
}
function dbg ()
{
if [[ -z "$dbgFlag" ]]
then
return
fi
echo "Debug : $@"
}
function help ()
{
echo "Usage `basename $0` [-d] [-h] -c ContorlFile -f PDF Folder -p Pages per Student [-q image quality] -o OutpuFolder -t TempFolder"
echo -e "\t -d : Enable debug messages"
echo -e "\t -h : Display this help message"
}
function createPdfs ()
{
del="$*"
log $LINENO "Converting images to PDFs"
pdfPartName=`basename "$currentPDF" .pdf`
# newName=${schoolCode}_${pdfPartName}_${startPage}_${currentPage}
# Set the page numbers in 4 digits, otherwise the ordering will be
# lost when ls uses dictionary sorting.
pgNo=`printf %04d $currentPage`
newName=${schoolCode}_${pdfPartName}_${pgNo}
dbg "$LINENO New name [$newName]"
convert $del ${imgPDF}/"${newName}".pdf
dbg $LINENO "Deleting [$del]"
rm $del
}
function splitPdf ()
{
num=$1
pdf2Split=`tail -n +${num} pdf.lst | head -1`
if [[ -z "$pdf2Split" ]]
then
# No more PDF files to split, exit
log "$LINENO No more PDF files to split, exiting"
exit 0
fi
log "$LINENO Splitting pdf file [$pdf2Split] into images."
dbg $LINENO splitPdf $num
pdfimages -p -j "$pdf2Split" $imgF/jj
rm $imgF/*.ppm
if [[ ! -z "${imgQlty}" ]]
then
log $LINENO Image Quality $imgQlty
convertImages
else
log $LINENO Image Quality Empty
fi
currentPDF=$pdf2Split
}
function convertImages ()
{
log "$LINENO Changing the image quality to [${imgQlty}%]"
ls ${imgF}/*.jpg > conv.lst 2> /dev/null
while read conv
do
dbg $LINENO [$conv]
convert -quality ${imgQlty} "$conv" ${imgF}/new.jpg
# Rename the converted file.
mv ${imgF}/new.jpg "$conv"
done < conv.lst
}
function combinePdfs ()
{
ucount=`ls -1 ${imgPDF}/*.pdf 2> /dev/null | wc -l`
if [[ "$ucount" -lt "1" ]]
then
return
fi
log "$LINENO $ucount Combining the PDFs"
ls ${imgPDF}/*.pdf > ls.lst
while read i
do
dbg "$LINENO $i"
done < ls.lst
pgNo=`printf %04d $currentPage`
pdfPartName=`basename "$currentPDF" .pdf`
newPDFName=${schoolCode}_${pdfPartName}_${startPage}_${pgNo}
log $LINENO new name [$newPDFName]
if [[ "$ucount" -eq 1 ]]
then
# Only one PDF file, just move it
mv ${imgPDF}/*.pdf "${outPDF}/${newPDFName}.pdf"
else
pdfunite ${imgPDF}/*.pdf "${outPDF}/${newPDFName}.pdf"
rm ${imgPDF}/*.pdf
fi
}
function getImages ()
{
num=$1
k=0
out=""
for i in `ls -1 $imgF`
do
out="$out ${imgF}/$i"
k=`expr $k + 1`
if [[ $k -eq $num ]]
then
break;
fi
done
out="$k|$out"
echo $out
}
while getopts c:f:p:q:o:t:hd args
do
case $args in
c) ctrlFile="$OPTARG"
;;
f) pdfDir="$OPTARG"
;;
p) pagePerStudent="$OPTARG"
;;
q) imgQlty="$OPTARG"
;;
o) outPDF="$OPTARG"
;;
t) tDir="$OPTARG"
;;
h) help
exit
;;
d)
dbgFlag=Y
;;
*) help
esac
done
if [ -z "$ctrlFile" -o -z "$pdfDir" -o -z "$pagePerStudent" ]
then
help
exit 2
fi
if [[ ! -r ${ctrlFile} ]]
then
echo "Unable to read input file ${ctrlFile}"
exit 2
fi
# Control Parameters
# Temp image folder
imgF="${tDir}/imgs"
imgPDF="${tDir}/pdf"
# Make sure that the temporary directory is cleaned out when interrupted.
# trap "rm -rf ${imgF}; exit 2" 1 2 3
# Generate list of PDF files.
dbg $LINENO "Generating pdf list"
ls ${pdfDir}/*.pdf > pdf.lst
## Program flow:
## Read the control file, get the school code and the student count.
## Calculate the number of pages required by the school.
## Build the PDF file with required number of pages from the images
## available in the $imgF folder.
## If sufficient number of pages(images) are not available, complete current
## pdf file being built and generate new images from the new PDF file.
# Validate the Control file. If errors are detected while processing the pdf
# files, the error recovery would be too complicated. If there are any
# errors, fail it in the beginning.
errors="no"
log $LINENO "Validating control file."
totalSchools=0
totalStudents=0
totalPages=0
while read ctrlInfo
do
schoolCode=`echo ${ctrlInfo} | awk -F"," '{print $1}'`
studentCount=`echo ${ctrlInfo} | awk -F"," '{print $2}'`
pageReq=`expr $studentCount \* $pagePerStudent`
ret=$?
if [[ "${ret}" -ne 0 ]]
then
# Expr failed, mostly due to invalid student count
err "Error while processing school : ${schoolCode}"
errors="Yes"
continue
fi
totalSchools=`expr ${totalSchools} + 1`
totalStudents=`expr ${totalStudents} + ${studentCount}`
totalPages=`expr ${totalPages} + ${pageReq}`
done < ${ctrlFile}
if [[ "${errors}" = "Yes" ]]
then
err $LINENO "Errors detected in the control file, exiting"
exit 2
fi
# Check for files from previous runs. If present exit. These files can lead
# to incorrect results. The user has to decide to keep the files or delete.
for outDirs in $imgPDF $imgF $outPDF
do
outFileCount=`ls ${outDirs}/* 2> /dev/null | wc -l`
dbg $LINENO outFileCount $outFileCount
if [[ "$outFileCount" -gt 0 ]]
then
err $LINENO "Files present in ${outDirs} folder."
errors="Yes"
fi
done
# Check for the output directory.
if [[ ! -d "$outPDF" ]]
then
err "$LINENO : Unable to access output directory $outDir"
exit 2
fi
if [[ "$errors" = "Yes" ]]
then
err $LINENO "Files present in temp / output folder. Remove them to proceed."
exit 2
fi
log $LINENO "Totals schools [$totalSchools], students [${totalStudents}] pages [${totalPages}]"
log $LINENO "Control file validation completed."
# Check for the temporary directories
if [[ -d "$tDir" ]]
then
mkdir ${imgF} ${imgPDF} 2> /dev/null
fi
for outDirs in ${imgF} ${imgPDF}
do
if [[ ! -d ${outDirs} ]]
then
err "$LINENO Unable to access temp directory ${outDirs}"
errors="Yes"
fi
done
if [[ "$errors" = "Yes" ]]
then
err "$LINENO Unable to access temp directory"
exit 2
fi
batchSize=10
pdfSeq=1
currentPDF=""
currentPage=0
# Read the control file and process the schools one by one.
while read ctrlInfo
do
schoolCode=`echo ${ctrlInfo} | awk -F"," '{print $1}'`
studentCount=`echo ${ctrlInfo} | awk -F"," '{print $2}'`
pageReq=`expr $studentCount \* $pagePerStudent`
ret=$?
if [[ "${ret}" -ne 0 ]]
then
# Expr failed, mostly due to invalid student count
err $LINENO "Error while processing school : ${schoolCode}"
continue
fi
# Start page for naming the output pdf files.
startPage=`expr $currentPage + 1`
log $LINENO "Processing School code [$schoolCode], # of students [${studentCount}] total pages [$pageReq]"
# Get required number of pages from the current school.
pagesProcessed=0
while :
do
# Calculate the fetch size.
rem=`expr $pageReq \- $pagesProcessed`
# dbg $LINENO rem $rem
if [ $rem -lt $batchSize -a $rem -gt 0 ]
then
dbg "$LINENO Changing fetch size to $rem"
fetchSize=$rem
else
fetchSize=$batchSize
fi
dbg $LINENO : fetchSize $fetchSize
imgRes=`getImages $fetchSize`
imgCount=`echo $imgRes | cut -d"|" -f 1`
images=`echo $imgRes | cut -d"|" -f 2`
dbg $LINENO imageCount [$imgCount]
# dbg $LINENO images $images
if [[ $imgCount -eq 0 ]]
then
# About to start processing new PDF file. Combine the smaller
# PDFs generated from the previous file.
log $LINENO Calling combinePdfs
combinePdfs
log $LINENO "Splitting new PDF into images"
splitPdf $pdfSeq
pdfSeq=`expr $pdfSeq + 1`
currentPage=0
startPage=1 # New pdf, reset page number.
continue
fi
pagesProcessed=`expr $pagesProcessed + $imgCount`
currentPage=`expr $currentPage + $imgCount`
log $LINENO Pages processed : $pagesProcessed
# convert to mini pdfs
log $LINENO "Current file [$currentPDF] start page [$startPage] end Page :[$currentPage]"
log "$LINENO Converting images into PDF."
createPdfs $images
if [[ $pagesProcessed -ge $pageReq ]]
then
log "$LINENO pages processed = $pagesProcessed, breaking"
combinePdfs
break
fi
done
log "$LINENO Pages Processed $pagesProcessed"
done < ${ctrlFile}
exit