-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearchpdf.php
77 lines (66 loc) · 3.49 KB
/
searchpdf.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
<?php
include_once 'data.php';
include_once 'functions.php';
session_write_close();
if(!empty($_GET['file'])) {
$file_name = preg_replace('/[^\d\.pdf]/', '', $_GET['file']);
$file = dirname(__FILE__).DIRECTORY_SEPARATOR.'library'.DIRECTORY_SEPARATOR.$file_name;
if (!file_exists($file)) die('{"Error":"PDF does not exist!"}');
} else {
die('{"Error":"No PDF provided!"}');
}
if(!empty($_GET['search_term'])) {
$search_term = addcslashes($_GET['search_term'],"\044\050..\053\056\057\074\076\077\133\134\136\173\174");
$search_term = str_replace('\<\?\>', '.', $search_term);
$search_term = str_replace('\<\*\>', '.*', $search_term);
} else {
die('{"Error":"No search term provided!"}');
}
$temp_file = $temp_dir.DIRECTORY_SEPARATOR.'i-librarian'.DIRECTORY_SEPARATOR.$file_name.'.txt';
if (!file_exists($temp_file) || filemtime($temp_file) < filemtime($file)) system(select_pdftotext().'-layout "'.$file.'" "'.$temp_file.'"', $ret);
$string = file_get_contents($temp_file);
if (empty($string)) die('{"Error":"PDF to text conversion failed!"}');
$pages = array ();
$pages = explode("\f", $string);
$output_pages = array ();
while (list($page_num,$page_str) = each ($pages)) {
if(preg_match("/$search_term/ui", $page_str) > 0) $output_pages[]=$page_num;
}
$final_pages = array ();
foreach ($output_pages as $output_page) {
$temp_xml = $temp_dir.DIRECTORY_SEPARATOR.'i-librarian'.DIRECTORY_SEPARATOR.$file_name;
if (!file_exists($temp_xml.$output_page.'.xml') || filemtime($temp_xml.$output_page.'.xml') < filemtime($file)) {
system(select_pdftohtml().' -q -dev -noframes -enc UTF-8 -nomerge -c -xml -f '.($output_page+1).' -l '.($output_page+1).' "'.$file.'" "'.$temp_xml.$output_page.'"');
}
if (file_exists($temp_xml.$output_page.'.xml')) {
$string = file_get_contents($temp_xml.$output_page.'.xml');
$string = preg_replace ('/[^[\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2}]/', ' ', $string);
$string = preg_replace('/\s{2,}/ui', ' ', $string);
$string = strtolower($string);
$string = str_replace('<!doctype pdf2xml system "pdf2xml.dtd">', '<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">', $string);
$xml = @simplexml_load_string($string);
if(!$xml) die('{"Error":"Invalid XML!"}');
foreach($xml->page->attributes() as $a => $b) {
if ($a == 'height') $page_height = $b;
if ($a == 'width') $page_width = $b;
}
$i = 0;
foreach($xml->page->text as $row) {
$row = strip_tags($row->asXML());
if(preg_match("/$search_term/ui", $row) > 0) {
foreach($xml->page->text[$i]->attributes() as $a => $b) {
if ($a == 'top') $row_top = 100*round($b/$page_height, 3);
if ($a == 'left') $row_left = 100*round($b/$page_width, 3)-0.5;
if ($a == 'height') $row_height = 100*round($b/$page_height, 3);
if ($a == 'width') $row_width = 100*round($b/$page_width, 3);
}
$final_pages[($output_page+1)][] = array('top' => $row_top, 'left' => $row_left, 'height' => $row_height, 'width' => $row_width, 'text' => $row);
}
$i = $i + 1;
}
} else {
die('{"Error":"PDF to XML conversion failed!"}');
}
}
print json_encode ($final_pages, JSON_FORCE_OBJECT);
?>