-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDF.py
137 lines (118 loc) · 3.83 KB
/
PDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import PyPDF2
import re
import tabula
import langid
from collections import Counter
# Get title
def title_pdf(document):
with open(document, 'rb') as document:
reader = PyPDF2.PdfReader(document)
metadata = reader.metadata
if metadata is not None:
title = metadata.get('/Title', '')
return title
else:
return ''
# Count pages
def page_count_pdf(document):
with open(document, 'rb') as f:
reader = PyPDF2.PdfReader(f)
return len(reader.pages)
# Count words
def word_count_pdf(document):
with open(document, 'rb') as f:
reader = PyPDF2.PdfReader(f)
words = 0
for page in reader.pages:
words += len(page.extract_text().split())
return words
# Extract keywords
import re
import PyPDF2
from collections import Counter
def keywords_pdf(document_path):
with open(document_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text()
# Remove non-alphanumeric characters and convert to lowercase
cleaned_text = re.sub(r'\W+', ' ', text.lower())
# Split text into individual words
words = cleaned_text.split()
# Count word frequency
word_count = Counter()
for word in words:
if len(word) > 1:
word_count[word] += 1
# Extract the top 3 most frequent words that are not single-letter words or numbers
keywords = []
for word, count in word_count.most_common():
if len(keywords) >= 3:
break
if len(word) > 1 and not word.isnumeric():
keywords.append(word)
return keywords
# Count characters
def character_count_pdf(document):
with open(document, 'rb') as f:
reader = PyPDF2.PdfReader(f)
num_characters = 0
for page in reader.pages:
text = page.extract_text()
num_characters += len(text)
return num_characters
# Get typography
def typography_pdf(document):
with open(document, 'rb') as document:
reader = PyPDF2.PdfReader(document)
typography = set()
for page in reader.pages:
if '/Font' in page['/Resources']:
page_fonts = page['/Resources']['/Font']
for font in page_fonts:
if '/BaseFont' in page_fonts[font]:
font_name = page_fonts[font]['/BaseFont']
typography.add(font_name)
return list(typography)
# Count images
def image_count_pdf(document):
with open(document, 'rb') as f:
reader = PyPDF2.PdfReader(f)
images = 0
for page in reader.pages:
if '/XObject' in page['/Resources']:
x_objects = page['/Resources']['/XObject']
for obj in x_objects:
if x_objects[obj]['/Subtype'] == '/Image':
images += 1
return images
# Count tables
def table_count_pdf(document):
tables = tabula.read_pdf(document, pages='all', multiple_tables=True, pandas_options={'header': None})
return len(tables)
# Detect language
def language_pdf(document):
with open(document, 'rb') as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text()
language = langid.classify(text)[0]
return language
# Detect encryption
def encryption_pdf(document):
with open(document, 'rb') as f:
reader = PyPDF2.PdfReader(f)
if reader.is_encrypted:
return True
else:
return False
# Get author
def author_pdf(document):
with open(document, 'rb') as f:
reader = PyPDF2.PdfReader(f)
metadata = reader.metadata
if metadata:
return metadata.get('/Author', '')
return ""