-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
116 lines (86 loc) · 2.69 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
package main
import (
"fmt"
. "github.com/pdftron/pdftron-go/v2"
)
const filename = "PDFTRON-test-pdf.pdf"
const beeFilename = "bee-movie-script.pdf"
type Sentences struct {
stringSentence string
metadata string
}
func main() {
PDFNetInitialize("demo:1681479144517:7df72d550300000000cd5af736e910cc63e454cadb25e0d06c2b7147ee")
// open document from the filesystem
doc := NewPDFDoc(filename)
// Printing the filename
fmt.Println(doc.GetFileName())
doc.Close()
//Read text from a file example from - https://docs.apryse.com/documentation/linux/guides/features/extraction/text-extract/
readTextFromFile()
//Read text from a file on a sentence basis.
readTextOnASentenceBasis()
// Converting txt file
openTXTFile()
}
func readTextFromFile() {
doc := NewPDFDoc(filename)
page := doc.GetPage(1)
txt := NewTextExtractor()
txt.Begin(page) // Readspage
// Extract words one by one
word := NewWord()
line := txt.GetFirstLine()
for line.IsValid() {
word = line.GetFirstWord()
for word.IsValid() {
// to get words in string you need to call .GetString() on a "word".
fmt.Println("This is the current word: ", word.GetString())
word = word.GetNextWord()
}
// To see new lines not needed
fmt.Println()
line = line.GetNextLine()
}
doc.Close()
}
func readTextOnASentenceBasis() {
var sentence string
var wordString string
var finalCharInWord string
doc := NewPDFDoc(beeFilename)
page := doc.GetPage(1)
txt := NewTextExtractor()
txt.Begin(page) // Readspage
// Extract words one by one
word := NewWord()
line := txt.GetFirstLine()
for line.IsValid() {
word = line.GetFirstWord()
for word.IsValid() {
// to get words in string you need to call .GetString() on a "word".
wordString = word.GetString()
// Append word to sentence (it is possible to use bytes pgk to do it in O(n) time)
// https://stackoverflow.com/questions/1760757/how-to-efficiently-concatenate-strings-in-go)
sentence += wordString + " "
// This can probably be done a lot prettier.
finalCharInWord = string(wordString[len(wordString)-1])
// Check to see if there is punctuation to see if there is a sentence.
if finalCharInWord == "." || finalCharInWord == "?" || finalCharInWord == "!" {
fmt.Println("Final sentence: ", sentence)
sentence = ""
}
word = word.GetNextWord()
}
line = line.GetNextLine()
}
doc.Close()
}
func openTXTFile() {
doc := NewPDFDoc()
txtFile := "pdftron-txt-file.txt"
// Converting txt to pdf. Maybe a bit overkill to convert .txt to pdf.
ConvertFromText(doc, txtFile)
// The second argument is a bitwise disjunction of flags used as options during serialization.
doc.Save("pdftron-txt-to-pdf-file.pdf", uint(SDFDocE_remove_unused))
}