-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKanji_splitter.go
134 lines (116 loc) · 3.92 KB
/
Kanji_splitter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
package kanatrans
import (
"strings"
"unicode"
)
// KanjiSplitter is a class to split a string into segments of Roman, Katagana & Hiragana, and Romaji text for individual processing
type KanjiSplitter struct{
kanjiCallback func(string) string
kanaCallback func(string) string
romanCallback func(string) string
punctCallback func(string) string
}
// NewKanjiSplitter creates a new instance of KanjiSplitter
func NewKanjiSplitter(kanjiCallback, kanaCallback, romanCallback, punctCallback func(string) string) *KanjiSplitter {
ks := KanjiSplitter{
kanjiCallback: kanjiCallback,
kanaCallback: kanaCallback,
romanCallback: romanCallback,
punctCallback: punctCallback,
}
return &ks
}
// SeparateAndProcess separates the input string into segments of Roman, Katagana & Hiragana, and Romaji text,
// and processes each segment accordingly
func (ks *KanjiSplitter) SeparateAndProcess(input string) string {
var result strings.Builder
// Separate the input string into segments of Roman, Katagana & Hiragana, and Romaji text
segments := ks.separateRomanJapaneseAndHan(input)
// Iterate over the segments
for _, segment := range segments {
// Process each segment differently based on its content
if ks.isJapanese(segment) {
// Call the ProcessJapanese function for Hiragana & Katakana segments
result.WriteString(ks.kanaCallback(segment))
} else if ks.isHan(segment) {
// Call the ProcessHan function for Romaji segments
result.WriteString(ks.kanjiCallback(segment))
} else if ks.isPunctuation(segment) {
// Call the function to handle punctuation
result.WriteString(ks.punctCallback(segment))
} else {
// Call the ProcessRoman function for Roman segments
result.WriteString(ks.romanCallback(segment))
}
}
return result.String()
}
// Function to separate the input string into segments of Roman, Katagana & Hiragana, and Romaji text
func (ks *KanjiSplitter) separateRomanJapaneseAndHan(input string) []string {
var segments []string
var currentSegment strings.Builder
lastType := ks.segmentType(rune(input[0]))
for _, char := range input {
currentType := ks.segmentType(char)
if currentType != lastType {
// Start a new segment
if currentSegment.String() != "" {
segments = append(segments, currentSegment.String())
}
currentSegment.Reset()
currentSegment.WriteString(string(char))
} else {
// Continue the current segment
currentSegment.WriteString(string(char))
}
lastType = currentType
}
// Append the last segment
if currentSegment.String() != "" {
segments = append(segments, currentSegment.String())
}
return segments
}
// Function to determine the type of segment (Roman, Katagana & Hiragana, or Romaji)
func (ks *KanjiSplitter) segmentType(char rune) int {
switch {
case char >= 'あ' && char <= 'ん':
return 1 // Japanese
case char >= '一' && char <= '龯':
return 2 // Han (Chinese)
case ks.isPunctuation(string(char)):
return 3 // Punctuation
default:
return 0 // Roman or other
}
}
// Function to check if a string contains Japanese text (hiragana or katakana)
func (ks *KanjiSplitter) isJapanese(s string) bool {
for _, char := range s {
if unicode.Is(unicode.Hiragana, char) || unicode.Is(unicode.Katakana, char) {
return true
}
}
return false
}
// Function to check if a string contains Han (Chinese) text
func (ks *KanjiSplitter) isHan(s string) bool {
for _, char := range s {
if unicode.Is(unicode.Han, char) {
return true
}
}
return false
}
// Function to check if a string contains punctuation characters or their Japanese equivalents
func (ks *KanjiSplitter) isPunctuation(s string) bool {
punctuationChars := " ?!;:-~,.'?!;:〜ー、。 "
// Iterate over each character in the string
for _, char := range s {
// Check if the character is punctuation or its Japanese equivalent
if strings.ContainsRune(punctuationChars, char) {
return true
}
}
return false
}