-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanonymizer.go
129 lines (119 loc) · 2.55 KB
/
anonymizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
package anonymizer
import (
_ "embed"
"iter"
"unicode"
)
type Anonymizer struct {
// The dictionary to use to see if a word is a dictionary word.
Dict *Dict
// The placeholder to use instead of uppercase letters.
Uppercase rune
// The placeholder to use instead of lowercase letters.
Lowercase rune
// The placeholder to use instead of digits.
Digit rune
}
func New(dict *Dict) Anonymizer {
if dict == nil {
dict = MustLoadDict("")
}
return Anonymizer{
Dict: dict,
Uppercase: '█',
Lowercase: '▄',
Digit: '0',
}
}
// Replace with a placeholder all non-dictionary words in the text.
func (a Anonymizer) Anonymize(text string) string {
runes := []rune(text)
for i, r := range runes {
if unicode.IsDigit(r) {
runes[i] = a.Digit
}
}
for span := range iterWords(runes) {
if shouldAnonymize(a.Dict, span) {
a.mask(runes, span)
}
}
return string(runes)
}
// Mask the given span in the slice of runes.
func (a Anonymizer) mask(runes []rune, span span) {
for i := span.start; i < span.end; i++ {
r := runes[i]
isUpper := unicode.IsUpper(r)
if isUpper {
runes[i] = a.Uppercase
} else {
runes[i] = a.Lowercase
}
}
}
// Check if the word in the given span should be anonymized.
func shouldAnonymize(dict *Dict, span span) bool {
word := span.word
if unicode.IsUpper(word[0]) {
if span.initial {
word = toLower(word)
} else {
return true
}
}
_, knownWord := dict.Find(string(word))
return !knownWord
}
func toLower(word []rune) []rune {
return append([]rune{unicode.ToLower(word[0])}, word[1:]...)
}
type span struct {
// The index of the first rune of the word.
start int
// The index of the first rune after the word.
end int
// The star of the show, the complete word in its original case.
word []rune
// True if it is the first word of a sentence.
initial bool
}
func iterWords(runes []rune) iter.Seq[span] {
return func(yield func(span) bool) {
start := 0
end := 0
terminal := -2
for i, r := range runes {
if unicode.IsLetter(r) {
end = i + 1
continue
}
if start < end {
keepGoing := yield(span{
start: start,
end: end,
word: runes[start:end],
initial: terminal != -1,
})
if !keepGoing {
break
}
terminal = -1
}
if unicode.In(r, unicode.Sentence_Terminal) {
terminal = i
} else if terminal != -1 && !unicode.IsSpace(r) {
terminal = -1
}
start = i + 1
}
if start < end {
yield(span{
start: start,
end: end,
word: runes[start:end],
initial: terminal != -1,
})
}
}
}