-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdoc.go
336 lines (260 loc) · 8.46 KB
/
doc.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
//go:generate build/make_js.bash
// Package espeak is a wrapper around espeak-ng that works both natively and in gopherjs with the same API.
// espeak-ng is an open source text to speech library that has over one hundred voices and languages and
// supports speech synthesis markup language (SSML).
package espeak // import "gopkg.in/BenLubar/espeak.v2"
import (
"errors"
"sync"
"time"
)
// Error is the error type from espeak-ng.
type Error struct {
Code uint32 // Code associated with this error type in the espeak-ng C API.
Message string // Message intended to be read by humans.
}
// Error implements the error interface.
func (err *Error) Error() string {
return "espeak: " + err.Message
}
// SampleRate returns the number of samples per second in audio generated by this package.
func SampleRate() int {
return getSampleRate()
}
var lock sync.Mutex
// Context contains the current state of text to speech data. Multiple Contexts may exist simultaneously,
// but each Context should only be accessed from one goroutine at a time. The zero value of a Context
// is empty with default values for rate, volume, pitch, and tone.
type Context struct {
// Samples is a slice of audio samples in PCM format. Use the WriteTo method on the context to
// encode Samples as a wav file.
Samples []int16
// Events are generated along with Samples and contain information about placement of words and
// sentences, which may be useful, for example, when generating real time subtitles.
Events []*SynthEvent
rate int // words per minute, 80 to 450; default 175
volume int // percentage of normal volume, min 0; default 100
pitch int // base pitch, 0 to 100; default 50
tone int // pitch range, 0 to 100; 0 is monotone; default 50
// TODO: punctuation?
// TODO: capitals?
// TODO: word gap?
voice struct {
name string
language string
gender Gender
age uint8
variant uint8
}
isInit bool
}
func (ctx *Context) init() {
if ctx.isInit {
return
}
ctx.isInit = true
ctx.rate = 175
ctx.volume = 100
ctx.pitch = 50
ctx.tone = 50
}
// Rate returns the current speed of speech in words per minute.
//
// The default rate is 175 words per minute.
func (ctx *Context) Rate() int {
ctx.init()
return ctx.rate
}
// Volume returns the current loudness of speech as a percent of the default volume.
func (ctx *Context) Volume() int {
ctx.init()
return ctx.volume
}
// Pitch returns the highness or lowness of the voice.
//
// The default pitch for the voice is represented by 50. Higher numbers are higher pitch.
func (ctx *Context) Pitch() int {
ctx.init()
return ctx.pitch
}
// Range returns the pitch range of speech.
//
// The default tone is 50. A tone of 0 is a monotonic voice.
func (ctx *Context) Range() int {
ctx.init()
return ctx.tone
}
// SetRate changes the speed of speech for future Synthesize calls to the given number of words per minute.
//
// The number of words per minute must be between 80 and 450, inclusive.
func (ctx *Context) SetRate(wpm int) {
if wpm < 80 || wpm > 450 {
panic("espeak: Context.SetRate: wpm must be between 80 and 450")
}
ctx.init()
ctx.rate = wpm
}
// SetVolume changes the loudness of the voice for future Synthesize calls to a percentage of the default.
//
// The percentage must not be negative. Percentages over 100 may cause distortion or clipping.
func (ctx *Context) SetVolume(percentage int) {
if percentage < 0 {
panic("espeak: Context.SetVolume: percentage must not be negative")
}
ctx.init()
ctx.volume = percentage
}
// SetPitch changes the highness or lowness of the voice for future Synthesize calls.
//
// Allowed values range from 0 (very low) to 100 (very high), with the original pitch for the voice being 50.
func (ctx *Context) SetPitch(pitch int) {
if pitch < 0 || pitch > 100 {
panic("espeak: Context.SetPitch: pitch must be between 0 and 100")
}
ctx.init()
ctx.pitch = pitch
}
// SetRange changes the pitch range of the voice for future Synthesize calls.
//
// Allowed values range from 0 (monotone) to 100 (sing-songy), with the original range for the voice being 50.
func (ctx *Context) SetRange(tone int) {
if tone < 0 || tone > 100 {
panic("espeak: Context.SetRange: tone must be between 0 and 100")
}
ctx.init()
ctx.tone = tone
}
// Voice is a voice supported by espeak.
type Voice struct {
// Name for this voice (unique)
Name string
// Languages and priorities. Lower numbers mean this voice is more likely to be used for the language.
Languages []Language
// Identifier is the filename for this voice within espeak-ng-data/voices.
Identifier string
// Gender of voice.
Gender Gender
// Age in years, or 0 if not specified.
Age uint8
}
// Language supported by a voice.
type Language struct {
// Priority of the voice for this language. A low number indicates a more preferred voice, and
// a higher number indicates a less preferred voice.
Priority uint8
// The name of the language, which may be in BCP47 format, but is not required to be.
Name string
}
// ListVoices returns the complete list of voices supported by espeak. The returned slice is not shared,
// and callers may modify it without any side effects.
func ListVoices() []*Voice {
lock.Lock()
defer lock.Unlock()
return listVoices()
}
// Gender of a voice.
type Gender uint8
// Voice genders
const (
Unknown Gender = 0
Male Gender = 1
Female Gender = 2
Neutral Gender = 3
)
// SetVoice sets a voice by name.
func (ctx *Context) SetVoice(name string) error {
if name == "" {
return errors.New("espeak: missing name in SetVoice")
}
return ctx.SetVoiceProperties(name, "", Unknown, 0, 0)
}
func validVoice(name, language string, gender Gender, age, variant uint8) error {
lock.Lock()
defer lock.Unlock()
return setVoice(name, language, gender, age, variant)
}
// SetVoiceProperties sets the voice for future calls to Synthesize. Any or all of the arguments can be set
// to their zero values, in which case they will be ignored. Variant differentiates between multiple voices
// if more than one voice is matched by the other arguments.
func (ctx *Context) SetVoiceProperties(name, language string, gender Gender, age, variant uint8) error {
if err := validVoice(name, language, gender, age, variant); err != nil {
return err
}
ctx.init()
ctx.voice.name = name
ctx.voice.language = language
ctx.voice.gender = gender
ctx.voice.age = age
ctx.voice.variant = variant
return nil
}
// SynthEventType is the type of a SynthEvent.
type SynthEventType uint8
const (
// EventWord is the start of a word.
EventWord SynthEventType = 1
// EventSentence is the start of a sentence.
EventSentence SynthEventType = 2
// EventMark is a <mark/> element in SSML.
EventMark SynthEventType = 3
// EventPlay is an <audio/> element in SSML.
EventPlay SynthEventType = 4
// EventEnd is the end of a sentence or clause.
EventEnd SynthEventType = 5
// EventMsgTerminated is the end of the synthesized message.
EventMsgTerminated SynthEventType = 6
// EventPhoneme is emitted for each phoneme if enabled.
EventPhoneme SynthEventType = 7
)
// SynthEvent gives additional information about the generated speech.
type SynthEvent struct {
// Type of the event.
Type SynthEventType
// TextPosition in characters from the start of the string. Unlike Go indexes, this starts at 1.
TextPosition int
// Length of the word, in characters. (for EventWord)
Length int
// AudioPosition is the time within the generated speech output data.
AudioPosition time.Duration
Number int // Number is used for EventWord and EventSentence
Name string // Name is used for EventMark and EventPlay
Phoneme string // Phoneme is used for EventPhoneme
}
// TODO:
/*
func (ctx *Context) Synthesize(speak *ssml.Speak) error {
ctx.init()
text, err := xml.Marshal(speak)
if err != nil {
return err
}
return ctx.synthesize(string(text))
}
*/
// SynthesizeText converts the given text to speech.
//
// Some SSML tags are accepted. All other XML tags are ignored.
func (ctx *Context) SynthesizeText(text string) error {
ctx.init()
return ctx.synthesize(text)
}
func (ctx *Context) synthesize(text string) error {
lock.Lock()
defer lock.Unlock()
if err := setRate(ctx.rate); err != nil {
return err
}
if err := setVolume(ctx.volume); err != nil {
return err
}
if err := setPitch(ctx.pitch); err != nil {
return err
}
if err := setTone(ctx.tone); err != nil {
return err
}
if err := setVoice(ctx.voice.name, ctx.voice.language, ctx.voice.gender, ctx.voice.age, ctx.voice.variant); err != nil {
return err
}
return synthesize(text, ctx)
}