diff --git a/Cargo.toml b/Cargo.toml index ca49120..346d622 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ name = "jbonsai" version = "0.1.0" edition = "2021" -rust-version = "1.65.0" +rust-version = "1.75.0" [features] default = ["htsvoice"] diff --git a/benches/bonsais.rs b/benches/bonsais.rs index 270ee20..ed2d3b6 100644 --- a/benches/bonsais.rs +++ b/benches/bonsais.rs @@ -11,7 +11,7 @@ const MODEL_NITECH_ATR503: &str = #[bench] fn bonsai(bencher: &mut Bencher) { // 盆栽,名詞,一般,*,*,*,*,盆栽,ボンサイ,ボンサイ,0/4,C2 - let lines: Vec = [ + let lines = [ "xx^xx-sil+b=o/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:4_4%0_xx_xx/H:xx_xx/I:xx-xx@xx+xx&xx-xx|xx+xx/J:1_4/K:1+1-4", "xx^sil-b+o=N/A:-3+1+4/B:xx-xx_xx/C:02_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:4_4#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:xx_xx/I:1-4@1+1&1-1|1+4/J:xx_xx/K:1+1-4", "sil^b-o+N=s/A:-3+1+4/B:xx-xx_xx/C:02_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:4_4#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:xx_xx/I:1-4@1+1&1-1|1+4/J:xx_xx/K:1+1-4", @@ -20,9 +20,9 @@ fn bonsai(bencher: &mut Bencher) { "N^s-a+i=sil/A:-1+3+2/B:xx-xx_xx/C:02_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:4_4#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:xx_xx/I:1-4@1+1&1-1|1+4/J:xx_xx/K:1+1-4", "s^a-i+sil=xx/A:0+4+1/B:xx-xx_xx/C:02_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:4_4#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:xx_xx/I:1-4@1+1&1-1|1+4/J:xx_xx/K:1+1-4", "a^i-sil+xx=xx/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:4_4!0_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:xx_xx%xx_xx_xx/H:1_4/I:xx-xx@xx+xx&xx-xx|xx+xx/J:xx_xx/K:1+1-4", - ].iter().map(|l| l.to_string()).collect(); + ]; - let engine = Engine::load(&[MODEL_NITECH_ATR503.to_string()]).unwrap(); + let engine = Engine::load(&[MODEL_NITECH_ATR503]).unwrap(); bencher.iter(|| { engine.synthesize_from_strings(&lines).unwrap(); @@ -37,7 +37,7 @@ fn is_bonsai(bencher: &mut Bencher) { // です,助動詞,*,*,*,特殊・デス,基本形,です,デス,デス’,1/2,動詞%F1/形容詞%F2/名詞%F2@1,1 // か,助詞,副助詞/並立助詞/終助詞,*,*,*,*,か,カ,カ,0/1,動詞%F2/形容詞%F2/名詞%F1,1 // ?,記号,一般,*,*,*,*,?,?,?,0/0,*,0 - let lines: Vec = [ + let lines = [ "xx^xx-sil+k=o/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:3_3%0_xx_xx/H:xx_xx/I:xx-xx@xx+xx&xx-xx|xx+xx/J:2_10/K:1+2-10", "xx^sil-k+o=r/A:-2+1+3/B:xx-xx_xx/C:04_xx+xx/D:24+xx_xx/E:xx_xx!xx_xx-xx/F:3_3#0_xx@1_2|1_10/G:7_5%1_xx_1/H:xx_xx/I:2-10@1+1&1-2|1+10/J:xx_xx/K:1+2-10", "sil^k-o+r=e/A:-2+1+3/B:xx-xx_xx/C:04_xx+xx/D:24+xx_xx/E:xx_xx!xx_xx-xx/F:3_3#0_xx@1_2|1_10/G:7_5%1_xx_1/H:xx_xx/I:2-10@1+1&1-2|1+10/J:xx_xx/K:1+2-10", @@ -58,9 +58,9 @@ fn is_bonsai(bencher: &mut Bencher) { "s^U-k+a=sil/A:2+7+1/B:10-7_2/C:23_xx+xx/D:xx+xx_xx/E:3_3!0_xx-1/F:7_5#1_xx@2_1|4_7/G:xx_xx%xx_xx_xx/H:xx_xx/I:2-10@1+1&1-2|1+10/J:xx_xx/K:1+2-10", "U^k-a+sil=xx/A:2+7+1/B:10-7_2/C:23_xx+xx/D:xx+xx_xx/E:3_3!0_xx-1/F:7_5#1_xx@2_1|4_7/G:xx_xx%xx_xx_xx/H:xx_xx/I:2-10@1+1&1-2|1+10/J:xx_xx/K:1+2-10", "k^a-sil+xx=xx/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:7_5!1_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:xx_xx%xx_xx_xx/H:2_10/I:xx-xx@xx+xx&xx-xx|xx+xx/J:xx_xx/K:1+2-10", - ].iter().map(|l| l.to_string()).collect(); + ]; - let engine = Engine::load(&[MODEL_NITECH_ATR503.to_string()]).unwrap(); + let engine = Engine::load(&[MODEL_NITECH_ATR503]).unwrap(); bencher.iter(|| { engine.synthesize_from_strings(&lines).unwrap(); @@ -86,7 +86,7 @@ fn bonsai_letter(bencher: &mut Bencher) { // あっ,動詞,非自立,*,*,五段・ラ行,連用タ接続,あっ,アッ,アッ,1/2,*,0 // た,助動詞,*,*,*,特殊・タ,基本形,た,タ,タ,0/1,動詞%F2@1/形容詞%F4@-2,1 // 。,記号,句点,*,*,*,*,。,、,、,0/0,*,0 - let lines: Vec = [ + let lines = [ "xx^xx-sil+t=e/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:4_4%0_xx_xx/H:xx_xx/I:xx-xx@xx+xx&xx-xx|xx+xx/J:6_24/K:1+6-24", "xx^sil-t+e=g/A:-3+1+4/B:xx-xx_xx/C:02_xx+xx/D:23+xx_xx/E:xx_xx!xx_xx-xx/F:4_4#0_xx@1_6|1_24/G:6_2%0_xx_1/H:xx_xx/I:6-24@1+1&1-6|1+24/J:xx_xx/K:1+6-24", "sil^t-e+g=a/A:-3+1+4/B:xx-xx_xx/C:02_xx+xx/D:23+xx_xx/E:xx_xx!xx_xx-xx/F:4_4#0_xx@1_6|1_24/G:6_2%0_xx_1/H:xx_xx/I:6-24@1+1&1-6|1+24/J:xx_xx/K:1+6-24", @@ -130,9 +130,9 @@ fn bonsai_letter(bencher: &mut Bencher) { "a^cl-t+a=sil/A:2+3+1/B:17-1_1/C:10_7+2/D:xx+xx_xx/E:3_1!0_xx-1/F:3_1#0_xx@6_1|22_3/G:xx_xx%xx_xx_xx/H:xx_xx/I:6-24@1+1&1-6|1+24/J:xx_xx/K:1+6-24", "cl^t-a+sil=xx/A:2+3+1/B:17-1_1/C:10_7+2/D:xx+xx_xx/E:3_1!0_xx-1/F:3_1#0_xx@6_1|22_3/G:xx_xx%xx_xx_xx/H:xx_xx/I:6-24@1+1&1-6|1+24/J:xx_xx/K:1+6-24", "t^a-sil+xx=xx/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:3_1!0_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:xx_xx%xx_xx_xx/H:6_24/I:xx-xx@xx+xx&xx-xx|xx+xx/J:xx_xx/K:1+6-24", - ].iter().map(|l| l.to_string()).collect(); + ]; - let engine = Engine::load(&[MODEL_NITECH_ATR503.to_string()]).unwrap(); + let engine = Engine::load(&[MODEL_NITECH_ATR503]).unwrap(); bencher.iter(|| { engine.synthesize_from_strings(&lines).unwrap(); diff --git a/examples/genji/main.rs b/examples/genji/main.rs index 1079647..f66c337 100644 --- a/examples/genji/main.rs +++ b/examples/genji/main.rs @@ -1,18 +1,18 @@ -use jbonsai::{engine::Engine, model::interporation_weight::Weights}; +use jbonsai::engine::Engine; fn main() -> Result<(), Box> { let label_str = std::fs::read_to_string("examples/genji/genji.lab")?; - let lines: Vec = label_str.lines().map(|s| s.to_string()).collect(); - let mut engine = Engine::load(&vec![ + let lines: Vec<_> = label_str.lines().collect(); + let mut engine = Engine::load(&[ "models/tohoku-f01/tohoku-f01-sad.htsvoice", "models/tohoku-f01/tohoku-f01-happy.htsvoice", ])?; let iw = engine.condition.get_interporation_weight_mut(); - iw.set_duration(Weights::new(&[0.5, 0.5])?)?; - iw.set_parameter(0, Weights::new(&[0.5, 0.5])?)?; - iw.set_parameter(1, Weights::new(&[0.5, 0.5])?)?; - iw.set_parameter(2, Weights::new(&[1.0, 0.0])?)?; + iw.set_duration(&[0.5, 0.5])?; + iw.set_parameter(0, &[0.5, 0.5])?; + iw.set_parameter(1, &[0.5, 0.5])?; + iw.set_parameter(2, &[1.0, 0.0])?; let speech = engine.synthesize_from_strings(&lines)?; diff --git a/examples/is-bonsai/main.rs b/examples/is-bonsai/main.rs index eadeaf7..cb4747b 100644 --- a/examples/is-bonsai/main.rs +++ b/examples/is-bonsai/main.rs @@ -1,7 +1,7 @@ use jbonsai::engine::Engine; fn main() -> Result<(), Box> { - let lines: Vec = [ + let lines = [ "xx^xx-sil+k=o/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:3_3%0_xx_xx/H:xx_xx/I:xx-xx@xx+xx&xx-xx|xx+xx/J:2_10/K:1+2-10", "xx^sil-k+o=r/A:-2+1+3/B:xx-xx_xx/C:04_xx+xx/D:24+xx_xx/E:xx_xx!xx_xx-xx/F:3_3#0_xx@1_2|1_10/G:7_5%1_xx_1/H:xx_xx/I:2-10@1+1&1-2|1+10/J:xx_xx/K:1+2-10", "sil^k-o+r=e/A:-2+1+3/B:xx-xx_xx/C:04_xx+xx/D:24+xx_xx/E:xx_xx!xx_xx-xx/F:3_3#0_xx@1_2|1_10/G:7_5%1_xx_1/H:xx_xx/I:2-10@1+1&1-2|1+10/J:xx_xx/K:1+2-10", @@ -22,10 +22,10 @@ fn main() -> Result<(), Box> { "s^U-k+a=sil/A:2+7+1/B:10-7_2/C:23_xx+xx/D:xx+xx_xx/E:3_3!0_xx-1/F:7_5#1_xx@2_1|4_7/G:xx_xx%xx_xx_xx/H:xx_xx/I:2-10@1+1&1-2|1+10/J:xx_xx/K:1+2-10", "U^k-a+sil=xx/A:2+7+1/B:10-7_2/C:23_xx+xx/D:xx+xx_xx/E:3_3!0_xx-1/F:7_5#1_xx@2_1|4_7/G:xx_xx%xx_xx_xx/H:xx_xx/I:2-10@1+1&1-2|1+10/J:xx_xx/K:1+2-10", "k^a-sil+xx=xx/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:7_5!1_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:xx_xx%xx_xx_xx/H:2_10/I:xx-xx@xx+xx&xx-xx|xx+xx/J:xx_xx/K:1+2-10", -].iter().map(|l| l.to_string()).collect(); +]; - let engine = Engine::load(&vec![ - "models/hts_voice_nitech_jp_atr503_m001-1.05/nitech_jp_atr503_m001.htsvoice".to_string(), + let engine = Engine::load(&[ + "models/hts_voice_nitech_jp_atr503_m001-1.05/nitech_jp_atr503_m001.htsvoice", ])?; let speech = engine.synthesize_from_strings(&lines)?; diff --git a/src/constants.rs b/src/constants.rs index 459ac96..dcce5ec 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -1,6 +1,3 @@ -pub const MAX_F0: f64 = 20000.0; -pub const MIN_F0: f64 = 20.0; - /// log(MAX_F0) = log(20000.0) pub const MAX_LF0: f64 = 9.903_487_552_536_127; /// log(MIN_F0) = log(20.0) diff --git a/src/duration.rs b/src/duration.rs index 8496d25..4eea3b5 100644 --- a/src/duration.rs +++ b/src/duration.rs @@ -1,25 +1,31 @@ -use crate::model::Models; +use crate::model::MeanVari; -pub struct DurationEstimator; +pub struct DurationEstimator { + parameters: Vec, + nstate: usize, +} impl DurationEstimator { - pub fn create(&self, models: &Models, speed: f64) -> Vec { - let duration_params = models.duration(); + pub fn new(duration: Vec, nstate: usize) -> Self { + Self { + parameters: duration, + nstate, + } + } + pub fn create(&self, speed: f64) -> Vec { // determine frame length - let mut duration = Self::estimate_duration(&duration_params, 0.0); + let mut duration = Self::estimate_duration(&self.parameters, 0.0); if speed != 1.0 { let length: usize = duration.iter().sum(); duration = - Self::estimate_duration_with_frame_length(&duration_params, length as f64 / speed); + Self::estimate_duration_with_frame_length(&self.parameters, length as f64 / speed); } duration } - pub fn create_with_alignment(&self, models: &Models, times: &[(f64, f64)]) -> Vec { - let duration_params = models.duration(); - + pub fn create_with_alignment(&self, times: &[(f64, f64)]) -> Vec { // determine state duration let mut duration = vec![]; // use duration set by user @@ -29,32 +35,32 @@ impl DurationEstimator { for (i, (_start_frame, end_frame)) in times.iter().enumerate() { if *end_frame >= 0.0 { let curr_duration = Self::estimate_duration_with_frame_length( - &duration_params[next_state..state + models.nstate()], + &self.parameters[next_state..state + self.nstate], end_frame - frame_count as f64, ); frame_count += curr_duration.iter().sum::(); - next_state = state + models.nstate(); + next_state = state + self.nstate; duration.extend_from_slice(&curr_duration); } else if i + 1 == times.len() { eprintln!("HTS_SStreamSet_create: The time of final label is not specified."); - Self::estimate_duration(&duration_params[next_state..state + models.nstate()], 0.0); + Self::estimate_duration(&self.parameters[next_state..state + self.nstate], 0.0); } - state += models.nstate(); + state += self.nstate; } duration } /// Estimate state duration - fn estimate_duration(duration_params: &[(f64, f64)], rho: f64) -> Vec { + fn estimate_duration(duration_params: &[MeanVari], rho: f64) -> Vec { duration_params .iter() - .map(|(mean, vari)| (mean + rho * vari).round().max(1.0) as usize) + .map(|MeanVari(mean, vari)| (mean + rho * vari).round().max(1.0) as usize) .collect() } /// Estimate duration from state duration probability distribution and specified frame length fn estimate_duration_with_frame_length( - duration_params: &[(f64, f64)], + duration_params: &[MeanVari], frame_length: f64, ) -> Vec { let size = duration_params.len(); @@ -68,11 +74,7 @@ impl DurationEstimator { } // RHO calculation - let (mean, vari) = duration_params - .iter() - .fold((0.0, 0.0), |(mean, vari), curr| { - (mean + curr.0, vari + curr.1) - }); + let MeanVari(mean, vari) = duration_params.iter().sum(); let rho = (target_length as f64 - mean) / vari; let mut duration = Self::estimate_duration(duration_params, rho); @@ -80,7 +82,7 @@ impl DurationEstimator { // loop estimation let mut sum: usize = duration.iter().sum(); let calculate_cost = - |d: usize, (mean, vari): (f64, f64)| (rho - (d as f64 - mean) / vari).abs(); + |d: usize, MeanVari(mean, vari): MeanVari| (rho - (d as f64 - mean) / vari).abs(); while target_length != sum { // search flexible state and modify its duration if target_length > sum { @@ -120,15 +122,16 @@ mod tests { #[test] fn without_alignment() { let models = load_models(); + let estimator = DurationEstimator::new(models.duration(), models.nstate()); assert_eq!( - DurationEstimator.create(&models, 1.0), + estimator.create(1.0), [ 8, 17, 14, 25, 15, 3, 4, 2, 2, 2, 2, 3, 3, 3, 3, 4, 3, 2, 2, 2, 3, 3, 6, 3, 2, 3, 3, 3, 3, 2, 2, 1, 3, 2, 14, 22, 14, 26, 38, 5 ] ); assert_eq!( - DurationEstimator.create(&models, 1.2), + estimator.create(1.2), [ 6, 12, 11, 19, 14, 3, 4, 2, 2, 2, 2, 3, 3, 3, 3, 4, 3, 2, 2, 2, 3, 3, 6, 3, 2, 3, 3, 3, 3, 2, 2, 1, 3, 2, 14, 18, 11, 16, 27, 4 @@ -139,20 +142,18 @@ mod tests { #[test] fn with_alignment() { let models = load_models(); + let estimator = DurationEstimator::new(models.duration(), models.nstate()); assert_eq!( - DurationEstimator.create_with_alignment( - &models, - &[ - (0.0, 298.5), - (298.5, 334.5), - (334.5, 350.5), - (350.5, 362.5), - (362.5, 394.5), - (394.5, 416.5), - (416.5, 454.5), - (454.5, 606.5) - ] - ), + estimator.create_with_alignment(&[ + (0.0, 298.5), + (298.5, 334.5), + (334.5, 350.5), + (350.5, 362.5), + (362.5, 394.5), + (394.5, 416.5), + (416.5, 454.5), + (454.5, 606.5) + ]), [ 36, 86, 48, 102, 27, 7, 11, 6, 6, 6, 2, 4, 3, 4, 3, 3, 3, 2, 2, 2, 3, 6, 14, 6, 3, 4, 5, 6, 4, 3, 3, 1, 4, 4, 26, 28, 19, 42, 55, 8 diff --git a/src/engine.rs b/src/engine.rs index 45d3c76..785a9d4 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -6,7 +6,7 @@ use crate::duration::DurationEstimator; use crate::label::{LabelError, Labels}; use crate::mlpg_adjust::MlpgAdjust; use crate::model::interporation_weight::InterporationWeight; -use crate::model::{apply_additional_half_tone, ModelError, Models, VoiceSet}; +use crate::model::{ModelError, Models, VoiceSet}; use crate::speech::SpeechGenerator; use crate::vocoder::Vocoder; @@ -74,8 +74,7 @@ impl Default for Condition { impl Condition { pub fn load_model(&mut self, voices: &VoiceSet) -> Result<(), EngineError> { - let first = voices.first(); - let metadata = &first.metadata; + let metadata = voices.global_metadata(); let nstream = metadata.num_streams; @@ -86,7 +85,7 @@ impl Condition { self.gv_weight = [1.0].repeat(nstream); /* spectrum */ - for option in &first.stream_models[0].metadata.option { + for option in &voices.stream_metadata(0).option { let Some((key, value)) = option.split_once('=') else { eprintln!("Skipped unrecognized option {}.", option); continue; @@ -97,7 +96,11 @@ impl Condition { .parse() .map_err(|_| EngineError::ParseOptionError(key.to_string()))? } - "LN_GAIN" => self.use_log_gain = value == "1", + "LN_GAIN" => match value { + "1" => self.use_log_gain = true, + "0" => self.use_log_gain = false, + _ => return Err(EngineError::ParseOptionError(key.to_string())), + }, "ALPHA" => { self.alpha = value .parse() @@ -218,6 +221,7 @@ impl Condition { } } +#[derive(Debug, Clone)] pub struct Engine { pub condition: Condition, pub voices: VoiceSet, @@ -264,47 +268,63 @@ impl Engine { } pub fn generate_speech(&self, labels: &Labels) -> Vec { + let vocoder = Vocoder::new( + self.voices.stream_metadata(0).vector_length, + self.voices.stream_metadata(2).vector_length, + self.condition.stage, + self.condition.use_log_gain, + self.condition.sampling_frequency, + self.condition.alpha, + self.condition.beta, + self.condition.volume, + self.condition.fperiod, + ); + let models = Models::new( - labels.labels().to_vec(), + labels.labels(), &self.voices, &self.condition.interporation_weight, ); + let estimator = DurationEstimator::new(models.duration(), models.nstate()); let durations = if self.condition.phoneme_alignment_flag { - DurationEstimator.create_with_alignment(&models, labels.times()) + estimator.create_with_alignment(labels.times()) } else { - DurationEstimator.create(&models, self.condition.speed) + estimator.create(self.condition.speed) }; - let initialize = |stream_index: usize| { + fn mutated(mut value: T, f: F) -> T { + f(&mut value); + value + } + + let spectrum = MlpgAdjust::new( + self.condition.gv_weight[0], + self.condition.msd_threshold[0], + models.model_stream(0), + ) + .create(&durations); + let lf0 = MlpgAdjust::new( + self.condition.gv_weight[1], + self.condition.msd_threshold[1], + mutated(models.model_stream(1), |m| { + m.stream + .apply_additional_half_tone(self.condition.additional_half_tone); + }), + ) + .create(&durations); + let lpf = if self.voices.global_metadata().num_streams > 2 { MlpgAdjust::new( - stream_index, - self.condition.gv_weight[stream_index], - self.condition.msd_threshold[stream_index], + self.condition.gv_weight[2], + self.condition.msd_threshold[2], + models.model_stream(2), ) + .create(&durations) + } else { + vec![vec![0.0; 0]; lf0.len()] }; - let spectrum = initialize(0).create(models.stream(0), &models, &durations); - let lf0 = { - let mut lf0_params = models.stream(1); - apply_additional_half_tone(&mut lf0_params, self.condition.additional_half_tone); - initialize(1).create(lf0_params, &models, &durations) - }; - let lpf = initialize(2).create(models.stream(2), &models, &durations); - - let vocoder = Vocoder::new( - models.vector_length(0) - 1, - self.condition.stage, - self.condition.use_log_gain, - self.condition.sampling_frequency, - self.condition.fperiod, - ); - let generator = SpeechGenerator::new( - self.condition.fperiod, - self.condition.alpha, - self.condition.beta, - self.condition.volume, - ); - generator.synthesize(vocoder, spectrum, lf0, Some(lpf)) + let generator = SpeechGenerator::new(self.condition.fperiod); + generator.synthesize(vocoder, spectrum, lf0, lpf) } } diff --git a/src/lib.rs b/src/lib.rs index e967d5d..bb8cda2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,7 +10,7 @@ pub mod vocoder; #[cfg(test)] mod tests { - use crate::{engine::Engine, model::interporation_weight::Weights}; + use crate::engine::Engine; pub const MODEL_NITECH_ATR503: &str = "models/hts_voice_nitech_jp_atr503_m001-1.05/nitech_jp_atr503_m001.htsvoice"; @@ -31,11 +31,9 @@ mod tests { #[test] fn bonsai() { - let lines: Vec = SAMPLE_SENTENCE_1.iter().map(|l| l.to_string()).collect(); + let engine = Engine::load(&[MODEL_NITECH_ATR503]).unwrap(); - let engine = Engine::load(&[MODEL_NITECH_ATR503.to_string()]).unwrap(); - - let speech = engine.synthesize_from_strings(&lines).unwrap(); + let speech = engine.synthesize_from_strings(&SAMPLE_SENTENCE_1).unwrap(); assert_eq!(speech.len(), 66480); approx::assert_abs_diff_eq!(speech[2000], 19.35141137623778, epsilon = 1.0e-10); @@ -49,7 +47,7 @@ mod tests { .map(|l| l.parse().unwrap()) .collect(); - let engine = Engine::load(&[MODEL_NITECH_ATR503.to_string()]).unwrap(); + let engine = Engine::load(&[MODEL_NITECH_ATR503]).unwrap(); let speech = engine.synthesize_from_labels(labels).unwrap(); @@ -60,19 +58,14 @@ mod tests { #[test] fn bonsai_multi() { - let lines: Vec = SAMPLE_SENTENCE_1.iter().map(|l| l.to_string()).collect(); - let mut engine = Engine::load(&[MODEL_TOHOKU_F01_NORMAL, MODEL_TOHOKU_F01_HAPPY]).unwrap(); let iw = engine.condition.get_interporation_weight_mut(); - iw.set_duration(Weights::new(&[0.7, 0.3]).unwrap()).unwrap(); - iw.set_parameter(0, Weights::new(&[0.7, 0.3]).unwrap()) - .unwrap(); - iw.set_parameter(1, Weights::new(&[0.7, 0.3]).unwrap()) - .unwrap(); - iw.set_parameter(2, Weights::new(&[1.0, 0.0]).unwrap()) - .unwrap(); + iw.set_duration(&[0.7, 0.3]).unwrap(); + iw.set_parameter(0, &[0.7, 0.3]).unwrap(); + iw.set_parameter(1, &[0.7, 0.3]).unwrap(); + iw.set_parameter(2, &[1.0, 0.0]).unwrap(); - let speech = engine.synthesize_from_strings(&lines).unwrap(); + let speech = engine.synthesize_from_strings(&SAMPLE_SENTENCE_1).unwrap(); assert_eq!(speech.len(), 74880); approx::assert_abs_diff_eq!(speech[2000], 2.3158134981607754e-5, epsilon = 1.0e-10); @@ -110,11 +103,9 @@ mod tests { #[test] fn is_this_bonsai() { - let lines: Vec = SAMPLE_SENTENCE_2.iter().map(|l| l.to_string()).collect(); + let engine = Engine::load(&[MODEL_NITECH_ATR503]).unwrap(); - let engine = Engine::load(&[MODEL_NITECH_ATR503.to_string()]).unwrap(); - - let speech = engine.synthesize_from_strings(&lines).unwrap(); + let speech = engine.synthesize_from_strings(&SAMPLE_SENTENCE_2).unwrap(); assert_eq!(speech.len(), 100800); approx::assert_abs_diff_eq!(speech[2000], 17.15977345625943, epsilon = 1.0e-10); @@ -125,12 +116,10 @@ mod tests { #[test] fn is_this_bonsai_fast() { - let lines: Vec = SAMPLE_SENTENCE_2.iter().map(|l| l.to_string()).collect(); - - let mut engine = Engine::load(&[MODEL_NITECH_ATR503.to_string()]).unwrap(); + let mut engine = Engine::load(&[MODEL_NITECH_ATR503]).unwrap(); engine.condition.set_speed(1.4); - let speech = engine.synthesize_from_strings(&lines).unwrap(); + let speech = engine.synthesize_from_strings(&SAMPLE_SENTENCE_2).unwrap(); assert_eq!(speech.len(), 72000); approx::assert_abs_diff_eq!(speech[2000], 15.0481014871396, epsilon = 1.0e-10); @@ -141,7 +130,7 @@ mod tests { #[test] fn empty() { - let engine = Engine::load(&[MODEL_NITECH_ATR503.to_string()]).unwrap(); + let engine = Engine::load(&[MODEL_NITECH_ATR503]).unwrap(); let speech = engine.synthesize_from_strings::(&[]).unwrap(); assert_eq!(speech.len(), 0); } diff --git a/src/mlpg_adjust/mask.rs b/src/mlpg_adjust/mask.rs index adb858b..49f0743 100644 --- a/src/mlpg_adjust/mask.rs +++ b/src/mlpg_adjust/mask.rs @@ -1,14 +1,24 @@ +use crate::model::StreamParameter; + +use super::IterExt; + pub struct Mask(Vec); impl FromIterator for Mask { fn from_iter>(iter: I) -> Self { - Self::new(iter.into_iter().collect()) + Self(iter.into_iter().collect()) } } impl Mask { - pub fn new(mask: Vec) -> Self { - Self(mask) + pub fn create(stream: &StreamParameter, threshold: f64, durations: &[usize]) -> Self { + Self( + stream + .iter() + .map(|(_, msd)| *msd > threshold) + .duration(durations) + .collect(), + ) } pub fn mask(&self) -> &[bool] { &self.0 @@ -70,22 +80,20 @@ mod tests { #[test] fn fill() { assert_eq!( - Mask::new(vec![false, false, true, true, false, true]) + Mask(vec![false, false, true, true, false, true]) .fill([0, 1, 2], 5) .collect::>(), vec![5, 5, 0, 1, 5, 2] ); assert_eq!( - Mask::new(vec![false, false]) - .fill([0, 1], 5) - .collect::>(), + Mask(vec![false, false]).fill([0, 1], 5).collect::>(), vec![5, 5] ); } #[test] fn boundary_distances() { assert_eq!( - Mask::new(vec![ + Mask(vec![ true, true, true, true, true, true, true, true, true, true ]) .boundary_distances(), @@ -103,7 +111,7 @@ mod tests { ], ); assert_eq!( - Mask::new(vec![ + Mask(vec![ true, true, true, false, false, true, true, true, true, true ]) .boundary_distances(), @@ -121,7 +129,7 @@ mod tests { ] ); assert_eq!( - Mask::new(vec![ + Mask(vec![ true, true, true, false, true, false, false, false, false, false ]) .boundary_distances(), @@ -138,6 +146,6 @@ mod tests { (0, 0) ] ); - assert_eq!(Mask::new(vec![]).boundary_distances(), vec![]); + assert_eq!(Mask(vec![]).boundary_distances(), vec![]); } } diff --git a/src/mlpg_adjust/mlpg.rs b/src/mlpg_adjust/mlpg.rs index 25b3d56..545ffbe 100644 --- a/src/mlpg_adjust/mlpg.rs +++ b/src/mlpg_adjust/mlpg.rs @@ -1,4 +1,6 @@ -use crate::model::Windows; +use crate::model::{GvParameter, MeanVari, Windows}; + +use super::{mask::Mask, IterExt}; const W1: f64 = 1.0; const W2: f64 = 1.0; @@ -13,29 +15,17 @@ pub struct MlpgMatrix { } impl MlpgMatrix { - pub fn new() -> Self { - Self { - win_size: 0, - length: 0, - width: 0, - wuw: Vec::new(), - wum: Vec::new(), - } - } - /// Calculate W^T U^{-1} W and W^T U^{-1} \mu /// (preparation for calculation of dynamic feature) - pub fn calc_wuw_and_wum(&mut self, windows: &Windows, parameters: Vec>) { - self.win_size = windows.size(); - self.length = parameters[0].len(); - self.width = windows.max_width() * 2 + 1; - - self.wuw = Vec::new(); - self.wum = Vec::new(); + pub fn calc_wuw_and_wum(windows: &Windows, parameters: Vec>) -> Self { + let length = parameters[0].len(); + let width = windows.max_width() * 2 + 1; + let mut wum = Vec::with_capacity(length); + let mut wuw = Vec::with_capacity(length); - for t in 0..self.length { - self.wuw.push(vec![0.0; self.width]); - self.wum.push(0.0); + for t in 0..length { + wuw.push(vec![0.0; width]); + wum.push(0.0); for (i, window) in windows.iter().enumerate() { for (index, coef) in window.iter_rev(0) { @@ -44,26 +34,34 @@ impl MlpgMatrix { } let idx = (t as isize) - index.position(); - if idx < 0 || idx >= self.length as isize { + if idx < 0 || idx >= length as isize { continue; } let wu = coef * parameters[i][idx as usize].1; - self.wum[t] += wu * parameters[i][idx as usize].0; + wum[t] += wu * parameters[i][idx as usize].0; for (inner_index, coef) in window.iter_rev(index.index()) { if coef == 0.0 { continue; } let j = inner_index.index() - index.index(); - if t + j >= self.length { + if t + j >= length { break; } - self.wuw[t][j] += wu * coef; + wuw[t][j] += wu * coef; } } } } + + Self { + win_size: windows.size(), + length, + width, + wuw, + wum, + } } /// Solve equation $W^T U^{-1} W c = W^T U^{-1} \mu$ and return the vector $c$ @@ -110,6 +108,32 @@ impl MlpgMatrix { par } + + pub fn par( + &mut self, + gv: &Option, + vector_index: usize, + gv_weight: f64, + durations: &[usize], + msd_flag: &Mask, + ) -> Vec { + if let Some((gv_param, gv_switch)) = gv { + let mtx_before = self.clone(); + let par = self.solve(); + let gv_switch: Vec<_> = gv_switch + .iter() + .copied() + .duration(durations) + .filter_by(msd_flag.mask()) + .collect(); + let mgv = MlpgGlobalVariance::new(mtx_before, par, &gv_switch); + + let MeanVari(gv_mean, gv_vari) = gv_param[vector_index]; + mgv.apply_gv(gv_mean * gv_weight, gv_vari) + } else { + self.solve() + } + } } #[derive(Debug, Clone)] diff --git a/src/mlpg_adjust/mod.rs b/src/mlpg_adjust/mod.rs index 77dbd01..faa4f9c 100644 --- a/src/mlpg_adjust/mod.rs +++ b/src/mlpg_adjust/mod.rs @@ -1,125 +1,118 @@ +use std::iter; + use crate::{ constants::NODATA, - model::{Models, StreamParameter}, + model::{GvParameter, MeanVari, ModelStream, StreamParameter, Windows}, }; mod mask; mod mlpg; -use self::{ - mask::Mask, - mlpg::{MlpgGlobalVariance, MlpgMatrix}, -}; +use self::{mask::Mask, mlpg::MlpgMatrix}; -pub struct MlpgAdjust { - stream_index: usize, +pub struct MlpgAdjust<'a> { gv_weight: f64, msd_threshold: f64, + vector_length: usize, + stream: StreamParameter, + gv: Option, + windows: &'a Windows, } -impl MlpgAdjust { - pub fn new(stream_index: usize, gv_weight: f64, msd_threshold: f64) -> Self { +impl<'a> MlpgAdjust<'a> { + pub fn new( + gv_weight: f64, + msd_threshold: f64, + ModelStream { + vector_length, + stream, + gv, + windows, + }: ModelStream<'a>, + ) -> Self { Self { - stream_index, gv_weight, msd_threshold, + vector_length, + stream, + gv, + windows, } } /// Parameter generation using GV weight - pub fn create( - &self, - stream: StreamParameter, - models: &Models, - durations: &[usize], - ) -> Vec> { - let vector_length = models.vector_length(self.stream_index); - - let msd_flag: Mask = stream - .iter() - .zip(durations) - .flat_map(|((_, msd), duration)| { - let flag = *msd > self.msd_threshold; - [flag].repeat(*duration) - }) - .collect(); - + pub fn create(&self, durations: &[usize]) -> Vec> { + let msd_flag = Mask::create(&self.stream, self.msd_threshold, durations); let msd_boundaries = msd_flag.boundary_distances(); + let mut pars = vec![vec![0.0; self.vector_length]; msd_flag.mask().len()]; - let mut pars = vec![vec![0.0; vector_length]; msd_flag.mask().len()]; - for vector_index in 0..vector_length { - let parameters: Vec> = models - .windows(self.stream_index) + for vector_index in 0..self.vector_length { + let parameters: Vec> = self + .windows .iter() .enumerate() .map(|(window_index, window)| { - let m = vector_length * window_index + vector_index; + let m = self.vector_length * window_index + vector_index; - let mut iter = msd_flag.mask().iter(); - stream + self.stream .iter() - .zip(durations) - // get mean and ivar, and spread it to its duration - .flat_map(|((curr_stream, _), duration)| { - let (mean, vari) = curr_stream[m]; - let ivar = { - if vari.abs() > 1e19 { - 0.0 - } else if vari.abs() < 1e-19 { - 1e38 - } else { - 1.0 / vari - } - }; - [(mean, ivar)].repeat(*duration) - }) + .map(|(curr_stream, _)| curr_stream[m].with_ivar()) + .duration(durations) .zip(&msd_boundaries) - .map(|((mean, ivar), (left, right))| { + .map(|(mean_ivar, (left, right))| { let is_left_msd_boundary = *left < window.left_width(); let is_right_msd_boundary = *right < window.right_width(); // If the window includes non-msd frames, set the ivar to 0.0 if (is_left_msd_boundary || is_right_msd_boundary) && window_index != 0 { - (mean, 0.0) + mean_ivar.with_0() } else { - (mean, ivar) + mean_ivar } }) - .filter(|_| iter.next() == Some(&true)) + .filter_by(msd_flag.mask()) .collect() }) .collect(); - let mut mtx = MlpgMatrix::new(); - mtx.calc_wuw_and_wum(models.windows(self.stream_index), parameters); + let mut mtx = MlpgMatrix::calc_wuw_and_wum(self.windows, parameters); + let par = mtx.par(&self.gv, vector_index, self.gv_weight, durations, &msd_flag); - let par = if let Some((gv_param, gv_switch)) = models.gv(self.stream_index) { - let mtx_before = mtx.clone(); - let par = mtx.solve(); + for (par, value) in pars.iter_mut().zip(msd_flag.fill(par, NODATA)) { + par[vector_index] = value; + } + } - let gv_mean = gv_param[vector_index].0 * self.gv_weight; - let gv_vari = gv_param[vector_index].1; + pars + } +} - let mut iter = msd_flag.mask().iter(); - let gv_switch: Vec = gv_switch - .iter() - .zip(durations) - .flat_map(|(switch, duration)| [*switch].repeat(*duration)) - .filter(|_| iter.next() == Some(&true)) - .collect(); +trait IterExt: Iterator { + fn duration<'a>( + self, + durations: impl IntoIterator + 'a, + ) -> impl Iterator; - MlpgGlobalVariance::new(mtx_before, par, &gv_switch).apply_gv(gv_mean, gv_vari) - } else { - mtx.solve() - }; + fn filter_by<'a>( + self, + mask: impl IntoIterator + 'a, + ) -> impl Iterator; +} - pars.iter_mut() - .zip(msd_flag.fill(par, NODATA)) - .for_each(|(par, value)| { - par[vector_index] = value; - }); - } +impl> IterExt for I { + fn duration<'a>( + self, + durations: impl IntoIterator + 'a, + ) -> impl Iterator { + self.zip(durations) + .flat_map(move |(item, duration)| iter::repeat(item).take(*duration)) + } - pars + fn filter_by<'a>( + self, + mask: impl IntoIterator + 'a, + ) -> impl Iterator { + self.zip(mask) + .filter_map(|(item, mask)| if *mask { Some(item) } else { None }) } } diff --git a/src/model/interporation_weight.rs b/src/model/interporation_weight.rs index e09ad05..f09a669 100644 --- a/src/model/interporation_weight.rs +++ b/src/model/interporation_weight.rs @@ -1,3 +1,5 @@ +use std::ops::Deref; + #[derive(Debug, Clone, thiserror::Error)] pub enum WeightError { #[error("Weights do not sum to 1.0")] @@ -23,19 +25,20 @@ impl Default for InterporationWeight { impl InterporationWeight { pub fn new(nvoices: usize, nstream: usize) -> Self { - let default_weight = Weights::average(nvoices); + let average = Weights::average(nvoices); Self { nvoices, - duration: default_weight.clone(), - parameter: vec![default_weight.clone(); nstream], - gv: vec![default_weight.clone(); nstream], + parameter: vec![average.clone(); nstream], + gv: vec![average.clone(); nstream], + duration: average, } } /// Set duration weight /// weights.len() == nvoices /// weights.iter().sum() == 1.0 - pub fn set_duration(&mut self, weights: Weights) -> Result<(), WeightError> { + pub fn set_duration(&mut self, weight: &[f64]) -> Result<(), WeightError> { + let weights = Weights::new(weight)?; weights.check_length(self.nvoices)?; self.duration = weights; Ok(()) @@ -46,8 +49,9 @@ impl InterporationWeight { pub fn set_parameter( &mut self, stream_index: usize, - weights: Weights, + weight: &[f64], ) -> Result<(), WeightError> { + let weights = Weights::new(weight)?; weights.check_length(self.nvoices)?; self.parameter[stream_index] = weights; Ok(()) @@ -55,7 +59,8 @@ impl InterporationWeight { /// Set GV weight /// weights.len() == nvoices /// weights.iter().sum() == 1.0 - pub fn set_gv(&mut self, stream_index: usize, weights: Weights) -> Result<(), WeightError> { + pub fn set_gv(&mut self, stream_index: usize, weight: &[f64]) -> Result<(), WeightError> { + let weights = Weights::new(weight)?; weights.check_length(self.nvoices)?; self.gv[stream_index] = weights; Ok(()) @@ -91,10 +96,6 @@ impl Weights { }) } - pub fn get_weights(&self) -> &[f64] { - &self.weights - } - fn average(nvoices: usize) -> Self { let average_weight = 1.0f64 / nvoices as f64; Self { @@ -109,3 +110,11 @@ impl Weights { Ok(()) } } + +impl Deref for Weights { + type Target = [f64]; + + fn deref(&self) -> &Self::Target { + &self.weights + } +} diff --git a/src/model/mean_vari.rs b/src/model/mean_vari.rs new file mode 100644 index 0000000..af0df89 --- /dev/null +++ b/src/model/mean_vari.rs @@ -0,0 +1,67 @@ +use std::{ + iter::Sum, + ops::{Add, Mul}, +}; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct MeanVari(pub f64, pub f64); + +impl MeanVari { + pub fn with_ivar(&self) -> Self { + let Self(mean, vari) = self; + let ivar = if vari.abs() > 1e19 { + 0.0 + } else if vari.abs() < 1e-19 { + 1e38 + } else { + 1.0 / vari + }; + Self(*mean, ivar) + } + + pub fn with_0(&self) -> Self { + let Self(mean, _) = self; + Self(*mean, 0.0) + } + + pub fn weighted(&self, weight: f64) -> Self { + let Self(mean, vari) = self; + Self(mean * weight, vari * weight) + } +} + +impl Add for &MeanVari { + type Output = MeanVari; + fn add(self, rhs: Self) -> Self::Output { + MeanVari(self.0 + rhs.0, self.1 + rhs.1) + } +} + +impl Add for MeanVari { + type Output = MeanVari; + #[allow(clippy::op_ref)] + fn add(self, rhs: Self) -> Self::Output { + &self + &rhs + } +} + +impl<'a> Sum<&'a Self> for MeanVari { + fn sum>(iter: I) -> Self { + iter.fold(MeanVari(0.0, 0.0), |a, b| a + *b) + } +} + +impl Mul for &MeanVari { + type Output = MeanVari; + fn mul(self, rhs: f64) -> Self::Output { + MeanVari(self.0 * rhs, self.1 * rhs) + } +} + +impl Mul for MeanVari { + type Output = MeanVari; + #[allow(clippy::op_ref)] + fn mul(self, rhs: f64) -> Self::Output { + &self * rhs + } +} diff --git a/src/model/mod.rs b/src/model/mod.rs index 5f9063d..52944a0 100644 --- a/src/model/mod.rs +++ b/src/model/mod.rs @@ -1,4 +1,4 @@ -use std::{borrow::Cow, sync::Arc}; +use std::borrow::Cow; use self::voice::model::ModelParameter; @@ -10,7 +10,16 @@ pub use self::{ use jlabel::Label; pub mod interporation_weight; +pub mod mean_vari; +pub mod model_stream; +pub mod stream_parameter; pub mod voice; +pub mod voice_set; + +pub use mean_vari::MeanVari; +pub use model_stream::ModelStream; +pub use stream_parameter::StreamParameter; +pub use voice_set::VoiceSet; #[cfg(feature = "htsvoice")] mod parser; @@ -29,92 +38,85 @@ pub enum ModelError { ParserError(#[from] parser::ModelParseError), } -pub type StreamParameter = Vec<(Vec<(f64, f64)>, f64)>; -pub type GvParameter = (Vec<(f64, f64)>, Vec); +pub type GvParameter = (Vec, Vec); pub struct Models<'a> { - labels: Vec