diff --git a/crates/audio/src/audio.rs b/crates/audio/src/audio.rs index 53b7baaa34..e6e58038a2 100644 --- a/crates/audio/src/audio.rs +++ b/crates/audio/src/audio.rs @@ -184,26 +184,26 @@ impl Audio { let stream = raw_mic_input .possibly_disconnected_channels_to_mono() .constant_samplerate(SAMPLE_RATE) - .limit(LimitSettings::live_performance()) - .process_buffer::(move |buffer| { - let mut int_buffer: [i16; _] = buffer.map(|s| s.to_sample()); - if voip_parts - .echo_canceller - .lock() - .process_stream( - &mut int_buffer, - SAMPLE_RATE.get() as i32, - CHANNEL_COUNT.get() as i32, - ) - .context("livekit audio processor error") - .log_err() - .is_some() - { - for (sample, processed) in buffer.iter_mut().zip(&int_buffer) { - *sample = (*processed).to_sample(); - } - } - }); + .limit(LimitSettings::live_performance()); + // .process_buffer::(move |buffer| { + // let mut int_buffer: [i16; _] = buffer.map(|s| s.to_sample()); + // if voip_parts + // .echo_canceller + // .lock() + // .process_stream( + // &mut int_buffer, + // SAMPLE_RATE.get() as i32, + // CHANNEL_COUNT.get() as i32, + // ) + // .context("livekit audio processor error") + // .log_err() + // .is_some() + // { + // for (sample, processed) in buffer.iter_mut().zip(&int_buffer) { + // *sample = (*processed).to_sample(); + // } + // } + // }); // .denoise() // .context("Could not set up denoiser")? // .automatic_gain_control(automatic_gain_control_settings()) diff --git a/crates/audio/src/rodio_ext.rs b/crates/audio/src/rodio_ext.rs index 6f5f4b5024..a869e0446e 100644 --- a/crates/audio/src/rodio_ext.rs +++ b/crates/audio/src/rodio_ext.rs @@ -114,7 +114,7 @@ pub struct ConstantChannelCount { } impl ConstantChannelCount { - fn new(source: S, target_channels: ChannelCount) -> Self { + pub fn new(source: S, target_channels: ChannelCount) -> Self { let input_channels = source.channels(); let sample_rate = source.sample_rate(); let inner = ChannelCountConverter::new(source, input_channels, target_channels); diff --git a/crates/audio/src/rodio_ext/resample.rs b/crates/audio/src/rodio_ext/resample.rs index d0245708bb..d9be836b79 100644 --- a/crates/audio/src/rodio_ext/resample.rs +++ b/crates/audio/src/rodio_ext/resample.rs @@ -142,7 +142,7 @@ mod tests { use std::time::Duration; use crate::{ - test::{recording_of_davids_voice, sine}, + test::{recording_of_voice, sine}, RodioExt, }; use itertools::Itertools; @@ -214,7 +214,7 @@ mod tests { #[test] fn constant_samplerate_preserves_length() { - let test_signal = recording_of_davids_voice(nz!(3), nz!(48_000)); + let test_signal = recording_of_voice(nz!(3), nz!(48_000)); let resampled = test_signal.clone().constant_samplerate(nz!(16_000)); let diff_in_length = test_signal diff --git a/crates/audio/src/test.rs b/crates/audio/src/test.rs index 6ab26f6df9..4ec02607ec 100644 --- a/crates/audio/src/test.rs +++ b/crates/audio/src/test.rs @@ -4,6 +4,7 @@ use std::env::current_dir; use std::io::Cursor; use std::iter; +use std::ops::Range; use std::sync::atomic::Ordering; use std::time::Duration; @@ -21,10 +22,14 @@ use crate::audio_settings::LIVE_SETTINGS; use crate::test::detector::BasicVoiceDetector; use crate::{Audio, LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE, RodioExt, VoipParts}; +mod detector; +// in hz +const HUMAN_SPEECH_RANGE: Range = 90.0..260.0; + #[gpui::test] fn test_input_pipeline(cx: &mut gpui::TestAppContext) { // strange params to invite bugs to show themselves - let test_signal = recording_of_davids_voice(nz!(3), nz!(48_000)); + let test_signal = recording_of_voice(nz!(3), nz!(48_000)); let test_signal_duration = test_signal .total_duration() .expect("recordings have a length"); @@ -38,15 +43,20 @@ fn test_input_pipeline(cx: &mut gpui::TestAppContext) { .into_samples_buffer(); let expected_output = - recording_of_davids_voice(input_pipeline.channels(), input_pipeline.sample_rate()); - rodio::wav_to_file(input_pipeline.clone(), "input_pipeline_output.wav").unwrap(); + recording_of_voice(input_pipeline.channels(), input_pipeline.sample_rate()); + + rodio::wav_to_file( + BasicVoiceDetector::add_voice_activity_as_channel(input_pipeline.clone()), + "input_pipeline_output.wav", + ) + .unwrap(); rodio::wav_to_file(expected_output.clone(), "input_pipeline_expect.wav").unwrap(); assert_similar_voice_spectra(expected_output, input_pipeline); } #[gpui::test] fn test_output_pipeline(cx: &mut gpui::TestAppContext) { - let test_signal = recording_of_davids_voice(LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE); + let test_signal = recording_of_voice(LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE); let test_signal_duration = test_signal .total_duration() .expect("recordings have a length"); @@ -65,7 +75,7 @@ fn test_output_pipeline(cx: &mut gpui::TestAppContext) { // dont care about the channel count and sample rate, as long as the voice // signal matches let expected_output = - recording_of_davids_voice(output_pipeline.channels(), output_pipeline.sample_rate()); + recording_of_voice(output_pipeline.channels(), output_pipeline.sample_rate()); rodio::wav_to_file(output_pipeline.clone(), "output_pipeline_output.wav").unwrap(); rodio::wav_to_file(expected_output.clone(), "output_pipeline_expect.wav").unwrap(); assert_similar_voice_spectra(expected_output, output_pipeline); @@ -74,7 +84,7 @@ fn test_output_pipeline(cx: &mut gpui::TestAppContext) { // TODO make a perf variant #[gpui::test] fn test_full_audio_pipeline(cx: &mut gpui::TestAppContext) { - let test_signal = recording_of_davids_voice(nz!(3), nz!(44_100)); + let test_signal = recording_of_voice(nz!(3), nz!(44_100)); let test_signal_duration = test_signal .total_duration() .expect("recordings have a length"); @@ -92,15 +102,21 @@ fn test_full_audio_pipeline(cx: &mut gpui::TestAppContext) { // dont care about the channel count and sample rate, as long as the voice // signal matches - let expected_output = - recording_of_davids_voice(full_pipeline.channels(), full_pipeline.sample_rate()); + let expected_output = recording_of_voice(full_pipeline.channels(), full_pipeline.sample_rate()); rodio::wav_to_file(full_pipeline.clone(), "full_pipeline_output.wav").unwrap(); - rodio::wav_to_file(expected_output.clone(), "full_pipeline_expected.wav").unwrap(); + rodio::wav_to_file(expected_output.clone(), "full_pipeline_expect.wav").unwrap(); assert_similar_voice_spectra(expected_output, full_pipeline); } -fn energy_of_spectrum(spectrum: &FrequencySpectrum) -> f32 { - spectrum.max().1.val() +fn human_perceivable_energy(spectrum: &FrequencySpectrum) -> f32 { + spectrum + .data() + .iter() + .filter(|(freq, _)| HUMAN_SPEECH_RANGE.contains(&freq.val())) + .max_by_key(|(_, energy)| energy) + .unwrap() + .1 + .val() } fn energy_of_chunk(chunk: &[rodio::Sample], sample_rate: SampleRate) -> f32 { @@ -112,7 +128,7 @@ fn energy_of_chunk(chunk: &[rodio::Sample], sample_rate: SampleRate) -> f32 { ) .unwrap(); - energy_of_spectrum(&spectrum) + human_perceivable_energy(&spectrum) } fn maximum_energy(mut a: impl rodio::Source) -> f32 { @@ -127,10 +143,6 @@ fn maximum_energy(mut a: impl rodio::Source) -> f32 { .fold(0f32, |max, energy| max.max(energy)) } -const CHUNK_DURATION: Duration = Duration::from_millis(100); - -mod detector; - // Test signals should be at least 50% voice fn assert_similar_voice_spectra( expected: impl rodio::Source + Clone, @@ -211,11 +223,17 @@ fn assert_similar_voice_spectra( ); } -fn spectra_chunk_size(source: &impl Source) -> usize { - ((CHUNK_DURATION.as_secs_f64() * source.sample_rate().get() as f64).ceil() as usize) +fn spectra_chunk_size(source: &impl Source, minimum_duration: Duration) -> usize { + ((minimum_duration.as_secs_f64() * source.sample_rate().get() as f64) as usize) .next_power_of_two() } +fn spectrum_duration(source: &impl Source, minimum_duration: Duration) -> Duration { + Duration::from_secs_f64( + spectra_chunk_size(source, minimum_duration) as f64 / source.sample_rate().get() as f64, + ) +} + fn assert_same_voice_signal( (chunk_start, (expected, pipeline)): (Duration, (FrequencySpectrum, FrequencySpectrum)), ) -> Option { @@ -232,7 +250,7 @@ fn assert_same_voice_signal( panic!( "Could not find fundamental voice freq in output while there is one in the input at {voice_freq_expected}Hz.\nLoudest 5 frequencies in output:\n{}\n\n{}", display_loudest_5_frequencies(&pipeline), - plot_spectra(&expected, &pipeline), + plot_spectra(&[(&expected, "expected"), (&pipeline, "pipeline")]), ); } (Some(voice_freq_expected), Some(voice_freq_pipeline)) => { @@ -243,7 +261,7 @@ fn assert_same_voice_signal( assert!( less_than_10percent_diff((voice_freq_expected, voice_freq_pipeline)), "expected: {voice_freq_expected}, pipeline: {voice_freq_pipeline}, at: {chunk_start:?}\n\n{}", - plot_spectra(&expected, &pipeline) + plot_spectra(&[(&expected, "expected"), (&pipeline, "pipeline")]) ); // Guards against voice distortion @@ -256,14 +274,12 @@ fn assert_same_voice_signal( } fn fundamental_voice_freq(spectrum: &FrequencySpectrum) -> Option { - let human_speech_range = 90.0..260.0; - let spectrum: Vec<_> = spectrum.data().iter().collect(); spectrum + .data() .iter() - .filter(|(freq, _)| human_speech_range.contains(&freq.val())) - // .inspect(|(freq, ampl)| println!("{freq},{ampl}")) - .max_by(|(_, a_ampl), (_, b_ampl)| a_ampl.val().total_cmp(&b_ampl.val())) - .map(|(freq, _ampl)| freq.val()) + .filter(|(freq, _)| HUMAN_SPEECH_RANGE.contains(&freq.val())) + .max_by_key(|(_, energy)| energy) + .map(|(freq, _)| freq.val()) } fn same_ratio_between_harmonics( @@ -313,7 +329,7 @@ fn display_loudest_5_frequencies(spectrum: &FrequencySpectrum) -> String { } // Returns ascii encoding a link to open the plot -fn plot_spectra(expected: &FrequencySpectrum, pipeline: &FrequencySpectrum) -> String { +pub fn plot_spectra(spectra: &[(&FrequencySpectrum, &str)]) -> String { use plotly::{Bar, Plot}; let mut plot = Plot::new(); @@ -322,29 +338,16 @@ fn plot_spectra(expected: &FrequencySpectrum, pipeline: &FrequencySpectrum) -> S // .y_axis(Axis::new().type_(plotly::layout::AxisType::Log)); plot.set_layout(layout); - let (x, y): (Vec<_>, Vec<_>) = expected - .data() - .iter() - .map(|(freq, amplitude)| (freq.val(), amplitude.val())) - .filter(|(freq, _)| *freq > 85.0) - .unzip(); - let trace = Bar::new(x, y) - .name("expected") - .show_legend(true) - .opacity(0.5); - plot.add_trace(trace); - - let (x, y): (Vec<_>, Vec<_>) = pipeline - .data() - .iter() - .map(|(freq, amplitude)| (freq.val(), amplitude.val())) - .filter(|(freq, _)| *freq > 85.0) - .unzip(); - let trace = Bar::new(x, y) - .name("pipeline") - .show_legend(true) - .opacity(0.5); - plot.add_trace(trace); + for (spectrum, label) in spectra { + let (x, y): (Vec<_>, Vec<_>) = spectrum + .data() + .iter() + .map(|(freq, amplitude)| (freq.val(), amplitude.val())) + .filter(|(freq, _)| *freq > 85.0) + .unzip(); + let trace = Bar::new(x, y).name(label).show_legend(true).opacity(0.5); + plot.add_trace(trace); + } let path = current_dir().unwrap().join("plot.html"); plot.write_html(&path); @@ -378,7 +381,7 @@ pub(crate) fn sine(channels: ChannelCount, sample_rate: SampleRate) -> impl Sour ) } -pub(crate) fn recording_of_davids_voice( +pub(crate) fn recording_of_voice( channels: ChannelCount, sample_rate: SampleRate, ) -> impl Source + Clone { @@ -407,7 +410,7 @@ pub(crate) fn recording_of_davids_voice( #[should_panic] fn test_rejects_pitch_shift() { // also known as 'robot/chipmunk voice' - let original = recording_of_davids_voice(nz!(1), nz!(44100)); + let original = recording_of_voice(nz!(1), nz!(44100)); let pitch_shifted = original .clone() .speed(1.2) // effectively increases the pitch by 20% @@ -422,7 +425,7 @@ fn test_rejects_pitch_shift() { #[test] #[should_panic] fn test_rejects_large_amounts_of_noise() { - let original = recording_of_davids_voice(nz!(1), nz!(44100)); + let original = recording_of_voice(nz!(1), nz!(44100)); let with_noise = add_noise(&original, 0.5); assert_similar_voice_spectra(original, with_noise); @@ -430,7 +433,7 @@ fn test_rejects_large_amounts_of_noise() { #[test] fn test_ignores_volume() { - let original = recording_of_davids_voice(nz!(1), nz!(44100)); + let original = recording_of_voice(nz!(1), nz!(44100)); let amplified = original.clone().amplify(1.42); assert_similar_voice_spectra(original, amplified); @@ -438,7 +441,7 @@ fn test_ignores_volume() { #[test] fn test_ignore_low_volume_noise() { - let original = recording_of_davids_voice(nz!(1), nz!(44100)); + let original = recording_of_voice(nz!(1), nz!(44100)); // 5% noise is quite hearable as the noise is across all frequencies so is // perceived far more intense then a voice let with_noise = add_noise(&original, 0.05); @@ -469,7 +472,7 @@ fn add_noise(original: &(impl Source + Clone + Send + 'static), amount: f32) -> #[test] fn test_ignores_small_shifts() { - let original = recording_of_davids_voice(nz!(1), nz!(44100)); + let original = recording_of_voice(nz!(1), nz!(44100)); let shifted = iter::repeat(0f32).take(10).chain(original.clone()); let shifted = SamplesBuffer::new( original.channels(), diff --git a/crates/audio/src/test/detector.rs b/crates/audio/src/test/detector.rs new file mode 100644 index 0000000000..928f5516ad --- /dev/null +++ b/crates/audio/src/test/detector.rs @@ -0,0 +1,196 @@ +use crate::RodioExt; +use crate::rodio_ext::ConstantChannelCount; +use crate::test::sine; +use crate::test::spectrum_duration; + +use super::human_perceivable_energy; + +use rodio::buffer::SamplesBuffer; +use rodio::nz; +use spectrum_analyzer::FrequencyLimit; +use spectrum_analyzer::FrequencySpectrum; +use spectrum_analyzer::scaling::divide_by_N_sqrt; +use spectrum_analyzer::windows::hann_window; + +use super::maximum_energy; + +use rodio::Source; + +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct VoiceSegment { + pub start: Duration, + pub end: Duration, +} + +impl VoiceSegment { + const ZERO: Self = Self { + start: Duration::ZERO, + end: Duration::ZERO, + }; + + fn length(&self) -> Duration { + self.end - self.start + } + + fn until(&self, other: &Self) -> Duration { + debug_assert!(self.end < other.start); + other.start - self.end + } +} + +pub(crate) struct BasicVoiceDetector { + pub(crate) segments_with_voice: Vec, +} + +impl BasicVoiceDetector { + pub(crate) fn new(source: impl Source + Clone) -> Self { + // only works on mono + let source = ConstantChannelCount::new(source, nz!(1)).into_samples_buffer(); + + // this gives a good resolution + let minimum_chunk_duration = Duration::from_millis(20); + let actual_chunk_duration = spectrum_duration(&source, minimum_chunk_duration); + + let mut spectrum_start_pos = Duration::ZERO; + let mut partial_segment = None; + + // empirically determined (by looking in audacity) + // see the 'soup' test for how + // + // while this might seem low remember humans precieve sound + // logarithmically. So 40% of energy sounds like 80% volume. + let threshold = 0.4 * maximum_energy(source.clone()); + let segments_with_voice: Vec<_> = iter_spectra(source.clone(), actual_chunk_duration) + .filter_map(|spectrum| { + let voice_detected = human_perceivable_energy(&spectrum) > threshold; + spectrum_start_pos += actual_chunk_duration; + match (&mut partial_segment, voice_detected) { + (Some(VoiceSegment { end, .. }), true) => *end = spectrum_start_pos, + (Some(VoiceSegment { start, .. }), false) => { + let res = Some(VoiceSegment { + start: *start, + end: spectrum_start_pos, + }); + partial_segment = None; + return res; + } + (None, true) => { + partial_segment = Some(VoiceSegment { + start: spectrum_start_pos, + end: spectrum_start_pos, + }) + } + (None, false) => partial_segment = None, + }; + None + }) + .collect(); + + Self { + segments_with_voice, + } + } + + pub fn voice_less_duration(&self) -> Duration { + self.segments_with_voice + .iter() + .map(|range| range.end - range.start) + .sum() + } + + fn beep_where_voice_detected(&self, source: &impl Source) -> SamplesBuffer { + let sine = sine(source.channels(), source.sample_rate()); + + let mut with_voice = [VoiceSegment::ZERO] + .iter() + .chain(self.segments_with_voice.iter()) + .peekable(); + let mut samples = Vec::new(); + + loop { + let Some(current_voice_segment) = with_voice.next() else { + break; + }; + + let voice_range_duration = current_voice_segment.length(); + samples.extend( + sine.clone() + .amplify(1.0) + .take_duration(voice_range_duration), + ); + + let Some(next_voice_segment) = with_voice.peek() else { + break; + }; + let until_next = current_voice_segment.until(next_voice_segment); + samples.extend(sine.clone().amplify(0.0).take_duration(until_next)); + } + + SamplesBuffer::new(nz!(1), source.sample_rate(), samples) + } + + pub fn add_voice_activity_as_channel(mut source: impl Source + Clone) -> impl Source { + let detector = Self::new(source.clone()); + let mut voice_activity = detector.beep_where_voice_detected(&source).into_iter(); + + let mut samples = Vec::new(); + loop { + let Some(s1) = source.next() else { + break; + }; + let Some(s2) = source.next() else { + break; + }; + let Some(s3) = voice_activity.next() else { + break; + }; + + samples.extend_from_slice(&[s1, s2, s3]); + } + SamplesBuffer::new( + source.channels().checked_add(1).unwrap(), + source.sample_rate(), + samples, + ) + } +} + +fn iter_spectra( + expected: impl Source + Clone, + chunk_duration: Duration, +) -> impl Iterator { + assert!(expected.total_duration().is_some()); + + let chunk_size = super::spectra_chunk_size(&expected, chunk_duration); + let expected_samples: Vec<_> = expected.clone().collect(); + expected_samples + .chunks_exact(chunk_size) + .map(|input| { + super::samples_fft_to_spectrum( + &hann_window(input), + expected.sample_rate().get(), + FrequencyLimit::Min(4.0), + Some(÷_by_N_sqrt), + ) + .unwrap() + }) + .collect::>() + .into_iter() +} + +#[cfg(test)] +mod test { + + use crate::test::{detector::BasicVoiceDetector, recording_of_voice}; + use rodio::{nz, wav_to_file}; + + #[test] + fn soup() { + let original = recording_of_voice(nz!(1), nz!(48000)); + let detector = BasicVoiceDetector::new(original.clone()); + let siny = detector.beep_where_voice_detected(&original); + wav_to_file(siny, "voice_activity.wav").unwrap(); + } +}