improves voice detector

2025-11-15 01:06:24 +01:00
parent f602047b3c
commit c78baaf71f
5 changed files with 278 additions and 79 deletions
--- a/crates/audio/src/audio.rs
+++ b/crates/audio/src/audio.rs
@@ -184,26 +184,26 @@ impl Audio {
        let stream = raw_mic_input
            .possibly_disconnected_channels_to_mono()
            .constant_samplerate(SAMPLE_RATE)
-            .limit(LimitSettings::live_performance())
-            .process_buffer::<BUFFER_SIZE, _>(move |buffer| {
-                let mut int_buffer: [i16; _] = buffer.map(|s| s.to_sample());
-                if voip_parts
-                    .echo_canceller
-                    .lock()
-                    .process_stream(
-                        &mut int_buffer,
-                        SAMPLE_RATE.get() as i32,
-                        CHANNEL_COUNT.get() as i32,
-                    )
-                    .context("livekit audio processor error")
-                    .log_err()
-                    .is_some()
-                {
-                    for (sample, processed) in buffer.iter_mut().zip(&int_buffer) {
-                        *sample = (*processed).to_sample();
-                    }
-                }
-            });
+            .limit(LimitSettings::live_performance());
+            // .process_buffer::<BUFFER_SIZE, _>(move |buffer| {
+            //     let mut int_buffer: [i16; _] = buffer.map(|s| s.to_sample());
+            //     if voip_parts
+            //         .echo_canceller
+            //         .lock()
+            //         .process_stream(
+            //             &mut int_buffer,
+            //             SAMPLE_RATE.get() as i32,
+            //             CHANNEL_COUNT.get() as i32,
+            //         )
+            //         .context("livekit audio processor error")
+            //         .log_err()
+            //         .is_some()
+            //     {
+            //         for (sample, processed) in buffer.iter_mut().zip(&int_buffer) {
+            //             *sample = (*processed).to_sample();
+            //         }
+            //     }
+            // });
        // .denoise()
        // .context("Could not set up denoiser")?
        // .automatic_gain_control(automatic_gain_control_settings())
--- a/crates/audio/src/rodio_ext.rs
+++ b/crates/audio/src/rodio_ext.rs
@@ -114,7 +114,7 @@ pub struct ConstantChannelCount<S: Source> {
 }

 impl<S: Source> ConstantChannelCount<S> {
-    fn new(source: S, target_channels: ChannelCount) -> Self {
+    pub fn new(source: S, target_channels: ChannelCount) -> Self {
        let input_channels = source.channels();
        let sample_rate = source.sample_rate();
        let inner = ChannelCountConverter::new(source, input_channels, target_channels);
--- a/crates/audio/src/rodio_ext/resample.rs
+++ b/crates/audio/src/rodio_ext/resample.rs
@@ -142,7 +142,7 @@ mod tests {
    use std::time::Duration;

    use crate::{
-        test::{recording_of_davids_voice, sine},
+        test::{recording_of_voice, sine},
        RodioExt,
    };
    use itertools::Itertools;
@@ -214,7 +214,7 @@ mod tests {

    #[test]
    fn constant_samplerate_preserves_length() {
-        let test_signal = recording_of_davids_voice(nz!(3), nz!(48_000));
+        let test_signal = recording_of_voice(nz!(3), nz!(48_000));
        let resampled = test_signal.clone().constant_samplerate(nz!(16_000));

        let diff_in_length = test_signal
--- a/crates/audio/src/test.rs
+++ b/crates/audio/src/test.rs
@@ -4,6 +4,7 @@
 use std::env::current_dir;
 use std::io::Cursor;
 use std::iter;
+use std::ops::Range;
 use std::sync::atomic::Ordering;
 use std::time::Duration;

@@ -21,10 +22,14 @@ use crate::audio_settings::LIVE_SETTINGS;
 use crate::test::detector::BasicVoiceDetector;
 use crate::{Audio, LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE, RodioExt, VoipParts};

+mod detector;
+// in hz
+const HUMAN_SPEECH_RANGE: Range<f32> = 90.0..260.0;
+
 #[gpui::test]
 fn test_input_pipeline(cx: &mut gpui::TestAppContext) {
    // strange params to invite bugs to show themselves
-    let test_signal = recording_of_davids_voice(nz!(3), nz!(48_000));
+    let test_signal = recording_of_voice(nz!(3), nz!(48_000));
    let test_signal_duration = test_signal
        .total_duration()
        .expect("recordings have a length");
@@ -38,15 +43,20 @@ fn test_input_pipeline(cx: &mut gpui::TestAppContext) {
        .into_samples_buffer();

    let expected_output =
-        recording_of_davids_voice(input_pipeline.channels(), input_pipeline.sample_rate());
-    rodio::wav_to_file(input_pipeline.clone(), "input_pipeline_output.wav").unwrap();
+        recording_of_voice(input_pipeline.channels(), input_pipeline.sample_rate());
+
+    rodio::wav_to_file(
+        BasicVoiceDetector::add_voice_activity_as_channel(input_pipeline.clone()),
+        "input_pipeline_output.wav",
+    )
+    .unwrap();
    rodio::wav_to_file(expected_output.clone(), "input_pipeline_expect.wav").unwrap();
    assert_similar_voice_spectra(expected_output, input_pipeline);
 }

 #[gpui::test]
 fn test_output_pipeline(cx: &mut gpui::TestAppContext) {
-    let test_signal = recording_of_davids_voice(LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE);
+    let test_signal = recording_of_voice(LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE);
    let test_signal_duration = test_signal
        .total_duration()
        .expect("recordings have a length");
@@ -65,7 +75,7 @@ fn test_output_pipeline(cx: &mut gpui::TestAppContext) {
    // dont care about the channel count and sample rate, as long as the voice
    // signal matches
    let expected_output =
-        recording_of_davids_voice(output_pipeline.channels(), output_pipeline.sample_rate());
+        recording_of_voice(output_pipeline.channels(), output_pipeline.sample_rate());
    rodio::wav_to_file(output_pipeline.clone(), "output_pipeline_output.wav").unwrap();
    rodio::wav_to_file(expected_output.clone(), "output_pipeline_expect.wav").unwrap();
    assert_similar_voice_spectra(expected_output, output_pipeline);
@@ -74,7 +84,7 @@ fn test_output_pipeline(cx: &mut gpui::TestAppContext) {
 // TODO make a perf variant
 #[gpui::test]
 fn test_full_audio_pipeline(cx: &mut gpui::TestAppContext) {
-    let test_signal = recording_of_davids_voice(nz!(3), nz!(44_100));
+    let test_signal = recording_of_voice(nz!(3), nz!(44_100));
    let test_signal_duration = test_signal
        .total_duration()
        .expect("recordings have a length");
@@ -92,15 +102,21 @@ fn test_full_audio_pipeline(cx: &mut gpui::TestAppContext) {

    // dont care about the channel count and sample rate, as long as the voice
    // signal matches
-    let expected_output =
-        recording_of_davids_voice(full_pipeline.channels(), full_pipeline.sample_rate());
+    let expected_output = recording_of_voice(full_pipeline.channels(), full_pipeline.sample_rate());
    rodio::wav_to_file(full_pipeline.clone(), "full_pipeline_output.wav").unwrap();
-    rodio::wav_to_file(expected_output.clone(), "full_pipeline_expected.wav").unwrap();
+    rodio::wav_to_file(expected_output.clone(), "full_pipeline_expect.wav").unwrap();
    assert_similar_voice_spectra(expected_output, full_pipeline);
 }

-fn energy_of_spectrum(spectrum: &FrequencySpectrum) -> f32 {
-    spectrum.max().1.val()
+fn human_perceivable_energy(spectrum: &FrequencySpectrum) -> f32 {
+    spectrum
+        .data()
+        .iter()
+        .filter(|(freq, _)| HUMAN_SPEECH_RANGE.contains(&freq.val()))
+        .max_by_key(|(_, energy)| energy)
+        .unwrap()
+        .1
+        .val()
 }

 fn energy_of_chunk(chunk: &[rodio::Sample], sample_rate: SampleRate) -> f32 {
@@ -112,7 +128,7 @@ fn energy_of_chunk(chunk: &[rodio::Sample], sample_rate: SampleRate) -> f32 {
    )
    .unwrap();

-    energy_of_spectrum(&spectrum)
+    human_perceivable_energy(&spectrum)
 }

 fn maximum_energy(mut a: impl rodio::Source) -> f32 {
@@ -127,10 +143,6 @@ fn maximum_energy(mut a: impl rodio::Source) -> f32 {
        .fold(0f32, |max, energy| max.max(energy))
 }

-const CHUNK_DURATION: Duration = Duration::from_millis(100);
-
-mod detector;
-
 // Test signals should be at least 50% voice
 fn assert_similar_voice_spectra(
    expected: impl rodio::Source + Clone,
@@ -211,11 +223,17 @@ fn assert_similar_voice_spectra(
    );
 }

-fn spectra_chunk_size(source: &impl Source) -> usize {
-    ((CHUNK_DURATION.as_secs_f64() * source.sample_rate().get() as f64).ceil() as usize)
+fn spectra_chunk_size(source: &impl Source, minimum_duration: Duration) -> usize {
+    ((minimum_duration.as_secs_f64() * source.sample_rate().get() as f64) as usize)
        .next_power_of_two()
 }

+fn spectrum_duration(source: &impl Source, minimum_duration: Duration) -> Duration {
+    Duration::from_secs_f64(
+        spectra_chunk_size(source, minimum_duration) as f64 / source.sample_rate().get() as f64,
+    )
+}
+
 fn assert_same_voice_signal(
    (chunk_start, (expected, pipeline)): (Duration, (FrequencySpectrum, FrequencySpectrum)),
 ) -> Option<bool> {
@@ -232,7 +250,7 @@ fn assert_same_voice_signal(
            panic!(
                "Could not find fundamental voice freq in output while there is one in the input at {voice_freq_expected}Hz.\nLoudest 5 frequencies in output:\n{}\n\n{}",
                display_loudest_5_frequencies(&pipeline),
-                plot_spectra(&expected, &pipeline),
+                plot_spectra(&[(&expected, "expected"), (&pipeline, "pipeline")]),
            );
        }
        (Some(voice_freq_expected), Some(voice_freq_pipeline)) => {
@@ -243,7 +261,7 @@ fn assert_same_voice_signal(
    assert!(
        less_than_10percent_diff((voice_freq_expected, voice_freq_pipeline)),
        "expected: {voice_freq_expected}, pipeline: {voice_freq_pipeline}, at: {chunk_start:?}\n\n{}",
-        plot_spectra(&expected, &pipeline)
+        plot_spectra(&[(&expected, "expected"), (&pipeline, "pipeline")])
    );

    // Guards against voice distortion
@@ -256,14 +274,12 @@ fn assert_same_voice_signal(
 }

 fn fundamental_voice_freq(spectrum: &FrequencySpectrum) -> Option<f32> {
-    let human_speech_range = 90.0..260.0;
-    let spectrum: Vec<_> = spectrum.data().iter().collect();
    spectrum
+        .data()
        .iter()
-        .filter(|(freq, _)| human_speech_range.contains(&freq.val()))
-        // .inspect(|(freq, ampl)| println!("{freq},{ampl}"))
-        .max_by(|(_, a_ampl), (_, b_ampl)| a_ampl.val().total_cmp(&b_ampl.val()))
-        .map(|(freq, _ampl)| freq.val())
+        .filter(|(freq, _)| HUMAN_SPEECH_RANGE.contains(&freq.val()))
+        .max_by_key(|(_, energy)| energy)
+        .map(|(freq, _)| freq.val())
 }

 fn same_ratio_between_harmonics(
@@ -313,7 +329,7 @@ fn display_loudest_5_frequencies(spectrum: &FrequencySpectrum) -> String {
 }

 // Returns ascii encoding a link to open the plot
-fn plot_spectra(expected: &FrequencySpectrum, pipeline: &FrequencySpectrum) -> String {
+pub fn plot_spectra(spectra: &[(&FrequencySpectrum, &str)]) -> String {
    use plotly::{Bar, Plot};

    let mut plot = Plot::new();
@@ -322,29 +338,16 @@ fn plot_spectra(expected: &FrequencySpectrum, pipeline: &FrequencySpectrum) -> S
    // .y_axis(Axis::new().type_(plotly::layout::AxisType::Log));
    plot.set_layout(layout);

-    let (x, y): (Vec<_>, Vec<_>) = expected
-        .data()
-        .iter()
-        .map(|(freq, amplitude)| (freq.val(), amplitude.val()))
-        .filter(|(freq, _)| *freq > 85.0)
-        .unzip();
-    let trace = Bar::new(x, y)
-        .name("expected")
-        .show_legend(true)
-        .opacity(0.5);
-    plot.add_trace(trace);
-
-    let (x, y): (Vec<_>, Vec<_>) = pipeline
-        .data()
-        .iter()
-        .map(|(freq, amplitude)| (freq.val(), amplitude.val()))
-        .filter(|(freq, _)| *freq > 85.0)
-        .unzip();
-    let trace = Bar::new(x, y)
-        .name("pipeline")
-        .show_legend(true)
-        .opacity(0.5);
-    plot.add_trace(trace);
+    for (spectrum, label) in spectra {
+        let (x, y): (Vec<_>, Vec<_>) = spectrum
+            .data()
+            .iter()
+            .map(|(freq, amplitude)| (freq.val(), amplitude.val()))
+            .filter(|(freq, _)| *freq > 85.0)
+            .unzip();
+        let trace = Bar::new(x, y).name(label).show_legend(true).opacity(0.5);
+        plot.add_trace(trace);
+    }

    let path = current_dir().unwrap().join("plot.html");
    plot.write_html(&path);
@@ -378,7 +381,7 @@ pub(crate) fn sine(channels: ChannelCount, sample_rate: SampleRate) -> impl Sour
    )
 }

-pub(crate) fn recording_of_davids_voice(
+pub(crate) fn recording_of_voice(
    channels: ChannelCount,
    sample_rate: SampleRate,
 ) -> impl Source + Clone {
@@ -407,7 +410,7 @@ pub(crate) fn recording_of_davids_voice(
 #[should_panic]
 fn test_rejects_pitch_shift() {
    // also known as 'robot/chipmunk voice'
-    let original = recording_of_davids_voice(nz!(1), nz!(44100));
+    let original = recording_of_voice(nz!(1), nz!(44100));
    let pitch_shifted = original
        .clone()
        .speed(1.2) // effectively increases the pitch by 20%
@@ -422,7 +425,7 @@ fn test_rejects_pitch_shift() {
 #[test]
 #[should_panic]
 fn test_rejects_large_amounts_of_noise() {
-    let original = recording_of_davids_voice(nz!(1), nz!(44100));
+    let original = recording_of_voice(nz!(1), nz!(44100));
    let with_noise = add_noise(&original, 0.5);

    assert_similar_voice_spectra(original, with_noise);
@@ -430,7 +433,7 @@ fn test_rejects_large_amounts_of_noise() {

 #[test]
 fn test_ignores_volume() {
-    let original = recording_of_davids_voice(nz!(1), nz!(44100));
+    let original = recording_of_voice(nz!(1), nz!(44100));
    let amplified = original.clone().amplify(1.42);

    assert_similar_voice_spectra(original, amplified);
@@ -438,7 +441,7 @@ fn test_ignores_volume() {

 #[test]
 fn test_ignore_low_volume_noise() {
-    let original = recording_of_davids_voice(nz!(1), nz!(44100));
+    let original = recording_of_voice(nz!(1), nz!(44100));
    // 5% noise is quite hearable as the noise is across all frequencies so is
    // perceived far more intense then a voice
    let with_noise = add_noise(&original, 0.05);
@@ -469,7 +472,7 @@ fn add_noise(original: &(impl Source + Clone + Send + 'static), amount: f32) ->

 #[test]
 fn test_ignores_small_shifts() {
-    let original = recording_of_davids_voice(nz!(1), nz!(44100));
+    let original = recording_of_voice(nz!(1), nz!(44100));
    let shifted = iter::repeat(0f32).take(10).chain(original.clone());
    let shifted = SamplesBuffer::new(
        original.channels(),
--- a/crates/audio/src/test/detector.rs
+++ b/crates/audio/src/test/detector.rs
@@ -0,0 +1,196 @@
+use crate::RodioExt;
+use crate::rodio_ext::ConstantChannelCount;
+use crate::test::sine;
+use crate::test::spectrum_duration;
+
+use super::human_perceivable_energy;
+
+use rodio::buffer::SamplesBuffer;
+use rodio::nz;
+use spectrum_analyzer::FrequencyLimit;
+use spectrum_analyzer::FrequencySpectrum;
+use spectrum_analyzer::scaling::divide_by_N_sqrt;
+use spectrum_analyzer::windows::hann_window;
+
+use super::maximum_energy;
+
+use rodio::Source;
+
+use std::time::Duration;
+
+#[derive(Debug, Clone)]
+pub struct VoiceSegment {
+    pub start: Duration,
+    pub end: Duration,
+}
+
+impl VoiceSegment {
+    const ZERO: Self = Self {
+        start: Duration::ZERO,
+        end: Duration::ZERO,
+    };
+
+    fn length(&self) -> Duration {
+        self.end - self.start
+    }
+
+    fn until(&self, other: &Self) -> Duration {
+        debug_assert!(self.end < other.start);
+        other.start - self.end
+    }
+}
+
+pub(crate) struct BasicVoiceDetector {
+    pub(crate) segments_with_voice: Vec<VoiceSegment>,
+}
+
+impl BasicVoiceDetector {
+    pub(crate) fn new(source: impl Source + Clone) -> Self {
+        // only works on mono
+        let source = ConstantChannelCount::new(source, nz!(1)).into_samples_buffer();
+
+        // this gives a good resolution
+        let minimum_chunk_duration = Duration::from_millis(20);
+        let actual_chunk_duration = spectrum_duration(&source, minimum_chunk_duration);
+
+        let mut spectrum_start_pos = Duration::ZERO;
+        let mut partial_segment = None;
+
+        // empirically determined (by looking in audacity)
+        // see the 'soup' test for how
+        //
+        // while this might seem low remember humans precieve sound
+        // logarithmically. So 40% of energy sounds like 80% volume.
+        let threshold = 0.4 * maximum_energy(source.clone());
+        let segments_with_voice: Vec<_> = iter_spectra(source.clone(), actual_chunk_duration)
+            .filter_map(|spectrum| {
+                let voice_detected = human_perceivable_energy(&spectrum) > threshold;
+                spectrum_start_pos += actual_chunk_duration;
+                match (&mut partial_segment, voice_detected) {
+                    (Some(VoiceSegment { end, .. }), true) => *end = spectrum_start_pos,
+                    (Some(VoiceSegment { start, .. }), false) => {
+                        let res = Some(VoiceSegment {
+                            start: *start,
+                            end: spectrum_start_pos,
+                        });
+                        partial_segment = None;
+                        return res;
+                    }
+                    (None, true) => {
+                        partial_segment = Some(VoiceSegment {
+                            start: spectrum_start_pos,
+                            end: spectrum_start_pos,
+                        })
+                    }
+                    (None, false) => partial_segment = None,
+                };
+                None
+            })
+            .collect();
+
+        Self {
+            segments_with_voice,
+        }
+    }
+
+    pub fn voice_less_duration(&self) -> Duration {
+        self.segments_with_voice
+            .iter()
+            .map(|range| range.end - range.start)
+            .sum()
+    }
+
+    fn beep_where_voice_detected(&self, source: &impl Source) -> SamplesBuffer {
+        let sine = sine(source.channels(), source.sample_rate());
+
+        let mut with_voice = [VoiceSegment::ZERO]
+            .iter()
+            .chain(self.segments_with_voice.iter())
+            .peekable();
+        let mut samples = Vec::new();
+
+        loop {
+            let Some(current_voice_segment) = with_voice.next() else {
+                break;
+            };
+
+            let voice_range_duration = current_voice_segment.length();
+            samples.extend(
+                sine.clone()
+                    .amplify(1.0)
+                    .take_duration(voice_range_duration),
+            );
+
+            let Some(next_voice_segment) = with_voice.peek() else {
+                break;
+            };
+            let until_next = current_voice_segment.until(next_voice_segment);
+            samples.extend(sine.clone().amplify(0.0).take_duration(until_next));
+        }
+
+        SamplesBuffer::new(nz!(1), source.sample_rate(), samples)
+    }
+
+    pub fn add_voice_activity_as_channel(mut source: impl Source + Clone) -> impl Source {
+        let detector = Self::new(source.clone());
+        let mut voice_activity = detector.beep_where_voice_detected(&source).into_iter();
+
+        let mut samples = Vec::new();
+        loop {
+            let Some(s1) = source.next() else {
+                break;
+            };
+            let Some(s2) = source.next() else {
+                break;
+            };
+            let Some(s3) = voice_activity.next() else {
+                break;
+            };
+
+            samples.extend_from_slice(&[s1, s2, s3]);
+        }
+        SamplesBuffer::new(
+            source.channels().checked_add(1).unwrap(),
+            source.sample_rate(),
+            samples,
+        )
+    }
+}
+
+fn iter_spectra(
+    expected: impl Source + Clone,
+    chunk_duration: Duration,
+) -> impl Iterator<Item = FrequencySpectrum> {
+    assert!(expected.total_duration().is_some());
+
+    let chunk_size = super::spectra_chunk_size(&expected, chunk_duration);
+    let expected_samples: Vec<_> = expected.clone().collect();
+    expected_samples
+        .chunks_exact(chunk_size)
+        .map(|input| {
+            super::samples_fft_to_spectrum(
+                &hann_window(input),
+                expected.sample_rate().get(),
+                FrequencyLimit::Min(4.0),
+                Some(&divide_by_N_sqrt),
+            )
+            .unwrap()
+        })
+        .collect::<Vec<_>>()
+        .into_iter()
+}
+
+#[cfg(test)]
+mod test {
+
+    use crate::test::{detector::BasicVoiceDetector, recording_of_voice};
+    use rodio::{nz, wav_to_file};
+
+    #[test]
+    fn soup() {
+        let original = recording_of_voice(nz!(1), nz!(48000));
+        let detector = BasicVoiceDetector::new(original.clone());
+        let siny = detector.beep_where_voice_detected(&original);
+        wav_to_file(siny, "voice_activity.wav").unwrap();
+    }
+}