improves voice detector

This commit is contained in:
David
2025-11-15 01:06:24 +01:00
parent f602047b3c
commit c78baaf71f
5 changed files with 278 additions and 79 deletions

View File

@@ -184,26 +184,26 @@ impl Audio {
let stream = raw_mic_input
.possibly_disconnected_channels_to_mono()
.constant_samplerate(SAMPLE_RATE)
.limit(LimitSettings::live_performance())
.process_buffer::<BUFFER_SIZE, _>(move |buffer| {
let mut int_buffer: [i16; _] = buffer.map(|s| s.to_sample());
if voip_parts
.echo_canceller
.lock()
.process_stream(
&mut int_buffer,
SAMPLE_RATE.get() as i32,
CHANNEL_COUNT.get() as i32,
)
.context("livekit audio processor error")
.log_err()
.is_some()
{
for (sample, processed) in buffer.iter_mut().zip(&int_buffer) {
*sample = (*processed).to_sample();
}
}
});
.limit(LimitSettings::live_performance());
// .process_buffer::<BUFFER_SIZE, _>(move |buffer| {
// let mut int_buffer: [i16; _] = buffer.map(|s| s.to_sample());
// if voip_parts
// .echo_canceller
// .lock()
// .process_stream(
// &mut int_buffer,
// SAMPLE_RATE.get() as i32,
// CHANNEL_COUNT.get() as i32,
// )
// .context("livekit audio processor error")
// .log_err()
// .is_some()
// {
// for (sample, processed) in buffer.iter_mut().zip(&int_buffer) {
// *sample = (*processed).to_sample();
// }
// }
// });
// .denoise()
// .context("Could not set up denoiser")?
// .automatic_gain_control(automatic_gain_control_settings())

View File

@@ -114,7 +114,7 @@ pub struct ConstantChannelCount<S: Source> {
}
impl<S: Source> ConstantChannelCount<S> {
fn new(source: S, target_channels: ChannelCount) -> Self {
pub fn new(source: S, target_channels: ChannelCount) -> Self {
let input_channels = source.channels();
let sample_rate = source.sample_rate();
let inner = ChannelCountConverter::new(source, input_channels, target_channels);

View File

@@ -142,7 +142,7 @@ mod tests {
use std::time::Duration;
use crate::{
test::{recording_of_davids_voice, sine},
test::{recording_of_voice, sine},
RodioExt,
};
use itertools::Itertools;
@@ -214,7 +214,7 @@ mod tests {
#[test]
fn constant_samplerate_preserves_length() {
let test_signal = recording_of_davids_voice(nz!(3), nz!(48_000));
let test_signal = recording_of_voice(nz!(3), nz!(48_000));
let resampled = test_signal.clone().constant_samplerate(nz!(16_000));
let diff_in_length = test_signal

View File

@@ -4,6 +4,7 @@
use std::env::current_dir;
use std::io::Cursor;
use std::iter;
use std::ops::Range;
use std::sync::atomic::Ordering;
use std::time::Duration;
@@ -21,10 +22,14 @@ use crate::audio_settings::LIVE_SETTINGS;
use crate::test::detector::BasicVoiceDetector;
use crate::{Audio, LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE, RodioExt, VoipParts};
mod detector;
// in hz
const HUMAN_SPEECH_RANGE: Range<f32> = 90.0..260.0;
#[gpui::test]
fn test_input_pipeline(cx: &mut gpui::TestAppContext) {
// strange params to invite bugs to show themselves
let test_signal = recording_of_davids_voice(nz!(3), nz!(48_000));
let test_signal = recording_of_voice(nz!(3), nz!(48_000));
let test_signal_duration = test_signal
.total_duration()
.expect("recordings have a length");
@@ -38,15 +43,20 @@ fn test_input_pipeline(cx: &mut gpui::TestAppContext) {
.into_samples_buffer();
let expected_output =
recording_of_davids_voice(input_pipeline.channels(), input_pipeline.sample_rate());
rodio::wav_to_file(input_pipeline.clone(), "input_pipeline_output.wav").unwrap();
recording_of_voice(input_pipeline.channels(), input_pipeline.sample_rate());
rodio::wav_to_file(
BasicVoiceDetector::add_voice_activity_as_channel(input_pipeline.clone()),
"input_pipeline_output.wav",
)
.unwrap();
rodio::wav_to_file(expected_output.clone(), "input_pipeline_expect.wav").unwrap();
assert_similar_voice_spectra(expected_output, input_pipeline);
}
#[gpui::test]
fn test_output_pipeline(cx: &mut gpui::TestAppContext) {
let test_signal = recording_of_davids_voice(LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE);
let test_signal = recording_of_voice(LEGACY_CHANNEL_COUNT, LEGACY_SAMPLE_RATE);
let test_signal_duration = test_signal
.total_duration()
.expect("recordings have a length");
@@ -65,7 +75,7 @@ fn test_output_pipeline(cx: &mut gpui::TestAppContext) {
// dont care about the channel count and sample rate, as long as the voice
// signal matches
let expected_output =
recording_of_davids_voice(output_pipeline.channels(), output_pipeline.sample_rate());
recording_of_voice(output_pipeline.channels(), output_pipeline.sample_rate());
rodio::wav_to_file(output_pipeline.clone(), "output_pipeline_output.wav").unwrap();
rodio::wav_to_file(expected_output.clone(), "output_pipeline_expect.wav").unwrap();
assert_similar_voice_spectra(expected_output, output_pipeline);
@@ -74,7 +84,7 @@ fn test_output_pipeline(cx: &mut gpui::TestAppContext) {
// TODO make a perf variant
#[gpui::test]
fn test_full_audio_pipeline(cx: &mut gpui::TestAppContext) {
let test_signal = recording_of_davids_voice(nz!(3), nz!(44_100));
let test_signal = recording_of_voice(nz!(3), nz!(44_100));
let test_signal_duration = test_signal
.total_duration()
.expect("recordings have a length");
@@ -92,15 +102,21 @@ fn test_full_audio_pipeline(cx: &mut gpui::TestAppContext) {
// dont care about the channel count and sample rate, as long as the voice
// signal matches
let expected_output =
recording_of_davids_voice(full_pipeline.channels(), full_pipeline.sample_rate());
let expected_output = recording_of_voice(full_pipeline.channels(), full_pipeline.sample_rate());
rodio::wav_to_file(full_pipeline.clone(), "full_pipeline_output.wav").unwrap();
rodio::wav_to_file(expected_output.clone(), "full_pipeline_expected.wav").unwrap();
rodio::wav_to_file(expected_output.clone(), "full_pipeline_expect.wav").unwrap();
assert_similar_voice_spectra(expected_output, full_pipeline);
}
fn energy_of_spectrum(spectrum: &FrequencySpectrum) -> f32 {
spectrum.max().1.val()
fn human_perceivable_energy(spectrum: &FrequencySpectrum) -> f32 {
spectrum
.data()
.iter()
.filter(|(freq, _)| HUMAN_SPEECH_RANGE.contains(&freq.val()))
.max_by_key(|(_, energy)| energy)
.unwrap()
.1
.val()
}
fn energy_of_chunk(chunk: &[rodio::Sample], sample_rate: SampleRate) -> f32 {
@@ -112,7 +128,7 @@ fn energy_of_chunk(chunk: &[rodio::Sample], sample_rate: SampleRate) -> f32 {
)
.unwrap();
energy_of_spectrum(&spectrum)
human_perceivable_energy(&spectrum)
}
fn maximum_energy(mut a: impl rodio::Source) -> f32 {
@@ -127,10 +143,6 @@ fn maximum_energy(mut a: impl rodio::Source) -> f32 {
.fold(0f32, |max, energy| max.max(energy))
}
const CHUNK_DURATION: Duration = Duration::from_millis(100);
mod detector;
// Test signals should be at least 50% voice
fn assert_similar_voice_spectra(
expected: impl rodio::Source + Clone,
@@ -211,11 +223,17 @@ fn assert_similar_voice_spectra(
);
}
fn spectra_chunk_size(source: &impl Source) -> usize {
((CHUNK_DURATION.as_secs_f64() * source.sample_rate().get() as f64).ceil() as usize)
fn spectra_chunk_size(source: &impl Source, minimum_duration: Duration) -> usize {
((minimum_duration.as_secs_f64() * source.sample_rate().get() as f64) as usize)
.next_power_of_two()
}
fn spectrum_duration(source: &impl Source, minimum_duration: Duration) -> Duration {
Duration::from_secs_f64(
spectra_chunk_size(source, minimum_duration) as f64 / source.sample_rate().get() as f64,
)
}
fn assert_same_voice_signal(
(chunk_start, (expected, pipeline)): (Duration, (FrequencySpectrum, FrequencySpectrum)),
) -> Option<bool> {
@@ -232,7 +250,7 @@ fn assert_same_voice_signal(
panic!(
"Could not find fundamental voice freq in output while there is one in the input at {voice_freq_expected}Hz.\nLoudest 5 frequencies in output:\n{}\n\n{}",
display_loudest_5_frequencies(&pipeline),
plot_spectra(&expected, &pipeline),
plot_spectra(&[(&expected, "expected"), (&pipeline, "pipeline")]),
);
}
(Some(voice_freq_expected), Some(voice_freq_pipeline)) => {
@@ -243,7 +261,7 @@ fn assert_same_voice_signal(
assert!(
less_than_10percent_diff((voice_freq_expected, voice_freq_pipeline)),
"expected: {voice_freq_expected}, pipeline: {voice_freq_pipeline}, at: {chunk_start:?}\n\n{}",
plot_spectra(&expected, &pipeline)
plot_spectra(&[(&expected, "expected"), (&pipeline, "pipeline")])
);
// Guards against voice distortion
@@ -256,14 +274,12 @@ fn assert_same_voice_signal(
}
fn fundamental_voice_freq(spectrum: &FrequencySpectrum) -> Option<f32> {
let human_speech_range = 90.0..260.0;
let spectrum: Vec<_> = spectrum.data().iter().collect();
spectrum
.data()
.iter()
.filter(|(freq, _)| human_speech_range.contains(&freq.val()))
// .inspect(|(freq, ampl)| println!("{freq},{ampl}"))
.max_by(|(_, a_ampl), (_, b_ampl)| a_ampl.val().total_cmp(&b_ampl.val()))
.map(|(freq, _ampl)| freq.val())
.filter(|(freq, _)| HUMAN_SPEECH_RANGE.contains(&freq.val()))
.max_by_key(|(_, energy)| energy)
.map(|(freq, _)| freq.val())
}
fn same_ratio_between_harmonics(
@@ -313,7 +329,7 @@ fn display_loudest_5_frequencies(spectrum: &FrequencySpectrum) -> String {
}
// Returns ascii encoding a link to open the plot
fn plot_spectra(expected: &FrequencySpectrum, pipeline: &FrequencySpectrum) -> String {
pub fn plot_spectra(spectra: &[(&FrequencySpectrum, &str)]) -> String {
use plotly::{Bar, Plot};
let mut plot = Plot::new();
@@ -322,29 +338,16 @@ fn plot_spectra(expected: &FrequencySpectrum, pipeline: &FrequencySpectrum) -> S
// .y_axis(Axis::new().type_(plotly::layout::AxisType::Log));
plot.set_layout(layout);
let (x, y): (Vec<_>, Vec<_>) = expected
.data()
.iter()
.map(|(freq, amplitude)| (freq.val(), amplitude.val()))
.filter(|(freq, _)| *freq > 85.0)
.unzip();
let trace = Bar::new(x, y)
.name("expected")
.show_legend(true)
.opacity(0.5);
plot.add_trace(trace);
let (x, y): (Vec<_>, Vec<_>) = pipeline
.data()
.iter()
.map(|(freq, amplitude)| (freq.val(), amplitude.val()))
.filter(|(freq, _)| *freq > 85.0)
.unzip();
let trace = Bar::new(x, y)
.name("pipeline")
.show_legend(true)
.opacity(0.5);
plot.add_trace(trace);
for (spectrum, label) in spectra {
let (x, y): (Vec<_>, Vec<_>) = spectrum
.data()
.iter()
.map(|(freq, amplitude)| (freq.val(), amplitude.val()))
.filter(|(freq, _)| *freq > 85.0)
.unzip();
let trace = Bar::new(x, y).name(label).show_legend(true).opacity(0.5);
plot.add_trace(trace);
}
let path = current_dir().unwrap().join("plot.html");
plot.write_html(&path);
@@ -378,7 +381,7 @@ pub(crate) fn sine(channels: ChannelCount, sample_rate: SampleRate) -> impl Sour
)
}
pub(crate) fn recording_of_davids_voice(
pub(crate) fn recording_of_voice(
channels: ChannelCount,
sample_rate: SampleRate,
) -> impl Source + Clone {
@@ -407,7 +410,7 @@ pub(crate) fn recording_of_davids_voice(
#[should_panic]
fn test_rejects_pitch_shift() {
// also known as 'robot/chipmunk voice'
let original = recording_of_davids_voice(nz!(1), nz!(44100));
let original = recording_of_voice(nz!(1), nz!(44100));
let pitch_shifted = original
.clone()
.speed(1.2) // effectively increases the pitch by 20%
@@ -422,7 +425,7 @@ fn test_rejects_pitch_shift() {
#[test]
#[should_panic]
fn test_rejects_large_amounts_of_noise() {
let original = recording_of_davids_voice(nz!(1), nz!(44100));
let original = recording_of_voice(nz!(1), nz!(44100));
let with_noise = add_noise(&original, 0.5);
assert_similar_voice_spectra(original, with_noise);
@@ -430,7 +433,7 @@ fn test_rejects_large_amounts_of_noise() {
#[test]
fn test_ignores_volume() {
let original = recording_of_davids_voice(nz!(1), nz!(44100));
let original = recording_of_voice(nz!(1), nz!(44100));
let amplified = original.clone().amplify(1.42);
assert_similar_voice_spectra(original, amplified);
@@ -438,7 +441,7 @@ fn test_ignores_volume() {
#[test]
fn test_ignore_low_volume_noise() {
let original = recording_of_davids_voice(nz!(1), nz!(44100));
let original = recording_of_voice(nz!(1), nz!(44100));
// 5% noise is quite hearable as the noise is across all frequencies so is
// perceived far more intense then a voice
let with_noise = add_noise(&original, 0.05);
@@ -469,7 +472,7 @@ fn add_noise(original: &(impl Source + Clone + Send + 'static), amount: f32) ->
#[test]
fn test_ignores_small_shifts() {
let original = recording_of_davids_voice(nz!(1), nz!(44100));
let original = recording_of_voice(nz!(1), nz!(44100));
let shifted = iter::repeat(0f32).take(10).chain(original.clone());
let shifted = SamplesBuffer::new(
original.channels(),

View File

@@ -0,0 +1,196 @@
use crate::RodioExt;
use crate::rodio_ext::ConstantChannelCount;
use crate::test::sine;
use crate::test::spectrum_duration;
use super::human_perceivable_energy;
use rodio::buffer::SamplesBuffer;
use rodio::nz;
use spectrum_analyzer::FrequencyLimit;
use spectrum_analyzer::FrequencySpectrum;
use spectrum_analyzer::scaling::divide_by_N_sqrt;
use spectrum_analyzer::windows::hann_window;
use super::maximum_energy;
use rodio::Source;
use std::time::Duration;
#[derive(Debug, Clone)]
pub struct VoiceSegment {
pub start: Duration,
pub end: Duration,
}
impl VoiceSegment {
const ZERO: Self = Self {
start: Duration::ZERO,
end: Duration::ZERO,
};
fn length(&self) -> Duration {
self.end - self.start
}
fn until(&self, other: &Self) -> Duration {
debug_assert!(self.end < other.start);
other.start - self.end
}
}
pub(crate) struct BasicVoiceDetector {
pub(crate) segments_with_voice: Vec<VoiceSegment>,
}
impl BasicVoiceDetector {
pub(crate) fn new(source: impl Source + Clone) -> Self {
// only works on mono
let source = ConstantChannelCount::new(source, nz!(1)).into_samples_buffer();
// this gives a good resolution
let minimum_chunk_duration = Duration::from_millis(20);
let actual_chunk_duration = spectrum_duration(&source, minimum_chunk_duration);
let mut spectrum_start_pos = Duration::ZERO;
let mut partial_segment = None;
// empirically determined (by looking in audacity)
// see the 'soup' test for how
//
// while this might seem low remember humans precieve sound
// logarithmically. So 40% of energy sounds like 80% volume.
let threshold = 0.4 * maximum_energy(source.clone());
let segments_with_voice: Vec<_> = iter_spectra(source.clone(), actual_chunk_duration)
.filter_map(|spectrum| {
let voice_detected = human_perceivable_energy(&spectrum) > threshold;
spectrum_start_pos += actual_chunk_duration;
match (&mut partial_segment, voice_detected) {
(Some(VoiceSegment { end, .. }), true) => *end = spectrum_start_pos,
(Some(VoiceSegment { start, .. }), false) => {
let res = Some(VoiceSegment {
start: *start,
end: spectrum_start_pos,
});
partial_segment = None;
return res;
}
(None, true) => {
partial_segment = Some(VoiceSegment {
start: spectrum_start_pos,
end: spectrum_start_pos,
})
}
(None, false) => partial_segment = None,
};
None
})
.collect();
Self {
segments_with_voice,
}
}
pub fn voice_less_duration(&self) -> Duration {
self.segments_with_voice
.iter()
.map(|range| range.end - range.start)
.sum()
}
fn beep_where_voice_detected(&self, source: &impl Source) -> SamplesBuffer {
let sine = sine(source.channels(), source.sample_rate());
let mut with_voice = [VoiceSegment::ZERO]
.iter()
.chain(self.segments_with_voice.iter())
.peekable();
let mut samples = Vec::new();
loop {
let Some(current_voice_segment) = with_voice.next() else {
break;
};
let voice_range_duration = current_voice_segment.length();
samples.extend(
sine.clone()
.amplify(1.0)
.take_duration(voice_range_duration),
);
let Some(next_voice_segment) = with_voice.peek() else {
break;
};
let until_next = current_voice_segment.until(next_voice_segment);
samples.extend(sine.clone().amplify(0.0).take_duration(until_next));
}
SamplesBuffer::new(nz!(1), source.sample_rate(), samples)
}
pub fn add_voice_activity_as_channel(mut source: impl Source + Clone) -> impl Source {
let detector = Self::new(source.clone());
let mut voice_activity = detector.beep_where_voice_detected(&source).into_iter();
let mut samples = Vec::new();
loop {
let Some(s1) = source.next() else {
break;
};
let Some(s2) = source.next() else {
break;
};
let Some(s3) = voice_activity.next() else {
break;
};
samples.extend_from_slice(&[s1, s2, s3]);
}
SamplesBuffer::new(
source.channels().checked_add(1).unwrap(),
source.sample_rate(),
samples,
)
}
}
fn iter_spectra(
expected: impl Source + Clone,
chunk_duration: Duration,
) -> impl Iterator<Item = FrequencySpectrum> {
assert!(expected.total_duration().is_some());
let chunk_size = super::spectra_chunk_size(&expected, chunk_duration);
let expected_samples: Vec<_> = expected.clone().collect();
expected_samples
.chunks_exact(chunk_size)
.map(|input| {
super::samples_fft_to_spectrum(
&hann_window(input),
expected.sample_rate().get(),
FrequencyLimit::Min(4.0),
Some(&divide_by_N_sqrt),
)
.unwrap()
})
.collect::<Vec<_>>()
.into_iter()
}
#[cfg(test)]
mod test {
use crate::test::{detector::BasicVoiceDetector, recording_of_voice};
use rodio::{nz, wav_to_file};
#[test]
fn soup() {
let original = recording_of_voice(nz!(1), nz!(48000));
let detector = BasicVoiceDetector::new(original.clone());
let siny = detector.beep_where_voice_detected(&original);
wav_to_file(siny, "voice_activity.wav").unwrap();
}
}