Skip to content

Commit

Permalink
Add support for VAD (Voice Activity Detection) powered by libfvad
Browse files Browse the repository at this point in the history
  • Loading branch information
gtreshchev committed Jun 1, 2024
1 parent 583c931 commit a52e376
Show file tree
Hide file tree
Showing 33 changed files with 3,091 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
## Additional information

MP3, WAV, and FLAC audio transcoding operations are powered by [dr_libs](https://github.com/mackron/dr_libs) and [minimp3](https://github.com/lieff/minimp3).
VAD (Voice Activity Detection) is powered by [libfvad](https://github.com/dpirch/libfvad).

## Legal info

Expand Down
Binary file added Resources/Documentation/resetvad.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Resources/Documentation/setvadmode.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Resources/Documentation/togglevad.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
47 changes: 47 additions & 0 deletions Source/RuntimeAudioImporter/Private/Sound/StreamingSoundWave.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include "Async/Async.h"
#include "SampleBuffer.h"
#include "VAD/RuntimeVoiceActivityDetector.h"
#include "UObject/WeakObjectPtrTemplates.h"

UStreamingSoundWave::UStreamingSoundWave(const FObjectInitializer& ObjectInitializer)
Expand All @@ -29,6 +30,32 @@ UStreamingSoundWave::UStreamingSoundWave(const FObjectInitializer& ObjectInitial
}
}

bool UStreamingSoundWave::ToggleVAD(bool bVAD)
{
VADInstance = bVAD ? NewObject<URuntimeVoiceActivityDetector>() : nullptr;
return true;
}

bool UStreamingSoundWave::ResetVAD()
{
if (VADInstance)
{
return VADInstance->ResetVAD();
}
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to reset VAD as the VAD instance is not valid"));
return false;
}

bool UStreamingSoundWave::SetVADMode(ERuntimeVADMode Mode)
{
if (VADInstance)
{
return VADInstance->SetVADMode(Mode);
}
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to set VAD mode as the VAD instance is not valid"));
return false;
}

void UStreamingSoundWave::PopulateAudioDataFromDecodedInfo(FDecodedAudioStruct&& DecodedAudioInfo)
{
{
Expand Down Expand Up @@ -307,6 +334,26 @@ void UStreamingSoundWave::AppendAudioDataFromRAW(TArray<uint8> RAWData, ERuntime
return;
}

#if WITH_RUNTIMEAUDIOIMPORTER_VAD_SUPPORT
// Process VAD if necessary
if (VADInstance)
{
bool bDetected = VADInstance->ProcessVAD(TArray<float>(reinterpret_cast<const float*>(Float32DataPtr), static_cast<int32>(NumOfSamples)),
#if UE_VERSION_NEWER_THAN(4, 25, 0)
InSampleRate
#else
WeakThis->AudioCapture.GetSampleRate()
#endif
, NumOfChannels);
if (!bDetected)
{
UE_LOG(LogRuntimeAudioImporter, Warning, TEXT("VAD detected silence, skipping audio data append"));
return;
}
UE_LOG(LogRuntimeAudioImporter, Log, TEXT("VAD detected voice, appending audio data"));
}
#endif

FDecodedAudioStruct DecodedAudioInfo;
{
FPCMStruct PCMInfo;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
// Georgy Treshchev 2024.

#include "VAD/RuntimeVoiceActivityDetector.h"

#include "RuntimeAudioImporterDefines.h"
#include "RuntimeAudioImporterTypes.h"
#include "VADIncludes.h"
#include "HAL/UnrealMemory.h"
#include "Codecs/RAW_RuntimeCodec.h"
#if !UE_VERSION_OLDER_THAN(5, 1, 0)
#include "DSP/FloatArrayMath.h"
#endif

URuntimeVoiceActivityDetector::URuntimeVoiceActivityDetector()
: AppliedSampleRate(0),
#if WITH_RUNTIMEAUDIOIMPORTER_VAD_SUPPORT
VADInstance(nullptr)
#endif
{
#if WITH_RUNTIMEAUDIOIMPORTER_VAD_SUPPORT
VADInstance = fvad_new();
if (VADInstance)
{
UE_LOG(LogRuntimeAudioImporter, VeryVerbose, TEXT("Successfully created VAD instance for %s"), *GetName());
SetVADMode(ERuntimeVADMode::VeryAggressive);
}
else
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to create VAD instance for %s"), *GetName());
}
#else
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("VAD support is disabled, unable to create VAD instance for %s"), *GetName());
#endif
}

void URuntimeVoiceActivityDetector::BeginDestroy()
{
#if WITH_RUNTIMEAUDIOIMPORTER_VAD_SUPPORT
if (VADInstance)
{
fvad_free(VADInstance);
VADInstance = nullptr;
}
#endif
UObject::BeginDestroy();
}

bool URuntimeVoiceActivityDetector::ResetVAD()
{
#if WITH_RUNTIMEAUDIOIMPORTER_VAD_SUPPORT
if (!VADInstance)
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to reset VAD for %s as the VAD instance is not valid"), *GetName());
return false;
}
fvad_reset(VADInstance);
SetVADMode(ERuntimeVADMode::VeryAggressive);
AppliedSampleRate = 0;
UE_LOG(LogRuntimeAudioImporter, Log, TEXT("Successfully reset VAD for %s"), *GetName());
return true;
#else
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to reset VAD for %s as VAD support is disabled"), *GetName());
return false;
#endif
}

bool URuntimeVoiceActivityDetector::SetVADMode(ERuntimeVADMode Mode)
{
#if WITH_RUNTIMEAUDIOIMPORTER_VAD_SUPPORT
if (!VADInstance)
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to set VAD mode for %s as the VAD instance is not valid"), *GetName());
return false;
}
if (fvad_set_mode(VADInstance, VoiceActivityDetector::GetVADModeInt(Mode)) != 0)
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to set VAD mode for %s as the mode is invalid"), *GetName());
return false;
}
UE_LOG(LogRuntimeAudioImporter, Log, TEXT("Successfully set VAD mode for %s to %s"), *GetName(), *UEnum::GetValueAsName(Mode).ToString());
return true;
#else
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to set VAD mode for %s as VAD support is disabled"), *GetName());
return false;
#endif
}

bool URuntimeVoiceActivityDetector::ProcessVAD(TArray<float> PCMData, int32 InSampleRate, int32 NumOfChannels)
{
#if WITH_RUNTIMEAUDIOIMPORTER_VAD_SUPPORT
if (!VADInstance)
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to process VAD for %s as the VAD instance is not valid"), *GetName());
return false;
}
if (PCMData.Num() == 0)
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to process VAD for %s as the PCM data is empty"), *GetName());
return false;
}
if (InSampleRate <= 0)
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to process VAD for %s as the sample rate is invalid"), *GetName());
return false;
}
if (NumOfChannels <= 0)
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to process VAD for %s as the number of channels is invalid"), *GetName());
return false;
}

Audio::FAlignedFloatBuffer AlignedPCMData;
AlignedPCMData = MoveTemp(PCMData);

// Mix channels if necessary (VAD only supports mono audio data)
if (NumOfChannels > 1)
{
Audio::FAlignedFloatBuffer AlignedPCMData_Mixed;
if (!FRAW_RuntimeCodec::MixChannelsRAWData(AlignedPCMData, InSampleRate, NumOfChannels, 1, AlignedPCMData_Mixed))
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to mix audio data for %s"), *GetName());
return false;
}
AlignedPCMData = MoveTemp(AlignedPCMData_Mixed);
}

// Resample the audio data if necessary (VAD only supports 8 kHz audio data)
static constexpr int32 VADTargetSampleRate = 8000;
if (InSampleRate != VADTargetSampleRate)
{
Audio::FAlignedFloatBuffer AlignedPCMData_Resampled;
if (!FRAW_RuntimeCodec::ResampleRAWData(AlignedPCMData, 1, InSampleRate, VADTargetSampleRate, AlignedPCMData_Resampled))
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to resample audio data for %s"), *GetName());
return false;
}

AlignedPCMData = MoveTemp(AlignedPCMData_Resampled);
InSampleRate = VADTargetSampleRate;
UE_LOG(LogRuntimeAudioImporter, Verbose, TEXT("Successfully resampled audio data for %s to %d sample rate"), *GetName(), VADTargetSampleRate);
}

// Apply the sample rate to the VAD instance if it is different from the current sample rate
if (AppliedSampleRate != InSampleRate)
{
if (fvad_set_sample_rate(VADInstance, InSampleRate) != 0)
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to set VAD sample rate for %s"), *GetName());
return false;
}
AppliedSampleRate = InSampleRate;
UE_LOG(LogRuntimeAudioImporter, Verbose, TEXT("Successfully set VAD sample rate for %s to %d"), *GetName(), AppliedSampleRate);
}

// Convert float PCM data to int16 PCM data
TArray<int16> Int16PCMData;
{
#if UE_VERSION_OLDER_THAN(5, 1, 0)
int16* Int16PCMDataPtr;
FRAW_RuntimeCodec::TranscodeRAWData<float, int16>(AlignedPCMData.GetData(), AlignedPCMData.Num(), Int16PCMDataPtr);
Int16PCMData.Append(Int16PCMDataPtr, AlignedPCMData.Num());
FMemory::Free(Int16PCMDataPtr);
#else
Int16PCMData.AddUninitialized(AlignedPCMData.Num());
Audio::ArrayFloatToPcm16(MakeArrayView(AlignedPCMData), MakeArrayView(Int16PCMData));
#endif
}

// Append the new PCM data to the accumulated data
AccumulatedPCMData.Append(MoveTemp(Int16PCMData));

// Calculate the length of the accumulated audio data in milliseconds
float AudioDataLengthMs = static_cast<float>(AccumulatedPCMData.Num()) / static_cast<float>(AppliedSampleRate) * 1000;

// Process the accumulated audio data if it reaches 10, 20, or 30 ms (VAD only supports these frame lengths)
if (AudioDataLengthMs >= 10)
{
int32 ValidLength = [this, AudioDataLengthMs]()
{
if (AudioDataLengthMs >= 30)
{
return 30;
}
else if (AudioDataLengthMs >= 20)
{
return 20;
}
else if (AudioDataLengthMs >= 10)
{
return 10;
}
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to process VAD for %s as the audio data length is invalid"), *GetName());
return 0;
}();

// Calculate the number of samples to process
int32 NumToProcess = ValidLength * AppliedSampleRate / 1000;

// Process the VAD
int32 VADResult = fvad_process(VADInstance, AccumulatedPCMData.GetData(), NumToProcess);

// Remove processed data from the accumulated buffer
AccumulatedPCMData.RemoveAt(0, NumToProcess);

if (VADResult == 1)
{
UE_LOG(LogRuntimeAudioImporter, Verbose, TEXT("VAD detected voice activity for %s"), *GetName());
return true;
}
else if (VADResult == 0)
{
UE_LOG(LogRuntimeAudioImporter, Verbose, TEXT("VAD detected no voice activity for %s"), *GetName());
return false;
}
else
{
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to process VAD for %s due to %d error code"), *GetName(), VADResult);
return false;
}
}
else
{
UE_LOG(LogRuntimeAudioImporter, Verbose, TEXT("Accumulating audio data until it reaches 10, 20 or 30 ms for %s. Current length: %f ms"), *GetName(), AudioDataLengthMs);
return false;
}
#else
UE_LOG(LogRuntimeAudioImporter, Error, TEXT("Unable to process VAD for %s as VAD support is disabled"), *GetName());
return false;
#endif
}
53 changes: 53 additions & 0 deletions Source/RuntimeAudioImporter/Private/VAD/VADIncludes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Georgy Treshchev 2024.

#pragma once

#if WITH_RUNTIMEAUDIOIMPORTER_VAD_SUPPORT

/**
* Replacing C dynamic memory management functions
* (calloc, malloc, free, realloc, memset, memcpy) with FMemory ones
*/
#undef calloc
#undef malloc
#undef free
#undef realloc
#undef memset
#undef memcpy

#define calloc(Count, Size) [&]() { void* MemPtr = FMemory::Malloc(Count * Size); if (MemPtr) { FMemory::Memset(MemPtr, 0, Count * Size); } return MemPtr; }()
#define malloc(Count) FMemory::Malloc(Count)
#define free(Original) FMemory::Free(Original)
#define realloc(Original, Count) FMemory::Realloc(Original, Count)
#define memset(Dest, Char, Count) FMemory::Memset(Dest, Char, Count)
#define memcpy(Dest, Src, Count) FMemory::Memcpy(Dest, Src, Count)

THIRD_PARTY_INCLUDES_START

extern "C"
{
#include "fvad.h"
#include "fvad.c"
#include "vad/vad_core.c"
#include "vad/vad_sp.c"
#include "vad/vad_gmm.c"
#include "vad/vad_filterbank.c"
#include "signal_processing/division_operations.c"
#include "signal_processing/energy.c"
#include "signal_processing/resample_48khz.c"
#include "signal_processing/spl_inl.c"
#include "signal_processing/get_scaling_square.c"
#include "signal_processing/resample_fractional.c"
#include "signal_processing/resample_by_2_internal.c"
}

THIRD_PARTY_INCLUDES_END

#undef calloc
#undef malloc
#undef free
#undef realloc
#undef memset
#undef memcpy

#endif
Loading

0 comments on commit a52e376

Please sign in to comment.