Getting Started
Supported Technologies
- Voice Isolation
- Turn-Taking
- VAD (Voice Activity Detection)
Supported Programming Languages
- C++
- C
- Python
- Node.js
- Go
- Rust
Voice Isolation
Sample Code
try
{
globalInit(L"");
ModelInfo ncModelInfo;
ncModelInfo.path = weightFilePath; // .kef file
NcSessionConfig ncCfg =
{
inRate,
frameDurationMillis,
outRate,
&ncModelInfo,
withStats,
nullptr // Ringtone model cfg for inbound stream case
};
std::shared_ptr<Nc<SamplingFormat>> ncSession = Nc<SamplingFormat>::create(ncCfg);
while(frameAvailable)
{
ncSession->process(inFrame, inFrameSize, outFrame, outFrameSize,
noiseSuppressionLevel, withStats ? &perFrameStats : nullptr);
}
// ncSession is a shared_ptr. Free pointer before globalDestroy()
ncSession.reset();
globalDestroy();
}
catch(std::exception& ex)
{
// Handle exception
}
import krisp_audio
# initialize Krisp SDK global instance
krisp_audio.globalInit("")
# Create Noise Cleaner with the specified configuration
nc_cfg = krisp_audio.NcSessionConfig()
ncFloat = krisp_audio.NcFloat.create(nc_cfg)
# Noise Cleaner frame by frame processing of the given audio stream
for i in range(0, 1000) # frame count
processed_frame = ncFloat.process(frame, self.suppression_level)
# Free the Krisp SDK global instance
ncFloat = None
krisp_audio.globalDestroy()
const krispAudioSdk = require('krisp-audio-node-sdk');
krispAudioSdk.globalInit("");
const config = {
inputSampleRate: krispAudioSdk.enums.SamplingRate.Sr16000Hz,
inputFrameDuration: krispAudioSdk.enums.FrameDuration.Fd10ms,
outputSampleRate: krispAudioSdk.enums.SamplingRate.Sr16000Hz,
modelInfo: {
path: "/path_to_the/model.kef"
}
};
let ncFloat = krispAudioSdk.NcFloat.create(config);
let noiseSuppressionLevel = 100.0; // 0-100.0
for (let i = 0; i < numberOfFrames; i++) {
let frame = getFrame(i);
const processedFrame = nc.process(frame, noiseSuppressionLevel);
}
ncFloat.destroy();
krispAudioSdk.globalDestroy();
package main
import (
"fmt"
"log"
"krisp"
)
func main() {
if err := krisp.GlobalInit(""); err != nil {
log.Fatal("Failed to initialize Krisp SDK:", err)
}
defer krisp.GlobalDestroy()
ncConfig := krisp.NCConfig{
InputSampleRate: krisp.SamplingRate(SR24000Hz),
OutputSampleRate: krisp.SamplingRate(SR24000Hz),
InputFrameDuration: krisp.FrameDuration(FD10ms),
ModelInfo: &krisp.ModelInfo{
Path: "/path_to_the/model.kef",
},
}
nc, err := krisp.CreateNcFloat(&ncConfig)
if err != nil {
log.Fatalf("Failed to create noise cancellation instance: %v", err)
}
defer nc.Close()
noiseSuppressionLevel := 100 // 0-100
for i := 0; i < len(floatAudioBuf); i += samplesPerFrame {
err := nc.ProcessFloat(frameFloat, outputFrameFloat, float32(noiseSuppressionLevel))
if err != nil {
log.Fatalf("Failed to process frame: %v", err)
}
}
}
use krisp_audio_sdk_rust::{
krisp_global_init, krisp_global_destroy, version, NcSession, NcSessionConfig,
SamplingRate, FrameDuration, AudioSampleType,
};
fn main() -> Result<(), Box<dyn std::error::Error>> {
krisp_global_init(Some(working_dir))?;
let config = NcSessionConfig {
input_sample_rate: SamplingRate::Rate24000Hz,
output_sample_rate: SamplingRate::Rate24000Hz,
input_frame_duration: FrameDuration::Ms10,
model_path: model_canonical_path.into_boxed_path(),
};
let mut session = NcSession::new(config, AudioSampleType::Float)?;
for frame in input_frames.iter() {
session.process_float(&frame.input, &mut frame.output, noise_suppression_level)?;
}
krisp_global_destroy()?;
Ok(())
}
TT (Turn-Taking)
Sample Code
void logCallback(const std::string& message, Krisp::AudioSdk::LogLevel level)
try
{
globalInit(L"", logCallback, Krisp::AudioSdk::LogLevel::Off);
ModelInfo modelInfo;
modelInfo.path = weightFilePath; // .kef file
TtSessionConfig ttCfg =
{
inRate,
frameDurationMillis,
&modelInfo
};
std::shared_ptr<Tt<SamplingFormat>> ttSession = Tt<SamplingFormat>::create(ttCfg);
float ttProbability = 0.0;
float recomenndedTtThreshold = 0.5;
while(frameAvailable)
{
// the ttProbability is in the [0, 1.0] range
ttSession->process(inFrame, inFrameSize, &ttProbability);
if (ttProbability >= recomenndedTtThreshold) {
// Consider turn taking took place
}
}
// ttSession is a shared_ptr. Free pointer before globalDestroy()
ttSession.reset();
globalDestroy();
}
catch(std::exception& ex)
{
// Handle exception
}
import krisp_audio
def log_callback(log_message, log_level):
print(f"[{log_level}] {log_message}", flush=True)
# initialize Krisp SDK global instance
krisp_audio.globalInit("", log_callback, krisp_audio.LogLevel.Off)
model_info = krisp_audio.ModelInfo()
model_info.path = self.model_path # path to .kef file
# Create Turn Taking session with the specified configuration
tt_cfg = krisp_audio.TtSessionConfig()
tt_cfg.inputSampleRate = krisp_audio.SamplingRate.Sr16000Hz
tt_cfg.inputFrameDuration = krisp_audio.FrameDuration.Fd10ms
tt_cfg.modelInfo = model_info
ttFloat = krisp_audio.TtFloat.create(tt_cfg)
ttRecommendedThreshold = 0.5
# Processing fixed sized audio frames
for frame in audioFrames:
ttProbability = ttFloat.process(frame) # the value is in the [0, 1] range
if ttProbability >= ttRecommendedThreshold:
# Consider turn taking took place
# Free the Krisp SDK global instance
ttFloat = None
krisp_audio.globalDestroy()
VAD (Voice Activity Detection)
Sample Code
void logCallback(const std::string& message, Krisp::AudioSdk::LogLevel level)
try
{
globalInit(L"", logCallback, Krisp::AudioSdk::LogLevel::Off);
ModelInfo modelInfo;
modelInfo.path = weightFilePath; // .kef file
VadSessionConfig vadCfg =
{
inRate,
frameDurationMillis,
&modelInfo
};
float voiceProbability = 0.0;
std::shared_ptr<Vad<SamplingFormat>> vadSession = Vad<SamplingFormat>::create(vadCfg);
while(frameAvailable)
{
vadSession->process(inFrame, inFrameSize, &voiceProbability);
if (voiceProbability > 0.5)
// Consider voice is detected in the frame
}
// vadSession is a shared_ptr. Free pointer before globalDestroy()
vadSession.reset();
globalDestroy();
}
catch(std::exception& ex)
{
// Handle exception
}
import krisp_audio
def log_callback(log_message, log_level):
print(f"[{log_level}] {log_message}", flush=True)
# Initialize Krisp SDK
krisp_audio.globalInit("", log_callback, krisp_audio.LogLevel.Off)
model_info = krisp_audio.ModelInfo()
model_info.path = self.model_path # path to .kef file
# Create VAD session with the specified configuration
vad_cfg = krisp_audio.VadSessionConfig()
vad_cfg.inputSampleRate = krisp_audio.SamplingRate.Sr16000Hz
vad_cfg.inputFrameDuration = krisp_audio.FrameDuration.Fd10ms
vad_cfg.modelInfo = model_info
vadFloat = krisp_audio.VadFloat.create(vad_cfg)
# Processing fixed sized audio frames
for frame in audioFrames:
voiceProbability = vadFloat.process(frame)
if voiceProbability > 0.5:
# Consider voice is detected in the frame
# Free the resource allocated for the audio stream processing
vadFloat = None
# Free the resources allocated by Krisp SDK
krisp_audio.globalDestroy()
Krisp SDK Action Sequence
Action | Description |
---|---|
SDK initialization | the SDK library should be loaded and initialized before being used. The SDK occupies resources that should be released once the SDK is no longer needed. |
Model initialization | VIVA SDK ships with voice isolation and Turn-Taking models. Each model has specific requirements for the device and the sampling rate. Each model should be initialized and loaded into the memory before it can be used. |
Audio stream handling preparation | In order to use Krisp Audio SDK to process the sound stream a session object should be created. The session creation is coupled to the AI model which should be specified during the session creation process. The session object also requires the specification of the sampling rate and frame duration. |
Frame processing | Frame processing is done using the session object which works only for the given sampling rate for the given frame duration using the loaded AI model. |
Releasing audio stream handling resources | the resources occupied by the session should be released once the session is no longer used. |
Unloading the SDK | the resources loaded by the Krisp Audio SDK should be released once the SDK is no longer needed |
ℹ️ Multiple sessions can be created and used at the same time
ℹ️ Each session can be processed in a different thread on another audio stream
ℹ️ A dummy Session can preload model into memory, while a separate Session could handle the actual stream processing. This approach may be helpful to reduce latency before processing begins, as the second Session will utilize the cached model data preloaded by the first Session and will be created much faster than the first Session.
❗The SDK should be unloaded only after the release of all sessions
Updated 12 days ago