Quick Start

This guide shows how to start a single-direction Voice Translation session using the Krisp Voice Translation SDK. In this example, spoken English audio is translated into Spanish in real time.

Each translation session processes audio in one direction (source → target). For bi-directional translation, create two independent sessions, one per direction.

High-Level Flow

All platform integrations follow the same basic flow:

Initialize the Voice Translation SDK
Create a translation session with source and target languages
Provide audio input to the session
Receive translated audio (and optional transcripts)
Stop and clean up the session

Code Snippet (English → Spanish Translation)

// GET A SESSION KEY

const axios = require('axios');

let config = {
  method: 'get',
  url: 'https://sdkapi.krisp.ai/v2/sdk/voice-translation/session/token?expiration_ttl=100',
  headers: { 
    'Authorization': 'api-key API_KEY'
  }
};

axios.request(config)
.then((response) => {
  const SESSION_KEY = response.data.data.session_key
})
.catch((error) => {
  console.log(error);
});

import { KrispVTSDK, LogLevel } from 'krisp-vt-sdk';

// 1. Initialize SDK
const sdk = new KrispVTSDK({
  apiKey: SESSION_KEY,
  logLevel: LogLevel.WARN  // NONE, ERROR, WARN, INFO, or DEBUG
});

// 2. Set up event hooks
sdk.setHooks({
  onProcessedAudio: (stream) => {
    // Play or send the translated audio
    const audio = new Audio();
    audio.srcObject = stream;
    audio.play();
  },
  onMessage: (event) => {
    // Handle transcripts
    console.log('Transcript:', event.data.text);
  }
});

// 3. Start translation service
await sdk.start({
  from: 'en-US',  // Source language
  to: 'es-ES',    // Target language
  gender: 'female',
});

// 4. Get microphone and process audio
const mic = await navigator.mediaDevices.getUserMedia({ audio: true });
await sdk.process(mic);

// 5. Stop when done
await sdk.stop();
// 4. Get microphone and process audio
const mic = await navigator.mediaDevices.getUserMedia({ audio: true });
await sdk.process(mic);

// 5. Stop when done
await sdk.stop();

auto logCallback = [](const std::string& message, LogLevel level)
    {
        static std::mutex mutex;
        std::lock_guard<std::mutex> lock(mutex);
        logFile << "[" << static_cast<int>(level) << "] " << message << std::endl;
    };
    
try
{
    globalInit(L"", logCallback, LogLevel::Trace);
    
    // initialize the VT session
    VtSessionConfig config = {
        .authToken = authToken,
        .inputSampleRate = inputSampleRate,
        .inputFrameDuration = inputFrameDuration,
        .outputSampleRate = outputSampleRate,
        .inputLanguageCode = "en-US",
        .outputLanguageCode = "es-ES",
        .gender = VtGender::Female,
        .customVocabulary = customVocabulary
    };
    
    auto audioResultCallback = [](const VtAudioResult<int16_t>& audioResult)
    {
		    // callback to receive the translated audio samples
    };

    auto originalTranscriptCallback = [](const VtOriginalTranscriptionResult& originalTranscriptResult)
    {
        // callback to receive the original transcript result
    };

    auto translatedTranscriptCallback = [](const VtTranslatedTranscriptionResult& translatedTranscriptResult)
    {
        // callback to receive the translated transcript result
    };

    // callback to receive events
    auto eventCallback = [](const VtEventType& event)
    {};

    // callback to receive errors
    auto errorCallback = [](const VtErrorType& error)
    {
        print("Error: " + std::to_string(static_cast<int>(error)));
    };
    
    vtSession = Vt<int16_t>::create(
        config,
        originalTranscriptCallback,
        translatedTranscriptCallback,
        audioResultCallback,
        eventCallback,
        errorCallback);
    
    while(frameAvailable)
    {
        vtSession->process(inFrame, inFrameSize);
    }
    
    vtSession.reset();
    globalDestroy();
}
catch(std::exception& ex)
{
  // Handle exception
}