JavaScript contributed examples

NodeJS WAV

This example demonstrates a very basic usage of the NodeJS API

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
const DeepSpeech = require('deepspeech');
const Fs = require('fs');
const Sox = require('sox-stream');
const MemoryStream = require('memory-stream');
const Duplex = require('stream').Duplex;
const Wav = require('node-wav');

const BEAM_WIDTH = 1024;
let modelPath = './models/output_graph.pbmm';
let alphabetPath = './models/alphabet.txt';

let model = new DeepSpeech.Model(modelPath, alphabetPath, BEAM_WIDTH);

const LM_ALPHA = 0.75;
const LM_BETA = 1.85;
let lmPath = './models/lm.binary';
let triePath = './models/trie';

model.enableDecoderWithLM(lmPath, triePath, LM_ALPHA, LM_BETA);

let audioFile = process.argv[2] || './audio/2830-3980-0043.wav';

if (!Fs.existsSync(audioFile)) {
	console.log('file missing:', audioFile);
	process.exit();
}

const buffer = Fs.readFileSync(audioFile);
const result = Wav.decode(buffer);

if (result.sampleRate < 16000) {
	console.error('Warning: original sample rate (' + result.sampleRate + ') is lower than 16kHz. Up-sampling might produce erratic speech recognition.');
}

function bufferToStream(buffer) {
	let stream = new Duplex();
	stream.push(buffer);
	stream.push(null);
	return stream;
}

let audioStream = new MemoryStream();
bufferToStream(buffer).
pipe(Sox({
	global: {
		'no-dither': true,
	},
	output: {
		bits: 16,
		rate: 16000,
		channels: 1,
		encoding: 'signed-integer',
		endian: 'little',
		compression: 0.0,
		type: 'raw'
	}
})).
pipe(audioStream);

audioStream.on('finish', () => {
	
	let audioBuffer = audioStream.toBuffer();
	
	const audioLength = (audioBuffer.length / 2) * ( 1 / 16000);
	console.log('audio length', audioLength);
	
	let result = model.stt(audioBuffer.slice(0, audioBuffer.length / 2), 16000);
	
	console.log('result:', result);
});

Full source code available under ../examples/nodejs_wav/.

FFMPEG VAD Streaming

This example demonstrates using the Streaming API with ffmpeg to perform some Voice-Activity-Detection.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env node

const VAD = require("node-vad");
const Ds = require('deepspeech');
const argparse = require('argparse');
const util = require('util');
const { spawn } = require('child_process');

// These constants control the beam search decoder

// Beam width used in the CTC decoder when building candidate transcriptions
const BEAM_WIDTH = 500;

// The alpha hyperparameter of the CTC decoder. Language Model weight
const LM_ALPHA = 0.75;

// The beta hyperparameter of the CTC decoder. Word insertion bonus.
const LM_BETA = 1.85;

let VersionAction = function VersionAction(options) {
	options = options || {};
	options.nargs = 0;
	argparse.Action.call(this, options);
};

util.inherits(VersionAction, argparse.Action);

VersionAction.prototype.call = function(parser) {
	Ds.printVersions();
	process.exit(0);
};

let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'});
parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'});
parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'});
parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'});
parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'});
parser.addArgument(['--audio'], {required: true, help: 'Path to the audio source to run (ffmpeg supported formats)'});
parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'});
let args = parser.parseArgs();

function totalTime(hrtimeValue) {
	return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4);
}

console.error('Loading model from file %s', args['model']);
const model_load_start = process.hrtime();
let model = new Ds.Model(args['model'], args['alphabet'], BEAM_WIDTH);
const model_load_end = process.hrtime(model_load_start);
console.error('Loaded model in %ds.', totalTime(model_load_end));

if (args['lm'] && args['trie']) {
	console.error('Loading language model from files %s %s', args['lm'], args['trie']);
	const lm_load_start = process.hrtime();
	model.enableDecoderWithLM(args['lm'], args['trie'], LM_ALPHA, LM_BETA);
	const lm_load_end = process.hrtime(lm_load_start);
	console.error('Loaded language model in %ds.', totalTime(lm_load_end));
}

// Default is 16kHz
const AUDIO_SAMPLE_RATE = 16000;

// Defines different thresholds for voice detection
// NORMAL: Suitable for high bitrate, low-noise data. May classify noise as voice, too.
// LOW_BITRATE: Detection mode optimised for low-bitrate audio.
// AGGRESSIVE: Detection mode best suited for somewhat noisy, lower quality audio.
// VERY_AGGRESSIVE: Detection mode with lowest miss-rate. Works well for most inputs.
const VAD_MODE = VAD.Mode.NORMAL;
// const VAD_MODE = VAD.Mode.LOW_BITRATE;
// const VAD_MODE = VAD.Mode.AGGRESSIVE;
// const VAD_MODE = VAD.Mode.VERY_AGGRESSIVE;

// Time in milliseconds for debouncing speech active state
const DEBOUNCE_TIME = 20;

// Create voice activity stream
const VAD_STREAM = VAD.createStream({
	mode: VAD_MODE,
	audioFrequency: AUDIO_SAMPLE_RATE,
	debounceTime: DEBOUNCE_TIME
});

// Spawn ffmpeg process
const ffmpeg = spawn('ffmpeg', [
	'-hide_banner',
	'-nostats',
	'-loglevel', 'fatal',
	'-i', args['audio'],
	'-vn',
	'-acodec', 'pcm_s16le',
	'-ac', 1,
	'-ar', AUDIO_SAMPLE_RATE,
	'-f', 's16le',
	'pipe:'
]);

let audioLength = 0;
let sctx = model.createStream(AUDIO_SAMPLE_RATE);

function finishStream() {
	const model_load_start = process.hrtime();
	console.error('Running inference.');
	console.log('Transcription: ', model.finishStream(sctx));
	const model_load_end = process.hrtime(model_load_start);
	console.error('Inference took %ds for %ds audio file.', totalTime(model_load_end), audioLength.toPrecision(4));
	audioLength = 0;
}

function intermediateDecode() {
	finishStream();
	sctx = model.createStream(AUDIO_SAMPLE_RATE);
}

function feedAudioContent(chunk) {
	audioLength += (chunk.length / 2) * ( 1 / AUDIO_SAMPLE_RATE);
	model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2));
}

function processVad(data) {
	if (data.speech.start||data.speech.state) feedAudioContent(data.audioData)
	else if (data.speech.end) { feedAudioContent(data.audioData); intermediateDecode() }
}

ffmpeg.stdout.pipe(VAD_STREAM).on('data', processVad);

Full source code available under ../examples/ffmpeg_vad_streaming/.