/* * --------------------------------------------------------------------------- * Recognizer.java * * Copyright 2007 Nuance Communciations, Inc. * * Licensed under the Apache License, Version 2.0 (the 'License'); you may not * use this file except in compliance with the License. * * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an 'AS IS' BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * * --------------------------------------------------------------------------- */ package android.speech.srec; import java.io.File; import java.io.InputStream; import java.io.IOException; import java.util.Locale; /** * Simple, synchronous speech recognizer, using the Nuance SREC package. * Usages proceeds as follows: * *
Recognizer
.
* Recognizer.Grammar
.
* Recognizer.Grammar
.
* Recognizer.Grammar
slots, if needed.
* Recognizer.Grammar
slots, if needed.
* Recognizer.Grammar
, if needed.
* Recognizer.Grammar
, if needed.
* Recognizer
.
* advance
and putAudio
until recognition complete.
* Recognizer
.
* Recognizer
.
* Below is example code
* ** * // create and start audio input * InputStream audio = new MicrophoneInputStream(11025, 11025*5); * // create a Recognizer * String cdir = Recognizer.getConfigDir(null); * Recognizer recognizer = new Recognizer(cdir + "/baseline11k.par"); * // create and load a Grammar * Recognizer.Grammar grammar = recognizer.new Grammar(cdir + "/grammars/VoiceDialer.g2g"); * // setup the Grammar to work with the Recognizer * grammar.setupRecognizer(); * // fill the Grammar slots with names and save, if required * grammar.resetAllSlots(); * for (String name : names) grammar.addWordToSlot("@Names", name, null, 1, "V=1"); * grammar.compile(); * grammar.save(".../foo.g2g"); * // start the Recognizer * recognizer.start(); * // loop over Recognizer events * while (true) { * switch (recognizer.advance()) { * case Recognizer.EVENT_INCOMPLETE: * case Recognizer.EVENT_STARTED: * case Recognizer.EVENT_START_OF_VOICING: * case Recognizer.EVENT_END_OF_VOICING: * // let the Recognizer continue to run * continue; * case Recognizer.EVENT_RECOGNITION_RESULT: * // success, so fetch results here! * for (int i = 0; i < recognizer.getResultCount(); i++) { * String result = recognizer.getResult(i, Recognizer.KEY_LITERAL); * } * break; * case Recognizer.EVENT_NEED_MORE_AUDIO: * // put more audio in the Recognizer * recognizer.putAudio(audio); * continue; * default: * notifyFailure(); * break; * } * break; * } * // stop the Recognizer * recognizer.stop(); * // destroy the Recognizer * recognizer.destroy(); * // stop the audio device * audio.close(); * **/ public final class Recognizer { static { System.loadLibrary("srec_jni"); } private static String TAG = "Recognizer"; /** * Result key corresponding to confidence score. */ public static final String KEY_CONFIDENCE = "conf"; /** * Result key corresponding to literal text. */ public static final String KEY_LITERAL = "literal"; /** * Result key corresponding to semantic meaning text. */ public static final String KEY_MEANING = "meaning"; // handle to SR_Vocabulary object private long mVocabulary = 0; // handle to SR_Recognizer object private long mRecognizer = 0; // Grammar currently associated with Recognizer via SR_GrammarSetupRecognizer private Grammar mActiveGrammar = null; /** * Get the pathname of the SREC configuration directory corresponding to the * language indicated by the Locale. * This directory contains dictionaries, speech models, * configuration files, and other data needed by the Recognizer. * @param locale
Locale
corresponding to the desired language,
* or null for default, currently Locale.US
.
* @return Pathname of the configuration directory.
*/
public static String getConfigDir(Locale locale) {
if (locale == null) locale = Locale.US;
String dir = "/system/usr/srec/config/" +
locale.toString().replace('_', '.').toLowerCase(Locale.ROOT);
if ((new File(dir)).isDirectory()) return dir;
return null;
}
/**
* Create an instance of a SREC speech recognizer.
*
* @param configFile pathname of the baseline*.par configuration file,
* which in turn contains references to dictionaries, speech models,
* and other data needed to configure and operate the recognizer.
* A separate config file is needed for each audio sample rate.
* Two files, baseline11k.par and baseline8k.par, which correspond to
* 11025 and 8000 hz, are present in the directory indicated by
* {@link #getConfigDir}.
* @throws IOException
*/
public Recognizer(String configFile) throws IOException {
PMemInit();
SR_SessionCreate(configFile);
mRecognizer = SR_RecognizerCreate();
SR_RecognizerSetup(mRecognizer);
mVocabulary = SR_VocabularyLoad();
}
/**
* Represents a grammar loaded into the Recognizer.
*/
public class Grammar {
private long mGrammar = 0;
/**
* Create a Grammar
instance.
* @param g2gFileName pathname of g2g file.
*/
public Grammar(String g2gFileName) throws IOException {
mGrammar = SR_GrammarLoad(g2gFileName);
SR_GrammarSetupVocabulary(mGrammar, mVocabulary);
}
/**
* Reset all slots.
*/
public void resetAllSlots() {
SR_GrammarResetAllSlots(mGrammar);
}
/**
* Add a word to a slot.
*
* @param slot slot name.
* @param word word to insert.
* @param pron pronunciation, or null to derive from word.
* @param weight weight to give the word. One is normal, 50 is low.
* @param tag semantic meaning tag string.
*/
public void addWordToSlot(String slot, String word, String pron, int weight, String tag) {
SR_GrammarAddWordToSlot(mGrammar, slot, word, pron, weight, tag);
}
/**
* Compile all slots.
*/
public void compile() {
SR_GrammarCompile(mGrammar);
}
/**
* Setup Grammar
with Recognizer
.
*/
public void setupRecognizer() {
SR_GrammarSetupRecognizer(mGrammar, mRecognizer);
mActiveGrammar = this;
}
/**
* Save Grammar
to g2g file.
*
* @param g2gFileName
* @throws IOException
*/
public void save(String g2gFileName) throws IOException {
SR_GrammarSave(mGrammar, g2gFileName);
}
/**
* Release resources associated with this Grammar
.
*/
public void destroy() {
// TODO: need to do cleanup and disassociation with Recognizer
if (mGrammar != 0) {
SR_GrammarDestroy(mGrammar);
mGrammar = 0;
}
}
/**
* Clean up resources.
*/
protected void finalize() {
if (mGrammar != 0) {
destroy();
throw new IllegalStateException("someone forgot to destroy Grammar");
}
}
}
/**
* Start recognition
*/
public void start() {
// TODO: shouldn't be here?
SR_RecognizerActivateRule(mRecognizer, mActiveGrammar.mGrammar, "trash", 1);
SR_RecognizerStart(mRecognizer);
}
/**
* Process some audio and return the current status.
* @return recognition event, one of:
* EVENT_INVALID
* EVENT_NO_MATCH
* EVENT_INCOMPLETE
* EVENT_STARTED
* EVENT_STOPPED
* EVENT_START_OF_VOICING
* EVENT_END_OF_VOICING
* EVENT_SPOKE_TOO_SOON
* EVENT_RECOGNITION_RESULT
* EVENT_START_OF_UTTERANCE_TIMEOUT
* EVENT_RECOGNITION_TIMEOUT
* EVENT_NEED_MORE_AUDIO
* EVENT_MAX_SPEECH
* Recognizer
.
* @param buf holds the audio samples.
* @param offset offset of the first sample.
* @param length number of bytes containing samples.
* @param isLast indicates no more audio data, normally false.
* @return number of bytes accepted.
*/
public int putAudio(byte[] buf, int offset, int length, boolean isLast) {
return SR_RecognizerPutAudio(mRecognizer, buf, offset, length, isLast);
}
/**
* Read audio samples from an InputStream
and put them in the
* Recognizer
.
* @param audio InputStream
containing PCM audio samples.
*/
public void putAudio(InputStream audio) throws IOException {
// make sure the audio buffer is allocated
if (mPutAudioBuffer == null) mPutAudioBuffer = new byte[512];
// read some data
int nbytes = audio.read(mPutAudioBuffer);
// eof, so signal Recognizer
if (nbytes == -1) {
SR_RecognizerPutAudio(mRecognizer, mPutAudioBuffer, 0, 0, true);
}
// put it into the Recognizer
else if (nbytes != SR_RecognizerPutAudio(mRecognizer, mPutAudioBuffer, 0, nbytes, false)) {
throw new IOException("SR_RecognizerPutAudio failed nbytes=" + nbytes);
}
}
// audio buffer for putAudio(InputStream)
private byte[] mPutAudioBuffer = null;
/**
* Get the number of recognition results. Must be called after
* EVENT_RECOGNITION_RESULT
is returned by
* advance
, but before stop
.
*
* @return number of results in nbest list.
*/
public int getResultCount() {
return SR_RecognizerResultGetSize(mRecognizer);
}
/**
* Get a set of keys for the result. Must be called after
* EVENT_RECOGNITION_RESULT
is returned by
* advance
, but before stop
.
*
* @param index index of result.
* @return array of keys.
*/
public String[] getResultKeys(int index) {
return SR_RecognizerResultGetKeyList(mRecognizer, index);
}
/**
* Get a result value. Must be called after
* EVENT_RECOGNITION_RESULT
is returned by
* advance
, but before stop
.
*
* @param index index of the result.
* @param key key of the result. This is typically one of
* KEY_CONFIDENCE
, KEY_LITERAL
, or
* KEY_MEANING
, but the user can also define their own keys
* in a grxml file, or in the tag
slot of
* Grammar.addWordToSlot
.
* @return the result.
*/
public String getResult(int index, String key) {
return SR_RecognizerResultGetValue(mRecognizer, index, key);
}
/**
* Stop the Recognizer
.
*/
public void stop() {
SR_RecognizerStop(mRecognizer);
SR_RecognizerDeactivateRule(mRecognizer, mActiveGrammar.mGrammar, "trash");
}
/**
* Reset the acoustic state vectorto it's default value.
*
* @hide
*/
public void resetAcousticState() {
SR_AcousticStateReset(mRecognizer);
}
/**
* Set the acoustic state vector.
* @param state String containing the acoustic state vector.
*
* @hide
*/
public void setAcousticState(String state) {
SR_AcousticStateSet(mRecognizer, state);
}
/**
* Get the acoustic state vector.
* @return String containing the acoustic state vector.
*
* @hide
*/
public String getAcousticState() {
return SR_AcousticStateGet(mRecognizer);
}
/**
* Clean up resources.
*/
public void destroy() {
try {
if (mVocabulary != 0) SR_VocabularyDestroy(mVocabulary);
} finally {
mVocabulary = 0;
try {
if (mRecognizer != 0) SR_RecognizerUnsetup(mRecognizer);
} finally {
try {
if (mRecognizer != 0) SR_RecognizerDestroy(mRecognizer);
} finally {
mRecognizer = 0;
try {
SR_SessionDestroy();
} finally {
PMemShutdown();
}
}
}
}
}
/**
* Clean up resources.
*/
protected void finalize() throws Throwable {
if (mVocabulary != 0 || mRecognizer != 0) {
destroy();
throw new IllegalStateException("someone forgot to destroy Recognizer");
}
}
/* an example session captured, for reference
void doall() {
if (PMemInit ( )
|| lhs_audioinOpen ( WAVE_MAPPER, SREC_TEST_DEFAULT_AUDIO_FREQUENCY, &audio_in_handle )
|| srec_test_init_application_data ( &applicationData, argc, argv )
|| SR_SessionCreate ( "/system/usr/srec/config/en.us/baseline11k.par" )
|| SR_RecognizerCreate ( &applicationData.recognizer )
|| SR_RecognizerSetup ( applicationData.recognizer)
|| ESR_SessionGetLCHAR ( L("cmdline.vocabulary"), filename, &flen )
|| SR_VocabularyLoad ( filename, &applicationData.vocabulary )
|| SR_VocabularyGetLanguage ( applicationData.vocabulary, &applicationData.locale )
|| (applicationData.nametag = NULL)
|| SR_NametagsCreate ( &applicationData.nametags )
|| (LSTRCPY ( applicationData.grammars [0].grammar_path, "/system/usr/srec/config/en.us/grammars/VoiceDialer.g2g" ), 0)
|| (LSTRCPY ( applicationData.grammars [0].grammarID, "BothTags" ), 0)
|| (LSTRCPY ( applicationData.grammars [0].ruleName, "trash" ), 0)
|| (applicationData.grammars [0].is_ve_grammar = ESR_FALSE, 0)
|| SR_GrammarLoad (applicationData.grammars [0].grammar_path, &applicationData.grammars [applicationData.grammarCount].grammar )
|| SR_GrammarSetupVocabulary ( applicationData.grammars [0].grammar, applicationData.vocabulary )
|| SR_GrammarSetupRecognizer( applicationData.grammars [0].grammar, applicationData.recognizer )
|| SR_GrammarSetDispatchFunction ( applicationData.grammars [0].grammar, L("myDSMCallback"), NULL, myDSMCallback )
|| (applicationData.grammarCount++, 0)
|| SR_RecognizerActivateRule ( applicationData.recognizer, applicationData.grammars [0].grammar,
applicationData.grammars [0].ruleName, 1 )
|| (applicationData.active_grammar_num = 0, 0)
|| lhs_audioinStart ( audio_in_handle )
|| SR_RecognizerStart ( applicationData.recognizer )
|| strl ( applicationData.grammars [0].grammar, &applicationData, audio_in_handle, &recognition_count )
|| SR_RecognizerStop ( applicationData.recognizer )
|| lhs_audioinStop ( audio_in_handle )
|| SR_RecognizerDeactivateRule ( applicationData.recognizer, applicationData.grammars [0].grammar, applicationData.grammars [0].ruleName )
|| (applicationData.active_grammar_num = -1, 0)
|| SR_GrammarDestroy ( applicationData.grammars [0].grammar )
|| (applicationData.grammarCount--, 0)
|| SR_NametagsDestroy ( applicationData.nametags )
|| (applicationData.nametags = NULL, 0)
|| SR_VocabularyDestroy ( applicationData.vocabulary )
|| (applicationData.vocabulary = NULL)
|| SR_RecognizerUnsetup ( applicationData.recognizer) // releases acoustic models
|| SR_RecognizerDestroy ( applicationData.recognizer )
|| (applicationData.recognizer = NULL)
|| SR_SessionDestroy ( )
|| srec_test_shutdown_application_data ( &applicationData )
|| lhs_audioinClose ( &audio_in_handle )
|| PMemShutdown ( )
}
*/
//
// PMem native methods
//
private static native void PMemInit();
private static native void PMemShutdown();
//
// SR_Session native methods
//
private static native void SR_SessionCreate(String filename);
private static native void SR_SessionDestroy();
//
// SR_Recognizer native methods
//
/**
* Reserved value.
*/
public final static int EVENT_INVALID = 0;
/**
* Recognizer
could not find a match for the utterance.
*/
public final static int EVENT_NO_MATCH = 1;
/**
* Recognizer
processed one frame of audio.
*/
public final static int EVENT_INCOMPLETE = 2;
/**
* Recognizer
has just been started.
*/
public final static int EVENT_STARTED = 3;
/**
* Recognizer
is stopped.
*/
public final static int EVENT_STOPPED = 4;
/**
* Beginning of speech detected.
*/
public final static int EVENT_START_OF_VOICING = 5;
/**
* End of speech detected.
*/
public final static int EVENT_END_OF_VOICING = 6;
/**
* Beginning of utterance occured too soon.
*/
public final static int EVENT_SPOKE_TOO_SOON = 7;
/**
* Recognition match detected.
*/
public final static int EVENT_RECOGNITION_RESULT = 8;
/**
* Timeout occured before beginning of utterance.
*/
public final static int EVENT_START_OF_UTTERANCE_TIMEOUT = 9;
/**
* Timeout occured before speech recognition could complete.
*/
public final static int EVENT_RECOGNITION_TIMEOUT = 10;
/**
* Not enough samples to process one frame.
*/
public final static int EVENT_NEED_MORE_AUDIO = 11;
/**
* More audio encountered than is allowed by 'swirec_max_speech_duration'.
*/
public final static int EVENT_MAX_SPEECH = 12;
/**
* Produce a displayable string from an advance
event.
* @param event
* @return String representing the event.
*/
public static String eventToString(int event) {
switch (event) {
case EVENT_INVALID:
return "EVENT_INVALID";
case EVENT_NO_MATCH:
return "EVENT_NO_MATCH";
case EVENT_INCOMPLETE:
return "EVENT_INCOMPLETE";
case EVENT_STARTED:
return "EVENT_STARTED";
case EVENT_STOPPED:
return "EVENT_STOPPED";
case EVENT_START_OF_VOICING:
return "EVENT_START_OF_VOICING";
case EVENT_END_OF_VOICING:
return "EVENT_END_OF_VOICING";
case EVENT_SPOKE_TOO_SOON:
return "EVENT_SPOKE_TOO_SOON";
case EVENT_RECOGNITION_RESULT:
return "EVENT_RECOGNITION_RESULT";
case EVENT_START_OF_UTTERANCE_TIMEOUT:
return "EVENT_START_OF_UTTERANCE_TIMEOUT";
case EVENT_RECOGNITION_TIMEOUT:
return "EVENT_RECOGNITION_TIMEOUT";
case EVENT_NEED_MORE_AUDIO:
return "EVENT_NEED_MORE_AUDIO";
case EVENT_MAX_SPEECH:
return "EVENT_MAX_SPEECH";
}
return "EVENT_" + event;
}
//
// SR_Recognizer methods
//
private static native void SR_RecognizerStart(long recognizer);
private static native void SR_RecognizerStop(long recognizer);
private static native long SR_RecognizerCreate();
private static native void SR_RecognizerDestroy(long recognizer);
private static native void SR_RecognizerSetup(long recognizer);
private static native void SR_RecognizerUnsetup(long recognizer);
private static native boolean SR_RecognizerIsSetup(long recognizer);
private static native String SR_RecognizerGetParameter(long recognizer, String key);
private static native int SR_RecognizerGetSize_tParameter(long recognizer, String key);
private static native boolean SR_RecognizerGetBoolParameter(long recognizer, String key);
private static native void SR_RecognizerSetParameter(long recognizer, String key, String value);
private static native void SR_RecognizerSetSize_tParameter(long recognizer,
String key, int value);
private static native void SR_RecognizerSetBoolParameter(long recognizer, String key,
boolean value);
private static native void SR_RecognizerSetupRule(long recognizer, long grammar,
String ruleName);
private static native boolean SR_RecognizerHasSetupRules(long recognizer);
private static native void SR_RecognizerActivateRule(long recognizer, long grammar,
String ruleName, int weight);
private static native void SR_RecognizerDeactivateRule(long recognizer, long grammar,
String ruleName);
private static native void SR_RecognizerDeactivateAllRules(long recognizer);
private static native boolean SR_RecognizerIsActiveRule(long recognizer, long grammar,
String ruleName);
private static native boolean SR_RecognizerCheckGrammarConsistency(long recognizer,
long grammar);
private static native int SR_RecognizerPutAudio(long recognizer, byte[] buffer, int offset,
int length, boolean isLast);
private static native int SR_RecognizerAdvance(long recognizer);
// private static native void SR_RecognizerLoadUtterance(long recognizer,
// const LCHAR* filename);
// private static native void SR_RecognizerLoadWaveFile(long recognizer,
// const LCHAR* filename);
// private static native void SR_RecognizerSetLockFunction(long recognizer,
// SR_RecognizerLockFunction function, void* data);
private static native boolean SR_RecognizerIsSignalClipping(long recognizer);
private static native boolean SR_RecognizerIsSignalDCOffset(long recognizer);
private static native boolean SR_RecognizerIsSignalNoisy(long recognizer);
private static native boolean SR_RecognizerIsSignalTooQuiet(long recognizer);
private static native boolean SR_RecognizerIsSignalTooFewSamples(long recognizer);
private static native boolean SR_RecognizerIsSignalTooManySamples(long recognizer);
// private static native void SR_Recognizer_Change_Sample_Rate (size_t new_sample_rate);
//
// SR_AcousticState native methods
//
private static native void SR_AcousticStateReset(long recognizer);
private static native void SR_AcousticStateSet(long recognizer, String state);
private static native String SR_AcousticStateGet(long recognizer);
//
// SR_Grammar native methods
//
private static native void SR_GrammarCompile(long grammar);
private static native void SR_GrammarAddWordToSlot(long grammar, String slot,
String word, String pronunciation, int weight, String tag);
private static native void SR_GrammarResetAllSlots(long grammar);
// private static native void SR_GrammarAddNametagToSlot(long grammar, String slot,
// const struct SR_Nametag_t* nametag, int weight, String tag);
private static native void SR_GrammarSetupVocabulary(long grammar, long vocabulary);
// private static native void SR_GrammarSetupModels(long grammar, SR_AcousticModels* models);
private static native void SR_GrammarSetupRecognizer(long grammar, long recognizer);
private static native void SR_GrammarUnsetupRecognizer(long grammar);
// private static native void SR_GrammarGetModels(long grammar,SR_AcousticModels** models);
private static native long SR_GrammarCreate();
private static native void SR_GrammarDestroy(long grammar);
private static native long SR_GrammarLoad(String filename);
private static native void SR_GrammarSave(long grammar, String filename);
// private static native void SR_GrammarSetDispatchFunction(long grammar,
// const LCHAR* name, void* userData, SR_GrammarDispatchFunction function);
// private static native void SR_GrammarSetParameter(long grammar, const
// LCHAR* key, void* value);
// private static native void SR_GrammarSetSize_tParameter(long grammar,
// const LCHAR* key, size_t value);
// private static native void SR_GrammarGetParameter(long grammar, const
// LCHAR* key, void** value);
// private static native void SR_GrammarGetSize_tParameter(long grammar,
// const LCHAR* key, size_t* value);
// private static native void SR_GrammarCheckParse(long grammar, const LCHAR*
// transcription, SR_SemanticResult** result, size_t* resultCount);
private static native void SR_GrammarAllowOnly(long grammar, String transcription);
private static native void SR_GrammarAllowAll(long grammar);
//
// SR_Vocabulary native methods
//
// private static native int SR_VocabularyCreate();
private static native long SR_VocabularyLoad();
// private static native void SR_VocabularySave(SR_Vocabulary* self,
// const LCHAR* filename);
// private static native void SR_VocabularyAddWord(SR_Vocabulary* self,
// const LCHAR* word);
// private static native void SR_VocabularyGetLanguage(SR_Vocabulary* self,
// ESR_Locale* locale);
private static native void SR_VocabularyDestroy(long vocabulary);
private static native String SR_VocabularyGetPronunciation(long vocabulary, String word);
//
// SR_RecognizerResult native methods
//
private static native byte[] SR_RecognizerResultGetWaveform(long recognizer);
private static native int SR_RecognizerResultGetSize(long recognizer);
private static native int SR_RecognizerResultGetKeyCount(long recognizer, int nbest);
private static native String[] SR_RecognizerResultGetKeyList(long recognizer, int nbest);
private static native String SR_RecognizerResultGetValue(long recognizer,
int nbest, String key);
// private static native void SR_RecognizerResultGetLocale(long recognizer, ESR_Locale* locale);
}