diff --git a/README.md b/README.md index 9ce84cd10..8a7da7b8a 100644 --- a/README.md +++ b/README.md @@ -120,8 +120,10 @@ The commands are as follows: all subsequent ones are concatenated to make the text, to avoid surprises if you forget to quote it. You are responsible for normalizing the text to remove punctuation, uppercase, centipedes, - etc. For example: - + etc. Forced alignment uses `align_beam`, `align_pbeam`, and + `align_wbeam` instead of `beam`, `pbeam`, and `wbeam`; see + `pocketsphinx help-config`. For example: + pocketsphinx align goforward.wav "go forward ten meters" By default, only word-level alignment is done. To get phone diff --git a/cython/_pocketsphinx.pyx b/cython/_pocketsphinx.pyx index d5ef2da64..83c898eb7 100644 --- a/cython/_pocketsphinx.pyx +++ b/cython/_pocketsphinx.pyx @@ -1668,6 +1668,10 @@ cdef class Decoder: segmentation in the usual manner. For phone-level alignment, see `set_alignment` and `get_alignment`. + Pruning for this pass uses ``align_beam``, ``align_pbeam``, and + ``align_wbeam`` by default; set ``align_use_main_beams`` to use the + main decoder ``beam``/``pbeam``/``wbeam`` instead. + Args: text(str): Sentence to align, as whitespace-separated words. All words must be present in the diff --git a/doxygen/pocketsphinx.1 b/doxygen/pocketsphinx.1 index 689e36d93..2fe7d6bbd 100644 --- a/doxygen/pocketsphinx.1 +++ b/doxygen/pocketsphinx.1 @@ -59,7 +59,12 @@ sequence, and write a JSON object in the same format described above. The first positional argument is the input, and all subsequent ones are concatenated to make the text, to avoid surprises if you forget to quote it. You are responsible for normalizing the text to remove -punctuation, uppercase, centipedes, etc. For example: +punctuation, uppercase, centipedes, etc. +Forced alignment uses +\fIalign_beam\fP, \fIalign_pbeam\fP, and \fIalign_wbeam\fP +instead of \fIbeam\fP, \fIpbeam\fP, and \fIwbeam\fP; defaults are +independent of LVCSR tuning (see \fBpocketsphinx help-config\fP). +For example: .EX pocketsphinx align goforward.wav "go forward ten meters" diff --git a/doxygen/pocketsphinx.1.in b/doxygen/pocketsphinx.1.in index 4ad4021dc..0f5b97f41 100644 --- a/doxygen/pocketsphinx.1.in +++ b/doxygen/pocketsphinx.1.in @@ -59,7 +59,12 @@ sequence, and write a JSON object in the same format described above. The first positional argument is the input, and all subsequent ones are concatenated to make the text, to avoid surprises if you forget to quote it. You are responsible for normalizing the text to remove -punctuation, uppercase, centipedes, etc. For example: +punctuation, uppercase, centipedes, etc. +Forced alignment uses +\fIalign_beam\fP, \fIalign_pbeam\fP, and \fIalign_wbeam\fP +instead of \fIbeam\fP, \fIpbeam\fP, and \fIwbeam\fP; defaults are +independent of LVCSR tuning (see \fBpocketsphinx help-config\fP). +For example: .EX pocketsphinx align goforward.wav "go forward ten meters" diff --git a/include/pocketsphinx/search.h b/include/pocketsphinx/search.h index 35cbff87b..62d85f078 100644 --- a/include/pocketsphinx/search.h +++ b/include/pocketsphinx/search.h @@ -336,6 +336,11 @@ int ps_add_allphone_file(ps_decoder_t *ps, const char *name, const char *path); * phoneme or state segmentations, you must subsequently call * ps_set_alignment() and re-run decoding. It's tough son, but it's life. * + * By default, pruning uses align_beam, align_pbeam, and align_wbeam + * rather than beam, pbeam, and wbeam. If align_use_main_beams is enabled, + * the main decoder beams are used instead. Defaults for align_* are tuned + * for forced alignment and are independent of LVCSR beam defaults. + * * @memberof ps_decoder_t * @param ps Decoder * @param words String containing whitespace-separated words for alignment. diff --git a/programs/pocketsphinx_main.c b/programs/pocketsphinx_main.c index 1c7a6c0cc..e53b8ccb2 100644 --- a/programs/pocketsphinx_main.c +++ b/programs/pocketsphinx_main.c @@ -753,6 +753,8 @@ usage_align(char *name) fprintf(stderr, " INPUT Audio file to align (or '-' for stdin)\n"); fprintf(stderr, " WORDS... Words to align to (will be concatenated)\n"); fprintf(stderr, "\nAlignment-specific options:\n"); + fprintf(stderr, " -align_beam FLOAT Beam for forced-alignment FSG (see also -align_pbeam, -align_wbeam)\n"); + fprintf(stderr, " -align_use_main_beams yes/no Use -beam/-pbeam/-wbeam for alignment FSG (default: no)\n"); fprintf(stderr, " -phone_align yes/no Run a second pass to align phones and print their durations\n"); fprintf(stderr, " (default: no)\n"); fprintf(stderr, " -state_align yes/no Run a second pass to align phones and states and print their\n"); diff --git a/src/config_macro.h b/src/config_macro.h index 769412ed1..9b3d0b284 100644 --- a/src/config_macro.h +++ b/src/config_macro.h @@ -61,6 +61,8 @@ POCKETSPHINX_FEAT_OPTIONS, \ POCKETSPHINX_ACMOD_OPTIONS, \ POCKETSPHINX_BEAM_OPTIONS, \ + POCKETSPHINX_ALIGN_BEAM_OPTIONS, \ + POCKETSPHINX_ALIGN_FSG_OPTIONS, \ POCKETSPHINX_SEARCH_OPTIONS, \ POCKETSPHINX_DICT_OPTIONS, \ POCKETSPHINX_NGRAM_OPTIONS, \ @@ -144,6 +146,32 @@ "3.0", \ "Weight for phoneme lookahead penalties" } \ +/** Beam widths for forced-alignment FSG (ps_set_align_text, align subcommand). */ +#define POCKETSPHINX_ALIGN_BEAM_OPTIONS \ +{ "align_beam", \ + ARG_FLOATING, \ + "1e-48", \ + "Beam width for each frame in forced-alignment FSG search " \ + "(independent of beam)" }, \ + { "align_pbeam", \ + ARG_FLOATING, \ + "1e-48", \ + "Phone-transition beam for forced-alignment FSG search " \ + "(independent of pbeam)" }, \ + { "align_wbeam", \ + ARG_FLOATING, \ + "1e-48", \ + "Word-exit beam for forced-alignment FSG search " \ + "(independent of wbeam)" } \ + +/** Toggle: use main decoder beams for forced-alignment FSG instead of align_*. */ +#define POCKETSPHINX_ALIGN_FSG_OPTIONS \ +{ "align_use_main_beams", \ + ARG_BOOLEAN, \ + "no", \ + "If yes, forced-alignment FSG uses beam, pbeam, wbeam instead of " \ + "align_beam, align_pbeam, align_wbeam" } \ + /** Options defining other parameters for tuning the search. */ #define POCKETSPHINX_SEARCH_OPTIONS \ { "compallsen", \ diff --git a/src/fsg_search.c b/src/fsg_search.c index d4db17c5f..4ee5a17d6 100644 --- a/src/fsg_search.c +++ b/src/fsg_search.c @@ -171,6 +171,23 @@ fsg_search_add_altpron(fsg_search_t *fsgs, fsg_model_t *fsg) return n_alt; } +static void +fsg_search_beam_config(ps_config_t *config, char const *name, + float *beam, float *pbeam, float *wbeam) +{ + if (name != NULL && strcmp(name, PS_DEFAULT_ALIGN_SEARCH) == 0 + && !ps_config_bool(config, "align_use_main_beams")) { + *beam = ps_config_float(config, "align_beam"); + *pbeam = ps_config_float(config, "align_pbeam"); + *wbeam = ps_config_float(config, "align_wbeam"); + } + else { + *beam = ps_config_float(config, "beam"); + *pbeam = ps_config_float(config, "pbeam"); + *wbeam = ps_config_float(config, "wbeam"); + } +} + ps_search_t * fsg_search_init(const char *name, fsg_model_t *fsg, @@ -196,16 +213,21 @@ fsg_search_init(const char *name, fsgs->frame = -1; /* Get search pruning parameters */ - fsgs->beam_factor = 1.0f; - fsgs->beam = fsgs->beam_orig - = (int32) logmath_log(acmod->lmath, ps_config_float(config, "beam")) - >> SENSCR_SHIFT; - fsgs->pbeam = fsgs->pbeam_orig - = (int32) logmath_log(acmod->lmath, ps_config_float(config, "pbeam")) - >> SENSCR_SHIFT; - fsgs->wbeam = fsgs->wbeam_orig - = (int32) logmath_log(acmod->lmath, ps_config_float(config, "wbeam")) - >> SENSCR_SHIFT; + { + float fl_beam, fl_pbeam, fl_wbeam; + + fsgs->beam_factor = 1.0f; + fsg_search_beam_config(config, name, &fl_beam, &fl_pbeam, &fl_wbeam); + fsgs->beam = fsgs->beam_orig + = (int32) logmath_log(acmod->lmath, fl_beam) + >> SENSCR_SHIFT; + fsgs->pbeam = fsgs->pbeam_orig + = (int32) logmath_log(acmod->lmath, fl_pbeam) + >> SENSCR_SHIFT; + fsgs->wbeam = fsgs->wbeam_orig + = (int32) logmath_log(acmod->lmath, fl_wbeam) + >> SENSCR_SHIFT; + } /* LM related weights/penalties */ fsgs->lw = ps_config_float(config, "lw"); @@ -242,6 +264,11 @@ fsg_search_init(const char *name, if (ps_config_bool(config, "bestpath")) fsgs->bestpath = TRUE; #endif + /* Forced-alignment FSG: hyp() must list the full transcript. Lattice + * bestpath can return a shorter string than the Viterbi backtrace (see + * pocketsphinx_main align(), which disables bestpath). */ + if (name != NULL && strcmp(name, PS_DEFAULT_ALIGN_SEARCH) == 0) + fsgs->bestpath = FALSE; if (fsg_search_reinit(ps_search_base(fsgs), ps_search_dict(fsgs), diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 7833c19c4..59d227b60 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -40,6 +40,7 @@ set(TESTS test_vad test_vad_alloc test_word_align + test_align_fsg_beam test_endpointer test_endpointer_timestamp test_thread_local_compile diff --git a/test/unit/test_align_fsg_beam.c b/test/unit/test_align_fsg_beam.c new file mode 100644 index 000000000..5f352fe9a --- /dev/null +++ b/test/unit/test_align_fsg_beam.c @@ -0,0 +1,60 @@ +/* -*- c-basic-offset: 4 -*- */ +#include + +#include "pocketsphinx_internal.h" +#include "fsg_search_internal.h" +#include "util/hash_table.h" +#include "test_macros.h" + +int +main(int argc, char *argv[]) +{ + ps_decoder_t *ps; + ps_config_t *config; + void *search_p; + fsg_search_t *fsgs; + + (void)argc; + (void)argv; + err_set_loglevel(ERR_INFO); + /* Stock asymmetry: wbeam differs from beam; forced-align FSG must use align_* */ + TEST_ASSERT(config = + ps_config_parse_json( + NULL, + "loglevel: INFO, bestpath: false," + "hmm: \"" MODELDIR "/en-us/en-us\"," + "dict: \"" MODELDIR "/en-us/cmudict-en-us.dict\"," + "samprate: 16000," + "wbeam: 7e-29")); + TEST_ASSERT(ps = ps_init(config)); + TEST_EQUAL(0, ps_set_align_text(ps, "go forward ten meters")); + TEST_EQUAL(0, hash_table_lookup(ps->searches, PS_DEFAULT_ALIGN_SEARCH, + &search_p)); + fsgs = (fsg_search_t *)search_p; + TEST_EQUAL(fsgs->wbeam_orig, fsgs->beam_orig); + TEST_EQUAL(fsgs->pbeam_orig, fsgs->beam_orig); + + ps_free(ps); + ps_config_free(config); + + /* With align_use_main_beams, FSG uses global wbeam (asymmetric from beam). */ + TEST_ASSERT(config = + ps_config_parse_json( + NULL, + "loglevel: ERROR, bestpath: false," + "hmm: \"" MODELDIR "/en-us/en-us\"," + "dict: \"" MODELDIR "/en-us/cmudict-en-us.dict\"," + "samprate: 16000," + "wbeam: 7e-29," + "align_use_main_beams: yes")); + TEST_ASSERT(ps = ps_init(config)); + TEST_EQUAL(0, ps_set_align_text(ps, "go forward ten meters")); + TEST_EQUAL(0, hash_table_lookup(ps->searches, PS_DEFAULT_ALIGN_SEARCH, + &search_p)); + fsgs = (fsg_search_t *)search_p; + TEST_ASSERT(fsgs->wbeam_orig != fsgs->beam_orig); + + ps_free(ps); + ps_config_free(config); + return 0; +}