|
Public Types |
typedef Trie< Array > | Trie |
typedef Trie::Iterator | Iterator |
typedef SymbolMap< std::string,
int > | SymbolMap |
typedef std::vector< float > | FloatVec |
typedef std::vector< int > | IntVec |
typedef std::vector< int > | Ngram |
Public Member Functions |
| Kneser () |
| Default constructor.
|
void | set_d1_model (int model) |
| Set model used in d1 computation (0 = kn, 1 = abs).
|
void | set_d1_weight_model (int model) |
| Set model used as ngram weight in d1 computation (0 = kn, 1 = abs).
|
const SymbolMap & | symbol_map () const |
| Constant access to symbol map.
|
int | sentence_start_id () const |
| Index of the sentence start symbol.
|
int | sentence_end_id () const |
| Index of the sentence end symbol.
|
Iterator | root () const |
| Iterator at the root of the trie.
|
u64 | num_ngrams () const |
| Return the number of ngrams in the unpruned model.
|
u64 | num_active_ngrams () const |
| Return the number of active ngrams in the pruned model.
|
template<class T> |
Iterator | find (const std::vector< T > &vec) const |
| Find an ngram.
|
Iterator | find (const std::string &str) const |
| Find an ngram.
|
float | ngram_prob (Iterator it) const |
| Probability of ngram P(abc) = P(a) P(b|a) P(c|ab).
|
float | prob_beta_lower (Iterator it) const |
| Compute conditional probability P(w|h) using only lower-order probabilities of the beta distribution (with the interpolation weight of h).
|
Ngram | ngram (const std::string &str) const |
| Convert a string to vector of symbol indices.
|
float | prob_beta_full (const Iterator &it) const |
| Compute conditional probability P(w|h) for ngrams found explicitly in the model using the beta distribution.
|
float | prob_beta_full (Ngram ngram) const |
| Compute conditional probability P(w|h) for arbitrary ngram using the beta distribution (Warning: implementation may be quite slow).
|
float | prob_lower (Iterator it) const |
| Compute conditional probability P(w|h) using only lower-order probabilities (with the interpolation weight of h).
|
float | prob_full (const Iterator &it, float *lower_prob=NULL) const |
| Compute conditional probability P(w|h).
|
float | prob_abs_lower (Iterator it) const |
| Compute conditional probability P(w|h) using unmodified counts and using only lower-order probabilities (with the interpolation weight of h).
|
float | prob_abs_full (const Iterator &it, float *lower_prob=NULL) const |
| Compute conditional probability P(w|h) using unmodified counts.
|
bool | is_pruned (const Iterator &it) const |
| Check if iterator is pruned.
|
u32 | get_count (const Iterator &it) const |
| Get ngram count at iterator returning.
|
u32 | sum_gx (const Iterator &it) const |
u32 | sum_nonzero_xg (const Iterator &it) const |
u32 | sum_nonzero_gx (const Iterator &it) const |
u32 | sum_nonzero_xgx (const Iterator &it) const |
float | get_beta_numerator (const Iterator &it) const |
float | get_beta_denominator (const Iterator &it) const |
| Return the beta denominator (1 for highest-order ngrams).
|
float | get_beta_interpolation_numerator (const Iterator &it) const |
float | get_d1 (const Iterator &it) const |
| Return the d1 measure of an ngram.
|
float | get_d2 (const Iterator &it) const |
| Return the unnormalized d2 measure of an ngram.
|
int | num_active_children (Iterator it) const |
| Compute the number of active (not pruned) children.
|
template<class T> |
std::string | ngram_str (const std::vector< T > &ngram) const |
| Printable string of a ngram.
|
void | write_binary_counts (FILE *file) const |
| Write trie structure, counts, modified counts, sum_nonzero_xg, sum_nonzero_xgx and sum_nonzero_gx in a file.
|
void | read_binary_counts (FILE *file) |
| Read trie structure, counts, modified counts, sum_nonzero_xg, sum_nonzero_xgx and sum_nonzero_gx from a file.
|
void | write_binary_d1d2 (FILE *file) const |
| Write d1 and d2.
|
void | read_binary_d1d2 (FILE *file) |
| Read d1 and d2.
|
void | write_arpa (FILE *file) const |
| Write model in ARPA format.
|
void | write_beta_arpa (FILE *file) const |
| Write model in ARPA format.
|
void | reserve_orders (unsigned int orders) |
| Reserve space for orders to avoid reallocing.
|
void | read_counts (FILE *file, bool integer_symbols=false) |
| Read counts from an ASCII file.
|
void | compute_sums () |
| Compute Kneser-Ney modified counts.
|
void | compute_d1 () |
| Compute d1 measure for all ngrams (n > 1) using log10.
|
void | compute_d2_full () |
| Compute d2 measure for each node when no nodes are yet pruned.
|
void | compute_d2_trick () |
| Compute d2 measure for all ngrams (n > 1).
|
void | prune_ngram (Iterator it) |
| Prune ngram, modify parents' d2 measure, and remove children.
|
void | prune_threshold (float threshold) |
| Prune ngrams (n > 1) whose d2 is under threshold.
|
void | prune (unsigned int ngrams) |
| Prune ngrams (n > 1) according to the d2 measure.
|
void | compute_beta_numerator_terms () |
| Precompute numerator terms for Kneser's improved back-off distribution.
|
void | compute_beta_denominator () |
| Precompute denominators for Kneser's improved back-off distribution.
|
int | compute_active_children (Iterator it, int *pruned_counts=NULL) |
| Compute number of active children and sum of pruned child counts.
|
void | compute_beta_interpolation_numerator () |
| Compute beta distribution and interpolation.
|
Iterator | add (const std::vector< int > &vec, int value) |
| Increment count.
|
void | set_discount (unsigned int order, float value) |
| Set the discounting parameter.
|
float | get_discount (unsigned int order) const |
| Get the discount parameters.
|
float | get_beta_discount (unsigned int order) const |
| Get the beta discount parameters.
|
float | interpolation (const Iterator &it) const |
float | interpolation_abs (const Iterator &it) const |
void | set_count (const Iterator &it, u32 value) |
| Set the count of the ngram at iterator returning.
|
std::string | debug_sum_nonzero_xg_str () |
void | debug_write_counts (FILE *file) |
Private Member Functions |
float | get_value (const std::vector< FloatVec > &arrays, const Iterator &it) const |
int | get_value (const std::vector< IntVec > &arrays, const Iterator &it) const |
u32 | get_value (const std::vector< Array > &arrays, const Iterator &it) const |
void | set_value (std::vector< FloatVec > &arrays, const Iterator &it, float value) |
void | set_value (std::vector< IntVec > &arrays, const Iterator &it, int value) |
void | set_value (std::vector< Array > &arrays, const Iterator &it, u32 value) |
void | add_value (std::vector< Array > &arrays, const Iterator &it, u32 value) |
void | add_value (std::vector< FloatVec > &arrays, const Iterator &it, float value) |
void | add_value (std::vector< IntVec > &arrays, const Iterator &it, int value) |
void | sub_value (std::vector< Array > &arrays, const Iterator &it, u32 value) |
Private Attributes |
int | m_progress_skip |
| Skip between progress reports.
|
Trie | m_trie |
| Trie containing the ngrams.
|
std::vector< int > | m_num_ngrams |
| Number of ngrams in the model (updated in pruning).
|
std::vector< IntVec > | m_counts |
| Real counts for each order.
|
std::vector< IntVec > | m_sum_gx |
| Sum of counts.
|
int | m_sum_gx0 |
| Sum of counts for unigrams.
|
std::vector< IntVec > | m_sum_nonzero_xg |
| Modified counts for each order.
|
std::vector< IntVec > | m_sum_nonzero_gx |
| Modified normalization counts for each order.
|
int | m_sum_nonzero_gx0 |
| Modified normalization counts for unigrams.
|
std::vector< IntVec > | m_sum_nonzero_xgx |
| Precomputed sum of modified counts for each order.
|
int | m_sum_nonzero_xgx0 |
| Precomputed sum of modified counts for unigrams.
|
std::vector< float > | m_discounts |
| Discount parameters for each order (index 0 is for unigrams).
|
std::vector< float > | m_beta_discounts |
| Beta discount parameters for each order (index 0 is for unigrams).
|
SymbolMap | m_symbol_map |
| Symbol set for mapping between symbols and integers.
|
std::string | m_sentence_start_str |
| String of the sentence start symbol.
|
std::string | m_sentence_end_str |
| String of the sentence end symbol.
|
int | m_sentence_start_id |
| Index of the sentence start symbol.
|
int | m_sentence_end_id |
| Index of the sentence end symbol.
|
|
int | m_d1_weight_model |
| Probability distribution used for computing ngram weight in d1 computation (0 = kneser-ney, 1 = absolute-discounting, 2 = counts).
|
int | m_d1_model |
| Probability distribution used in d1 computation (0 = kneser-ney, 1 = absolute-discounting).
|
|
std::vector< Array > | m_pruned |
std::vector< FloatVec > | m_d1 |
std::vector< FloatVec > | m_d2 |
std::vector< IntVec > | m_d2_norm |
std::vector< IntVec > | m_sum_xg_not_pruned |
std::vector< IntVec > | m_sum_nonzero_xg_not_pruned |
std::vector< FloatVec > | m_beta_denominator |
| Precomputed denominator for Kneser's improved back-off distribution.
|
float | m_beta_denominator0 |
| Precomputed unigram-denominator for Kneser's improved back-off distribution.
|
std::vector< FloatVec > | m_beta_interpolation_numerator |
| Precomputed interpolation weights for Kneser's improved back-off distribution.
|
float | m_beta_interpolation_numerator0 |
| Precomputed interpolation weight for zero-grams.
|
Classes |
struct | D2Norm |
struct | OrderIndex |
struct | PruneCompare |
Statistical Language Modeling Using a Variable Context Length. EUROSPEECH 1997.