bit::Kneser Class Reference

Pruning algorithm for buildling variable-order models with Kneser-Ney smoothing presented in: Reinhard Kneser. More...

#include <Kneser.hh>

List of all members.

Public Types

typedef Trie< ArrayTrie
typedef Trie::Iterator Iterator
typedef SymbolMap< std::string,
int > 
SymbolMap
typedef std::vector< float > FloatVec
typedef std::vector< int > IntVec
typedef std::vector< int > Ngram

Public Member Functions

 Kneser ()
 Default constructor.
void set_d1_model (int model)
 Set model used in d1 computation (0 = kn, 1 = abs).
void set_d1_weight_model (int model)
 Set model used as ngram weight in d1 computation (0 = kn, 1 = abs).
const SymbolMapsymbol_map () const
 Constant access to symbol map.
int sentence_start_id () const
 Index of the sentence start symbol.
int sentence_end_id () const
 Index of the sentence end symbol.
Iterator root () const
 Iterator at the root of the trie.
u64 num_ngrams () const
 Return the number of ngrams in the unpruned model.
u64 num_active_ngrams () const
 Return the number of active ngrams in the pruned model.
template<class T>
Iterator find (const std::vector< T > &vec) const
 Find an ngram.
Iterator find (const std::string &str) const
 Find an ngram.
float ngram_prob (Iterator it) const
 Probability of ngram P(abc) = P(a) P(b|a) P(c|ab).
float prob_beta_lower (Iterator it) const
 Compute conditional probability P(w|h) using only lower-order probabilities of the beta distribution (with the interpolation weight of h).
Ngram ngram (const std::string &str) const
 Convert a string to vector of symbol indices.
float prob_beta_full (const Iterator &it) const
 Compute conditional probability P(w|h) for ngrams found explicitly in the model using the beta distribution.
float prob_beta_full (Ngram ngram) const
 Compute conditional probability P(w|h) for arbitrary ngram using the beta distribution (Warning: implementation may be quite slow).
float prob_lower (Iterator it) const
 Compute conditional probability P(w|h) using only lower-order probabilities (with the interpolation weight of h).
float prob_full (const Iterator &it, float *lower_prob=NULL) const
 Compute conditional probability P(w|h).
float prob_abs_lower (Iterator it) const
 Compute conditional probability P(w|h) using unmodified counts and using only lower-order probabilities (with the interpolation weight of h).
float prob_abs_full (const Iterator &it, float *lower_prob=NULL) const
 Compute conditional probability P(w|h) using unmodified counts.
bool is_pruned (const Iterator &it) const
 Check if iterator is pruned.
u32 get_count (const Iterator &it) const
 Get ngram count at iterator returning.
u32 sum_gx (const Iterator &it) const
u32 sum_nonzero_xg (const Iterator &it) const
u32 sum_nonzero_gx (const Iterator &it) const
u32 sum_nonzero_xgx (const Iterator &it) const
float get_beta_numerator (const Iterator &it) const
float get_beta_denominator (const Iterator &it) const
 Return the beta denominator (1 for highest-order ngrams).
float get_beta_interpolation_numerator (const Iterator &it) const
float get_d1 (const Iterator &it) const
 Return the d1 measure of an ngram.
float get_d2 (const Iterator &it) const
 Return the unnormalized d2 measure of an ngram.
int num_active_children (Iterator it) const
 Compute the number of active (not pruned) children.
template<class T>
std::string ngram_str (const std::vector< T > &ngram) const
 Printable string of a ngram.
void write_binary_counts (FILE *file) const
 Write trie structure, counts, modified counts, sum_nonzero_xg, sum_nonzero_xgx and sum_nonzero_gx in a file.
void read_binary_counts (FILE *file)
 Read trie structure, counts, modified counts, sum_nonzero_xg, sum_nonzero_xgx and sum_nonzero_gx from a file.
void write_binary_d1d2 (FILE *file) const
 Write d1 and d2.
void read_binary_d1d2 (FILE *file)
 Read d1 and d2.
void write_arpa (FILE *file) const
 Write model in ARPA format.
void write_beta_arpa (FILE *file) const
 Write model in ARPA format.
void reserve_orders (unsigned int orders)
 Reserve space for orders to avoid reallocing.
void read_counts (FILE *file, bool integer_symbols=false)
 Read counts from an ASCII file.
void compute_sums ()
 Compute Kneser-Ney modified counts.
void compute_d1 ()
 Compute d1 measure for all ngrams (n > 1) using log10.
void compute_d2_full ()
 Compute d2 measure for each node when no nodes are yet pruned.
void compute_d2_trick ()
 Compute d2 measure for all ngrams (n > 1).
void prune_ngram (Iterator it)
 Prune ngram, modify parents' d2 measure, and remove children.
void prune_threshold (float threshold)
 Prune ngrams (n > 1) whose d2 is under threshold.
void prune (unsigned int ngrams)
 Prune ngrams (n > 1) according to the d2 measure.
void compute_beta_numerator_terms ()
 Precompute numerator terms for Kneser's improved back-off distribution.
void compute_beta_denominator ()
 Precompute denominators for Kneser's improved back-off distribution.
int compute_active_children (Iterator it, int *pruned_counts=NULL)
 Compute number of active children and sum of pruned child counts.
void compute_beta_interpolation_numerator ()
 Compute beta distribution and interpolation.
Iterator add (const std::vector< int > &vec, int value)
 Increment count.
void set_discount (unsigned int order, float value)
 Set the discounting parameter.
float get_discount (unsigned int order) const
 Get the discount parameters.
float get_beta_discount (unsigned int order) const
 Get the beta discount parameters.
float interpolation (const Iterator &it) const
float interpolation_abs (const Iterator &it) const
void set_count (const Iterator &it, u32 value)
 Set the count of the ngram at iterator returning.
std::string debug_sum_nonzero_xg_str ()
void debug_write_counts (FILE *file)

Private Member Functions

float get_value (const std::vector< FloatVec > &arrays, const Iterator &it) const
int get_value (const std::vector< IntVec > &arrays, const Iterator &it) const
u32 get_value (const std::vector< Array > &arrays, const Iterator &it) const
void set_value (std::vector< FloatVec > &arrays, const Iterator &it, float value)
void set_value (std::vector< IntVec > &arrays, const Iterator &it, int value)
void set_value (std::vector< Array > &arrays, const Iterator &it, u32 value)
void add_value (std::vector< Array > &arrays, const Iterator &it, u32 value)
void add_value (std::vector< FloatVec > &arrays, const Iterator &it, float value)
void add_value (std::vector< IntVec > &arrays, const Iterator &it, int value)
void sub_value (std::vector< Array > &arrays, const Iterator &it, u32 value)

Private Attributes

int m_progress_skip
 Skip between progress reports.
Trie m_trie
 Trie containing the ngrams.
std::vector< int > m_num_ngrams
 Number of ngrams in the model (updated in pruning).
std::vector< IntVecm_counts
 Real counts for each order.
std::vector< IntVecm_sum_gx
 Sum of counts.
int m_sum_gx0
 Sum of counts for unigrams.
std::vector< IntVecm_sum_nonzero_xg
 Modified counts for each order.
std::vector< IntVecm_sum_nonzero_gx
 Modified normalization counts for each order.
int m_sum_nonzero_gx0
 Modified normalization counts for unigrams.
std::vector< IntVecm_sum_nonzero_xgx
 Precomputed sum of modified counts for each order.
int m_sum_nonzero_xgx0
 Precomputed sum of modified counts for unigrams.
std::vector< float > m_discounts
 Discount parameters for each order (index 0 is for unigrams).
std::vector< float > m_beta_discounts
 Beta discount parameters for each order (index 0 is for unigrams).
SymbolMap m_symbol_map
 Symbol set for mapping between symbols and integers.
std::string m_sentence_start_str
 String of the sentence start symbol.
std::string m_sentence_end_str
 String of the sentence end symbol.
int m_sentence_start_id
 Index of the sentence start symbol.
int m_sentence_end_id
 Index of the sentence end symbol.
Configurable options
int m_d1_weight_model
 Probability distribution used for computing ngram weight in d1 computation (0 = kneser-ney, 1 = absolute-discounting, 2 = counts).
int m_d1_model
 Probability distribution used in d1 computation (0 = kneser-ney, 1 = absolute-discounting).
Variables and types used in pruning
std::vector< Arraym_pruned
std::vector< FloatVecm_d1
std::vector< FloatVecm_d2
std::vector< IntVecm_d2_norm
std::vector< IntVecm_sum_xg_not_pruned
std::vector< IntVecm_sum_nonzero_xg_not_pruned
std::vector< FloatVecm_beta_denominator
 Precomputed denominator for Kneser's improved back-off distribution.
float m_beta_denominator0
 Precomputed unigram-denominator for Kneser's improved back-off distribution.
std::vector< FloatVecm_beta_interpolation_numerator
 Precomputed interpolation weights for Kneser's improved back-off distribution.
float m_beta_interpolation_numerator0
 Precomputed interpolation weight for zero-grams.

Classes

struct  D2Norm
struct  OrderIndex
struct  PruneCompare


Detailed Description

Pruning algorithm for buildling variable-order models with Kneser-Ney smoothing presented in: Reinhard Kneser.

Statistical Language Modeling Using a Variable Context Length. EUROSPEECH 1997.

Bug:
The test suite of Kneser class is not very thoroug.


Member Typedef Documentation

typedef std::vector<float> bit::Kneser::FloatVec
 

typedef std::vector<int> bit::Kneser::IntVec
 

typedef Trie::Iterator bit::Kneser::Iterator
 

typedef std::vector<int> bit::Kneser::Ngram
 

typedef SymbolMap<std::string, int> bit::Kneser::SymbolMap
 

typedef Trie<Array> bit::Kneser::Trie
 


Constructor & Destructor Documentation

bit::Kneser::Kneser  )  [inline]
 

Default constructor.


Member Function Documentation

Iterator bit::Kneser::add const std::vector< int > &  vec,
int  value
[inline]
 

Increment count.

Parameters:
vec = ngram whose count is incremented
value = the amount of increment
Returns:
iterator to the ngram
Exceptions:
bit::invalid argument if ngram's parent was not in the model

void bit::Kneser::add_value std::vector< IntVec > &  arrays,
const Iterator it,
int  value
[inline, private]
 

void bit::Kneser::add_value std::vector< FloatVec > &  arrays,
const Iterator it,
float  value
[inline, private]
 

void bit::Kneser::add_value std::vector< Array > &  arrays,
const Iterator it,
u32  value
[inline, private]
 

int bit::Kneser::compute_active_children Iterator  it,
int *  pruned_counts = NULL
[inline]
 

Compute number of active children and sum of pruned child counts.

void bit::Kneser::compute_beta_denominator  )  [inline]
 

Precompute denominators for Kneser's improved back-off distribution.

void bit::Kneser::compute_beta_interpolation_numerator  )  [inline]
 

Compute beta distribution and interpolation.

Exceptions:
bit::out_of_range if a numerator was less than discount

void bit::Kneser::compute_beta_numerator_terms  )  [inline]
 

Precompute numerator terms for Kneser's improved back-off distribution.

Note that discounting weights are modified too.

void bit::Kneser::compute_d1  )  [inline]
 

Compute d1 measure for all ngrams (n > 1) using log10.

void bit::Kneser::compute_d2_full  )  [inline]
 

Compute d2 measure for each node when no nodes are yet pruned.

Method prune_threshold modifies d2 when pruning nodes.

void bit::Kneser::compute_d2_trick  )  [inline]
 

Compute d2 measure for all ngrams (n > 1).

For each ngram we can precompute what the d2 measure will be at the time when the node would be pruned. We can do this in a bottom-up fashion.

void bit::Kneser::compute_sums  )  [inline]
 

Compute Kneser-Ney modified counts.

The modified count stored at ngram (a,b,c) is the number of ngrams (x,a,b,c) which have positive count. If no such ngrams exist, the original count is used instead.

std::string bit::Kneser::debug_sum_nonzero_xg_str  )  [inline]
 

void bit::Kneser::debug_write_counts FILE *  file  )  [inline]
 

Iterator bit::Kneser::find const std::string &  str  )  const [inline]
 

Find an ngram.

Parameters:
str = white-space separated list of ngram symbols
Returns:
iterator to the ngram, or empty if not found

template<class T>
Iterator bit::Kneser::find const std::vector< T > &  vec  )  const [inline]
 

Find an ngram.

Parameters:
vec = ngram to find
Returns:
iterator to the ngram, or empty if not found

float bit::Kneser::get_beta_denominator const Iterator it  )  const [inline]
 

Return the beta denominator (1 for highest-order ngrams).

float bit::Kneser::get_beta_discount unsigned int  order  )  const [inline]
 

Get the beta discount parameters.

Parameters:
order = order for which the discount is fetched (0 is unigrams)
Returns:
the discount parameter
Exceptions:
bit::invalid_call if discount not set

float bit::Kneser::get_beta_interpolation_numerator const Iterator it  )  const [inline]
 

float bit::Kneser::get_beta_numerator const Iterator it  )  const [inline]
 

u32 bit::Kneser::get_count const Iterator it  )  const [inline]
 

Get ngram count at iterator returning.

It is safe to call this even for iterators for which the count has not been set. Zero is returned in that case.

Returns:
the count of the ngram

float bit::Kneser::get_d1 const Iterator it  )  const [inline]
 

Return the d1 measure of an ngram.

Exceptions:
bit::invalid_argument if called for unigrams (we never prune unigrams)

float bit::Kneser::get_d2 const Iterator it  )  const [inline]
 

Return the unnormalized d2 measure of an ngram.

Exceptions:
bit::invalid_argument if called for unigrams (we never prune unigrams)

float bit::Kneser::get_discount unsigned int  order  )  const [inline]
 

Get the discount parameters.

Parameters:
order = order for which the discount is fetched (0 is unigrams)
Returns:
the discount parameter
Exceptions:
bit::invalid_call if discount not set

u32 bit::Kneser::get_value const std::vector< Array > &  arrays,
const Iterator it
const [inline, private]
 

int bit::Kneser::get_value const std::vector< IntVec > &  arrays,
const Iterator it
const [inline, private]
 

float bit::Kneser::get_value const std::vector< FloatVec > &  arrays,
const Iterator it
const [inline, private]
 

float bit::Kneser::interpolation const Iterator it  )  const [inline]
 

float bit::Kneser::interpolation_abs const Iterator it  )  const [inline]
 

bool bit::Kneser::is_pruned const Iterator it  )  const [inline]
 

Check if iterator is pruned.

Ngram bit::Kneser::ngram const std::string &  str  )  const [inline]
 

Convert a string to vector of symbol indices.

float bit::Kneser::ngram_prob Iterator  it  )  const [inline]
 

Probability of ngram P(abc) = P(a) P(b|a) P(c|ab).

Parameters:
it = ngram iterator for which the probability is computed
Returns:
the probability of the ngram
Exceptions:
bit::invalid_argument if called at root

template<class T>
std::string bit::Kneser::ngram_str const std::vector< T > &  ngram  )  const [inline]
 

Printable string of a ngram.

Parameters:
ngram = ngram to convert
Returns:
string representing the ngram

int bit::Kneser::num_active_children Iterator  it  )  const [inline]
 

Compute the number of active (not pruned) children.

u64 bit::Kneser::num_active_ngrams  )  const [inline]
 

Return the number of active ngrams in the pruned model.

u64 bit::Kneser::num_ngrams  )  const [inline]
 

Return the number of ngrams in the unpruned model.

float bit::Kneser::prob_abs_full const Iterator it,
float *  lower_prob = NULL
const [inline]
 

Compute conditional probability P(w|h) using unmodified counts.

Parameters:
it = ngram iterator pointing to (h,w)
lower_prob = if non-null, the lower-order part "the whole backoff part" of the probability is set here
Exceptions:
bit::invalid_argument if w is sentence start symbol

float bit::Kneser::prob_abs_lower Iterator  it  )  const [inline]
 

Compute conditional probability P(w|h) using unmodified counts and using only lower-order probabilities (with the interpolation weight of h).

Exceptions:
bit::invalid_argument if w is sentence start symbol

float bit::Kneser::prob_beta_full Ngram  ngram  )  const [inline]
 

Compute conditional probability P(w|h) for arbitrary ngram using the beta distribution (Warning: implementation may be quite slow).

Parameters:
vec = std::vector of symbols
Exceptions:
bit::invalid_argument if w is sentence start symbol or vec is empty

float bit::Kneser::prob_beta_full const Iterator it  )  const [inline]
 

Compute conditional probability P(w|h) for ngrams found explicitly in the model using the beta distribution.

Exceptions:
bit::invalid_argument if w is sentence start symbol

float bit::Kneser::prob_beta_lower Iterator  it  )  const [inline]
 

Compute conditional probability P(w|h) using only lower-order probabilities of the beta distribution (with the interpolation weight of h).

Exceptions:
bit::invalid_argument if w is sentence start symbol

float bit::Kneser::prob_full const Iterator it,
float *  lower_prob = NULL
const [inline]
 

Compute conditional probability P(w|h).

Parameters:
it = ngram iterator pointing to (h,w)
lower_prob = if non-null, the lower-order part "the whole backoff part" of the probability is set here
Exceptions:
bit::invalid_argument if w is sentence start symbol

float bit::Kneser::prob_lower Iterator  it  )  const [inline]
 

Compute conditional probability P(w|h) using only lower-order probabilities (with the interpolation weight of h).

Exceptions:
bit::invalid_argument if w is sentence start symbol

void bit::Kneser::prune unsigned int  ngrams  )  [inline]
 

Prune ngrams (n > 1) according to the d2 measure.

The ngrams are not actually removed, but only marked pruned.

Parameters:
ngrams = number of ngrams to prune
Exceptions:
bit::invalid_argument if ngrams is greater than number of ngrams for (n > 2).

void bit::Kneser::prune_ngram Iterator  it  )  [inline]
 

Prune ngram, modify parents' d2 measure, and remove children.

Exceptions:
bit::invalid_argument if pruned already or ngram is shorter than 2-gram

void bit::Kneser::prune_threshold float  threshold  )  [inline]
 

Prune ngrams (n > 1) whose d2 is under threshold.

Children are processed before parents, and parents' d2 is modified if children are pruned.

void bit::Kneser::read_binary_counts FILE *  file  )  [inline]
 

Read trie structure, counts, modified counts, sum_nonzero_xg, sum_nonzero_xgx and sum_nonzero_gx from a file.

void bit::Kneser::read_binary_d1d2 FILE *  file  )  [inline]
 

Read d1 and d2.

void bit::Kneser::read_counts FILE *  file,
bool  integer_symbols = false
[inline]
 

Read counts from an ASCII file.

Each line contains a white space separated list of strings; the ngram symbols followed by the count.

Parameters:
file = file stream to read from
integer_symbols = the ngram symbols are integers
Exceptions:
bit::io_error if reading fails

void bit::Kneser::reserve_orders unsigned int  orders  )  [inline]
 

Reserve space for orders to avoid reallocing.

Parameters:
orders = number of orders used

Iterator bit::Kneser::root  )  const [inline]
 

Iterator at the root of the trie.

int bit::Kneser::sentence_end_id  )  const [inline]
 

Index of the sentence end symbol.

int bit::Kneser::sentence_start_id  )  const [inline]
 

Index of the sentence start symbol.

void bit::Kneser::set_count const Iterator it,
u32  value
[inline]
 

Set the count of the ngram at iterator returning.

Returns:
the count of the ngram

void bit::Kneser::set_d1_model int  model  )  [inline]
 

Set model used in d1 computation (0 = kn, 1 = abs).

void bit::Kneser::set_d1_weight_model int  model  )  [inline]
 

Set model used as ngram weight in d1 computation (0 = kn, 1 = abs).

void bit::Kneser::set_discount unsigned int  order,
float  value
[inline]
 

Set the discounting parameter.

Parameters:
order = order for which the discount is set. Zero is the discount for unigrams.
value = value of the discount

void bit::Kneser::set_value std::vector< Array > &  arrays,
const Iterator it,
u32  value
[inline, private]
 

void bit::Kneser::set_value std::vector< IntVec > &  arrays,
const Iterator it,
int  value
[inline, private]
 

void bit::Kneser::set_value std::vector< FloatVec > &  arrays,
const Iterator it,
float  value
[inline, private]
 

void bit::Kneser::sub_value std::vector< Array > &  arrays,
const Iterator it,
u32  value
[inline, private]
 

u32 bit::Kneser::sum_gx const Iterator it  )  const [inline]
 

u32 bit::Kneser::sum_nonzero_gx const Iterator it  )  const [inline]
 

u32 bit::Kneser::sum_nonzero_xg const Iterator it  )  const [inline]
 

u32 bit::Kneser::sum_nonzero_xgx const Iterator it  )  const [inline]
 

const SymbolMap& bit::Kneser::symbol_map  )  const [inline]
 

Constant access to symbol map.

void bit::Kneser::write_arpa FILE *  file  )  const [inline]
 

Write model in ARPA format.

void bit::Kneser::write_beta_arpa FILE *  file  )  const [inline]
 

Write model in ARPA format.

void bit::Kneser::write_binary_counts FILE *  file  )  const [inline]
 

Write trie structure, counts, modified counts, sum_nonzero_xg, sum_nonzero_xgx and sum_nonzero_gx in a file.

void bit::Kneser::write_binary_d1d2 FILE *  file  )  const [inline]
 

Write d1 and d2.


Member Data Documentation

std::vector<FloatVec> bit::Kneser::m_beta_denominator [private]
 

Precomputed denominator for Kneser's improved back-off distribution.

float bit::Kneser::m_beta_denominator0 [private]
 

Precomputed unigram-denominator for Kneser's improved back-off distribution.

std::vector<float> bit::Kneser::m_beta_discounts [private]
 

Beta discount parameters for each order (index 0 is for unigrams).

In beta distribution the lower order discounts should be multiplied by the higher order discounts.

std::vector<FloatVec> bit::Kneser::m_beta_interpolation_numerator [private]
 

Precomputed interpolation weights for Kneser's improved back-off distribution.

float bit::Kneser::m_beta_interpolation_numerator0 [private]
 

Precomputed interpolation weight for zero-grams.

std::vector<IntVec> bit::Kneser::m_counts [private]
 

Real counts for each order.

std::vector<FloatVec> bit::Kneser::m_d1 [private]
 

int bit::Kneser::m_d1_model [private]
 

Probability distribution used in d1 computation (0 = kneser-ney, 1 = absolute-discounting).

int bit::Kneser::m_d1_weight_model [private]
 

Probability distribution used for computing ngram weight in d1 computation (0 = kneser-ney, 1 = absolute-discounting, 2 = counts).

std::vector<FloatVec> bit::Kneser::m_d2 [private]
 

std::vector<IntVec> bit::Kneser::m_d2_norm [private]
 

std::vector<float> bit::Kneser::m_discounts [private]
 

Discount parameters for each order (index 0 is for unigrams).

std::vector<int> bit::Kneser::m_num_ngrams [private]
 

Number of ngrams in the model (updated in pruning).

int bit::Kneser::m_progress_skip [private]
 

Skip between progress reports.

std::vector<Array> bit::Kneser::m_pruned [private]
 

int bit::Kneser::m_sentence_end_id [private]
 

Index of the sentence end symbol.

std::string bit::Kneser::m_sentence_end_str [private]
 

String of the sentence end symbol.

int bit::Kneser::m_sentence_start_id [private]
 

Index of the sentence start symbol.

std::string bit::Kneser::m_sentence_start_str [private]
 

String of the sentence start symbol.

std::vector<IntVec> bit::Kneser::m_sum_gx [private]
 

Sum of counts.

int bit::Kneser::m_sum_gx0 [private]
 

Sum of counts for unigrams.

std::vector<IntVec> bit::Kneser::m_sum_nonzero_gx [private]
 

Modified normalization counts for each order.

int bit::Kneser::m_sum_nonzero_gx0 [private]
 

Modified normalization counts for unigrams.

std::vector<IntVec> bit::Kneser::m_sum_nonzero_xg [private]
 

Modified counts for each order.

std::vector<IntVec> bit::Kneser::m_sum_nonzero_xg_not_pruned [private]
 

std::vector<IntVec> bit::Kneser::m_sum_nonzero_xgx [private]
 

Precomputed sum of modified counts for each order.

int bit::Kneser::m_sum_nonzero_xgx0 [private]
 

Precomputed sum of modified counts for unigrams.

std::vector<IntVec> bit::Kneser::m_sum_xg_not_pruned [private]
 

SymbolMap bit::Kneser::m_symbol_map [private]
 

Symbol set for mapping between symbols and integers.

Trie bit::Kneser::m_trie [private]
 

Trie containing the ngrams.


The documentation for this class was generated from the following file:
Generated on Mon Jan 8 15:51:04 2007 for bit by  doxygen 1.4.6