![Uwe Rathmann](/assets/img/avatar_default.png)
The 3rdparty files are now compiled as part of the corresponding input method, so that the project files can be written without using platform specific linker flags.
97 lines
2.8 KiB
C++
97 lines
2.8 KiB
C++
/*
|
|
* Copyright (C) 2009 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef PINYINIME_INCLUDE_NGRAM_H__
|
|
#define PINYINIME_INCLUDE_NGRAM_H__
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "./dictdef.h"
|
|
|
|
namespace ime_pinyin {
|
|
|
|
typedef unsigned char CODEBOOK_TYPE;
|
|
|
|
static const size_t kCodeBookSize = 256;
|
|
|
|
class NGram {
|
|
public:
|
|
// The maximum score of a lemma item.
|
|
static const LmaScoreType kMaxScore = 0x3fff;
|
|
|
|
// In order to reduce the storage size, the original log value is amplified by
|
|
// kScoreAmplifier, and we use LmaScoreType to store.
|
|
// After this process, an item with a lower score has a higher frequency.
|
|
static const int kLogValueAmplifier = -800;
|
|
|
|
// System words' total frequency. It is not the real total frequency, instead,
|
|
// It is only used to adjust system lemmas' scores when the user dictionary's
|
|
// total frequency changes.
|
|
// In this version, frequencies of system lemmas are fixed. We are considering
|
|
// to make them changable in next version.
|
|
static const size_t kSysDictTotalFreq = 100000000;
|
|
|
|
private:
|
|
|
|
static NGram* instance_;
|
|
|
|
bool initialized_;
|
|
uint32 idx_num_;
|
|
|
|
size_t total_freq_none_sys_;
|
|
|
|
// Score compensation for system dictionary lemmas.
|
|
// Because after user adds some user lemmas, the total frequency changes, and
|
|
// we use this value to normalize the score.
|
|
float sys_score_compensation_;
|
|
|
|
#ifdef ___BUILD_MODEL___
|
|
double *freq_codes_df_;
|
|
#endif
|
|
LmaScoreType *freq_codes_;
|
|
CODEBOOK_TYPE *lma_freq_idx_;
|
|
|
|
public:
|
|
NGram();
|
|
~NGram();
|
|
|
|
static NGram& get_instance();
|
|
|
|
bool save_ngram(FILE *fp);
|
|
bool load_ngram(FILE *fp);
|
|
|
|
// Set the total frequency of all none system dictionaries.
|
|
void set_total_freq_none_sys(size_t freq_none_sys);
|
|
|
|
float get_uni_psb(LemmaIdType lma_id);
|
|
|
|
// Convert a probability to score. Actually, the score will be limited to
|
|
// kMaxScore, but at runtime, we also need float expression to get accurate
|
|
// value of the score.
|
|
// After the conversion, a lower score indicates a higher probability of the
|
|
// item.
|
|
static float convert_psb_to_score(double psb);
|
|
|
|
#ifdef ___BUILD_MODEL___
|
|
// For constructing the unigram mode model.
|
|
bool build_unigram(LemmaEntry *lemma_arr, size_t num,
|
|
LemmaIdType next_idx_unused);
|
|
#endif
|
|
};
|
|
}
|
|
|
|
#endif // PINYINIME_INCLUDE_NGRAM_H__
|