![Uwe Rathmann](/assets/img/avatar_default.png)
The 3rdparty files are now compiled as part of the corresponding input method, so that the project files can be written without using platform specific linker flags.
1982 lines
60 KiB
C++
1982 lines
60 KiB
C++
/*
|
|
* Copyright (C) 2009 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include "../include/lpicache.h"
|
|
#include "../include/matrixsearch.h"
|
|
#include "../include/mystdlib.h"
|
|
#include "../include/ngram.h"
|
|
#include "../include/userdict.h"
|
|
|
|
namespace ime_pinyin {
|
|
|
|
#define PRUMING_SCORE 8000.0
|
|
|
|
MatrixSearch::MatrixSearch() {
|
|
inited_ = false;
|
|
spl_trie_ = SpellingTrie::get_cpinstance();
|
|
|
|
reset_pointers_to_null();
|
|
|
|
pys_decoded_len_ = 0;
|
|
mtrx_nd_pool_used_ = 0;
|
|
dmi_pool_used_ = 0;
|
|
xi_an_enabled_ = false;
|
|
dmi_c_phrase_ = false;
|
|
|
|
assert(kMaxSearchSteps > 0);
|
|
max_sps_len_ = kMaxSearchSteps - 1;
|
|
max_hzs_len_ = kMaxSearchSteps;
|
|
}
|
|
|
|
MatrixSearch::~MatrixSearch() {
|
|
free_resource();
|
|
}
|
|
|
|
void MatrixSearch::reset_pointers_to_null() {
|
|
dict_trie_ = NULL;
|
|
user_dict_ = NULL;
|
|
spl_parser_ = NULL;
|
|
|
|
share_buf_ = NULL;
|
|
|
|
// The following four buffers are used for decoding, and they are based on
|
|
// share_buf_, no need to delete them.
|
|
mtrx_nd_pool_ = NULL;
|
|
dmi_pool_ = NULL;
|
|
matrix_ = NULL;
|
|
dep_ = NULL;
|
|
|
|
// Based on share_buf_, no need to delete them.
|
|
npre_items_ = NULL;
|
|
}
|
|
|
|
bool MatrixSearch::alloc_resource() {
|
|
free_resource();
|
|
|
|
dict_trie_ = new DictTrie();
|
|
user_dict_ = static_cast<AtomDictBase*>(new UserDict());
|
|
spl_parser_ = new SpellingParser();
|
|
|
|
size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize;
|
|
mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t);
|
|
size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize;
|
|
dmi_size = align_to_size_t(dmi_size) / sizeof(size_t);
|
|
size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum;
|
|
matrix_size = align_to_size_t(matrix_size) / sizeof(size_t);
|
|
size_t dep_size = sizeof(DictExtPara);
|
|
dep_size = align_to_size_t(dep_size) / sizeof(size_t);
|
|
|
|
// share_buf's size is determined by the buffers for search.
|
|
share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size];
|
|
|
|
if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ ||
|
|
NULL == share_buf_)
|
|
return false;
|
|
|
|
// The buffers for search are based on the share buffer
|
|
mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_);
|
|
dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size);
|
|
matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size);
|
|
dep_ = reinterpret_cast<DictExtPara*>
|
|
(share_buf_ + mtrx_nd_size + dmi_size + matrix_size);
|
|
|
|
// The prediction buffer is also based on the share buffer.
|
|
npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_);
|
|
npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) *
|
|
sizeof(size_t) / sizeof(NPredictItem);
|
|
return true;
|
|
}
|
|
|
|
void MatrixSearch::free_resource() {
|
|
if (NULL != dict_trie_)
|
|
delete dict_trie_;
|
|
|
|
if (NULL != user_dict_)
|
|
delete user_dict_;
|
|
|
|
if (NULL != spl_parser_)
|
|
delete spl_parser_;
|
|
|
|
if (NULL != share_buf_)
|
|
delete [] share_buf_;
|
|
|
|
reset_pointers_to_null();
|
|
}
|
|
|
|
bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) {
|
|
if (NULL == fn_sys_dict || NULL == fn_usr_dict)
|
|
return false;
|
|
|
|
if (!alloc_resource())
|
|
return false;
|
|
|
|
if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd))
|
|
return false;
|
|
|
|
// If engine fails to load the user dictionary, reset the user dictionary
|
|
// to NULL.
|
|
if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
|
|
delete user_dict_;
|
|
user_dict_ = NULL;
|
|
} else{
|
|
user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
|
|
}
|
|
|
|
reset_search0();
|
|
|
|
inited_ = true;
|
|
return true;
|
|
}
|
|
|
|
bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length,
|
|
const char *fn_usr_dict) {
|
|
if (NULL == fn_usr_dict)
|
|
return false;
|
|
|
|
if (!alloc_resource())
|
|
return false;
|
|
|
|
if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd))
|
|
return false;
|
|
|
|
if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
|
|
delete user_dict_;
|
|
user_dict_ = NULL;
|
|
} else {
|
|
user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
|
|
}
|
|
|
|
reset_search0();
|
|
|
|
inited_ = true;
|
|
return true;
|
|
}
|
|
|
|
void MatrixSearch::init_user_dictionary(const char *fn_usr_dict) {
|
|
assert(inited_);
|
|
|
|
if (NULL != user_dict_) {
|
|
delete user_dict_;
|
|
user_dict_ = NULL;
|
|
}
|
|
|
|
if (NULL != fn_usr_dict) {
|
|
user_dict_ = static_cast<AtomDictBase*>(new UserDict());
|
|
if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
|
|
delete user_dict_;
|
|
user_dict_ = NULL;
|
|
}
|
|
}
|
|
|
|
reset_search0();
|
|
}
|
|
|
|
bool MatrixSearch::is_user_dictionary_enabled() const {
|
|
return NULL != user_dict_;
|
|
}
|
|
|
|
void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) {
|
|
if (0 != max_sps_len)
|
|
max_sps_len_ = max_sps_len;
|
|
if (0 != max_hzs_len)
|
|
max_hzs_len_ = max_hzs_len;
|
|
}
|
|
|
|
void MatrixSearch::close() {
|
|
flush_cache();
|
|
free_resource();
|
|
inited_ = false;
|
|
}
|
|
|
|
void MatrixSearch::flush_cache() {
|
|
if (NULL != user_dict_)
|
|
user_dict_->flush_cache();
|
|
}
|
|
|
|
void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) {
|
|
xi_an_enabled_ = xi_an_enabled;
|
|
}
|
|
|
|
bool MatrixSearch::get_xi_an_switch() {
|
|
return xi_an_enabled_;
|
|
}
|
|
|
|
bool MatrixSearch::reset_search() {
|
|
if (!inited_)
|
|
return false;
|
|
return reset_search0();
|
|
}
|
|
|
|
bool MatrixSearch::reset_search0() {
|
|
if (!inited_)
|
|
return false;
|
|
|
|
pys_decoded_len_ = 0;
|
|
mtrx_nd_pool_used_ = 0;
|
|
dmi_pool_used_ = 0;
|
|
|
|
// Get a MatrixNode from the pool
|
|
matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_;
|
|
matrix_[0].mtrx_nd_num = 1;
|
|
mtrx_nd_pool_used_ += 1;
|
|
|
|
// Update the node, and make it to be a starting node
|
|
MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos;
|
|
node->id = 0;
|
|
node->score = 0;
|
|
node->from = NULL;
|
|
node->step = 0;
|
|
node->dmi_fr = (PoolPosType)-1;
|
|
|
|
matrix_[0].dmi_pos = 0;
|
|
matrix_[0].dmi_num = 0;
|
|
matrix_[0].dmi_has_full_id = 1;
|
|
matrix_[0].mtrx_nd_fixed = node;
|
|
|
|
lma_start_[0] = 0;
|
|
fixed_lmas_ = 0;
|
|
spl_start_[0] = 0;
|
|
fixed_hzs_ = 0;
|
|
|
|
dict_trie_->reset_milestones(0, 0);
|
|
if (NULL != user_dict_)
|
|
user_dict_->reset_milestones(0, 0);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step,
|
|
bool clear_dmi_this_step,
|
|
bool clear_mtrx_this_step) {
|
|
if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum)
|
|
return false;
|
|
|
|
if (0 == ch_pos) {
|
|
reset_search0();
|
|
} else {
|
|
// Prepare mile stones of this step to clear.
|
|
MileStoneHandle *dict_handles_to_clear = NULL;
|
|
if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) {
|
|
dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles;
|
|
}
|
|
|
|
// If there are more steps, and this step is not allowed to clear, find
|
|
// milestones of next step.
|
|
if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) {
|
|
dict_handles_to_clear = NULL;
|
|
if (matrix_[ch_pos + 1].dmi_num > 0) {
|
|
dict_handles_to_clear =
|
|
dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles;
|
|
}
|
|
}
|
|
|
|
if (NULL != dict_handles_to_clear) {
|
|
dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]);
|
|
if (NULL != user_dict_)
|
|
user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]);
|
|
}
|
|
|
|
pys_decoded_len_ = ch_pos;
|
|
|
|
if (clear_dmi_this_step) {
|
|
dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos
|
|
+ matrix_[ch_pos - 1].dmi_num;
|
|
matrix_[ch_pos].dmi_num = 0;
|
|
} else {
|
|
dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num;
|
|
}
|
|
|
|
if (clear_mtrx_this_step) {
|
|
mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos
|
|
+ matrix_[ch_pos - 1].mtrx_nd_num;
|
|
matrix_[ch_pos].mtrx_nd_num = 0;
|
|
} else {
|
|
mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos
|
|
+ matrix_[ch_pos].mtrx_nd_num;
|
|
}
|
|
|
|
// Modify fixed_hzs_
|
|
if (fixed_hzs_ > 0 &&
|
|
((kLemmaIdComposing != lma_id_[0]) ||
|
|
(kLemmaIdComposing == lma_id_[0] &&
|
|
spl_start_[c_phrase_.length] <= ch_pos))) {
|
|
size_t fixed_ch_pos = ch_pos;
|
|
if (clear_fixed_this_step)
|
|
fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0;
|
|
while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0)
|
|
fixed_ch_pos--;
|
|
|
|
fixed_lmas_ = 0;
|
|
fixed_hzs_ = 0;
|
|
if (fixed_ch_pos > 0) {
|
|
while (spl_start_[fixed_hzs_] < fixed_ch_pos)
|
|
fixed_hzs_++;
|
|
assert(spl_start_[fixed_hzs_] == fixed_ch_pos);
|
|
|
|
while (lma_start_[fixed_lmas_] < fixed_hzs_)
|
|
fixed_lmas_++;
|
|
assert(lma_start_[fixed_lmas_] == fixed_hzs_);
|
|
}
|
|
|
|
// Re-search the Pinyin string for the unlocked lemma
|
|
// which was previously fixed.
|
|
//
|
|
// Prepare mile stones of this step to clear.
|
|
MileStoneHandle *dict_handles_to_clear = NULL;
|
|
if (clear_dmi_this_step && ch_pos == fixed_ch_pos &&
|
|
matrix_[fixed_ch_pos].dmi_num > 0) {
|
|
dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles;
|
|
}
|
|
|
|
// If there are more steps, and this step is not allowed to clear, find
|
|
// milestones of next step.
|
|
if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) {
|
|
dict_handles_to_clear = NULL;
|
|
if (matrix_[fixed_ch_pos + 1].dmi_num > 0) {
|
|
dict_handles_to_clear =
|
|
dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles;
|
|
}
|
|
}
|
|
|
|
if (NULL != dict_handles_to_clear) {
|
|
dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]);
|
|
if (NULL != user_dict_)
|
|
user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]);
|
|
}
|
|
|
|
|
|
pys_decoded_len_ = fixed_ch_pos;
|
|
|
|
if (clear_dmi_this_step && ch_pos == fixed_ch_pos) {
|
|
dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos
|
|
+ matrix_[fixed_ch_pos - 1].dmi_num;
|
|
matrix_[fixed_ch_pos].dmi_num = 0;
|
|
} else {
|
|
dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos +
|
|
matrix_[fixed_ch_pos].dmi_num;
|
|
}
|
|
|
|
if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) {
|
|
mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos
|
|
+ matrix_[fixed_ch_pos - 1].mtrx_nd_num;
|
|
matrix_[fixed_ch_pos].mtrx_nd_num = 0;
|
|
} else {
|
|
mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos
|
|
+ matrix_[fixed_ch_pos].mtrx_nd_num;
|
|
}
|
|
|
|
for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) {
|
|
add_char(pys_[re_pos]);
|
|
}
|
|
} else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) {
|
|
for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) {
|
|
uint16 splpos_begin = c_phrase_.sublma_start[subpos];
|
|
uint16 splpos_end = c_phrase_.sublma_start[subpos + 1];
|
|
for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) {
|
|
// If ch_pos is in this spelling
|
|
uint16 spl_start = c_phrase_.spl_start[splpos];
|
|
uint16 spl_end = c_phrase_.spl_start[splpos + 1];
|
|
if (ch_pos >= spl_start && ch_pos < spl_end) {
|
|
// Clear everything after this position
|
|
c_phrase_.chn_str[splpos] = static_cast<char16>('\0');
|
|
c_phrase_.sublma_start[subpos + 1] = splpos;
|
|
c_phrase_.sublma_num = subpos + 1;
|
|
c_phrase_.length = splpos;
|
|
|
|
if (splpos == splpos_begin) {
|
|
c_phrase_.sublma_num = subpos;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extend the composing phrase.
|
|
reset_search0();
|
|
dmi_c_phrase_ = true;
|
|
uint16 c_py_pos = 0;
|
|
while (c_py_pos < spl_start_[c_phrase_.length]) {
|
|
bool b_ac_tmp = add_char(pys_[c_py_pos]);
|
|
assert(b_ac_tmp);
|
|
c_py_pos++;
|
|
}
|
|
dmi_c_phrase_ = false;
|
|
|
|
lma_id_num_ = 1;
|
|
fixed_lmas_ = 1;
|
|
fixed_lmas_no1_[0] = 0; // A composing string is always modified.
|
|
fixed_hzs_ = c_phrase_.length;
|
|
lma_start_[1] = fixed_hzs_;
|
|
lma_id_[0] = kLemmaIdComposing;
|
|
matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
|
|
matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void MatrixSearch::del_in_pys(size_t start, size_t len) {
|
|
while (start < kMaxRowNum - len && '\0' != pys_[start]) {
|
|
pys_[start] = pys_[start + len];
|
|
start++;
|
|
}
|
|
}
|
|
|
|
size_t MatrixSearch::search(const char *py, size_t py_len) {
|
|
if (!inited_ || NULL == py)
|
|
return 0;
|
|
|
|
// If the search Pinyin string is too long, it will be truncated.
|
|
if (py_len > kMaxRowNum - 1)
|
|
py_len = kMaxRowNum - 1;
|
|
|
|
// Compare the new string with the previous one. Find their prefix to
|
|
// increase search efficiency.
|
|
size_t ch_pos = 0;
|
|
for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) {
|
|
if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos])
|
|
break;
|
|
}
|
|
|
|
bool clear_fix = true;
|
|
if (ch_pos == pys_decoded_len_)
|
|
clear_fix = false;
|
|
|
|
reset_search(ch_pos, clear_fix, false, false);
|
|
|
|
memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos);
|
|
pys_[py_len] = '\0';
|
|
|
|
while ('\0' != pys_[ch_pos]) {
|
|
if (!add_char(py[ch_pos])) {
|
|
pys_decoded_len_ = ch_pos;
|
|
break;
|
|
}
|
|
ch_pos++;
|
|
}
|
|
|
|
// Get spelling ids and starting positions.
|
|
get_spl_start_id();
|
|
|
|
// If there are too many spellings, remove the last letter until the spelling
|
|
// number is acceptable.
|
|
while (spl_id_num_ > 9) {
|
|
py_len--;
|
|
reset_search(py_len, false, false, false);
|
|
pys_[py_len] = '\0';
|
|
get_spl_start_id();
|
|
}
|
|
|
|
prepare_candidates();
|
|
|
|
if (kPrintDebug0) {
|
|
printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_);
|
|
printf("--DMI Pool Used: %d\n", dmi_pool_used_);
|
|
|
|
if (kPrintDebug1) {
|
|
for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) {
|
|
debug_print_dmi(pos, 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
return ch_pos;
|
|
}
|
|
|
|
size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid,
|
|
bool clear_fixed_this_step) {
|
|
if (!inited_)
|
|
return 0;
|
|
|
|
size_t reset_pos = pos;
|
|
|
|
// Out of range for both Pinyin mode and Spelling id mode.
|
|
if (pys_decoded_len_ <= pos) {
|
|
del_in_pys(pos, 1);
|
|
|
|
reset_pos = pys_decoded_len_;
|
|
// Decode the string after the un-decoded position
|
|
while ('\0' != pys_[reset_pos]) {
|
|
if (!add_char(pys_[reset_pos])) {
|
|
pys_decoded_len_ = reset_pos;
|
|
break;
|
|
}
|
|
reset_pos++;
|
|
}
|
|
get_spl_start_id();
|
|
prepare_candidates();
|
|
return pys_decoded_len_;
|
|
}
|
|
|
|
// Spelling id mode, but out of range.
|
|
if (is_pos_in_splid && spl_id_num_ <= pos)
|
|
return pys_decoded_len_;
|
|
|
|
// Begin to handle two modes respectively.
|
|
// Pinyin mode by default
|
|
size_t c_py_len = 0; // The length of composing phrase's Pinyin
|
|
size_t del_py_len = 1;
|
|
if (!is_pos_in_splid) {
|
|
// Pinyin mode is only allowed to delete beyond the fixed lemmas.
|
|
if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]])
|
|
return pys_decoded_len_;
|
|
|
|
del_in_pys(pos, 1);
|
|
|
|
// If the deleted character is just the one after the last fixed lemma
|
|
if (pos == spl_start_[lma_start_[fixed_lmas_]]) {
|
|
// If all fixed lemmas have been merged, and the caller of the function
|
|
// request to unlock the last fixed lemma.
|
|
if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) {
|
|
// Unlock the last sub lemma in the composing phrase. Because it is not
|
|
// easy to unlock it directly. Instead, we re-decode the modified
|
|
// composing phrase.
|
|
c_phrase_.sublma_num--;
|
|
c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num];
|
|
reset_pos = spl_start_[c_phrase_.length];
|
|
c_py_len = reset_pos;
|
|
}
|
|
}
|
|
} else {
|
|
del_py_len = spl_start_[pos + 1] - spl_start_[pos];
|
|
|
|
del_in_pys(spl_start_[pos], del_py_len);
|
|
|
|
if (pos >= lma_start_[fixed_lmas_]) {
|
|
c_py_len = 0;
|
|
reset_pos = spl_start_[pos + 1] - del_py_len;
|
|
} else {
|
|
c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len;
|
|
reset_pos = c_py_len;
|
|
if (c_py_len > 0)
|
|
merge_fixed_lmas(pos);
|
|
}
|
|
}
|
|
|
|
if (c_py_len > 0) {
|
|
assert(c_phrase_.length > 0 && c_py_len ==
|
|
c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]);
|
|
// The composing phrase is valid, reset all search space,
|
|
// and begin a new search which will only extend the composing
|
|
// phrase.
|
|
reset_search0();
|
|
|
|
dmi_c_phrase_ = true;
|
|
// Extend the composing phrase.
|
|
uint16 c_py_pos = 0;
|
|
while (c_py_pos < c_py_len) {
|
|
bool b_ac_tmp = add_char(pys_[c_py_pos]);
|
|
assert(b_ac_tmp);
|
|
c_py_pos++;
|
|
}
|
|
dmi_c_phrase_ = false;
|
|
|
|
// Fixd the composing phrase as the first choice.
|
|
lma_id_num_ = 1;
|
|
fixed_lmas_ = 1;
|
|
fixed_lmas_no1_[0] = 0; // A composing string is always modified.
|
|
fixed_hzs_ = c_phrase_.length;
|
|
lma_start_[1] = fixed_hzs_;
|
|
lma_id_[0] = kLemmaIdComposing;
|
|
matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
|
|
matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
|
|
} else {
|
|
// Reseting search only clear pys_decoded_len_, but the string is kept.
|
|
reset_search(reset_pos, clear_fixed_this_step, false, false);
|
|
}
|
|
|
|
// Decode the string after the delete position.
|
|
while ('\0' != pys_[reset_pos]) {
|
|
if (!add_char(pys_[reset_pos])) {
|
|
pys_decoded_len_ = reset_pos;
|
|
break;
|
|
}
|
|
reset_pos++;
|
|
}
|
|
|
|
get_spl_start_id();
|
|
prepare_candidates();
|
|
return pys_decoded_len_;
|
|
}
|
|
|
|
size_t MatrixSearch::get_candidate_num() {
|
|
if (!inited_ || 0 == pys_decoded_len_ ||
|
|
0 == matrix_[pys_decoded_len_].mtrx_nd_num)
|
|
return 0;
|
|
|
|
return 1 + lpi_total_;
|
|
}
|
|
|
|
char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str,
|
|
size_t max_len) {
|
|
if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str)
|
|
return NULL;
|
|
|
|
if (0 == cand_id) {
|
|
return get_candidate0(cand_str, max_len, NULL, false);
|
|
} else {
|
|
cand_id--;
|
|
}
|
|
|
|
// For this case: the current sentence is a word only, and the user fixed it,
|
|
// so the result will be fixed to the sentence space, and
|
|
// lpi_total_ will be set to 0.
|
|
if (0 == lpi_total_) {
|
|
return get_candidate0(cand_str, max_len, NULL, false);
|
|
}
|
|
|
|
LemmaIdType id = lpi_items_[cand_id].id;
|
|
char16 s[kMaxLemmaSize + 1];
|
|
|
|
uint16 s_len = lpi_items_[cand_id].lma_len;
|
|
if (s_len > 1) {
|
|
s_len = get_lemma_str(id, s, kMaxLemmaSize + 1);
|
|
} else {
|
|
// For a single character, Hanzi is ready.
|
|
s[0] = lpi_items_[cand_id].hanzi;
|
|
s[1] = static_cast<char16>(0);
|
|
}
|
|
|
|
if (s_len > 0 && max_len > s_len) {
|
|
utf16_strncpy(cand_str, s, s_len);
|
|
cand_str[s_len] = (char16)'\0';
|
|
return cand_str;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
void MatrixSearch::update_dict_freq() {
|
|
if (NULL != user_dict_) {
|
|
// Update the total frequency of all lemmas, including system lemmas and
|
|
// user dictionary lemmas.
|
|
size_t total_freq = user_dict_->get_total_lemma_count();
|
|
dict_trie_->set_total_lemma_count_of_others(total_freq);
|
|
}
|
|
}
|
|
|
|
bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to,
|
|
float score) {
|
|
if (lma_to - lma_fr <= 1 || NULL == user_dict_)
|
|
return false;
|
|
|
|
char16 word_str[kMaxLemmaSize + 1];
|
|
uint16 spl_ids[kMaxLemmaSize];
|
|
|
|
uint16 spl_id_fr = 0;
|
|
|
|
for (uint16 pos = lma_fr; pos < lma_to; pos++) {
|
|
LemmaIdType lma_id = lma_id_[pos];
|
|
if (is_user_lemma(lma_id)) {
|
|
user_dict_->update_lemma(lma_id, 1, true);
|
|
}
|
|
uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos];
|
|
utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len);
|
|
|
|
uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr,
|
|
kMaxLemmaSize + 1 - spl_id_fr);
|
|
assert(tmp == lma_len);
|
|
|
|
tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true);
|
|
if (tmp != lma_len) {
|
|
return false;
|
|
}
|
|
|
|
spl_id_fr += lma_len;
|
|
}
|
|
|
|
assert(spl_id_fr <= kMaxLemmaSize);
|
|
|
|
return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids,
|
|
spl_id_fr, 1);
|
|
}
|
|
|
|
void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) {
|
|
if (dmi_pos >= dmi_pool_used_) return;
|
|
|
|
DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
|
|
|
|
if (1 == nest_level) {
|
|
printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos);
|
|
}
|
|
if (dmi->dict_level > 1) {
|
|
debug_print_dmi(dmi->dmi_fr, nest_level + 1);
|
|
}
|
|
printf("---%d\n", dmi->dict_level);
|
|
printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]);
|
|
printf(" Spelling : %s, %d\n", SpellingTrie::get_instance().
|
|
get_spelling_str(dmi->spl_id), dmi->spl_id);
|
|
printf(" Total Pinyin Len: %d\n", dmi->splstr_len);
|
|
if (1 == nest_level) {
|
|
printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos);
|
|
}
|
|
}
|
|
|
|
bool MatrixSearch::try_add_cand0_to_userdict() {
|
|
size_t new_cand_num = get_candidate_num();
|
|
if (fixed_hzs_ > 0 && 1 == new_cand_num) {
|
|
float score_from = 0;
|
|
uint16 lma_id_from = 0;
|
|
uint16 pos = 0;
|
|
bool modified = false;
|
|
while (pos < fixed_lmas_) {
|
|
if (lma_start_[pos + 1] - lma_start_[lma_id_from] >
|
|
static_cast<uint16>(kMaxLemmaSize)) {
|
|
float score_to_add =
|
|
mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
|
|
.mtrx_nd_pos].score - score_from;
|
|
if (modified) {
|
|
score_to_add += 1.0;
|
|
if (score_to_add > NGram::kMaxScore) {
|
|
score_to_add = NGram::kMaxScore;
|
|
}
|
|
add_lma_to_userdict(lma_id_from, pos, score_to_add);
|
|
}
|
|
lma_id_from = pos;
|
|
score_from += score_to_add;
|
|
|
|
// Clear the flag for next user lemma.
|
|
modified = false;
|
|
}
|
|
|
|
if (0 == fixed_lmas_no1_[pos]) {
|
|
modified = true;
|
|
}
|
|
pos++;
|
|
}
|
|
|
|
// Single-char word is not allowed to add to userdict.
|
|
if (lma_start_[pos] - lma_start_[lma_id_from] > 1) {
|
|
float score_to_add =
|
|
mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
|
|
.mtrx_nd_pos].score - score_from;
|
|
if (modified) {
|
|
score_to_add += 1.0;
|
|
if (score_to_add > NGram::kMaxScore) {
|
|
score_to_add = NGram::kMaxScore;
|
|
}
|
|
add_lma_to_userdict(lma_id_from, pos, score_to_add);
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Choose a candidate, and give new candidates for next step.
|
|
// If user finishes selection, we will try to communicate with user dictionary
|
|
// to add new items or update score of some existing items.
|
|
//
|
|
// Basic rule:
|
|
// 1. If user selects the first choice:
|
|
// 1.1. If the first choice is not a sentence, instead, it is a lemma:
|
|
// 1.1.1. If the first choice is a user lemma, notify the user
|
|
// dictionary that a user lemma is hit, and add occuring count
|
|
// by 1.
|
|
// 1.1.2. If the first choice is a system lemma, do nothing.
|
|
// 1.2. If the first choice is a sentence containing more than one lemma:
|
|
// 1.2.1. The whole sentence will be added as a user lemma. If the
|
|
// sentence contains user lemmas, -> hit, and add occuring count
|
|
// by 1.
|
|
size_t MatrixSearch::choose(size_t cand_id) {
|
|
if (!inited_ || 0 == pys_decoded_len_)
|
|
return 0;
|
|
|
|
if (0 == cand_id) {
|
|
fixed_hzs_ = spl_id_num_;
|
|
matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
|
|
matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
|
|
for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) {
|
|
fixed_lmas_no1_[pos] = 1;
|
|
}
|
|
fixed_lmas_ = lma_id_num_;
|
|
lpi_total_ = 0; // Clean all other candidates.
|
|
|
|
// 1. It is the first choice
|
|
if (1 == lma_id_num_) {
|
|
// 1.1. The first choice is not a sentence but a lemma
|
|
if (is_user_lemma(lma_id_[0])) {
|
|
// 1.1.1. The first choice is a user lemma, notify the user dictionary
|
|
// that it is hit.
|
|
if (NULL != user_dict_)
|
|
user_dict_->update_lemma(lma_id_[0], 1, true);
|
|
} else {
|
|
// 1.1.2. do thing for a system lemma.
|
|
}
|
|
} else {
|
|
// 1.2. The first choice is a sentence.
|
|
// 1.2.1 Try to add the whole sentence to user dictionary, the whole
|
|
// sentence may be splitted into many items.
|
|
if (NULL != user_dict_) {
|
|
try_add_cand0_to_userdict();
|
|
}
|
|
}
|
|
update_dict_freq();
|
|
return 1;
|
|
} else {
|
|
cand_id--;
|
|
}
|
|
|
|
// 2. It is not the full sentence candidate.
|
|
// Find the length of the candidate.
|
|
LemmaIdType id_chosen = lpi_items_[cand_id].id;
|
|
LmaScoreType score_chosen = lpi_items_[cand_id].psb;
|
|
size_t cand_len = lpi_items_[cand_id].lma_len;
|
|
|
|
assert(cand_len > 0);
|
|
|
|
// Notify the atom dictionary that this item is hit.
|
|
if (is_user_lemma(id_chosen)) {
|
|
if (NULL != user_dict_) {
|
|
user_dict_->update_lemma(id_chosen, 1, true);
|
|
}
|
|
update_dict_freq();
|
|
}
|
|
|
|
// 3. Fixed the chosen item.
|
|
// 3.1 Get the steps number.
|
|
size_t step_fr = spl_start_[fixed_hzs_];
|
|
size_t step_to = spl_start_[fixed_hzs_ + cand_len];
|
|
|
|
// 3.2 Save the length of the original string.
|
|
size_t pys_decoded_len = pys_decoded_len_;
|
|
|
|
// 3.2 Reset the space of the fixed part.
|
|
reset_search(step_to, false, false, true);
|
|
|
|
// 3.3 For the last character of the fixed part, the previous DMI
|
|
// information will be kept, while the MTRX information will be re-extended,
|
|
// and only one node will be extended.
|
|
matrix_[step_to].mtrx_nd_num = 0;
|
|
|
|
LmaPsbItem lpi_item;
|
|
lpi_item.psb = score_chosen;
|
|
lpi_item.id = id_chosen;
|
|
|
|
PoolPosType step_to_dmi_fr = match_dmi(step_to,
|
|
spl_id_ + fixed_hzs_, cand_len);
|
|
//assert(step_to_dmi_fr != static_cast<PoolPosType>(-1));
|
|
|
|
extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1,
|
|
step_to_dmi_fr, step_to);
|
|
|
|
matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos;
|
|
mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos +
|
|
matrix_[step_to].mtrx_nd_num;
|
|
|
|
if (id_chosen == lma_id_[fixed_lmas_])
|
|
fixed_lmas_no1_[fixed_lmas_] = 1;
|
|
else
|
|
fixed_lmas_no1_[fixed_lmas_] = 0;
|
|
lma_id_[fixed_lmas_] = id_chosen;
|
|
lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len;
|
|
fixed_lmas_++;
|
|
fixed_hzs_ = fixed_hzs_ + cand_len;
|
|
|
|
while (step_to != pys_decoded_len) {
|
|
bool b = add_char(pys_[step_to]);
|
|
assert(b);
|
|
step_to++;
|
|
}
|
|
|
|
if (fixed_hzs_ < spl_id_num_) {
|
|
prepare_candidates();
|
|
} else {
|
|
lpi_total_ = 0;
|
|
if (NULL != user_dict_) {
|
|
try_add_cand0_to_userdict();
|
|
}
|
|
}
|
|
|
|
return get_candidate_num();
|
|
}
|
|
|
|
size_t MatrixSearch::cancel_last_choice() {
|
|
if (!inited_ || 0 == pys_decoded_len_)
|
|
return 0;
|
|
|
|
size_t step_start = 0;
|
|
if (fixed_hzs_ > 0) {
|
|
size_t step_end = spl_start_[fixed_hzs_];
|
|
MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed;
|
|
assert(NULL != end_node);
|
|
|
|
step_start = end_node->from->step;
|
|
|
|
if (step_start > 0) {
|
|
DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr;
|
|
fixed_hzs_ -= dmi->dict_level;
|
|
} else {
|
|
fixed_hzs_ = 0;
|
|
}
|
|
|
|
reset_search(step_start, false, false, false);
|
|
|
|
while (pys_[step_start] != '\0') {
|
|
bool b = add_char(pys_[step_start]);
|
|
assert(b);
|
|
step_start++;
|
|
}
|
|
|
|
prepare_candidates();
|
|
}
|
|
return get_candidate_num();
|
|
}
|
|
|
|
size_t MatrixSearch::get_fixedlen() {
|
|
if (!inited_ || 0 == pys_decoded_len_)
|
|
return 0;
|
|
return fixed_hzs_;
|
|
}
|
|
|
|
bool MatrixSearch::prepare_add_char(char ch) {
|
|
if (pys_decoded_len_ >= kMaxRowNum - 1 ||
|
|
(!spl_parser_->is_valid_to_parse(ch) && ch != '\''))
|
|
return false;
|
|
|
|
if (dmi_pool_used_ >= kDmiPoolSize) return false;
|
|
|
|
pys_[pys_decoded_len_] = ch;
|
|
pys_decoded_len_++;
|
|
|
|
MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_;
|
|
mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_;
|
|
mtrx_this_row->mtrx_nd_num = 0;
|
|
mtrx_this_row->dmi_pos = dmi_pool_used_;
|
|
mtrx_this_row->dmi_num = 0;
|
|
mtrx_this_row->dmi_has_full_id = 0;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool MatrixSearch::is_split_at(uint16 pos) {
|
|
return !spl_parser_->is_valid_to_parse(pys_[pos - 1]);
|
|
}
|
|
|
|
void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
|
|
PoolPosType dmi_fr, uint16 spl_id,
|
|
uint16 node_num, unsigned char dict_level,
|
|
bool splid_end_split, unsigned char splstr_len,
|
|
unsigned char all_full_id) {
|
|
dmi->dict_handles[0] = handles[0];
|
|
dmi->dict_handles[1] = handles[1];
|
|
dmi->dmi_fr = dmi_fr;
|
|
dmi->spl_id = spl_id;
|
|
dmi->dict_level = dict_level;
|
|
dmi->splid_end_split = splid_end_split ? 1 : 0;
|
|
dmi->splstr_len = splstr_len;
|
|
dmi->all_full_id = all_full_id;
|
|
dmi->c_phrase = 0;
|
|
}
|
|
|
|
bool MatrixSearch::add_char(char ch) {
|
|
if (!prepare_add_char(ch))
|
|
return false;
|
|
return add_char_qwerty();
|
|
}
|
|
|
|
bool MatrixSearch::add_char_qwerty() {
|
|
matrix_[pys_decoded_len_].mtrx_nd_num = 0;
|
|
|
|
bool spl_matched = false;
|
|
uint16 longest_ext = 0;
|
|
// Extend the search matrix, from the oldest unfixed row. ext_len means
|
|
// extending length.
|
|
for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) {
|
|
if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_])
|
|
continue;
|
|
|
|
// Refer to the declaration of the variable dmi_has_full_id for the
|
|
// explanation of this piece of code. In one word, it is used to prevent
|
|
// from the unwise extending of "shoud ou" but allow the reasonable
|
|
// extending of "heng ao", "lang a", etc.
|
|
if (ext_len > 1 && 0 != longest_ext &&
|
|
0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) {
|
|
if (xi_an_enabled_)
|
|
continue;
|
|
else
|
|
break;
|
|
}
|
|
|
|
uint16 oldrow = pys_decoded_len_ - ext_len;
|
|
|
|
// 0. If that row is before the last fixed step, ignore.
|
|
if (spl_start_[fixed_hzs_] > oldrow)
|
|
continue;
|
|
|
|
// 1. Check if that old row has valid MatrixNode. If no, means that row is
|
|
// not a boundary, either a word boundary or a spelling boundary.
|
|
// If it is for extending composing phrase, it's OK to ignore the 0.
|
|
if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_)
|
|
continue;
|
|
|
|
// 2. Get spelling id(s) for the last ext_len chars.
|
|
uint16 spl_idx;
|
|
bool is_pre = false;
|
|
spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow,
|
|
ext_len, &is_pre);
|
|
if (is_pre)
|
|
spl_matched = true;
|
|
|
|
if (0 == spl_idx)
|
|
continue;
|
|
|
|
bool splid_end_split = is_split_at(oldrow + ext_len);
|
|
|
|
// 3. Extend the DMI nodes of that old row
|
|
// + 1 is to extend an extra node from the root
|
|
for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos;
|
|
dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1;
|
|
dmi_pos++) {
|
|
DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
|
|
if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) {
|
|
dmi = NULL; // The last one, NULL means extending from the root.
|
|
} else {
|
|
// If the dmi is covered by the fixed arrange, ignore it.
|
|
if (fixed_hzs_ > 0 &&
|
|
pys_decoded_len_ - ext_len - dmi->splstr_len <
|
|
spl_start_[fixed_hzs_]) {
|
|
continue;
|
|
}
|
|
// If it is not in mode for composing phrase, and the source DMI node
|
|
// is marked for composing phrase, ignore this node.
|
|
if (dmi->c_phrase != 0 && !dmi_c_phrase_) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// For example, if "gao" is extended, "g ao" is not allowed.
|
|
// or "zh" has been passed, "z h" is not allowed.
|
|
// Both word and word-connection will be prevented.
|
|
if (longest_ext > ext_len) {
|
|
if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) {
|
|
continue;
|
|
}
|
|
|
|
// "z h" is not allowed.
|
|
if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
dep_->splids_extended = 0;
|
|
if (NULL != dmi) {
|
|
uint16 prev_ids_num = dmi->dict_level;
|
|
if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) ||
|
|
(dmi_c_phrase_ && prev_ids_num >= kMaxRowNum)) {
|
|
continue;
|
|
}
|
|
|
|
DictMatchInfo *d = dmi;
|
|
while (d) {
|
|
dep_->splids[--prev_ids_num] = d->spl_id;
|
|
if ((PoolPosType)-1 == d->dmi_fr)
|
|
break;
|
|
d = dmi_pool_ + d->dmi_fr;
|
|
}
|
|
assert(0 == prev_ids_num);
|
|
dep_->splids_extended = dmi->dict_level;
|
|
}
|
|
dep_->splids[dep_->splids_extended] = spl_idx;
|
|
dep_->ext_len = ext_len;
|
|
dep_->splid_end_split = splid_end_split;
|
|
|
|
dep_->id_num = 1;
|
|
dep_->id_start = spl_idx;
|
|
if (spl_trie_->is_half_id(spl_idx)) {
|
|
// Get the full id list
|
|
dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start));
|
|
assert(dep_->id_num > 0);
|
|
}
|
|
|
|
uint16 new_dmi_num;
|
|
|
|
new_dmi_num = extend_dmi(dep_, dmi);
|
|
|
|
if (new_dmi_num > 0) {
|
|
if (dmi_c_phrase_) {
|
|
dmi_pool_[dmi_pool_used_].c_phrase = 1;
|
|
}
|
|
matrix_[pys_decoded_len_].dmi_num += new_dmi_num;
|
|
dmi_pool_used_ += new_dmi_num;
|
|
|
|
if (!spl_trie_->is_half_id(spl_idx))
|
|
matrix_[pys_decoded_len_].dmi_has_full_id = 1;
|
|
}
|
|
|
|
// If get candiate lemmas, try to extend the path
|
|
if (lpi_total_ > 0) {
|
|
uint16 fr_row;
|
|
if (NULL == dmi) {
|
|
fr_row = oldrow;
|
|
} else {
|
|
assert(oldrow >= dmi->splstr_len);
|
|
fr_row = oldrow - dmi->splstr_len;
|
|
}
|
|
for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos;
|
|
mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos +
|
|
matrix_[fr_row].mtrx_nd_num;
|
|
mtrx_nd_pos++) {
|
|
MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos;
|
|
|
|
extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_,
|
|
dmi_pool_used_ - new_dmi_num, pys_decoded_len_);
|
|
if (longest_ext == 0)
|
|
longest_ext = ext_len;
|
|
}
|
|
}
|
|
} // for dmi_pos
|
|
} // for ext_len
|
|
mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num;
|
|
|
|
if (dmi_c_phrase_)
|
|
return true;
|
|
|
|
return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched);
|
|
}
|
|
|
|
void MatrixSearch::prepare_candidates() {
|
|
// Get candiates from the first un-fixed step.
|
|
uint16 lma_size_max = kMaxLemmaSize;
|
|
if (lma_size_max > spl_id_num_ - fixed_hzs_)
|
|
lma_size_max = spl_id_num_ - fixed_hzs_;
|
|
|
|
uint16 lma_size = lma_size_max;
|
|
|
|
// If the full sentense candidate's unfixed part may be the same with a normal
|
|
// lemma. Remove the lemma candidate in this case.
|
|
char16 fullsent[kMaxLemmaSize + 1];
|
|
char16 *pfullsent = NULL;
|
|
uint16 sent_len;
|
|
pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true);
|
|
|
|
// If the unfixed part contains more than one ids, it is not necessary to
|
|
// check whether a lemma's string is the same to the unfixed part of the full
|
|
// sentence candidate, so, set it to NULL;
|
|
if (sent_len > kMaxLemmaSize)
|
|
pfullsent = NULL;
|
|
|
|
lpi_total_ = 0;
|
|
size_t lpi_num_full_match = 0; // Number of items which are fully-matched.
|
|
while (lma_size > 0) {
|
|
size_t lma_num;
|
|
lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size,
|
|
lpi_items_ + lpi_total_,
|
|
size_t(kMaxLmaPsbItems - lpi_total_),
|
|
pfullsent, lma_size == lma_size_max);
|
|
|
|
if (lma_num > 0) {
|
|
lpi_total_ += lma_num;
|
|
// For next lemma candidates which are not the longest, it is not
|
|
// necessary to compare with the full sentence candiate.
|
|
pfullsent = NULL;
|
|
}
|
|
if (lma_size == lma_size_max) {
|
|
lpi_num_full_match = lpi_total_;
|
|
}
|
|
lma_size--;
|
|
}
|
|
|
|
// Sort those partially-matched items by their unified scores.
|
|
myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match,
|
|
sizeof(LmaPsbItem), cmp_lpi_with_unified_psb);
|
|
|
|
if (kPrintDebug0) {
|
|
printf("-----Prepare candidates, score:\n");
|
|
for (size_t a = 0; a < lpi_total_; a++) {
|
|
printf("[%03d]%d ", a, lpi_items_[a].psb);
|
|
if ((a + 1) % 6 == 0) printf("\n");
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
if (kPrintDebug0) {
|
|
printf("--- lpi_total_ = %d\n", lpi_total_);
|
|
}
|
|
}
|
|
|
|
const char* MatrixSearch::get_pystr(size_t *decoded_len) {
|
|
if (!inited_ || NULL == decoded_len)
|
|
return NULL;
|
|
|
|
*decoded_len = pys_decoded_len_;
|
|
return pys_;
|
|
}
|
|
|
|
void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) {
|
|
if (fixed_lmas_ == 0)
|
|
return;
|
|
// Update spelling segmentation information first.
|
|
spl_id_num_ -= 1;
|
|
uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos];
|
|
for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) {
|
|
spl_start_[pos] = spl_start_[pos + 1] - del_py_len;
|
|
if (pos == spl_id_num_)
|
|
break;
|
|
spl_id_[pos] = spl_id_[pos + 1];
|
|
}
|
|
|
|
// Begin to merge.
|
|
uint16 phrase_len = 0;
|
|
|
|
// Update the spelling ids to the composing phrase.
|
|
// We need to convert these ids into full id in the future.
|
|
memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16));
|
|
memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16));
|
|
|
|
// If composing phrase has not been created, first merge all fixed
|
|
// lemmas into a composing phrase without deletion.
|
|
if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) {
|
|
uint16 bp = 1; // Begin position of real fixed lemmas.
|
|
// There is no existing composing phrase.
|
|
if (kLemmaIdComposing != lma_id_[0]) {
|
|
c_phrase_.sublma_num = 0;
|
|
bp = 0;
|
|
}
|
|
|
|
uint16 sub_num = c_phrase_.sublma_num;
|
|
for (uint16 pos = bp; pos <= fixed_lmas_; pos++) {
|
|
c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos];
|
|
if (lma_start_[pos] > del_spl_pos) {
|
|
c_phrase_.sublma_start[sub_num + pos - bp] -= 1;
|
|
}
|
|
|
|
if (pos == fixed_lmas_)
|
|
break;
|
|
|
|
uint16 lma_len;
|
|
char16 *lma_str = c_phrase_.chn_str +
|
|
c_phrase_.sublma_start[sub_num] + phrase_len;
|
|
|
|
lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len);
|
|
assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]);
|
|
phrase_len += lma_len;
|
|
}
|
|
assert(phrase_len == lma_start_[fixed_lmas_]);
|
|
c_phrase_.length = phrase_len; // will be deleted by 1
|
|
c_phrase_.sublma_num += fixed_lmas_ - bp;
|
|
} else {
|
|
for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) {
|
|
if (c_phrase_.sublma_start[pos] > del_spl_pos) {
|
|
c_phrase_.sublma_start[pos] -= 1;
|
|
}
|
|
}
|
|
phrase_len = c_phrase_.length;
|
|
}
|
|
|
|
assert(phrase_len > 0);
|
|
if (1 == phrase_len) {
|
|
// After the only one is deleted, nothing will be left.
|
|
fixed_lmas_ = 0;
|
|
return;
|
|
}
|
|
|
|
// Delete the Chinese character in the merged phrase.
|
|
// The corresponding elements in spl_ids and spl_start of the
|
|
// phrase have been deleted.
|
|
char16 *chn_str = c_phrase_.chn_str + del_spl_pos;
|
|
for (uint16 pos = 0;
|
|
pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos;
|
|
pos++) {
|
|
chn_str[pos] = chn_str[pos + 1];
|
|
}
|
|
c_phrase_.length -= 1;
|
|
|
|
// If the deleted spelling id is in a sub lemma which contains more than
|
|
// one id, del_a_sub will be false; but if the deleted id is in a sub lemma
|
|
// which only contains 1 id, the whole sub lemma needs to be deleted, so
|
|
// del_a_sub will be true.
|
|
bool del_a_sub = false;
|
|
for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) {
|
|
if (c_phrase_.sublma_start[pos - 1] ==
|
|
c_phrase_.sublma_start[pos]) {
|
|
del_a_sub = true;
|
|
}
|
|
if (del_a_sub) {
|
|
c_phrase_.sublma_start[pos - 1] =
|
|
c_phrase_.sublma_start[pos];
|
|
}
|
|
}
|
|
if (del_a_sub)
|
|
c_phrase_.sublma_num -= 1;
|
|
|
|
return;
|
|
}
|
|
|
|
void MatrixSearch::get_spl_start_id() {
|
|
lma_id_num_ = 0;
|
|
lma_start_[0] = 0;
|
|
|
|
spl_id_num_ = 0;
|
|
spl_start_[0] = 0;
|
|
if (!inited_ || 0 == pys_decoded_len_ ||
|
|
0 == matrix_[pys_decoded_len_].mtrx_nd_num)
|
|
return;
|
|
|
|
// Calculate number of lemmas and spellings
|
|
// Only scan those part which is not fixed.
|
|
lma_id_num_ = fixed_lmas_;
|
|
spl_id_num_ = fixed_hzs_;
|
|
|
|
MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
|
|
while (mtrx_nd != mtrx_nd_pool_) {
|
|
if (fixed_hzs_ > 0) {
|
|
if (mtrx_nd->step <= spl_start_[fixed_hzs_])
|
|
break;
|
|
}
|
|
|
|
// Update the spelling segamentation information
|
|
unsigned char word_splstr_len = 0;
|
|
PoolPosType dmi_fr = mtrx_nd->dmi_fr;
|
|
if ((PoolPosType)-1 != dmi_fr)
|
|
word_splstr_len = dmi_pool_[dmi_fr].splstr_len;
|
|
|
|
while ((PoolPosType)-1 != dmi_fr) {
|
|
spl_start_[spl_id_num_ + 1] = mtrx_nd->step -
|
|
(word_splstr_len - dmi_pool_[dmi_fr].splstr_len);
|
|
spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id;
|
|
spl_id_num_++;
|
|
dmi_fr = dmi_pool_[dmi_fr].dmi_fr;
|
|
}
|
|
|
|
// Update the lemma segmentation information
|
|
lma_start_[lma_id_num_ + 1] = spl_id_num_;
|
|
lma_id_[lma_id_num_] = mtrx_nd->id;
|
|
lma_id_num_++;
|
|
|
|
mtrx_nd = mtrx_nd->from;
|
|
}
|
|
|
|
// Reverse the result of spelling info
|
|
for (size_t pos = fixed_hzs_;
|
|
pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) {
|
|
if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) {
|
|
spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
|
|
spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1];
|
|
spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
|
|
|
|
spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1];
|
|
spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos];
|
|
spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1];
|
|
}
|
|
}
|
|
|
|
// Reverse the result of lemma info
|
|
for (size_t pos = fixed_lmas_;
|
|
pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) {
|
|
assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos);
|
|
|
|
if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) {
|
|
lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
|
|
lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1];
|
|
lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
|
|
|
|
lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
|
|
lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos];
|
|
lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
|
|
}
|
|
}
|
|
|
|
for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) {
|
|
if (pos < lma_id_num_)
|
|
lma_start_[pos] = lma_start_[pos - 1] +
|
|
(lma_start_[pos] - lma_start_[pos + 1]);
|
|
else
|
|
lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] -
|
|
lma_start_[fixed_lmas_];
|
|
}
|
|
|
|
// Find the last fixed position
|
|
fixed_hzs_ = 0;
|
|
for (size_t pos = spl_id_num_; pos > 0; pos--) {
|
|
if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) {
|
|
fixed_hzs_ = pos;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) {
|
|
get_spl_start_id();
|
|
spl_start = spl_start_;
|
|
return spl_id_num_;
|
|
}
|
|
|
|
size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) {
|
|
if (dmi_pool_used_ >= kDmiPoolSize) return 0;
|
|
|
|
if (dmi_c_phrase_)
|
|
return extend_dmi_c(dep, dmi_s);
|
|
|
|
LpiCache& lpi_cache = LpiCache::get_instance();
|
|
uint16 splid = dep->splids[dep->splids_extended];
|
|
|
|
bool cached = false;
|
|
if (0 == dep->splids_extended)
|
|
cached = lpi_cache.is_cached(splid);
|
|
|
|
// 1. If this is a half Id, get its corresponding full starting Id and
|
|
// number of full Id.
|
|
size_t ret_val = 0;
|
|
PoolPosType mtrx_dmi_fr = (PoolPosType)-1; // From which dmi node
|
|
|
|
lpi_total_ = 0;
|
|
|
|
MileStoneHandle from_h[3];
|
|
from_h[0] = 0;
|
|
from_h[1] = 0;
|
|
|
|
if (0 != dep->splids_extended) {
|
|
from_h[0] = dmi_s->dict_handles[0];
|
|
from_h[1] = dmi_s->dict_handles[1];
|
|
}
|
|
|
|
// 2. Begin exgtending in the system dictionary
|
|
size_t lpi_num = 0;
|
|
MileStoneHandle handles[2];
|
|
handles[0] = handles[1] = 0;
|
|
if (from_h[0] > 0 || NULL == dmi_s) {
|
|
handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_,
|
|
kMaxLmaPsbItems, &lpi_num);
|
|
}
|
|
if (handles[0] > 0)
|
|
lpi_total_ = lpi_num;
|
|
|
|
if (NULL == dmi_s) { // from root
|
|
assert(0 != handles[0]);
|
|
mtrx_dmi_fr = dmi_pool_used_;
|
|
}
|
|
|
|
// 3. Begin extending in the user dictionary
|
|
if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) {
|
|
handles[1] = user_dict_->extend_dict(from_h[1], dep,
|
|
lpi_items_ + lpi_total_,
|
|
kMaxLmaPsbItems - lpi_total_,
|
|
&lpi_num);
|
|
if (handles[1] > 0) {
|
|
if (kPrintDebug0) {
|
|
for (size_t t = 0; t < lpi_num; t++) {
|
|
printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id,
|
|
lpi_items_[lpi_total_ + t].psb);
|
|
}
|
|
}
|
|
lpi_total_ += lpi_num;
|
|
}
|
|
}
|
|
|
|
if (0 != handles[0] || 0 != handles[1]) {
|
|
if (dmi_pool_used_ >= kDmiPoolSize) return 0;
|
|
|
|
DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
|
|
if (NULL == dmi_s) {
|
|
fill_dmi(dmi_add, handles,
|
|
(PoolPosType)-1, splid,
|
|
1, 1, dep->splid_end_split, dep->ext_len,
|
|
spl_trie_->is_half_id(splid) ? 0 : 1);
|
|
} else {
|
|
fill_dmi(dmi_add, handles,
|
|
dmi_s - dmi_pool_, splid, 1,
|
|
dmi_s->dict_level + 1, dep->splid_end_split,
|
|
dmi_s->splstr_len + dep->ext_len,
|
|
spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
|
|
}
|
|
|
|
ret_val = 1;
|
|
}
|
|
|
|
if (!cached) {
|
|
if (0 == lpi_total_)
|
|
return ret_val;
|
|
|
|
if (kPrintDebug0) {
|
|
printf("--- lpi_total_ = %d\n", lpi_total_);
|
|
}
|
|
|
|
myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb);
|
|
if (NULL == dmi_s && spl_trie_->is_half_id(splid))
|
|
lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_);
|
|
} else {
|
|
assert(spl_trie_->is_half_id(splid));
|
|
lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems);
|
|
}
|
|
|
|
return ret_val;
|
|
}
|
|
|
|
size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) {
|
|
lpi_total_ = 0;
|
|
|
|
uint16 pos = dep->splids_extended;
|
|
assert(dmi_c_phrase_);
|
|
if (pos >= c_phrase_.length)
|
|
return 0;
|
|
|
|
uint16 splid = dep->splids[pos];
|
|
if (splid == c_phrase_.spl_ids[pos]) {
|
|
DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
|
|
MileStoneHandle handles[2]; // Actually never used.
|
|
if (NULL == dmi_s)
|
|
fill_dmi(dmi_add, handles,
|
|
(PoolPosType)-1, splid,
|
|
1, 1, dep->splid_end_split, dep->ext_len,
|
|
spl_trie_->is_half_id(splid) ? 0 : 1);
|
|
else
|
|
fill_dmi(dmi_add, handles,
|
|
dmi_s - dmi_pool_, splid, 1,
|
|
dmi_s->dict_level + 1, dep->splid_end_split,
|
|
dmi_s->splstr_len + dep->ext_len,
|
|
spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
|
|
|
|
if (pos == c_phrase_.length - 1) {
|
|
lpi_items_[0].id = kLemmaIdComposing;
|
|
lpi_items_[0].psb = 0; // 0 is bigger than normal lemma score.
|
|
lpi_total_ = 1;
|
|
}
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
|
|
size_t lpi_num, PoolPosType dmi_fr,
|
|
size_t res_row) {
|
|
assert(NULL != mtrx_nd);
|
|
matrix_[res_row].mtrx_nd_fixed = NULL;
|
|
|
|
if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow)
|
|
return 0;
|
|
|
|
if (0 == mtrx_nd->step) {
|
|
// Because the list is sorted, if the source step is 0, it is only
|
|
// necessary to pick up the first kMaxNodeARow items.
|
|
if (lpi_num > kMaxNodeARow)
|
|
lpi_num = kMaxNodeARow;
|
|
}
|
|
|
|
MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos;
|
|
for (size_t pos = 0; pos < lpi_num; pos++) {
|
|
float score = mtrx_nd->score + lpi_items[pos].psb;
|
|
if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score)
|
|
break;
|
|
|
|
// Try to add a new node
|
|
size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num;
|
|
MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num;
|
|
bool replace = false;
|
|
// Find its position
|
|
while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) {
|
|
if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow)
|
|
*mtrx_nd_res = *(mtrx_nd_res - 1);
|
|
mtrx_nd_res--;
|
|
replace = true;
|
|
}
|
|
if (replace || (mtrx_nd_num < kMaxNodeARow &&
|
|
matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) {
|
|
mtrx_nd_res->id = lpi_items[pos].id;
|
|
mtrx_nd_res->score = score;
|
|
mtrx_nd_res->from = mtrx_nd;
|
|
mtrx_nd_res->dmi_fr = dmi_fr;
|
|
mtrx_nd_res->step = res_row;
|
|
if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow)
|
|
matrix_[res_row].mtrx_nd_num++;
|
|
}
|
|
}
|
|
return matrix_[res_row].mtrx_nd_num;
|
|
}
|
|
|
|
PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[],
|
|
uint16 spl_id_num) {
|
|
if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) {
|
|
return static_cast<PoolPosType>(-1);
|
|
}
|
|
|
|
for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) {
|
|
DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos;
|
|
|
|
if (dmi->dict_level != spl_id_num)
|
|
continue;
|
|
|
|
bool matched = true;
|
|
for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) {
|
|
if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) {
|
|
matched = false;
|
|
break;
|
|
}
|
|
|
|
dmi = dmi_pool_ + dmi->dmi_fr;
|
|
}
|
|
if (matched) {
|
|
return matrix_[step_to].dmi_pos + dmi_pos;
|
|
}
|
|
}
|
|
|
|
return static_cast<PoolPosType>(-1);
|
|
}
|
|
|
|
char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len,
|
|
uint16 *retstr_len,
|
|
bool only_unfixed) {
|
|
if (pys_decoded_len_ == 0 ||
|
|
matrix_[pys_decoded_len_].mtrx_nd_num == 0)
|
|
return NULL;
|
|
|
|
LemmaIdType idxs[kMaxRowNum];
|
|
size_t id_num = 0;
|
|
|
|
MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
|
|
|
|
if (kPrintDebug0) {
|
|
printf("--- sentence score: %f\n", mtrx_nd->score);
|
|
}
|
|
|
|
if (kPrintDebug1) {
|
|
printf("==============Sentence DMI (reverse order) begin===========>>\n");
|
|
}
|
|
|
|
while (mtrx_nd != NULL) {
|
|
idxs[id_num] = mtrx_nd->id;
|
|
id_num++;
|
|
|
|
if (kPrintDebug1) {
|
|
printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n",
|
|
mtrx_nd->step, mtrx_nd->id, mtrx_nd->score);
|
|
debug_print_dmi(mtrx_nd->dmi_fr, 1);
|
|
}
|
|
|
|
mtrx_nd = mtrx_nd->from;
|
|
}
|
|
|
|
if (kPrintDebug1) {
|
|
printf("<<==============Sentence DMI (reverse order) end=============\n");
|
|
}
|
|
|
|
size_t ret_pos = 0;
|
|
do {
|
|
id_num--;
|
|
if (0 == idxs[id_num])
|
|
continue;
|
|
|
|
char16 str[kMaxLemmaSize + 1];
|
|
uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1);
|
|
if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) ||
|
|
(only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) {
|
|
if (!only_unfixed)
|
|
utf16_strncpy(cand_str + ret_pos, str, str_len);
|
|
else if (ret_pos >= fixed_hzs_)
|
|
utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len);
|
|
|
|
ret_pos += str_len;
|
|
} else {
|
|
return NULL;
|
|
}
|
|
} while (id_num != 0);
|
|
|
|
if (!only_unfixed) {
|
|
if (NULL != retstr_len)
|
|
*retstr_len = ret_pos;
|
|
cand_str[ret_pos] = (char16)'\0';
|
|
} else {
|
|
if (NULL != retstr_len)
|
|
*retstr_len = ret_pos - fixed_hzs_;
|
|
cand_str[ret_pos - fixed_hzs_] = (char16)'\0';
|
|
}
|
|
return cand_str;
|
|
}
|
|
|
|
size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len,
|
|
LmaPsbItem* lma_buf, size_t max_lma_buf,
|
|
const char16 *pfullsent, bool sort_by_psb) {
|
|
if (splid_str_len > kMaxLemmaSize)
|
|
return 0;
|
|
|
|
size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len,
|
|
lma_buf, max_lma_buf);
|
|
size_t num2 = 0;
|
|
if (NULL != user_dict_) {
|
|
num2 = user_dict_->get_lpis(splid_str, splid_str_len,
|
|
lma_buf + num1, max_lma_buf - num1);
|
|
}
|
|
|
|
size_t num = num1 + num2;
|
|
|
|
if (0 == num)
|
|
return 0;
|
|
|
|
// Remove repeated items.
|
|
if (splid_str_len > 1) {
|
|
LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num);
|
|
size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) /
|
|
sizeof(LmaPsbStrItem);
|
|
//assert(lpsi_num > num);
|
|
if (num > lpsi_num) num = lpsi_num;
|
|
lpsi_num = num;
|
|
|
|
for (size_t pos = 0; pos < lpsi_num; pos++) {
|
|
lpsis[pos].lpi = lma_buf[pos];
|
|
get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1);
|
|
}
|
|
|
|
myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str);
|
|
|
|
size_t remain_num = 0;
|
|
for (size_t pos = 0; pos < lpsi_num; pos++) {
|
|
if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) {
|
|
if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) {
|
|
assert(remain_num > 0);
|
|
lma_buf[remain_num - 1] = lpsis[pos].lpi;
|
|
}
|
|
continue;
|
|
}
|
|
if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0)
|
|
continue;
|
|
|
|
lma_buf[remain_num] = lpsis[pos].lpi;
|
|
remain_num++;
|
|
}
|
|
|
|
// Update the result number
|
|
num = remain_num;
|
|
} else {
|
|
// For single character, some characters have more than one spelling, for
|
|
// example, "de" and "di" are all valid for a Chinese character, so when
|
|
// the user input "d", repeated items are generated.
|
|
// For single character lemmas, Hanzis will be gotten
|
|
for (size_t pos = 0; pos < num; pos++) {
|
|
char16 hanzis[2];
|
|
get_lemma_str(lma_buf[pos].id, hanzis, 2);
|
|
lma_buf[pos].hanzi = hanzis[0];
|
|
}
|
|
|
|
myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi);
|
|
|
|
size_t remain_num = 0;
|
|
for (size_t pos = 0; pos < num; pos++) {
|
|
if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) {
|
|
if (NULL != pfullsent &&
|
|
static_cast<char16>(0) == pfullsent[1] &&
|
|
lma_buf[pos].hanzi == pfullsent[0])
|
|
continue;
|
|
|
|
if (lma_buf[pos].psb < lma_buf[pos - 1].psb) {
|
|
assert(remain_num > 0);
|
|
assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi);
|
|
lma_buf[remain_num - 1] = lma_buf[pos];
|
|
}
|
|
continue;
|
|
}
|
|
if (NULL != pfullsent &&
|
|
static_cast<char16>(0) == pfullsent[1] &&
|
|
lma_buf[pos].hanzi == pfullsent[0])
|
|
continue;
|
|
|
|
lma_buf[remain_num] = lma_buf[pos];
|
|
remain_num++;
|
|
}
|
|
|
|
num = remain_num;
|
|
}
|
|
|
|
if (sort_by_psb) {
|
|
myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb);
|
|
}
|
|
return num;
|
|
}
|
|
|
|
uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
|
|
uint16 str_max) {
|
|
uint16 str_len = 0;
|
|
|
|
if (is_system_lemma(id_lemma)) {
|
|
str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max);
|
|
} else if (is_user_lemma(id_lemma)) {
|
|
if (NULL != user_dict_) {
|
|
str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max);
|
|
} else {
|
|
str_len = 0;
|
|
str_buf[0] = static_cast<char16>('\0');
|
|
}
|
|
} else if (is_composing_lemma(id_lemma)) {
|
|
if (str_max <= 1)
|
|
return 0;
|
|
str_len = c_phrase_.sublma_start[c_phrase_.sublma_num];
|
|
if (str_len > str_max - 1)
|
|
str_len = str_max - 1;
|
|
utf16_strncpy(str_buf, c_phrase_.chn_str, str_len);
|
|
str_buf[str_len] = (char16)'\0';
|
|
return str_len;
|
|
}
|
|
|
|
return str_len;
|
|
}
|
|
|
|
uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
|
|
uint16 splids_max, bool arg_valid) {
|
|
uint16 splid_num = 0;
|
|
|
|
if (arg_valid) {
|
|
for (splid_num = 0; splid_num < splids_max; splid_num++) {
|
|
if (spl_trie_->is_half_id(splids[splid_num]))
|
|
break;
|
|
}
|
|
if (splid_num == splids_max)
|
|
return splid_num;
|
|
}
|
|
|
|
if (is_system_lemma(id_lemma)) {
|
|
splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max,
|
|
arg_valid);
|
|
} else if (is_user_lemma(id_lemma)) {
|
|
if (NULL != user_dict_) {
|
|
splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max,
|
|
arg_valid);
|
|
} else {
|
|
splid_num = 0;
|
|
}
|
|
} else if (is_composing_lemma(id_lemma)) {
|
|
if (c_phrase_.length > splids_max) {
|
|
return 0;
|
|
}
|
|
for (uint16 pos = 0; pos < c_phrase_.length; pos++) {
|
|
splids[pos] = c_phrase_.spl_ids[pos];
|
|
if (spl_trie_->is_half_id(splids[pos])) {
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
return splid_num;
|
|
}
|
|
|
|
size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len,
|
|
char16 predict_buf[][kMaxPredictSize + 1],
|
|
size_t buf_len) {
|
|
size_t res_total = 0;
|
|
memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_);
|
|
// In order to shorten the comments, j-character candidates predicted by
|
|
// i-character prefix are called P(i,j). All candiates predicted by
|
|
// i-character prefix are called P(i,*)
|
|
// Step 1. Get P(kMaxPredictSize, *) and sort them, here
|
|
// P(kMaxPredictSize, *) == P(kMaxPredictSize, 1)
|
|
for (size_t len = fixed_len; len >0; len--) {
|
|
// How many blank items are available
|
|
size_t this_max = npre_items_len_ - res_total;
|
|
size_t res_this;
|
|
// If the history is longer than 1, and we can not get prediction from
|
|
// lemmas longer than 2, in this case, we will add lemmas with
|
|
// highest scores as the prediction result.
|
|
if (fixed_len > 1 && 1 == len && 0 == res_total) {
|
|
// Try to find if recent n (n>1) characters can be a valid lemma in system
|
|
// dictionary.
|
|
bool nearest_n_word = false;
|
|
for (size_t nlen = 2; nlen <= fixed_len; nlen++) {
|
|
if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) {
|
|
nearest_n_word = true;
|
|
break;
|
|
}
|
|
}
|
|
res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0,
|
|
npre_items_ + res_total,
|
|
this_max, res_total);
|
|
res_total += res_this;
|
|
}
|
|
|
|
// How many blank items are available
|
|
this_max = npre_items_len_ - res_total;
|
|
res_this = 0;
|
|
if (!kOnlyUserDictPredict) {
|
|
res_this =
|
|
dict_trie_->predict(fixed_buf + fixed_len - len, len,
|
|
npre_items_ + res_total, this_max,
|
|
res_total);
|
|
}
|
|
|
|
if (NULL != user_dict_) {
|
|
res_this = res_this +
|
|
user_dict_->predict(fixed_buf + fixed_len - len, len,
|
|
npre_items_ + res_total + res_this,
|
|
this_max - res_this, res_total + res_this);
|
|
}
|
|
|
|
if (kPredictLimitGt1) {
|
|
myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem),
|
|
cmp_npre_by_score);
|
|
|
|
if (len > 3) {
|
|
if (res_this > kMaxPredictNumByGt3)
|
|
res_this = kMaxPredictNumByGt3;
|
|
} else if (3 == len) {
|
|
if (res_this > kMaxPredictNumBy3)
|
|
res_this = kMaxPredictNumBy3;
|
|
} else if (2 == len) {
|
|
if (res_this > kMaxPredictNumBy2)
|
|
res_this = kMaxPredictNumBy2;
|
|
}
|
|
}
|
|
|
|
res_total += res_this;
|
|
}
|
|
|
|
res_total = remove_duplicate_npre(npre_items_, res_total);
|
|
|
|
if (kPreferLongHistoryPredict) {
|
|
myqsort(npre_items_, res_total, sizeof(NPredictItem),
|
|
cmp_npre_by_hislen_score);
|
|
} else {
|
|
myqsort(npre_items_, res_total, sizeof(NPredictItem),
|
|
cmp_npre_by_score);
|
|
}
|
|
|
|
if (buf_len < res_total) {
|
|
res_total = buf_len;
|
|
}
|
|
|
|
if (kPrintDebug2) {
|
|
printf("/////////////////Predicted Items Begin////////////////////>>\n");
|
|
for (size_t i = 0; i < res_total; i++) {
|
|
printf("---");
|
|
for (size_t j = 0; j < kMaxPredictSize; j++) {
|
|
printf("%d ", npre_items_[i].pre_hzs[j]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
printf("<<///////////////Predicted Items End////////////////////////\n");
|
|
}
|
|
|
|
for (size_t i = 0; i < res_total; i++) {
|
|
utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs,
|
|
kMaxPredictSize);
|
|
predict_buf[i][kMaxPredictSize] = '\0';
|
|
}
|
|
|
|
return res_total;
|
|
}
|
|
|
|
size_t MatrixSearch::get_predicts(const char16 fixed_buf[],
|
|
char16 predict_buf[][kMaxPredictSize + 1],
|
|
size_t buf_len) {
|
|
size_t fixed_len = utf16_strlen(fixed_buf);
|
|
if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len)
|
|
return 0;
|
|
|
|
return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len);
|
|
}
|
|
|
|
} // namespace ime_pinyin
|