![Uwe Rathmann](/assets/img/avatar_default.png)
The 3rdparty files are now compiled as part of the corresponding input method, so that the project files can be written without using platform specific linker flags.
447 lines
12 KiB
C++
447 lines
12 KiB
C++
/*
|
|
* Copyright (C) 2009 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include "../include/dictlist.h"
|
|
#include "../include/mystdlib.h"
|
|
#include "../include/ngram.h"
|
|
#include "../include/searchutility.h"
|
|
|
|
namespace ime_pinyin {
|
|
|
|
DictList::DictList() {
|
|
initialized_ = false;
|
|
scis_num_ = 0;
|
|
scis_hz_ = NULL;
|
|
scis_splid_ = NULL;
|
|
buf_ = NULL;
|
|
spl_trie_ = SpellingTrie::get_cpinstance();
|
|
|
|
assert(kMaxLemmaSize == 8);
|
|
cmp_func_[0] = cmp_hanzis_1;
|
|
cmp_func_[1] = cmp_hanzis_2;
|
|
cmp_func_[2] = cmp_hanzis_3;
|
|
cmp_func_[3] = cmp_hanzis_4;
|
|
cmp_func_[4] = cmp_hanzis_5;
|
|
cmp_func_[5] = cmp_hanzis_6;
|
|
cmp_func_[6] = cmp_hanzis_7;
|
|
cmp_func_[7] = cmp_hanzis_8;
|
|
}
|
|
|
|
DictList::~DictList() {
|
|
free_resource();
|
|
}
|
|
|
|
bool DictList::alloc_resource(size_t buf_size, size_t scis_num) {
|
|
// Allocate memory
|
|
buf_ = static_cast<char16*>(malloc(buf_size * sizeof(char16)));
|
|
if (NULL == buf_)
|
|
return false;
|
|
|
|
scis_num_ = scis_num;
|
|
|
|
scis_hz_ = static_cast<char16*>(malloc(scis_num_ * sizeof(char16)));
|
|
if (NULL == scis_hz_)
|
|
return false;
|
|
|
|
scis_splid_ = static_cast<SpellingId*>
|
|
(malloc(scis_num_ * sizeof(SpellingId)));
|
|
|
|
if (NULL == scis_splid_)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
void DictList::free_resource() {
|
|
if (NULL != buf_)
|
|
free(buf_);
|
|
buf_ = NULL;
|
|
|
|
if (NULL != scis_hz_)
|
|
free(scis_hz_);
|
|
scis_hz_ = NULL;
|
|
|
|
if (NULL != scis_splid_)
|
|
free(scis_splid_);
|
|
scis_splid_ = NULL;
|
|
}
|
|
|
|
#ifdef ___BUILD_MODEL___
|
|
bool DictList::init_list(const SingleCharItem *scis, size_t scis_num,
|
|
const LemmaEntry *lemma_arr, size_t lemma_num) {
|
|
if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num)
|
|
return false;
|
|
|
|
initialized_ = false;
|
|
|
|
if (NULL != buf_)
|
|
free(buf_);
|
|
|
|
// calculate the size
|
|
size_t buf_size = calculate_size(lemma_arr, lemma_num);
|
|
if (0 == buf_size)
|
|
return false;
|
|
|
|
if (!alloc_resource(buf_size, scis_num))
|
|
return false;
|
|
|
|
fill_scis(scis, scis_num);
|
|
|
|
// Copy the related content from the array to inner buffer
|
|
fill_list(lemma_arr, lemma_num);
|
|
|
|
initialized_ = true;
|
|
return true;
|
|
}
|
|
|
|
size_t DictList::calculate_size(const LemmaEntry* lemma_arr, size_t lemma_num) {
|
|
size_t last_hz_len = 0;
|
|
size_t list_size = 0;
|
|
size_t id_num = 0;
|
|
|
|
for (size_t i = 0; i < lemma_num; i++) {
|
|
if (0 == i) {
|
|
last_hz_len = lemma_arr[i].hz_str_len;
|
|
|
|
assert(last_hz_len > 0);
|
|
assert(lemma_arr[0].idx_by_hz == 1);
|
|
|
|
id_num++;
|
|
start_pos_[0] = 0;
|
|
start_id_[0] = id_num;
|
|
|
|
last_hz_len = 1;
|
|
list_size += last_hz_len;
|
|
} else {
|
|
size_t current_hz_len = lemma_arr[i].hz_str_len;
|
|
|
|
assert(current_hz_len >= last_hz_len);
|
|
|
|
if (current_hz_len == last_hz_len) {
|
|
list_size += current_hz_len;
|
|
id_num++;
|
|
} else {
|
|
for (size_t len = last_hz_len; len < current_hz_len - 1; len++) {
|
|
start_pos_[len] = start_pos_[len - 1];
|
|
start_id_[len] = start_id_[len - 1];
|
|
}
|
|
|
|
start_pos_[current_hz_len - 1] = list_size;
|
|
|
|
id_num++;
|
|
start_id_[current_hz_len - 1] = id_num;
|
|
|
|
last_hz_len = current_hz_len;
|
|
list_size += current_hz_len;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (size_t i = last_hz_len; i <= kMaxLemmaSize; i++) {
|
|
if (0 == i) {
|
|
start_pos_[0] = 0;
|
|
start_id_[0] = 1;
|
|
} else {
|
|
start_pos_[i] = list_size;
|
|
start_id_[i] = id_num;
|
|
}
|
|
}
|
|
|
|
return start_pos_[kMaxLemmaSize];
|
|
}
|
|
|
|
void DictList::fill_scis(const SingleCharItem *scis, size_t scis_num) {
|
|
assert(scis_num_ == scis_num);
|
|
|
|
for (size_t pos = 0; pos < scis_num_; pos++) {
|
|
scis_hz_[pos] = scis[pos].hz;
|
|
scis_splid_[pos] = scis[pos].splid;
|
|
}
|
|
}
|
|
|
|
void DictList::fill_list(const LemmaEntry* lemma_arr, size_t lemma_num) {
|
|
size_t current_pos = 0;
|
|
|
|
utf16_strncpy(buf_, lemma_arr[0].hanzi_str,
|
|
lemma_arr[0].hz_str_len);
|
|
|
|
current_pos = lemma_arr[0].hz_str_len;
|
|
|
|
size_t id_num = 1;
|
|
|
|
for (size_t i = 1; i < lemma_num; i++) {
|
|
utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str,
|
|
lemma_arr[i].hz_str_len);
|
|
|
|
id_num++;
|
|
current_pos += lemma_arr[i].hz_str_len;
|
|
}
|
|
|
|
assert(current_pos == start_pos_[kMaxLemmaSize]);
|
|
assert(id_num == start_id_[kMaxLemmaSize]);
|
|
}
|
|
|
|
char16* DictList::find_pos2_startedbyhz(char16 hz_char) {
|
|
char16 *found_2w = static_cast<char16*>
|
|
(mybsearch(&hz_char, buf_ + start_pos_[1],
|
|
(start_pos_[2] - start_pos_[1]) / 2,
|
|
sizeof(char16) * 2, cmp_hanzis_1));
|
|
if (NULL == found_2w)
|
|
return NULL;
|
|
|
|
while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1))
|
|
found_2w -= 2;
|
|
|
|
return found_2w;
|
|
}
|
|
#endif // ___BUILD_MODEL___
|
|
|
|
char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[],
|
|
size_t word_len, int (*cmp_func)(const void *, const void *)) {
|
|
char16 *found_w = static_cast<char16*>
|
|
(mybsearch(last_hzs, buf_ + start_pos_[word_len - 1],
|
|
(start_pos_[word_len] - start_pos_[word_len - 1])
|
|
/ word_len,
|
|
sizeof(char16) * word_len, cmp_func));
|
|
|
|
if (NULL == found_w)
|
|
return NULL;
|
|
|
|
while (found_w > buf_ + start_pos_[word_len -1] &&
|
|
cmp_func(found_w, found_w - word_len) == 0)
|
|
found_w -= word_len;
|
|
|
|
return found_w;
|
|
}
|
|
|
|
size_t DictList::predict(const char16 last_hzs[], uint16 hzs_len,
|
|
NPredictItem *npre_items, size_t npre_max,
|
|
size_t b4_used) {
|
|
assert(hzs_len <= kMaxPredictSize && hzs_len > 0);
|
|
|
|
// 1. Prepare work
|
|
int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1];
|
|
|
|
NGram& ngram = NGram::get_instance();
|
|
|
|
size_t item_num = 0;
|
|
|
|
// 2. Do prediction
|
|
for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len;
|
|
pre_len++) {
|
|
uint16 word_len = hzs_len + pre_len;
|
|
char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func);
|
|
if (NULL == w_buf)
|
|
continue;
|
|
while (w_buf < buf_ + start_pos_[word_len] &&
|
|
cmp_func(w_buf, last_hzs) == 0 &&
|
|
item_num < npre_max) {
|
|
memset(npre_items + item_num, 0, sizeof(NPredictItem));
|
|
utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len);
|
|
npre_items[item_num].psb =
|
|
ngram.get_uni_psb((size_t)(w_buf - buf_ - start_pos_[word_len - 1])
|
|
/ word_len + start_id_[word_len - 1]);
|
|
npre_items[item_num].his_len = hzs_len;
|
|
item_num++;
|
|
w_buf += word_len;
|
|
}
|
|
}
|
|
|
|
size_t new_num = 0;
|
|
for (size_t i = 0; i < item_num; i++) {
|
|
// Try to find it in the existing items
|
|
size_t e_pos;
|
|
for (e_pos = 1; e_pos <= b4_used; e_pos++) {
|
|
if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs,
|
|
kMaxPredictSize) == 0)
|
|
break;
|
|
}
|
|
if (e_pos <= b4_used)
|
|
continue;
|
|
|
|
// If not found, append it to the buffer
|
|
npre_items[new_num] = npre_items[i];
|
|
new_num++;
|
|
}
|
|
|
|
return new_num;
|
|
}
|
|
|
|
uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
|
|
uint16 str_max) {
|
|
if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf
|
|
|| str_max <= 1)
|
|
return 0;
|
|
|
|
// Find the range
|
|
for (uint16 i = 0; i < kMaxLemmaSize; i++) {
|
|
if (i + 1 > str_max - 1)
|
|
return 0;
|
|
if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) {
|
|
size_t id_span = id_lemma - start_id_[i];
|
|
|
|
uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1);
|
|
for (uint16 len = 0; len <= i; len++) {
|
|
str_buf[len] = buf[len];
|
|
}
|
|
str_buf[i+1] = (char16)'\0';
|
|
return i + 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid,
|
|
uint16 *splids, uint16 max_splids) {
|
|
char16 *hz_found = static_cast<char16*>
|
|
(mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1));
|
|
assert(NULL != hz_found && hanzi == *hz_found);
|
|
|
|
// Move to the first one.
|
|
while (hz_found > scis_hz_ && hanzi == *(hz_found - 1))
|
|
hz_found--;
|
|
|
|
// First try to found if strict comparison result is not zero.
|
|
char16 *hz_f = hz_found;
|
|
bool strict = false;
|
|
while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) {
|
|
uint16 pos = hz_f - scis_hz_;
|
|
if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) {
|
|
strict = true;
|
|
}
|
|
hz_f++;
|
|
}
|
|
|
|
uint16 found_num = 0;
|
|
while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) {
|
|
uint16 pos = hz_found - scis_hz_;
|
|
if (0 == half_splid ||
|
|
(strict && scis_splid_[pos].half_splid == half_splid) ||
|
|
(!strict && spl_trie_->half_full_compatible(half_splid,
|
|
scis_splid_[pos].full_splid))) {
|
|
assert(found_num + 1 < max_splids);
|
|
splids[found_num] = scis_splid_[pos].full_splid;
|
|
found_num++;
|
|
}
|
|
hz_found++;
|
|
}
|
|
|
|
return found_num;
|
|
}
|
|
|
|
LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) {
|
|
if (NULL == str || str_len > kMaxLemmaSize)
|
|
return 0;
|
|
|
|
char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]);
|
|
if (NULL == found)
|
|
return 0;
|
|
|
|
assert(found > buf_);
|
|
assert(static_cast<size_t>(found - buf_) >= start_pos_[str_len - 1]);
|
|
return static_cast<LemmaIdType>
|
|
(start_id_[str_len - 1] +
|
|
(found - buf_ - start_pos_[str_len - 1]) / str_len);
|
|
}
|
|
|
|
void DictList::convert_to_hanzis(char16 *str, uint16 str_len) {
|
|
assert(NULL != str);
|
|
|
|
for (uint16 str_pos = 0; str_pos < str_len; str_pos++) {
|
|
str[str_pos] = scis_hz_[str[str_pos]];
|
|
}
|
|
}
|
|
|
|
void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) {
|
|
assert(NULL != str);
|
|
|
|
for (uint16 str_pos = 0; str_pos < str_len; str_pos++) {
|
|
str[str_pos] = 0x100;
|
|
}
|
|
}
|
|
|
|
bool DictList::save_list(FILE *fp) {
|
|
if (!initialized_ || NULL == fp)
|
|
return false;
|
|
|
|
if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] ||
|
|
NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_)
|
|
return false;
|
|
|
|
if (fwrite(&scis_num_, sizeof(uint32), 1, fp) != 1)
|
|
return false;
|
|
|
|
if (fwrite(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) !=
|
|
kMaxLemmaSize + 1)
|
|
return false;
|
|
|
|
if (fwrite(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) !=
|
|
kMaxLemmaSize + 1)
|
|
return false;
|
|
|
|
if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_)
|
|
return false;
|
|
|
|
if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_)
|
|
return false;
|
|
|
|
if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) !=
|
|
start_pos_[kMaxLemmaSize])
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool DictList::load_list(FILE *fp) {
|
|
if (NULL == fp)
|
|
return false;
|
|
|
|
initialized_ = false;
|
|
|
|
if (fread(&scis_num_, sizeof(uint32), 1, fp) != 1)
|
|
return false;
|
|
|
|
if (fread(start_pos_, sizeof(uint32), kMaxLemmaSize + 1, fp) !=
|
|
kMaxLemmaSize + 1)
|
|
return false;
|
|
|
|
if (fread(start_id_, sizeof(uint32), kMaxLemmaSize + 1, fp) !=
|
|
kMaxLemmaSize + 1)
|
|
return false;
|
|
|
|
free_resource();
|
|
|
|
if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_))
|
|
return false;
|
|
|
|
if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_)
|
|
return false;
|
|
|
|
if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_)
|
|
return false;
|
|
|
|
if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) !=
|
|
start_pos_[kMaxLemmaSize])
|
|
return false;
|
|
|
|
initialized_ = true;
|
|
return true;
|
|
}
|
|
} // namespace ime_pinyin
|