![Uwe Rathmann](/assets/img/avatar_default.png)
The 3rdparty files are now compiled as part of the corresponding input method, so that the project files can be written without using platform specific linker flags.
2291 lines
58 KiB
C++
2291 lines
58 KiB
C++
/*
|
|
* Copyright (C) 2009 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "../include/userdict.h"
|
|
#include "../include/splparser.h"
|
|
#include "../include/ngram.h"
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#ifdef ___DEBUG_PERF___
|
|
#include <cutils/log.h>
|
|
#endif
|
|
#ifdef _WIN32
|
|
#include <io.h>
|
|
#else
|
|
#include <unistd.h>
|
|
#endif
|
|
#include <fcntl.h>
|
|
#include <sys/stat.h>
|
|
#include <assert.h>
|
|
#include <ctype.h>
|
|
#include <sys/types.h>
|
|
#ifndef _WIN32
|
|
#include <sys/time.h>
|
|
#endif
|
|
#include <time.h>
|
|
#ifdef _WIN32
|
|
#undef max
|
|
#undef min
|
|
#include <QDateTime>
|
|
#include <QMutex>
|
|
#else
|
|
#include <pthread.h>
|
|
#endif
|
|
#include <math.h>
|
|
|
|
namespace ime_pinyin {
|
|
|
|
#ifdef _WIN32
|
|
static int gettimeofday(struct timeval *tp, void *) {
|
|
const qint64 current_msecs_since_epoch = QDateTime::currentMSecsSinceEpoch();
|
|
tp->tv_sec = (long)(current_msecs_since_epoch / 1000);
|
|
tp->tv_usec = (long)((current_msecs_since_epoch % 1000) * 1000);
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#ifdef ___DEBUG_PERF___
|
|
static uint64 _ellapse_ = 0;
|
|
static struct timeval _tv_start_, _tv_end_;
|
|
#define DEBUG_PERF_BEGIN \
|
|
do { \
|
|
gettimeofday(&_tv_start_, NULL); \
|
|
} while (0)
|
|
#define DEBUG_PERF_END \
|
|
do { \
|
|
gettimeofday(&_tv_end_, NULL); \
|
|
_ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
|
|
(_tv_end_.tv_usec - _tv_start_.tv_usec); \
|
|
} while (0)
|
|
#define LOGD_PERF(message) \
|
|
ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
|
|
#else
|
|
#define DEBUG_PERF_BEGIN
|
|
#define DEBUG_PERF_END
|
|
#define LOGD_PERF(message)
|
|
#endif
|
|
|
|
// XXX File load and write are thread-safe by g_mutex_
|
|
#ifdef _WIN32
|
|
static QMutex g_mutex_;
|
|
#define pthread_mutex_lock(MUTEX) ((MUTEX)->lock())
|
|
#define pthread_mutex_unlock(MUTEX) ((MUTEX)->unlock())
|
|
#define pthread_mutex_trylock(MUTEX) (!(MUTEX)->tryLock(0))
|
|
#else
|
|
static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
|
|
#endif
|
|
static struct timeval g_last_update_ = {0, 0};
|
|
|
|
inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
|
|
return (4 + info->lemma_size + (info->lemma_count << 3)
|
|
#ifdef ___PREDICT_ENABLED___
|
|
+ (info->lemma_count << 2)
|
|
#endif
|
|
#ifdef ___SYNC_ENABLED___
|
|
+ (info->sync_count << 2)
|
|
#endif
|
|
+ sizeof(*info));
|
|
}
|
|
|
|
inline LmaScoreType UserDict::translate_score(int raw_score) {
|
|
// 1) ori_freq: original user frequency
|
|
uint32 ori_freq = extract_score_freq(raw_score);
|
|
// 2) lmt_off: lmt index (week offset for example)
|
|
uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
|
|
if (kUserDictLMTBitWidth < 16) {
|
|
uint64 mask = ~(1 << kUserDictLMTBitWidth);
|
|
lmt_off &= mask;
|
|
}
|
|
// 3) now_off: current time index (current week offset for example)
|
|
// assuming load_time_ is around current time
|
|
uint64 now_off = load_time_.tv_sec;
|
|
now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
|
|
now_off = (now_off << (64 - kUserDictLMTBitWidth));
|
|
now_off = (now_off >> (64 - kUserDictLMTBitWidth));
|
|
// 4) factor: decide expand-factor
|
|
int delta = now_off - lmt_off;
|
|
if (delta > 4)
|
|
delta = 4;
|
|
int factor = 80 - (delta << 4);
|
|
|
|
double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
|
|
return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
|
|
* NGram::kLogValueAmplifier);
|
|
}
|
|
|
|
inline int UserDict::extract_score_freq(int raw_score) {
|
|
// Frequence stored in lowest 16 bits
|
|
int freq = (raw_score & 0x0000ffff);
|
|
return freq;
|
|
}
|
|
|
|
inline uint64 UserDict::extract_score_lmt(int raw_score) {
|
|
uint64 lmt = ((raw_score & 0xffff0000) >> 16);
|
|
if (kUserDictLMTBitWidth < 16) {
|
|
uint64 mask = ~(1 << kUserDictLMTBitWidth);
|
|
lmt &= mask;
|
|
}
|
|
lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
|
|
return lmt;
|
|
}
|
|
|
|
inline int UserDict::build_score(uint64 lmt, int freq) {
|
|
lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
|
|
lmt = (lmt << (64 - kUserDictLMTBitWidth));
|
|
lmt = (lmt >> (64 - kUserDictLMTBitWidth));
|
|
uint16 lmt16 = (uint16)lmt;
|
|
int s = freq;
|
|
s &= 0x0000ffff;
|
|
s = (lmt16 << 16) | s;
|
|
return s;
|
|
}
|
|
|
|
inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
|
|
int64 ret = 0;
|
|
if (len <= 0)
|
|
return ret;
|
|
|
|
int flag = 1;
|
|
const uint16 * endp = s + len;
|
|
if (*s == '-') {
|
|
flag = -1;
|
|
s++;
|
|
} else if (*s == '+') {
|
|
s++;
|
|
}
|
|
|
|
while (*s >= '0' && *s <= '9' && s < endp) {
|
|
ret += ret * 10 + (*s) - '0';
|
|
s++;
|
|
}
|
|
return ret * flag;
|
|
}
|
|
|
|
inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
|
|
if (!s || size <= 0)
|
|
return 0;
|
|
uint16 *endp = s + size;
|
|
int ret_len = 0;
|
|
if (v < 0) {
|
|
*(s++) = '-';
|
|
++ret_len;
|
|
v *= -1;
|
|
}
|
|
|
|
uint16 *b = s;
|
|
while (s < endp && v != 0) {
|
|
*(s++) = '0' + (v % 10);
|
|
v = v / 10;
|
|
++ret_len;
|
|
}
|
|
|
|
if (v != 0)
|
|
return 0;
|
|
|
|
--s;
|
|
|
|
while (b < s) {
|
|
*b = *s;
|
|
++b, --s;
|
|
}
|
|
|
|
return ret_len;
|
|
}
|
|
|
|
inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
|
|
offset &= kUserDictOffsetMask;
|
|
lemmas_[offset] |= flag;
|
|
}
|
|
|
|
inline char UserDict::get_lemma_flag(uint32 offset) {
|
|
offset &= kUserDictOffsetMask;
|
|
return (char)(lemmas_[offset]);
|
|
}
|
|
|
|
inline char UserDict::get_lemma_nchar(uint32 offset) {
|
|
offset &= kUserDictOffsetMask;
|
|
return (char)(lemmas_[offset + 1]);
|
|
}
|
|
|
|
inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
|
|
offset &= kUserDictOffsetMask;
|
|
return (uint16 *)(lemmas_ + offset + 2);
|
|
}
|
|
|
|
inline uint16 * UserDict::get_lemma_word(uint32 offset) {
|
|
offset &= kUserDictOffsetMask;
|
|
uint8 nchar = get_lemma_nchar(offset);
|
|
return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
|
|
}
|
|
|
|
inline LemmaIdType UserDict::get_max_lemma_id() {
|
|
// When a lemma is deleted, we don't not claim its id back for
|
|
// simplicity and performance
|
|
return start_id_ + dict_info_.lemma_count - 1;
|
|
}
|
|
|
|
inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
|
|
if (id >= start_id_ && id <= get_max_lemma_id())
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
inline bool UserDict::is_valid_state() {
|
|
if (state_ == USER_DICT_NONE)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
UserDict::UserDict()
|
|
: start_id_(0),
|
|
version_(0),
|
|
lemmas_(NULL),
|
|
offsets_(NULL),
|
|
scores_(NULL),
|
|
ids_(NULL),
|
|
#ifdef ___PREDICT_ENABLED___
|
|
predicts_(NULL),
|
|
#endif
|
|
#ifdef ___SYNC_ENABLED___
|
|
syncs_(NULL),
|
|
sync_count_size_(0),
|
|
#endif
|
|
offsets_by_id_(NULL),
|
|
lemma_count_left_(0),
|
|
lemma_size_left_(0),
|
|
dict_file_(NULL),
|
|
state_(USER_DICT_NONE) {
|
|
memset(&dict_info_, 0, sizeof(dict_info_));
|
|
memset(&load_time_, 0, sizeof(load_time_));
|
|
#ifdef ___CACHE_ENABLED___
|
|
cache_init();
|
|
#endif
|
|
}
|
|
|
|
UserDict::~UserDict() {
|
|
close_dict();
|
|
}
|
|
|
|
bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
|
|
LemmaIdType end_id) {
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_BEGIN;
|
|
#endif
|
|
dict_file_ = strdup(file_name);
|
|
if (!dict_file_)
|
|
return false;
|
|
|
|
start_id_ = start_id;
|
|
|
|
if (false == validate(file_name) && false == reset(file_name)) {
|
|
goto error;
|
|
}
|
|
if (false == load(file_name, start_id)) {
|
|
goto error;
|
|
}
|
|
|
|
state_ = USER_DICT_SYNC;
|
|
|
|
gettimeofday(&load_time_, NULL);
|
|
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_END;
|
|
LOGD_PERF("load_dict");
|
|
#endif
|
|
return true;
|
|
error:
|
|
free((void*)dict_file_);
|
|
dict_file_ = NULL;
|
|
start_id_ = 0;
|
|
return false;
|
|
}
|
|
|
|
bool UserDict::close_dict() {
|
|
if (state_ == USER_DICT_NONE)
|
|
return true;
|
|
if (state_ == USER_DICT_SYNC)
|
|
goto out;
|
|
|
|
// If dictionary is written back by others,
|
|
// we can not simply write back here
|
|
// To do a safe flush, we have to discard all newly added
|
|
// lemmas and try to reload dict file.
|
|
pthread_mutex_lock(&g_mutex_);
|
|
if (load_time_.tv_sec > g_last_update_.tv_sec ||
|
|
(load_time_.tv_sec == g_last_update_.tv_sec &&
|
|
load_time_.tv_usec > g_last_update_.tv_usec)) {
|
|
write_back();
|
|
gettimeofday(&g_last_update_, NULL);
|
|
}
|
|
pthread_mutex_unlock(&g_mutex_);
|
|
|
|
out:
|
|
free((void*)dict_file_);
|
|
free(lemmas_);
|
|
free(offsets_);
|
|
free(offsets_by_id_);
|
|
free(scores_);
|
|
free(ids_);
|
|
#ifdef ___PREDICT_ENABLED___
|
|
free(predicts_);
|
|
#endif
|
|
|
|
version_ = 0;
|
|
dict_file_ = NULL;
|
|
lemmas_ = NULL;
|
|
#ifdef ___SYNC_ENABLED___
|
|
syncs_ = NULL;
|
|
sync_count_size_ = 0;
|
|
#endif
|
|
offsets_ = NULL;
|
|
offsets_by_id_ = NULL;
|
|
scores_ = NULL;
|
|
ids_ = NULL;
|
|
#ifdef ___PREDICT_ENABLED___
|
|
predicts_ = NULL;
|
|
#endif
|
|
|
|
memset(&dict_info_, 0, sizeof(dict_info_));
|
|
lemma_count_left_ = 0;
|
|
lemma_size_left_ = 0;
|
|
state_ = USER_DICT_NONE;
|
|
|
|
return true;
|
|
}
|
|
|
|
size_t UserDict::number_of_lemmas() {
|
|
return dict_info_.lemma_count;
|
|
}
|
|
|
|
void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
|
|
return;
|
|
}
|
|
|
|
MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
|
|
const DictExtPara *dep,
|
|
LmaPsbItem *lpi_items,
|
|
size_t lpi_max, size_t *lpi_num) {
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
|
|
bool need_extend = false;
|
|
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_BEGIN;
|
|
#endif
|
|
*lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
|
|
lpi_items, lpi_max, &need_extend);
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_END;
|
|
LOGD_PERF("extend_dict");
|
|
#endif
|
|
return ((*lpi_num > 0 || need_extend) ? 1 : 0);
|
|
}
|
|
|
|
int UserDict::is_fuzzy_prefix_spell_id(
|
|
const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
|
|
if (len1 < searchable->splids_len)
|
|
return 0;
|
|
|
|
SpellingTrie &spl_trie = SpellingTrie::get_instance();
|
|
uint32 i = 0;
|
|
for (i = 0; i < searchable->splids_len; i++) {
|
|
const char py1 = *spl_trie.get_spelling_str(id1[i]);
|
|
uint16 off = 8 * (i % 4);
|
|
const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
|
|
if (py1 == py2)
|
|
continue;
|
|
return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
int UserDict::fuzzy_compare_spell_id(
|
|
const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
|
|
if (len1 < searchable->splids_len)
|
|
return -1;
|
|
if (len1 > searchable->splids_len)
|
|
return 1;
|
|
|
|
SpellingTrie &spl_trie = SpellingTrie::get_instance();
|
|
uint32 i = 0;
|
|
for (i = 0; i < len1; i++) {
|
|
const char py1 = *spl_trie.get_spelling_str(id1[i]);
|
|
uint16 off = 8 * (i % 4);
|
|
const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
|
|
if (py1 == py2)
|
|
continue;
|
|
if (py1 > py2)
|
|
return 1;
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bool UserDict::is_prefix_spell_id(
|
|
const uint16 * fullids, uint16 fulllen,
|
|
const UserDictSearchable *searchable) {
|
|
if (fulllen < searchable->splids_len)
|
|
return false;
|
|
|
|
uint32 i = 0;
|
|
for (; i < searchable->splids_len; i++) {
|
|
uint16 start_id = searchable->splid_start[i];
|
|
uint16 count = searchable->splid_count[i];
|
|
if (fullids[i] >= start_id && fullids[i] < start_id + count)
|
|
continue;
|
|
else
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool UserDict::equal_spell_id(
|
|
const uint16 * fullids, uint16 fulllen,
|
|
const UserDictSearchable *searchable) {
|
|
if (fulllen != searchable->splids_len)
|
|
return false;
|
|
|
|
uint32 i = 0;
|
|
for (; i < fulllen; i++) {
|
|
uint16 start_id = searchable->splid_start[i];
|
|
uint16 count = searchable->splid_count[i];
|
|
if (fullids[i] >= start_id && fullids[i] < start_id + count)
|
|
continue;
|
|
else
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
|
|
int32 begin = 0;
|
|
int32 end = dict_info_.lemma_count - 1;
|
|
int32 middle = -1;
|
|
|
|
int32 first_prefix = middle;
|
|
int32 last_matched = middle;
|
|
|
|
while (begin <= end) {
|
|
middle = (begin + end) >> 1;
|
|
uint32 offset = offsets_[middle];
|
|
uint8 nchar = get_lemma_nchar(offset);
|
|
const uint16 * splids = get_lemma_spell_ids(offset);
|
|
int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
|
|
int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
|
|
|
|
if (pre)
|
|
first_prefix = middle;
|
|
|
|
if (cmp < 0) {
|
|
begin = middle + 1;
|
|
} else if (cmp > 0) {
|
|
end = middle - 1;
|
|
} else {
|
|
end = middle - 1;
|
|
last_matched = middle;
|
|
}
|
|
}
|
|
|
|
return first_prefix;
|
|
}
|
|
|
|
void UserDict::prepare_locate(UserDictSearchable *searchable,
|
|
const uint16 *splid_str,
|
|
uint16 splid_str_len) {
|
|
searchable->splids_len = splid_str_len;
|
|
memset(searchable->signature, 0, sizeof(searchable->signature));
|
|
|
|
SpellingTrie &spl_trie = SpellingTrie::get_instance();
|
|
uint32 i = 0;
|
|
for (; i < splid_str_len; i++) {
|
|
if (spl_trie.is_half_id(splid_str[i])) {
|
|
searchable->splid_count[i] =
|
|
spl_trie.half_to_full(splid_str[i],
|
|
&(searchable->splid_start[i]));
|
|
} else {
|
|
searchable->splid_count[i] = 1;
|
|
searchable->splid_start[i] = splid_str[i];
|
|
}
|
|
const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
|
|
searchable->signature[i>>2] |= (py << (8 * (i % 4)));
|
|
}
|
|
}
|
|
|
|
size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
|
|
LmaPsbItem *lpi_items, size_t lpi_max) {
|
|
return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
|
|
}
|
|
|
|
size_t UserDict::_get_lpis(const uint16 *splid_str,
|
|
uint16 splid_str_len, LmaPsbItem *lpi_items,
|
|
size_t lpi_max, bool * need_extend) {
|
|
bool tmp_extend;
|
|
if (!need_extend)
|
|
need_extend = &tmp_extend;
|
|
|
|
*need_extend = false;
|
|
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
if (lpi_max <= 0)
|
|
return 0;
|
|
|
|
if (0 == pthread_mutex_trylock(&g_mutex_)) {
|
|
if (load_time_.tv_sec < g_last_update_.tv_sec ||
|
|
(load_time_.tv_sec == g_last_update_.tv_sec &&
|
|
load_time_.tv_usec < g_last_update_.tv_usec)) {
|
|
// Others updated disk file, have to reload
|
|
pthread_mutex_unlock(&g_mutex_);
|
|
flush_cache();
|
|
} else {
|
|
pthread_mutex_unlock(&g_mutex_);
|
|
}
|
|
} else {
|
|
}
|
|
|
|
UserDictSearchable searchable;
|
|
prepare_locate(&searchable, splid_str, splid_str_len);
|
|
|
|
uint32 max_off = dict_info_.lemma_count;
|
|
#ifdef ___CACHE_ENABLED___
|
|
int32 middle;
|
|
uint32 start, count;
|
|
bool cached = cache_hit(&searchable, &start, &count);
|
|
if (cached) {
|
|
middle = start;
|
|
max_off = start + count;
|
|
} else {
|
|
middle = locate_first_in_offsets(&searchable);
|
|
start = middle;
|
|
}
|
|
#else
|
|
int32 middle = locate_first_in_offsets(&searchable);
|
|
#endif
|
|
|
|
if (middle == -1) {
|
|
#ifdef ___CACHE_ENABLED___
|
|
if (!cached)
|
|
cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
size_t lpi_current = 0;
|
|
|
|
bool fuzzy_break = false;
|
|
bool prefix_break = false;
|
|
while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
|
|
if (lpi_current >= lpi_max)
|
|
break;
|
|
uint32 offset = offsets_[middle];
|
|
// Ignore deleted lemmas
|
|
if (offset & kUserDictOffsetFlagRemove) {
|
|
middle++;
|
|
continue;
|
|
}
|
|
uint8 nchar = get_lemma_nchar(offset);
|
|
uint16 * splids = get_lemma_spell_ids(offset);
|
|
#ifdef ___CACHE_ENABLED___
|
|
if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
|
|
#else
|
|
if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
|
|
#endif
|
|
fuzzy_break = true;
|
|
}
|
|
|
|
if (prefix_break == false) {
|
|
if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
|
|
if (*need_extend == false &&
|
|
is_prefix_spell_id(splids, nchar, &searchable)) {
|
|
*need_extend = true;
|
|
}
|
|
} else {
|
|
prefix_break = true;
|
|
}
|
|
}
|
|
|
|
if (equal_spell_id(splids, nchar, &searchable) == true) {
|
|
lpi_items[lpi_current].psb = translate_score(scores_[middle]);
|
|
lpi_items[lpi_current].id = ids_[middle];
|
|
lpi_items[lpi_current].lma_len = nchar;
|
|
lpi_current++;
|
|
}
|
|
middle++;
|
|
}
|
|
|
|
#ifdef ___CACHE_ENABLED___
|
|
if (!cached) {
|
|
count = middle - start;
|
|
cache_push(USER_DICT_CACHE, &searchable, start, count);
|
|
}
|
|
#endif
|
|
|
|
return lpi_current;
|
|
}
|
|
|
|
uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
|
|
uint16 str_max) {
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
if (is_valid_lemma_id(id_lemma) == false)
|
|
return 0;
|
|
uint32 offset = offsets_by_id_[id_lemma - start_id_];
|
|
uint8 nchar = get_lemma_nchar(offset);
|
|
char16 * str = get_lemma_word(offset);
|
|
uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
|
|
int i = 0;
|
|
for (; i < m; i++) {
|
|
str_buf[i] = str[i];
|
|
}
|
|
str_buf[i] = 0;
|
|
return m;
|
|
}
|
|
|
|
uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
|
|
uint16 splids_max, bool arg_valid) {
|
|
if (is_valid_lemma_id(id_lemma) == false)
|
|
return 0;
|
|
uint32 offset = offsets_by_id_[id_lemma - start_id_];
|
|
uint8 nchar = get_lemma_nchar(offset);
|
|
const uint16 * ids = get_lemma_spell_ids(offset);
|
|
int i = 0;
|
|
for (; i < nchar && i < splids_max; i++)
|
|
splids[i] = ids[i];
|
|
return i;
|
|
}
|
|
|
|
size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
|
|
NPredictItem *npre_items, size_t npre_max,
|
|
size_t b4_used) {
|
|
uint32 new_added = 0;
|
|
#ifdef ___PREDICT_ENABLED___
|
|
int32 end = dict_info_.lemma_count - 1;
|
|
int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
|
|
if (j == -1)
|
|
return 0;
|
|
|
|
while (j <= end) {
|
|
uint32 offset = predicts_[j];
|
|
// Ignore deleted lemmas
|
|
if (offset & kUserDictOffsetFlagRemove) {
|
|
j++;
|
|
continue;
|
|
}
|
|
uint32 nchar = get_lemma_nchar(offset);
|
|
uint16 * words = get_lemma_word(offset);
|
|
uint16 * splids = get_lemma_spell_ids(offset);
|
|
|
|
if (nchar <= hzs_len) {
|
|
j++;
|
|
continue;
|
|
}
|
|
|
|
if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
|
|
if (new_added >= npre_max) {
|
|
return new_added;
|
|
}
|
|
uint32 cpy_len =
|
|
(nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
|
|
- (hzs_len << 1);
|
|
npre_items[new_added].his_len = hzs_len;
|
|
npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
|
|
memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
|
|
if ((cpy_len >> 1) < kMaxPredictSize) {
|
|
npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
|
|
}
|
|
new_added++;
|
|
} else {
|
|
break;
|
|
}
|
|
|
|
j++;
|
|
}
|
|
#endif
|
|
return new_added;
|
|
}
|
|
|
|
int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
|
|
uint16 lemma_len) {
|
|
int32 max_off = dict_info_.lemma_count;
|
|
|
|
UserDictSearchable searchable;
|
|
prepare_locate(&searchable, splid_str, lemma_len);
|
|
#ifdef ___CACHE_ENABLED___
|
|
int32 off;
|
|
uint32 start, count;
|
|
bool cached = load_cache(&searchable, &start, &count);
|
|
if (cached) {
|
|
off = start;
|
|
max_off = start + count;
|
|
} else {
|
|
off = locate_first_in_offsets(&searchable);
|
|
start = off;
|
|
}
|
|
#else
|
|
int32 off = locate_first_in_offsets(&searchable);
|
|
#endif
|
|
|
|
if (off == -1) {
|
|
return off;
|
|
}
|
|
|
|
while (off < max_off) {
|
|
uint32 offset = offsets_[off];
|
|
if (offset & kUserDictOffsetFlagRemove) {
|
|
off++;
|
|
continue;
|
|
}
|
|
uint16 * splids = get_lemma_spell_ids(offset);
|
|
#ifdef ___CACHE_ENABLED___
|
|
if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
|
|
break;
|
|
#else
|
|
if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
|
|
break;
|
|
#endif
|
|
if (equal_spell_id(splids, lemma_len, &searchable) == true) {
|
|
uint16 * str = get_lemma_word(offset);
|
|
uint32 i = 0;
|
|
for (i = 0; i < lemma_len; i++) {
|
|
if (str[i] == lemma_str[i])
|
|
continue;
|
|
break;
|
|
}
|
|
if (i < lemma_len) {
|
|
off++;
|
|
continue;
|
|
}
|
|
#ifdef ___CACHE_ENABLED___
|
|
// No need to save_cache here, since current function is invoked by
|
|
// put_lemma. It's rarely possible for a user input same lemma twice.
|
|
// That means first time user type a new lemma, it is newly added into
|
|
// user dictionary, then it's possible that user type the same lemma
|
|
// again.
|
|
// Another reason save_cache can not be invoked here is this function
|
|
// aborts when lemma is found, and it never knows the count.
|
|
#endif
|
|
return off;
|
|
}
|
|
off++;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
#ifdef ___PREDICT_ENABLED___
|
|
uint32 UserDict::locate_where_to_insert_in_predicts(
|
|
const uint16 * words, int lemma_len) {
|
|
int32 begin = 0;
|
|
int32 end = dict_info_.lemma_count - 1;
|
|
int32 middle = end;
|
|
|
|
uint32 last_matched = middle;
|
|
|
|
while (begin <= end) {
|
|
middle = (begin + end) >> 1;
|
|
uint32 offset = offsets_[middle];
|
|
uint8 nchar = get_lemma_nchar(offset);
|
|
const uint16 * ws = get_lemma_word(offset);
|
|
|
|
uint32 minl = nchar < lemma_len ? nchar : lemma_len;
|
|
uint32 k = 0;
|
|
int cmp = 0;
|
|
|
|
for (; k < minl; k++) {
|
|
if (ws[k] < words[k]) {
|
|
cmp = -1;
|
|
break;
|
|
} else if (ws[k] > words[k]) {
|
|
cmp = 1;
|
|
break;
|
|
}
|
|
}
|
|
if (cmp == 0) {
|
|
if (nchar < lemma_len)
|
|
cmp = -1;
|
|
else if (nchar > lemma_len)
|
|
cmp = 1;
|
|
}
|
|
|
|
if (cmp < 0) {
|
|
begin = middle + 1;
|
|
last_matched = middle;
|
|
} else if (cmp > 0) {
|
|
end = middle - 1;
|
|
} else {
|
|
end = middle - 1;
|
|
last_matched = middle;
|
|
}
|
|
}
|
|
|
|
return last_matched;
|
|
}
|
|
|
|
int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
|
|
int32 begin = 0;
|
|
int32 end = dict_info_.lemma_count - 1;
|
|
int32 middle = -1;
|
|
|
|
int32 last_matched = middle;
|
|
|
|
while (begin <= end) {
|
|
middle = (begin + end) >> 1;
|
|
uint32 offset = offsets_[middle];
|
|
uint8 nchar = get_lemma_nchar(offset);
|
|
const uint16 * ws = get_lemma_word(offset);
|
|
|
|
uint32 minl = nchar < lemma_len ? nchar : lemma_len;
|
|
uint32 k = 0;
|
|
int cmp = 0;
|
|
|
|
for (; k < minl; k++) {
|
|
if (ws[k] < words[k]) {
|
|
cmp = -1;
|
|
break;
|
|
} else if (ws[k] > words[k]) {
|
|
cmp = 1;
|
|
break;
|
|
}
|
|
}
|
|
if (cmp == 0) {
|
|
if (nchar >= lemma_len)
|
|
last_matched = middle;
|
|
if (nchar < lemma_len)
|
|
cmp = -1;
|
|
else if (nchar > lemma_len)
|
|
cmp = 1;
|
|
}
|
|
|
|
if (cmp < 0) {
|
|
begin = middle + 1;
|
|
} else if (cmp > 0) {
|
|
end = middle - 1;
|
|
} else {
|
|
end = middle - 1;
|
|
}
|
|
}
|
|
|
|
return last_matched;
|
|
}
|
|
|
|
#endif
|
|
|
|
LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
|
|
uint16 lemma_len) {
|
|
int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
|
|
if (off == -1) {
|
|
return 0;
|
|
}
|
|
|
|
return ids_[off];
|
|
}
|
|
|
|
LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
if (is_valid_lemma_id(lemma_id) == false)
|
|
return 0;
|
|
|
|
return translate_score(_get_lemma_score(lemma_id));
|
|
}
|
|
|
|
LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
|
|
uint16 lemma_len) {
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
|
|
}
|
|
|
|
int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
if (is_valid_lemma_id(lemma_id) == false)
|
|
return 0;
|
|
|
|
uint32 offset = offsets_by_id_[lemma_id - start_id_];
|
|
|
|
uint32 nchar = get_lemma_nchar(offset);
|
|
uint16 * spl = get_lemma_spell_ids(offset);
|
|
uint16 * wrd = get_lemma_word(offset);
|
|
|
|
int32 off = locate_in_offsets(wrd, spl, nchar);
|
|
if (off == -1) {
|
|
return 0;
|
|
}
|
|
|
|
return scores_[off];
|
|
}
|
|
|
|
int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
|
|
uint16 lemma_len) {
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
|
|
int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
|
|
if (off == -1) {
|
|
return 0;
|
|
}
|
|
|
|
return scores_[off];
|
|
}
|
|
|
|
#ifdef ___SYNC_ENABLED___
|
|
void UserDict::remove_lemma_from_sync_list(uint32 offset) {
|
|
offset &= kUserDictOffsetMask;
|
|
uint32 i = 0;
|
|
for (; i < dict_info_.sync_count; i++) {
|
|
unsigned int off = (syncs_[i] & kUserDictOffsetMask);
|
|
if (off == offset)
|
|
break;
|
|
}
|
|
if (i < dict_info_.sync_count) {
|
|
syncs_[i] = syncs_[dict_info_.sync_count - 1];
|
|
dict_info_.sync_count--;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef ___PREDICT_ENABLED___
|
|
void UserDict::remove_lemma_from_predict_list(uint32 offset) {
|
|
offset &= kUserDictOffsetMask;
|
|
uint32 i = 0;
|
|
for (; i < dict_info_.lemma_count; i++) {
|
|
unsigned int off = (predicts_[i] & kUserDictOffsetMask);
|
|
if (off == offset) {
|
|
predicts_[i] |= kUserDictOffsetFlagRemove;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
bool UserDict::remove_lemma_by_offset_index(int offset_index) {
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
|
|
int32 off = offset_index;
|
|
if (off == -1) {
|
|
return false;
|
|
}
|
|
|
|
uint32 offset = offsets_[off];
|
|
uint32 nchar = get_lemma_nchar(offset);
|
|
|
|
offsets_[off] |= kUserDictOffsetFlagRemove;
|
|
|
|
#ifdef ___SYNC_ENABLED___
|
|
// Remove corresponding sync item
|
|
remove_lemma_from_sync_list(offset);
|
|
#endif
|
|
|
|
#ifdef ___PREDICT_ENABLED___
|
|
remove_lemma_from_predict_list(offset);
|
|
#endif
|
|
dict_info_.free_count++;
|
|
dict_info_.free_size += (2 + (nchar << 2));
|
|
|
|
if (state_ < USER_DICT_OFFSET_DIRTY)
|
|
state_ = USER_DICT_OFFSET_DIRTY;
|
|
return true;
|
|
}
|
|
|
|
bool UserDict::remove_lemma(LemmaIdType lemma_id) {
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
if (is_valid_lemma_id(lemma_id) == false)
|
|
return false;
|
|
uint32 offset = offsets_by_id_[lemma_id - start_id_];
|
|
|
|
uint32 nchar = get_lemma_nchar(offset);
|
|
uint16 * spl = get_lemma_spell_ids(offset);
|
|
uint16 * wrd = get_lemma_word(offset);
|
|
|
|
int32 off = locate_in_offsets(wrd, spl, nchar);
|
|
|
|
return remove_lemma_by_offset_index(off);
|
|
}
|
|
|
|
void UserDict::flush_cache() {
|
|
LemmaIdType start_id = start_id_;
|
|
if (!dict_file_)
|
|
return;
|
|
const char * file = strdup(dict_file_);
|
|
if (!file)
|
|
return;
|
|
close_dict();
|
|
load_dict(file, start_id, kUserDictIdEnd);
|
|
free((void*)file);
|
|
#ifdef ___CACHE_ENABLED___
|
|
cache_init();
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
bool UserDict::reset(const char *file) {
|
|
FILE *fp = fopen(file, "w+");
|
|
if (!fp) {
|
|
return false;
|
|
}
|
|
uint32 version = kUserDictVersion;
|
|
size_t wred = fwrite(&version, 1, 4, fp);
|
|
UserDictInfo info;
|
|
memset(&info, 0, sizeof(info));
|
|
// By default, no limitation for lemma count and size
|
|
// thereby, reclaim_ratio is never used
|
|
wred += fwrite(&info, 1, sizeof(info), fp);
|
|
if (wred != sizeof(info) + sizeof(version)) {
|
|
fclose(fp);
|
|
unlink(file);
|
|
return false;
|
|
}
|
|
fclose(fp);
|
|
return true;
|
|
}
|
|
|
|
bool UserDict::validate(const char *file) {
|
|
// b is ignored in POSIX compatible os including Linux
|
|
// while b is important flag for Windows to specify binary mode
|
|
FILE *fp = fopen(file, "rb");
|
|
if (!fp) {
|
|
return false;
|
|
}
|
|
|
|
size_t size;
|
|
size_t readed;
|
|
uint32 version;
|
|
UserDictInfo dict_info;
|
|
|
|
// validate
|
|
int err = fseek(fp, 0, SEEK_END);
|
|
if (err) {
|
|
goto error;
|
|
}
|
|
|
|
size = ftell(fp);
|
|
if (size < 4 + sizeof(dict_info)) {
|
|
goto error;
|
|
}
|
|
|
|
err = fseek(fp, 0, SEEK_SET);
|
|
if (err) {
|
|
goto error;
|
|
}
|
|
|
|
readed = fread(&version, 1, sizeof(version), fp);
|
|
if (readed < sizeof(version)) {
|
|
goto error;
|
|
}
|
|
if (version != kUserDictVersion) {
|
|
goto error;
|
|
}
|
|
|
|
err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
|
|
if (err) {
|
|
goto error;
|
|
}
|
|
|
|
readed = fread(&dict_info, 1, sizeof(dict_info), fp);
|
|
if (readed != sizeof(dict_info)) {
|
|
goto error;
|
|
}
|
|
|
|
if (size != get_dict_file_size(&dict_info)) {
|
|
goto error;
|
|
}
|
|
|
|
fclose(fp);
|
|
return true;
|
|
|
|
error:
|
|
fclose(fp);
|
|
return false;
|
|
}
|
|
|
|
bool UserDict::load(const char *file, LemmaIdType start_id) {
|
|
if (0 != pthread_mutex_trylock(&g_mutex_)) {
|
|
return false;
|
|
}
|
|
// b is ignored in POSIX compatible os including Linux
|
|
// while b is important flag for Windows to specify binary mode
|
|
FILE *fp = fopen(file, "rb");
|
|
if (!fp) {
|
|
pthread_mutex_unlock(&g_mutex_);
|
|
return false;
|
|
}
|
|
|
|
size_t readed, toread;
|
|
UserDictInfo dict_info;
|
|
uint8 *lemmas = NULL;
|
|
uint32 *offsets = NULL;
|
|
#ifdef ___SYNC_ENABLED___
|
|
uint32 *syncs = NULL;
|
|
#endif
|
|
uint32 *scores = NULL;
|
|
uint32 *ids = NULL;
|
|
uint32 *offsets_by_id = NULL;
|
|
#ifdef ___PREDICT_ENABLED___
|
|
uint32 *predicts = NULL;
|
|
#endif
|
|
size_t i;
|
|
int err;
|
|
|
|
err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
|
|
if (err) goto error;
|
|
|
|
readed = fread(&dict_info, 1, sizeof(dict_info), fp);
|
|
if (readed != sizeof(dict_info)) goto error;
|
|
|
|
lemmas = (uint8 *)malloc(
|
|
dict_info.lemma_size +
|
|
(kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
|
|
|
|
if (!lemmas) goto error;
|
|
|
|
offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
|
|
if (!offsets) goto error;
|
|
|
|
#ifdef ___PREDICT_ENABLED___
|
|
predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
|
|
if (!predicts) goto error;
|
|
#endif
|
|
|
|
#ifdef ___SYNC_ENABLED___
|
|
syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
|
|
if (!syncs) goto error;
|
|
#endif
|
|
|
|
scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
|
|
if (!scores) goto error;
|
|
|
|
ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
|
|
if (!ids) goto error;
|
|
|
|
offsets_by_id = (uint32 *)malloc(
|
|
(dict_info.lemma_count + kUserDictPreAlloc) << 2);
|
|
if (!offsets_by_id) goto error;
|
|
|
|
err = fseek(fp, 4, SEEK_SET);
|
|
if (err) goto error;
|
|
|
|
readed = 0;
|
|
while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
|
|
readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
|
|
}
|
|
if (readed < dict_info.lemma_size)
|
|
goto error;
|
|
|
|
toread = (dict_info.lemma_count << 2);
|
|
readed = 0;
|
|
while (readed < toread && !ferror(fp) && !feof(fp)) {
|
|
readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
|
|
}
|
|
if (readed < toread)
|
|
goto error;
|
|
|
|
#ifdef ___PREDICT_ENABLED___
|
|
toread = (dict_info.lemma_count << 2);
|
|
readed = 0;
|
|
while (readed < toread && !ferror(fp) && !feof(fp)) {
|
|
readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
|
|
}
|
|
if (readed < toread)
|
|
goto error;
|
|
#endif
|
|
|
|
readed = 0;
|
|
while (readed < toread && !ferror(fp) && !feof(fp)) {
|
|
readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
|
|
}
|
|
if (readed < toread)
|
|
goto error;
|
|
|
|
#ifdef ___SYNC_ENABLED___
|
|
toread = (dict_info.sync_count << 2);
|
|
readed = 0;
|
|
while (readed < toread && !ferror(fp) && !feof(fp)) {
|
|
readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
|
|
}
|
|
if (readed < toread)
|
|
goto error;
|
|
#endif
|
|
|
|
for (i = 0; i < dict_info.lemma_count; i++) {
|
|
ids[i] = start_id + i;
|
|
offsets_by_id[i] = offsets[i];
|
|
}
|
|
|
|
lemmas_ = lemmas;
|
|
offsets_ = offsets;
|
|
#ifdef ___SYNC_ENABLED___
|
|
syncs_ = syncs;
|
|
sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
|
|
#endif
|
|
offsets_by_id_ = offsets_by_id;
|
|
scores_ = scores;
|
|
ids_ = ids;
|
|
#ifdef ___PREDICT_ENABLED___
|
|
predicts_ = predicts;
|
|
#endif
|
|
lemma_count_left_ = kUserDictPreAlloc;
|
|
lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
|
|
memcpy(&dict_info_, &dict_info, sizeof(dict_info));
|
|
state_ = USER_DICT_SYNC;
|
|
|
|
fclose(fp);
|
|
|
|
pthread_mutex_unlock(&g_mutex_);
|
|
return true;
|
|
|
|
error:
|
|
if (lemmas) free(lemmas);
|
|
if (offsets) free(offsets);
|
|
#ifdef ___SYNC_ENABLED___
|
|
if (syncs) free(syncs);
|
|
#endif
|
|
if (scores) free(scores);
|
|
if (ids) free(ids);
|
|
if (offsets_by_id) free(offsets_by_id);
|
|
#ifdef ___PREDICT_ENABLED___
|
|
if (predicts) free(predicts);
|
|
#endif
|
|
fclose(fp);
|
|
pthread_mutex_unlock(&g_mutex_);
|
|
return false;
|
|
}
|
|
|
|
void UserDict::write_back() {
|
|
// XXX write back is only allowed from close_dict due to thread-safe sake
|
|
if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
|
|
return;
|
|
int fd = open(dict_file_, O_WRONLY);
|
|
if (fd == -1)
|
|
return;
|
|
switch (state_) {
|
|
case USER_DICT_DEFRAGMENTED:
|
|
write_back_all(fd);
|
|
break;
|
|
case USER_DICT_LEMMA_DIRTY:
|
|
write_back_lemma(fd);
|
|
break;
|
|
case USER_DICT_OFFSET_DIRTY:
|
|
write_back_offset(fd);
|
|
break;
|
|
case USER_DICT_SCORE_DIRTY:
|
|
write_back_score(fd);
|
|
break;
|
|
#ifdef ___SYNC_ENABLED___
|
|
case USER_DICT_SYNC_DIRTY:
|
|
write_back_sync(fd);
|
|
break;
|
|
#endif
|
|
default:
|
|
break;
|
|
}
|
|
// It seems truncate is not need on Linux, Windows except Mac
|
|
// I am doing it here anyway for safety.
|
|
off_t cur = lseek(fd, 0, SEEK_CUR);
|
|
#ifndef _WIN32
|
|
ftruncate(fd, cur);
|
|
#endif
|
|
close(fd);
|
|
state_ = USER_DICT_SYNC;
|
|
}
|
|
|
|
#ifdef ___SYNC_ENABLED___
|
|
void UserDict::write_back_sync(int fd) {
|
|
int err = lseek(fd, 4 + dict_info_.lemma_size
|
|
+ (dict_info_.lemma_count << 3)
|
|
#ifdef ___PREDICT_ENABLED___
|
|
+ (dict_info_.lemma_count << 2)
|
|
#endif
|
|
, SEEK_SET);
|
|
if (err == -1)
|
|
return;
|
|
write(fd, syncs_, dict_info_.sync_count << 2);
|
|
write(fd, &dict_info_, sizeof(dict_info_));
|
|
}
|
|
#endif
|
|
|
|
void UserDict::write_back_offset(int fd) {
|
|
int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
|
|
if (err == -1)
|
|
return;
|
|
write(fd, offsets_, dict_info_.lemma_count << 2);
|
|
#ifdef ___PREDICT_ENABLED___
|
|
write(fd, predicts_, dict_info_.lemma_count << 2);
|
|
#endif
|
|
write(fd, scores_, dict_info_.lemma_count << 2);
|
|
#ifdef ___SYNC_ENABLED___
|
|
write(fd, syncs_, dict_info_.sync_count << 2);
|
|
#endif
|
|
write(fd, &dict_info_, sizeof(dict_info_));
|
|
}
|
|
|
|
void UserDict::write_back_score(int fd) {
|
|
int err = lseek(fd, 4 + dict_info_.lemma_size
|
|
+ (dict_info_.lemma_count << 2)
|
|
#ifdef ___PREDICT_ENABLED___
|
|
+ (dict_info_.lemma_count << 2)
|
|
#endif
|
|
, SEEK_SET);
|
|
if (err == -1)
|
|
return;
|
|
write(fd, scores_, dict_info_.lemma_count << 2);
|
|
#ifdef ___SYNC_ENABLED___
|
|
write(fd, syncs_, dict_info_.sync_count << 2);
|
|
#endif
|
|
write(fd, &dict_info_, sizeof(dict_info_));
|
|
}
|
|
|
|
void UserDict::write_back_lemma(int fd) {
|
|
int err = lseek(fd, 4, SEEK_SET);
|
|
if (err == -1)
|
|
return;
|
|
// New lemmas are always appended, no need to write whole lemma block
|
|
size_t need_write = kUserDictPreAlloc *
|
|
(2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
|
|
err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
|
|
if (err == -1)
|
|
return;
|
|
write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
|
|
|
|
write(fd, offsets_, dict_info_.lemma_count << 2);
|
|
#ifdef ___PREDICT_ENABLED___
|
|
write(fd, predicts_, dict_info_.lemma_count << 2);
|
|
#endif
|
|
write(fd, scores_, dict_info_.lemma_count << 2);
|
|
#ifdef ___SYNC_ENABLED___
|
|
write(fd, syncs_, dict_info_.sync_count << 2);
|
|
#endif
|
|
write(fd, &dict_info_, sizeof(dict_info_));
|
|
}
|
|
|
|
void UserDict::write_back_all(int fd) {
|
|
// XXX lemma_size is handled differently in writeall
|
|
// and writelemma. I update lemma_size and lemma_count in different
|
|
// places for these two cases. Should fix it to make it consistent.
|
|
int err = lseek(fd, 4, SEEK_SET);
|
|
if (err == -1)
|
|
return;
|
|
write(fd, lemmas_, dict_info_.lemma_size);
|
|
write(fd, offsets_, dict_info_.lemma_count << 2);
|
|
#ifdef ___PREDICT_ENABLED___
|
|
write(fd, predicts_, dict_info_.lemma_count << 2);
|
|
#endif
|
|
write(fd, scores_, dict_info_.lemma_count << 2);
|
|
#ifdef ___SYNC_ENABLED___
|
|
write(fd, syncs_, dict_info_.sync_count << 2);
|
|
#endif
|
|
write(fd, &dict_info_, sizeof(dict_info_));
|
|
}
|
|
|
|
#ifdef ___CACHE_ENABLED___
|
|
bool UserDict::load_cache(UserDictSearchable *searchable,
|
|
uint32 *offset, uint32 *length) {
|
|
UserDictCache *cache = &caches_[searchable->splids_len - 1];
|
|
if (cache->head == cache->tail)
|
|
return false;
|
|
|
|
uint16 j, sig_len = kMaxLemmaSize / 4;
|
|
uint16 i = cache->head;
|
|
while (1) {
|
|
j = 0;
|
|
for (; j < sig_len; j++) {
|
|
if (cache->signatures[i][j] != searchable->signature[j])
|
|
break;
|
|
}
|
|
if (j < sig_len) {
|
|
i++;
|
|
if (i >= kUserDictCacheSize)
|
|
i -= kUserDictCacheSize;
|
|
if (i == cache->tail)
|
|
break;
|
|
continue;
|
|
}
|
|
*offset = cache->offsets[i];
|
|
*length = cache->lengths[i];
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void UserDict::save_cache(UserDictSearchable *searchable,
|
|
uint32 offset, uint32 length) {
|
|
UserDictCache *cache = &caches_[searchable->splids_len - 1];
|
|
uint16 next = cache->tail;
|
|
|
|
cache->offsets[next] = offset;
|
|
cache->lengths[next] = length;
|
|
uint16 sig_len = kMaxLemmaSize / 4;
|
|
uint16 j = 0;
|
|
for (; j < sig_len; j++) {
|
|
cache->signatures[next][j] = searchable->signature[j];
|
|
}
|
|
|
|
if (++next >= kUserDictCacheSize) {
|
|
next -= kUserDictCacheSize;
|
|
}
|
|
if (next == cache->head) {
|
|
cache->head++;
|
|
if (cache->head >= kUserDictCacheSize) {
|
|
cache->head -= kUserDictCacheSize;
|
|
}
|
|
}
|
|
cache->tail = next;
|
|
}
|
|
|
|
void UserDict::reset_cache() {
|
|
memset(caches_, 0, sizeof(caches_));
|
|
}
|
|
|
|
bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
|
|
UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
|
|
if (cache->head == cache->tail)
|
|
return false;
|
|
|
|
uint16 j, sig_len = kMaxLemmaSize / 4;
|
|
uint16 i = cache->head;
|
|
while (1) {
|
|
j = 0;
|
|
for (; j < sig_len; j++) {
|
|
if (cache->signatures[i][j] != searchable->signature[j])
|
|
break;
|
|
}
|
|
if (j < sig_len) {
|
|
i++;
|
|
if (i >= kUserDictMissCacheSize)
|
|
i -= kUserDictMissCacheSize;
|
|
if (i == cache->tail)
|
|
break;
|
|
continue;
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void UserDict::save_miss_cache(UserDictSearchable *searchable) {
|
|
UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
|
|
uint16 next = cache->tail;
|
|
|
|
uint16 sig_len = kMaxLemmaSize / 4;
|
|
uint16 j = 0;
|
|
for (; j < sig_len; j++) {
|
|
cache->signatures[next][j] = searchable->signature[j];
|
|
}
|
|
|
|
if (++next >= kUserDictMissCacheSize) {
|
|
next -= kUserDictMissCacheSize;
|
|
}
|
|
if (next == cache->head) {
|
|
cache->head++;
|
|
if (cache->head >= kUserDictMissCacheSize) {
|
|
cache->head -= kUserDictMissCacheSize;
|
|
}
|
|
}
|
|
cache->tail = next;
|
|
}
|
|
|
|
void UserDict::reset_miss_cache() {
|
|
memset(miss_caches_, 0, sizeof(miss_caches_));
|
|
}
|
|
|
|
void UserDict::cache_init() {
|
|
reset_cache();
|
|
reset_miss_cache();
|
|
}
|
|
|
|
bool UserDict::cache_hit(UserDictSearchable *searchable,
|
|
uint32 *offset, uint32 *length) {
|
|
bool hit = load_miss_cache(searchable);
|
|
if (hit) {
|
|
*offset = 0;
|
|
*length = 0;
|
|
return true;
|
|
}
|
|
hit = load_cache(searchable, offset, length);
|
|
if (hit) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void UserDict::cache_push(UserDictCacheType type,
|
|
UserDictSearchable *searchable,
|
|
uint32 offset, uint32 length) {
|
|
switch (type) {
|
|
case USER_DICT_MISS_CACHE:
|
|
save_miss_cache(searchable);
|
|
break;
|
|
case USER_DICT_CACHE:
|
|
save_cache(searchable, offset, length);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
void UserDict::defragment(void) {
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_BEGIN;
|
|
#endif
|
|
if (is_valid_state() == false)
|
|
return;
|
|
// Fixup offsets_, set REMOVE flag to lemma's flag if needed
|
|
size_t first_freed = 0;
|
|
size_t first_inuse = 0;
|
|
while (first_freed < dict_info_.lemma_count) {
|
|
// Find first freed offset
|
|
while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
|
|
first_freed < dict_info_.lemma_count) {
|
|
first_freed++;
|
|
}
|
|
if (first_freed < dict_info_.lemma_count) {
|
|
// Save REMOVE flag to lemma flag
|
|
int off = offsets_[first_freed];
|
|
set_lemma_flag(off, kUserDictLemmaFlagRemove);
|
|
} else {
|
|
break;
|
|
}
|
|
// Find first inuse offse after first_freed
|
|
first_inuse = first_freed + 1;
|
|
while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
|
|
(first_inuse < dict_info_.lemma_count)) {
|
|
// Save REMOVE flag to lemma flag
|
|
int off = offsets_[first_inuse];
|
|
set_lemma_flag(off, kUserDictLemmaFlagRemove);
|
|
first_inuse++;
|
|
}
|
|
if (first_inuse >= dict_info_.lemma_count) {
|
|
break;
|
|
}
|
|
// Swap offsets_
|
|
int tmp = offsets_[first_inuse];
|
|
offsets_[first_inuse] = offsets_[first_freed];
|
|
offsets_[first_freed] = tmp;
|
|
// Move scores_, no need to swap
|
|
tmp = scores_[first_inuse];
|
|
scores_[first_inuse] = scores_[first_freed];
|
|
scores_[first_freed] = tmp;
|
|
// Swap ids_
|
|
LemmaIdType tmpid = ids_[first_inuse];
|
|
ids_[first_inuse] = ids_[first_freed];
|
|
ids_[first_freed] = tmpid;
|
|
// Go on
|
|
first_freed++;
|
|
}
|
|
#ifdef ___PREDICT_ENABLED___
|
|
// Fixup predicts_
|
|
first_freed = 0;
|
|
first_inuse = 0;
|
|
while (first_freed < dict_info_.lemma_count) {
|
|
// Find first freed offset
|
|
while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
|
|
first_freed < dict_info_.lemma_count) {
|
|
first_freed++;
|
|
}
|
|
if (first_freed >= dict_info_.lemma_count)
|
|
break;
|
|
// Find first inuse offse after first_freed
|
|
first_inuse = first_freed + 1;
|
|
while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
|
|
&& (first_inuse < dict_info_.lemma_count)) {
|
|
first_inuse++;
|
|
}
|
|
if (first_inuse >= dict_info_.lemma_count) {
|
|
break;
|
|
}
|
|
// Swap offsets_
|
|
int tmp = predicts_[first_inuse];
|
|
predicts_[first_inuse] = predicts_[first_freed];
|
|
predicts_[first_freed] = tmp;
|
|
// Go on
|
|
first_freed++;
|
|
}
|
|
#endif
|
|
dict_info_.lemma_count = first_freed;
|
|
// Fixup lemmas_
|
|
size_t begin = 0;
|
|
size_t end = 0;
|
|
size_t dst = 0;
|
|
int total_size = dict_info_.lemma_size + lemma_size_left_;
|
|
int total_count = dict_info_.lemma_count + lemma_count_left_;
|
|
size_t real_size = total_size - lemma_size_left_;
|
|
while (dst < real_size) {
|
|
unsigned char flag = get_lemma_flag(dst);
|
|
unsigned char nchr = get_lemma_nchar(dst);
|
|
if ((flag & kUserDictLemmaFlagRemove) == 0) {
|
|
dst += nchr * 4 + 2;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
if (dst >= real_size)
|
|
return;
|
|
|
|
end = dst;
|
|
while (end < real_size) {
|
|
begin = end + get_lemma_nchar(end) * 4 + 2;
|
|
repeat:
|
|
// not used any more
|
|
if (begin >= real_size)
|
|
break;
|
|
unsigned char flag = get_lemma_flag(begin);
|
|
unsigned char nchr = get_lemma_nchar(begin);
|
|
if (flag & kUserDictLemmaFlagRemove) {
|
|
begin += nchr * 4 + 2;
|
|
goto repeat;
|
|
}
|
|
end = begin + nchr * 4 + 2;
|
|
while (end < real_size) {
|
|
unsigned char eflag = get_lemma_flag(end);
|
|
unsigned char enchr = get_lemma_nchar(end);
|
|
if ((eflag & kUserDictLemmaFlagRemove) == 0) {
|
|
end += enchr * 4 + 2;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
|
|
for (size_t j = 0; j < dict_info_.lemma_count; j++) {
|
|
if (offsets_[j] >= begin && offsets_[j] < end) {
|
|
offsets_[j] -= (begin - dst);
|
|
offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
|
|
}
|
|
#ifdef ___PREDICT_ENABLED___
|
|
if (predicts_[j] >= begin && predicts_[j] < end) {
|
|
predicts_[j] -= (begin - dst);
|
|
}
|
|
#endif
|
|
}
|
|
#ifdef ___SYNC_ENABLED___
|
|
for (size_t j = 0; j < dict_info_.sync_count; j++) {
|
|
if (syncs_[j] >= begin && syncs_[j] < end) {
|
|
syncs_[j] -= (begin - dst);
|
|
}
|
|
}
|
|
#endif
|
|
dst += (end - begin);
|
|
}
|
|
|
|
dict_info_.free_count = 0;
|
|
dict_info_.free_size = 0;
|
|
dict_info_.lemma_size = dst;
|
|
lemma_size_left_ = total_size - dict_info_.lemma_size;
|
|
lemma_count_left_ = total_count - dict_info_.lemma_count;
|
|
|
|
// XXX Without following code,
|
|
// offsets_by_id_ is not reordered.
|
|
// That's to say, all removed lemmas' ids are not collected back.
|
|
// There may not be room for addition of new lemmas due to
|
|
// offsests_by_id_ reason, although lemma_size_left_ is fixed.
|
|
// By default, we do want defrag as fast as possible, because
|
|
// during defrag procedure, other peers can not write new lemmas
|
|
// to user dictionary file.
|
|
// XXX If write-back is invoked immediately after
|
|
// this defragment, no need to fix up following in-mem data.
|
|
for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
|
|
ids_[i] = start_id_ + i;
|
|
offsets_by_id_[i] = offsets_[i];
|
|
}
|
|
|
|
state_ = USER_DICT_DEFRAGMENTED;
|
|
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_END;
|
|
LOGD_PERF("defragment");
|
|
#endif
|
|
}
|
|
|
|
#ifdef ___SYNC_ENABLED___
|
|
void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
|
|
if (is_valid_state() == false)
|
|
return;
|
|
if (end > dict_info_.sync_count)
|
|
end = dict_info_.sync_count;
|
|
memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
|
|
dict_info_.sync_count -= (end - start);
|
|
if (state_ < USER_DICT_SYNC_DIRTY)
|
|
state_ = USER_DICT_SYNC_DIRTY;
|
|
}
|
|
|
|
int UserDict::get_sync_count() {
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
return dict_info_.sync_count;
|
|
}
|
|
|
|
LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
|
|
uint16 lemma_len, uint16 count, uint64 lmt) {
|
|
int again = 0;
|
|
begin:
|
|
LemmaIdType id;
|
|
uint32 * syncs_bak = syncs_;
|
|
syncs_ = NULL;
|
|
id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
|
|
syncs_ = syncs_bak;
|
|
if (id == 0 && again == 0) {
|
|
if ((dict_info_.limit_lemma_count > 0 &&
|
|
dict_info_.lemma_count >= dict_info_.limit_lemma_count)
|
|
|| (dict_info_.limit_lemma_size > 0 &&
|
|
dict_info_.lemma_size + (2 + (lemma_len << 2))
|
|
> dict_info_.limit_lemma_size)) {
|
|
// XXX Always reclaim and defrag in sync code path
|
|
// sync thread is background thread and ok with heavy work
|
|
reclaim();
|
|
defragment();
|
|
flush_cache();
|
|
again = 1;
|
|
goto begin;
|
|
}
|
|
}
|
|
return id;
|
|
}
|
|
|
|
int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
|
|
int newly_added = 0;
|
|
|
|
SpellingParser * spl_parser = new SpellingParser();
|
|
if (!spl_parser) {
|
|
return 0;
|
|
}
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_BEGIN;
|
|
#endif
|
|
char16 *ptr = lemmas;
|
|
|
|
// Extract pinyin,words,frequence,last_mod_time
|
|
char16 * p = ptr, * py16 = ptr;
|
|
char16 * hz16 = NULL;
|
|
int py16_len = 0;
|
|
uint16 splid[kMaxLemmaSize];
|
|
int splid_len = 0;
|
|
int hz16_len = 0;
|
|
char16 * fr16 = NULL;
|
|
int fr16_len = 0;
|
|
|
|
while (p - ptr < len) {
|
|
// Pinyin
|
|
py16 = p;
|
|
splid_len = 0;
|
|
while (*p != 0x2c && (p - ptr) < len) {
|
|
if (*p == 0x20)
|
|
splid_len++;
|
|
p++;
|
|
}
|
|
splid_len++;
|
|
if (p - ptr == len)
|
|
break;
|
|
py16_len = p - py16;
|
|
if (kMaxLemmaSize < splid_len) {
|
|
break;
|
|
}
|
|
bool is_pre;
|
|
int splidl = spl_parser->splstr16_to_idxs_f(
|
|
py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
|
|
if (splidl != splid_len)
|
|
break;
|
|
// Phrase
|
|
hz16 = ++p;
|
|
while (*p != 0x2c && (p - ptr) < len) {
|
|
p++;
|
|
}
|
|
hz16_len = p - hz16;
|
|
if (hz16_len != splid_len)
|
|
break;
|
|
// Frequency
|
|
fr16 = ++p;
|
|
fr16_len = 0;
|
|
while (*p != 0x2c && (p - ptr) < len) {
|
|
p++;
|
|
}
|
|
fr16_len = p - fr16;
|
|
uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
|
|
// Last modified time
|
|
fr16 = ++p;
|
|
fr16_len = 0;
|
|
while (*p != 0x3b && (p - ptr) < len) {
|
|
p++;
|
|
}
|
|
fr16_len = p - fr16;
|
|
uint64 last_mod = utf16le_atoll(fr16, fr16_len);
|
|
|
|
put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
|
|
newly_added++;
|
|
|
|
p++;
|
|
}
|
|
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_END;
|
|
LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
|
|
#endif
|
|
return newly_added;
|
|
}
|
|
|
|
int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
|
|
char16 * str, int size, int * count) {
|
|
int len = 0;
|
|
*count = 0;
|
|
|
|
int left_len = size;
|
|
|
|
if (is_valid_state() == false)
|
|
return len;
|
|
|
|
SpellingTrie * spl_trie = &SpellingTrie::get_instance();
|
|
if (!spl_trie) {
|
|
return 0;
|
|
}
|
|
|
|
uint32 i;
|
|
for (i = 0; i < dict_info_.sync_count; i++) {
|
|
int offset = syncs_[i];
|
|
uint32 nchar = get_lemma_nchar(offset);
|
|
uint16 *spl = get_lemma_spell_ids(offset);
|
|
uint16 *wrd = get_lemma_word(offset);
|
|
int score = _get_lemma_score(wrd, spl, nchar);
|
|
|
|
static char score_temp[32], *pscore_temp = score_temp;
|
|
static char16 temp[256], *ptemp = temp;
|
|
|
|
pscore_temp = score_temp;
|
|
ptemp = temp;
|
|
|
|
uint32 j;
|
|
// Add pinyin
|
|
for (j = 0; j < nchar; j++) {
|
|
int ret_len = spl_trie->get_spelling_str16(
|
|
spl[j], ptemp, temp + sizeof(temp) - ptemp);
|
|
if (ret_len <= 0)
|
|
break;
|
|
ptemp += ret_len;
|
|
if (ptemp < temp + sizeof(temp) - 1) {
|
|
*(ptemp++) = ' ';
|
|
} else {
|
|
j = 0;
|
|
break;
|
|
}
|
|
}
|
|
if (j < nchar) {
|
|
continue;
|
|
}
|
|
ptemp--;
|
|
if (ptemp < temp + sizeof(temp) - 1) {
|
|
*(ptemp++) = ',';
|
|
} else {
|
|
continue;
|
|
}
|
|
// Add phrase
|
|
for (j = 0; j < nchar; j++) {
|
|
if (ptemp < temp + sizeof(temp) - 1) {
|
|
*(ptemp++) = wrd[j];
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if (j < nchar) {
|
|
continue;
|
|
}
|
|
if (ptemp < temp + sizeof(temp) - 1) {
|
|
*(ptemp++) = ',';
|
|
} else {
|
|
continue;
|
|
}
|
|
// Add frequency
|
|
uint32 intf = extract_score_freq(score);
|
|
int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
|
|
if (ret_len <= 0)
|
|
continue;
|
|
ptemp += ret_len;
|
|
if (ptemp < temp + sizeof(temp) - 1) {
|
|
*(ptemp++) = ',';
|
|
} else {
|
|
continue;
|
|
}
|
|
// Add last modified time
|
|
uint64 last_mod = extract_score_lmt(score);
|
|
ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
|
|
if (ret_len <= 0)
|
|
continue;
|
|
ptemp += ret_len;
|
|
if (ptemp < temp + sizeof(temp) - 1) {
|
|
*(ptemp++) = ';';
|
|
} else {
|
|
continue;
|
|
}
|
|
|
|
// Write to string
|
|
int need_len = ptemp - temp;
|
|
if (need_len > left_len)
|
|
break;
|
|
memcpy(str + len, temp, need_len * 2);
|
|
left_len -= need_len;
|
|
|
|
len += need_len;
|
|
(*count)++;
|
|
}
|
|
|
|
if (len > 0) {
|
|
if (state_ < USER_DICT_SYNC_DIRTY)
|
|
state_ = USER_DICT_SYNC_DIRTY;
|
|
}
|
|
return len;
|
|
}
|
|
|
|
#endif
|
|
|
|
bool UserDict::state(UserDictStat * stat) {
|
|
if (is_valid_state() == false)
|
|
return false;
|
|
if (!stat)
|
|
return false;
|
|
stat->version = version_;
|
|
stat->file_name = dict_file_;
|
|
stat->load_time.tv_sec = load_time_.tv_sec;
|
|
stat->load_time.tv_usec = load_time_.tv_usec;
|
|
pthread_mutex_lock(&g_mutex_);
|
|
stat->last_update.tv_sec = g_last_update_.tv_sec;
|
|
stat->last_update.tv_usec = g_last_update_.tv_usec;
|
|
pthread_mutex_unlock(&g_mutex_);
|
|
stat->disk_size = get_dict_file_size(&dict_info_);
|
|
stat->lemma_count = dict_info_.lemma_count;
|
|
stat->lemma_size = dict_info_.lemma_size;
|
|
stat->delete_count = dict_info_.free_count;
|
|
stat->delete_size = dict_info_.free_size;
|
|
#ifdef ___SYNC_ENABLED___
|
|
stat->sync_count = dict_info_.sync_count;
|
|
#endif
|
|
stat->limit_lemma_count = dict_info_.limit_lemma_count;
|
|
stat->limit_lemma_size = dict_info_.limit_lemma_size;
|
|
stat->reclaim_ratio = dict_info_.reclaim_ratio;
|
|
return true;
|
|
}
|
|
|
|
void UserDict::set_limit(uint32 max_lemma_count,
|
|
uint32 max_lemma_size, uint32 reclaim_ratio) {
|
|
dict_info_.limit_lemma_count = max_lemma_count;
|
|
dict_info_.limit_lemma_size = max_lemma_size;
|
|
if (reclaim_ratio > 100)
|
|
reclaim_ratio = 100;
|
|
dict_info_.reclaim_ratio = reclaim_ratio;
|
|
}
|
|
|
|
void UserDict::reclaim() {
|
|
if (is_valid_state() == false)
|
|
return;
|
|
|
|
switch (dict_info_.reclaim_ratio) {
|
|
case 0:
|
|
return;
|
|
case 100:
|
|
// TODO: CLEAR to be implemented
|
|
assert(false);
|
|
return;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
// XXX Reclaim is only based on count, not size
|
|
uint32 count = dict_info_.lemma_count;
|
|
int rc = count * dict_info_.reclaim_ratio / 100;
|
|
|
|
UserDictScoreOffsetPair * score_offset_pairs = NULL;
|
|
score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
|
|
sizeof(UserDictScoreOffsetPair) * rc);
|
|
if (score_offset_pairs == NULL) {
|
|
return;
|
|
}
|
|
|
|
for (int i = 0; i < rc; i++) {
|
|
int s = scores_[i];
|
|
score_offset_pairs[i].score = s;
|
|
score_offset_pairs[i].offset_index = i;
|
|
}
|
|
|
|
for (int i = (rc + 1) / 2; i >= 0; i--)
|
|
shift_down(score_offset_pairs, i, rc);
|
|
|
|
for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
|
|
int s = scores_[i];
|
|
if (s < score_offset_pairs[0].score) {
|
|
score_offset_pairs[0].score = s;
|
|
score_offset_pairs[0].offset_index = i;
|
|
shift_down(score_offset_pairs, 0, rc);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < rc; i++) {
|
|
int off = score_offset_pairs[i].offset_index;
|
|
remove_lemma_by_offset_index(off);
|
|
}
|
|
if (rc > 0) {
|
|
if (state_ < USER_DICT_OFFSET_DIRTY)
|
|
state_ = USER_DICT_OFFSET_DIRTY;
|
|
}
|
|
|
|
free(score_offset_pairs);
|
|
}
|
|
|
|
inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
|
|
int s = sop[i].score;
|
|
int p = sop[i].offset_index;
|
|
sop[i].score = sop[j].score;
|
|
sop[i].offset_index = sop[j].offset_index;
|
|
sop[j].score = s;
|
|
sop[j].offset_index = p;
|
|
}
|
|
|
|
void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
|
|
int par = i;
|
|
while (par < n) {
|
|
int left = par * 2 + 1;
|
|
int right = left + 1;
|
|
if (left >= n && right >= n)
|
|
break;
|
|
if (right >= n) {
|
|
if (sop[left].score > sop[par].score) {
|
|
swap(sop, left, par);
|
|
par = left;
|
|
continue;
|
|
}
|
|
} else if (sop[left].score > sop[right].score &&
|
|
sop[left].score > sop[par].score) {
|
|
swap(sop, left, par);
|
|
par = left;
|
|
continue;
|
|
} else if (sop[right].score > sop[left].score &&
|
|
sop[right].score > sop[par].score) {
|
|
swap(sop, right, par);
|
|
par = right;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
|
|
uint16 lemma_len, uint16 count) {
|
|
return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
|
|
}
|
|
|
|
LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
|
|
uint16 lemma_len, uint16 count, uint64 lmt) {
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_BEGIN;
|
|
#endif
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
|
|
if (off != -1) {
|
|
int delta_score = count - scores_[off];
|
|
dict_info_.total_nfreq += delta_score;
|
|
scores_[off] = build_score(lmt, count);
|
|
if (state_ < USER_DICT_SCORE_DIRTY)
|
|
state_ = USER_DICT_SCORE_DIRTY;
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_END;
|
|
LOGD_PERF("_put_lemma(update)");
|
|
#endif
|
|
return ids_[off];
|
|
} else {
|
|
if ((dict_info_.limit_lemma_count > 0 &&
|
|
dict_info_.lemma_count >= dict_info_.limit_lemma_count)
|
|
|| (dict_info_.limit_lemma_size > 0 &&
|
|
dict_info_.lemma_size + (2 + (lemma_len << 2))
|
|
> dict_info_.limit_lemma_size)) {
|
|
// XXX Don't defragment here, it's too time-consuming.
|
|
return 0;
|
|
}
|
|
int flushed = 0;
|
|
if (lemma_count_left_ == 0 ||
|
|
lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
|
|
|
|
// XXX When there is no space for new lemma, we flush to disk
|
|
// flush_cache() may be called by upper user
|
|
// and better place shoule be found instead of here
|
|
flush_cache();
|
|
flushed = 1;
|
|
// Or simply return and do nothing
|
|
// return 0;
|
|
}
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_END;
|
|
LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
|
|
#endif
|
|
LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
|
|
#ifdef ___SYNC_ENABLED___
|
|
if (syncs_ && id != 0) {
|
|
queue_lemma_for_sync(id);
|
|
}
|
|
#endif
|
|
return id;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
#ifdef ___SYNC_ENABLED___
|
|
void UserDict::queue_lemma_for_sync(LemmaIdType id) {
|
|
if (dict_info_.sync_count < sync_count_size_) {
|
|
syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
|
|
} else {
|
|
uint32 * syncs = (uint32*)realloc(
|
|
syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
|
|
if (syncs) {
|
|
sync_count_size_ += kUserDictPreAlloc;
|
|
syncs_ = syncs;
|
|
syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
|
|
bool selected) {
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_BEGIN;
|
|
#endif
|
|
if (is_valid_state() == false)
|
|
return 0;
|
|
if (is_valid_lemma_id(lemma_id) == false)
|
|
return 0;
|
|
uint32 offset = offsets_by_id_[lemma_id - start_id_];
|
|
uint8 lemma_len = get_lemma_nchar(offset);
|
|
char16 * lemma_str = get_lemma_word(offset);
|
|
uint16 * splids = get_lemma_spell_ids(offset);
|
|
|
|
int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
|
|
if (off != -1) {
|
|
int score = scores_[off];
|
|
int count = extract_score_freq(score);
|
|
uint64 lmt = extract_score_lmt(score);
|
|
if (count + delta_count > kUserDictMaxFrequency ||
|
|
count + delta_count < count) {
|
|
delta_count = kUserDictMaxFrequency - count;
|
|
}
|
|
count += delta_count;
|
|
dict_info_.total_nfreq += delta_count;
|
|
if (selected) {
|
|
lmt = time(NULL);
|
|
}
|
|
scores_[off] = build_score(lmt, count);
|
|
if (state_ < USER_DICT_SCORE_DIRTY)
|
|
state_ = USER_DICT_SCORE_DIRTY;
|
|
#ifdef ___DEBUG_PERF___
|
|
DEBUG_PERF_END;
|
|
LOGD_PERF("update_lemma");
|
|
#endif
|
|
#ifdef ___SYNC_ENABLED___
|
|
queue_lemma_for_sync(ids_[off]);
|
|
#endif
|
|
return ids_[off];
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
size_t UserDict::get_total_lemma_count() {
|
|
return dict_info_.total_nfreq;
|
|
}
|
|
|
|
void UserDict::set_total_lemma_count_of_others(size_t count) {
|
|
total_other_nfreq_ = count;
|
|
}
|
|
|
|
LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
|
|
uint16 lemma_len, uint16 count, uint64 lmt) {
|
|
LemmaIdType id = get_max_lemma_id() + 1;
|
|
size_t offset = dict_info_.lemma_size;
|
|
if (offset > kUserDictOffsetMask)
|
|
return 0;
|
|
|
|
lemmas_[offset] = 0;
|
|
lemmas_[offset + 1] = (uint8)lemma_len;
|
|
for (size_t i = 0; i < lemma_len; i++) {
|
|
*((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
|
|
*((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
|
|
= lemma_str[i];
|
|
}
|
|
uint32 off = dict_info_.lemma_count;
|
|
offsets_[off] = offset;
|
|
scores_[off] = build_score(lmt, count);
|
|
ids_[off] = id;
|
|
#ifdef ___PREDICT_ENABLED___
|
|
predicts_[off] = offset;
|
|
#endif
|
|
|
|
offsets_by_id_[id - start_id_] = offset;
|
|
|
|
dict_info_.lemma_count++;
|
|
dict_info_.lemma_size += (2 + (lemma_len << 2));
|
|
lemma_count_left_--;
|
|
lemma_size_left_ -= (2 + (lemma_len << 2));
|
|
|
|
// Sort
|
|
|
|
UserDictSearchable searchable;
|
|
prepare_locate(&searchable, splids, lemma_len);
|
|
|
|
size_t i = 0;
|
|
while (i < off) {
|
|
offset = offsets_[i];
|
|
uint32 nchar = get_lemma_nchar(offset);
|
|
uint16 * spl = get_lemma_spell_ids(offset);
|
|
|
|
if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
|
|
break;
|
|
i++;
|
|
}
|
|
if (i != off) {
|
|
uint32 temp = offsets_[off];
|
|
memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
|
|
offsets_[i] = temp;
|
|
|
|
temp = scores_[off];
|
|
memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
|
|
scores_[i] = temp;
|
|
|
|
temp = ids_[off];
|
|
memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
|
|
ids_[i] = temp;
|
|
}
|
|
|
|
#ifdef ___PREDICT_ENABLED___
|
|
uint32 j = 0;
|
|
uint16 * words_new = get_lemma_word(predicts_[off]);
|
|
j = locate_where_to_insert_in_predicts(words_new, lemma_len);
|
|
if (j != off) {
|
|
uint32 temp = predicts_[off];
|
|
memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
|
|
predicts_[j] = temp;
|
|
}
|
|
#endif
|
|
|
|
if (state_ < USER_DICT_LEMMA_DIRTY)
|
|
state_ = USER_DICT_LEMMA_DIRTY;
|
|
|
|
#ifdef ___CACHE_ENABLED___
|
|
cache_init();
|
|
#endif
|
|
|
|
dict_info_.total_nfreq += count;
|
|
return id;
|
|
}
|
|
}
|