![Uwe Rathmann](/assets/img/avatar_default.png)
The 3rdparty files are now compiled as part of the corresponding input method, so that the project files can be written without using platform specific linker flags.
158 lines
4.8 KiB
C++
158 lines
4.8 KiB
C++
/*
|
|
* Copyright (C) 2009 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef PINYINIME_INCLUDE_DICTDEF_H__
|
|
#define PINYINIME_INCLUDE_DICTDEF_H__
|
|
|
|
#include <stdlib.h>
|
|
#include "./utf16char.h"
|
|
|
|
namespace ime_pinyin {
|
|
|
|
// Enable the following line when building the binary dictionary model.
|
|
// #define ___BUILD_MODEL___
|
|
|
|
typedef unsigned char uint8;
|
|
typedef unsigned short uint16;
|
|
typedef unsigned int uint32;
|
|
|
|
typedef signed char int8;
|
|
typedef short int16;
|
|
typedef int int32;
|
|
typedef long long int64;
|
|
typedef unsigned long long uint64;
|
|
|
|
const bool kPrintDebug0 = false;
|
|
const bool kPrintDebug1 = false;
|
|
const bool kPrintDebug2 = false;
|
|
|
|
// The max length of a lemma.
|
|
const size_t kMaxLemmaSize = 8;
|
|
|
|
// The max length of a Pinyin (spelling).
|
|
const size_t kMaxPinyinSize = 6;
|
|
|
|
// The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
|
|
// See SpellingTrie.h for details.
|
|
const size_t kHalfSpellingIdNum = 29;
|
|
|
|
// The maximum number of full spellings. For Chinese Pinyin, there are only
|
|
// about 410 spellings.
|
|
// If change this value is bigger(needs more bits), please also update
|
|
// other structures like SpellingNode, to make sure than a spelling id can be
|
|
// stored.
|
|
// -1 is because that 0 is never used.
|
|
const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
|
|
const size_t kMaxSearchSteps = 40;
|
|
|
|
// One character predicts its following characters.
|
|
const size_t kMaxPredictSize = (kMaxLemmaSize - 1);
|
|
|
|
// LemmaIdType must always be size_t.
|
|
typedef size_t LemmaIdType;
|
|
const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage.
|
|
const size_t kLemmaIdComposing = 0xffffff;
|
|
|
|
typedef uint16 LmaScoreType;
|
|
typedef uint16 KeyScoreType;
|
|
|
|
// Number of items with highest score are kept for prediction purpose.
|
|
const size_t kTopScoreLemmaNum = 10;
|
|
|
|
const size_t kMaxPredictNumByGt3 = 1;
|
|
const size_t kMaxPredictNumBy3 = 2;
|
|
const size_t kMaxPredictNumBy2 = 2;
|
|
|
|
// The last lemma id (included) for the system dictionary. The system
|
|
// dictionary's ids always start from 1.
|
|
const LemmaIdType kSysDictIdEnd = 500000;
|
|
|
|
// The first lemma id for the user dictionary.
|
|
const LemmaIdType kUserDictIdStart = 500001;
|
|
|
|
// The last lemma id (included) for the user dictionary.
|
|
const LemmaIdType kUserDictIdEnd = 600000;
|
|
|
|
typedef struct {
|
|
uint16 half_splid:5;
|
|
uint16 full_splid:11;
|
|
} SpellingId, *PSpellingId;
|
|
|
|
|
|
/**
|
|
* We use different node types for different layers
|
|
* Statistical data of the building result for a testing dictionary:
|
|
* root, level 0, level 1, level 2, level 3
|
|
* max son num of one node: 406 280 41 2 -
|
|
* max homo num of one node: 0 90 23 2 2
|
|
* total node num of a layer: 1 406 31766 13516 993
|
|
* total homo num of a layer: 9 5674 44609 12667 995
|
|
*
|
|
* The node number for root and level 0 won't be larger than 500
|
|
* According to the information above, two kinds of nodes can be used; one for
|
|
* root and level 0, the other for these layers deeper than 0.
|
|
*
|
|
* LE = less and equal,
|
|
* A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
|
|
*/
|
|
struct LmaNodeLE0 {
|
|
uint32 son_1st_off;
|
|
uint32 homo_idx_buf_off;
|
|
uint16 spl_idx;
|
|
uint16 num_of_son;
|
|
uint16 num_of_homo;
|
|
};
|
|
|
|
/**
|
|
* GE = great and equal
|
|
* A node occupies 8 bytes.
|
|
*/
|
|
struct LmaNodeGE1 {
|
|
uint16 son_1st_off_l; // Low bits of the son_1st_off
|
|
uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1
|
|
uint16 spl_idx;
|
|
unsigned char num_of_son; // number of son nodes
|
|
unsigned char num_of_homo; // number of homo words
|
|
unsigned char son_1st_off_h; // high bits of the son_1st_off
|
|
unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off
|
|
};
|
|
|
|
#ifdef ___BUILD_MODEL___
|
|
struct SingleCharItem {
|
|
float freq;
|
|
char16 hz;
|
|
SpellingId splid;
|
|
};
|
|
|
|
struct LemmaEntry {
|
|
LemmaIdType idx_by_py;
|
|
LemmaIdType idx_by_hz;
|
|
char16 hanzi_str[kMaxLemmaSize + 1];
|
|
|
|
// The SingleCharItem id for each Hanzi.
|
|
uint16 hanzi_scis_ids[kMaxLemmaSize];
|
|
|
|
uint16 spl_idx_arr[kMaxLemmaSize + 1];
|
|
char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
|
|
unsigned char hz_str_len;
|
|
float freq;
|
|
};
|
|
#endif // ___BUILD_MODEL___
|
|
|
|
} // namespace ime_pinyin
|
|
|
|
#endif // PINYINIME_INCLUDE_DICTDEF_H__
|