3rdparty moved to inputcontext/3rdparty.

The 3rdparty files are now compiled as part of the corresponding input
method, so that the project files can be written without using platform
specific linker flags.
This commit is contained in:
Uwe Rathmann 2018-03-30 18:31:13 +02:00
parent 9bf518145d
commit 59b516a118
131 changed files with 172 additions and 36510 deletions

View File

@ -1,2 +0,0 @@
TEMPLATE = subdirs
SUBDIRS = hunspell pinyin

View File

@ -1,7 +0,0 @@
include( $${GLOBAL_PRI_FOLDER}/functions.pri )
DEPS =
INCS =
REGISTERED_MODULES += $$addModule( inputcontext_hunspell, 0.0.0, $${_FILE_}, $$INCS, $$DEPS )

View File

@ -1,49 +0,0 @@
QSK_OUT_ROOT = $${OUT_PWD}/../..
QSK_PLUGIN_DIR = $${QSK_OUT_ROOT}/plugins
TEMPLATE = lib
TARGET = qskinputcontext_hunspell
CONFIG += static
CONFIG += precompile_header warn_off
MODULE_INCLUDEPATH = $$PWD/src
DESTDIR = $${QSK_OUT_ROOT}/plugins/platforminputcontexts
QMAKE_RPATHDIR *= $${DESTDIR_LIBS}
SOURCES += \
src/hunspell/affentry.cxx \
src/hunspell/affixmgr.cxx \
src/hunspell/csutil.cxx \
src/hunspell/filemgr.cxx \
src/hunspell/hashmgr.cxx \
src/hunspell/hunspell.cxx \
src/hunspell/hunzip.cxx \
src/hunspell/phonet.cxx \
src/hunspell/replist.cxx \
src/hunspell/suggestmgr.cxx
HEADERS += \
src/hunspell/affentry.hxx \
src/hunspell/affixmgr.hxx \
src/hunspell/atypes.hxx \
src/hunspell/baseaffix.hxx \
src/hunspell/csutil.hxx \
src/hunspell/filemgr.hxx \
src/hunspell/hashmgr.hxx \
src/hunspell/htypes.hxx \
src/hunspell/hunspell.h \
src/hunspell/hunspell.hxx \
src/hunspell/hunvisapi.h \
src/hunspell/hunzip.hxx \
src/hunspell/langnum.hxx \
src/hunspell/phonet.hxx \
src/hunspell/replist.hxx \
src/hunspell/suggestmgr.hxx \
src/hunspell/w_char.hxx
OTHER_FILES +=\
src/hunspell/license.hunspell \
src/hunspell/license.myspell \
src/hunspell/utf_info.cxx

View File

@ -1,983 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include "affentry.hxx"
#include "csutil.hxx"
AffEntry::~AffEntry() {
if (opts & aeLONGCOND)
free(c.l.conds2);
if (morphcode && !(opts & aeALIASM))
free(morphcode);
if (contclass && !(opts & aeALIASF))
free(contclass);
}
PfxEntry::PfxEntry(AffixMgr* pmgr)
// register affix manager
: pmyMgr(pmgr),
next(NULL),
nexteq(NULL),
nextne(NULL),
flgnxt(NULL) {
}
// add prefix to this word assuming conditions hold
std::string PfxEntry::add(const char* word, size_t len) {
std::string result;
if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
(len >= numconds) && test_condition(word) &&
(!strip.size() || (strncmp(word, strip.c_str(), strip.size()) == 0))) {
/* we have a match so add prefix */
result.assign(appnd);
result.append(word + strip.size());
}
return result;
}
inline char* PfxEntry::nextchar(char* p) {
if (p) {
p++;
if (opts & aeLONGCOND) {
// jump to the 2nd part of the condition
if (p == c.conds + MAXCONDLEN_1)
return c.l.conds2;
// end of the MAXCONDLEN length condition
} else if (p == c.conds + MAXCONDLEN)
return NULL;
return *p ? p : NULL;
}
return NULL;
}
inline int PfxEntry::test_condition(const char* st) {
const char* pos = NULL; // group with pos input position
bool neg = false; // complementer
bool ingroup = false; // character in the group
if (numconds == 0)
return 1;
char* p = c.conds;
while (1) {
switch (*p) {
case '\0':
return 1;
case '[': {
neg = false;
ingroup = false;
p = nextchar(p);
pos = st;
break;
}
case '^': {
p = nextchar(p);
neg = true;
break;
}
case ']': {
if ((neg && ingroup) || (!neg && !ingroup))
return 0;
pos = NULL;
p = nextchar(p);
// skip the next character
if (!ingroup && *st)
for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
;
if (*st == '\0' && p)
return 0; // word <= condition
break;
}
case '.':
if (!pos) { // dots are not metacharacters in groups: [.]
p = nextchar(p);
// skip the next character
for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++)
;
if (*st == '\0' && p)
return 0; // word <= condition
break;
}
/* FALLTHROUGH */
default: {
if (*st == *p) {
st++;
p = nextchar(p);
if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte
while (p && (*p & 0xc0) == 0x80) { // character
if (*p != *st) {
if (!pos)
return 0;
st = pos;
break;
}
p = nextchar(p);
st++;
}
if (pos && st != pos) {
ingroup = true;
while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
}
}
} else if (pos) {
ingroup = true;
while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
}
}
} else if (pos) { // group
p = nextchar(p);
} else
return 0;
}
}
if (!p)
return 1;
}
}
// check if this prefix entry matches
struct hentry* PfxEntry::checkword(const char* word,
int len,
char in_compound,
const FLAG needflag) {
struct hentry* he; // hash entry of root word or NULL
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
int tmpl = len - appnd.size(); // length of tmpword
if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {
// generate new root word by removing prefix and adding
// back any characters that would have been stripped
std::string tmpword(strip);
tmpword.append(word + appnd.size());
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
// this file for more info on exactly what is being
// tested
// if all conditions are met then check if resulting
// root word in the dictionary
if (test_condition(tmpword.c_str())) {
tmpl += strip.size();
if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
do {
if (TESTAFF(he->astr, aflag, he->alen) &&
// forbid single prefixes with needaffix flag
!TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
// needflag
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
(contclass && TESTAFF(contclass, needflag, contclasslen))))
return he;
he = he->next_homonym; // check homonyms
} while (he);
}
// prefix matched but no root word was found
// if aeXPRODUCT is allowed, try again but now
// ross checked combined with a suffix
// if ((opts & aeXPRODUCT) && in_compound) {
if ((opts & aeXPRODUCT)) {
he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, aeXPRODUCT, this,
FLAG_NULL, needflag, in_compound);
if (he)
return he;
}
}
}
return NULL;
}
// check if this prefix entry matches
struct hentry* PfxEntry::check_twosfx(const char* word,
int len,
char in_compound,
const FLAG needflag) {
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
// generate new root word by removing prefix and adding
// back any characters that would have been stripped
std::string tmpword(strip);
tmpword.append(word + appnd.size());
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
// this file for more info on exactly what is being
// tested
// if all conditions are met then check if resulting
// root word in the dictionary
if (test_condition(tmpword.c_str())) {
tmpl += strip.size();
// prefix matched but no root word was found
// if aeXPRODUCT is allowed, try again but now
// cross checked combined with a suffix
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
// hash entry of root word or NULL
struct hentry* he = pmyMgr->suffix_check_twosfx(tmpword.c_str(), tmpl, aeXPRODUCT, this,
needflag);
if (he)
return he;
}
}
}
return NULL;
}
// check if this prefix entry matches
std::string PfxEntry::check_twosfx_morph(const char* word,
int len,
char in_compound,
const FLAG needflag) {
std::string result;
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
// generate new root word by removing prefix and adding
// back any characters that would have been stripped
std::string tmpword(strip);
tmpword.append(word + appnd.size());
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
// this file for more info on exactly what is being
// tested
// if all conditions are met then check if resulting
// root word in the dictionary
if (test_condition(tmpword.c_str())) {
tmpl += strip.size();
// prefix matched but no root word was found
// if aeXPRODUCT is allowed, try again but now
// ross checked combined with a suffix
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
result = pmyMgr->suffix_check_twosfx_morph(tmpword.c_str(), tmpl,
aeXPRODUCT,
this, needflag);
}
}
}
return result;
}
// check if this prefix entry matches
std::string PfxEntry::check_morph(const char* word,
int len,
char in_compound,
const FLAG needflag) {
std::string result;
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
// generate new root word by removing prefix and adding
// back any characters that would have been stripped
std::string tmpword(strip);
tmpword.append(word + appnd.size());
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
// this file for more info on exactly what is being
// tested
// if all conditions are met then check if resulting
// root word in the dictionary
if (test_condition(tmpword.c_str())) {
tmpl += strip.size();
struct hentry* he; // hash entry of root word or NULL
if ((he = pmyMgr->lookup(tmpword.c_str())) != NULL) {
do {
if (TESTAFF(he->astr, aflag, he->alen) &&
// forbid single prefixes with needaffix flag
!TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&
// needflag
((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
(contclass && TESTAFF(contclass, needflag, contclasslen)))) {
if (morphcode) {
result.append(" ");
result.append(morphcode);
} else
result.append(getKey());
if (!HENTRY_FIND(he, MORPH_STEM)) {
result.append(" ");
result.append(MORPH_STEM);
result.append(HENTRY_WORD(he));
}
// store the pointer of the hash entry
if (HENTRY_DATA(he)) {
result.append(" ");
result.append(HENTRY_DATA2(he));
} else {
// return with debug information
char* flag = pmyMgr->encode_flag(getFlag());
result.append(" ");
result.append(MORPH_FLAG);
result.append(flag);
free(flag);
}
result.append("\n");
}
he = he->next_homonym;
} while (he);
}
// prefix matched but no root word was found
// if aeXPRODUCT is allowed, try again but now
// ross checked combined with a suffix
if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, aeXPRODUCT, this,
FLAG_NULL, needflag);
if (!st.empty()) {
result.append(st);
}
}
}
}
return result;
}
SfxEntry::SfxEntry(AffixMgr* pmgr)
: pmyMgr(pmgr) // register affix manager
,
next(NULL),
nexteq(NULL),
nextne(NULL),
flgnxt(NULL),
l_morph(NULL),
r_morph(NULL),
eq_morph(NULL) {
}
// add suffix to this word assuming conditions hold
std::string SfxEntry::add(const char* word, size_t len) {
std::string result;
/* make sure all conditions match */
if ((len > strip.size() || (len == 0 && pmyMgr->get_fullstrip())) &&
(len >= numconds) && test_condition(word + len, word) &&
(!strip.size() ||
(strcmp(word + len - strip.size(), strip.c_str()) == 0))) {
result.assign(word);
/* we have a match so add suffix */
result.replace(len - strip.size(), std::string::npos, appnd);
}
return result;
}
inline char* SfxEntry::nextchar(char* p) {
if (p) {
p++;
if (opts & aeLONGCOND) {
// jump to the 2nd part of the condition
if (p == c.l.conds1 + MAXCONDLEN_1)
return c.l.conds2;
// end of the MAXCONDLEN length condition
} else if (p == c.conds + MAXCONDLEN)
return NULL;
return *p ? p : NULL;
}
return NULL;
}
inline int SfxEntry::test_condition(const char* st, const char* beg) {
const char* pos = NULL; // group with pos input position
bool neg = false; // complementer
bool ingroup = false; // character in the group
if (numconds == 0)
return 1;
char* p = c.conds;
st--;
int i = 1;
while (1) {
switch (*p) {
case '\0':
return 1;
case '[':
p = nextchar(p);
pos = st;
break;
case '^':
p = nextchar(p);
neg = true;
break;
case ']':
if (!neg && !ingroup)
return 0;
i++;
// skip the next character
if (!ingroup) {
for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--)
;
st--;
}
pos = NULL;
neg = false;
ingroup = false;
p = nextchar(p);
if (st < beg && p)
return 0; // word <= condition
break;
case '.':
if (!pos) {
// dots are not metacharacters in groups: [.]
p = nextchar(p);
// skip the next character
for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80;
st--)
;
if (st < beg) { // word <= condition
if (p)
return 0;
else
return 1;
}
if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character
st--;
if (st < beg) { // word <= condition
if (p)
return 0;
else
return 1;
}
}
break;
}
/* FALLTHROUGH */
default: {
if (*st == *p) {
p = nextchar(p);
if ((opts & aeUTF8) && (*st & 0x80)) {
st--;
while (p && (st >= beg)) {
if (*p != *st) {
if (!pos)
return 0;
st = pos;
break;
}
// first byte of the UTF-8 multibyte character
if ((*p & 0xc0) != 0x80)
break;
p = nextchar(p);
st--;
}
if (pos && st != pos) {
if (neg)
return 0;
else if (i == numconds)
return 1;
ingroup = true;
while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
}
st--;
}
if (p && *p != ']')
p = nextchar(p);
} else if (pos) {
if (neg)
return 0;
else if (i == numconds)
return 1;
ingroup = true;
while (p && *p != ']' && ((p = nextchar(p)) != NULL)) {
}
// if (p && *p != ']') p = nextchar(p);
st--;
}
if (!pos) {
i++;
st--;
}
if (st < beg && p && *p != ']')
return 0; // word <= condition
} else if (pos) { // group
p = nextchar(p);
} else
return 0;
}
}
if (!p)
return 1;
}
}
// see if this suffix is present in the word
struct hentry* SfxEntry::checkword(const char* word,
int len,
int optflags,
PfxEntry* ppfx,
const FLAG cclass,
const FLAG needflag,
const FLAG badflag) {
struct hentry* he; // hash entry pointer
PfxEntry* ep = ppfx;
// if this suffix is being cross checked with a prefix
// but it does not support cross products skip it
if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
return NULL;
// upon entry suffix is 0 length or already matches the end of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
int tmpl = len - appnd.size(); // length of tmpword
// the second condition is not enough for UTF-8 strings
// it checked in test_condition()
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
// generate new root word by removing suffix and adding
// back any characters that would have been stripped or
// or null terminating the shorter string
std::string tmpstring(word, tmpl);
if (strip.size()) {
tmpstring.append(strip);
}
const char* tmpword = tmpstring.c_str();
const char* endword = tmpword + tmpstring.size();
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
// this file for more info on exactly what is being
// tested
// if all conditions are met then check if resulting
// root word in the dictionary
if (test_condition(endword, tmpword)) {
#ifdef SZOSZABLYA_POSSIBLE_ROOTS
fprintf(stdout, "%s %s %c\n", word, tmpword, aflag);
#endif
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
do {
// check conditional suffix (enabled by prefix)
if ((TESTAFF(he->astr, aflag, he->alen) ||
(ep && ep->getCont() &&
TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
(((optflags & aeXPRODUCT) == 0) ||
(ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||
// enabled by prefix
((contclass) &&
(ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))) &&
// handle cont. class
((!cclass) ||
((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
// check only in compound homonyms (bad flags)
(!badflag || !TESTAFF(he->astr, badflag, he->alen)) &&
// handle required flag
((!needflag) ||
(TESTAFF(he->astr, needflag, he->alen) ||
((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
return he;
he = he->next_homonym; // check homonyms
} while (he);
}
}
}
return NULL;
}
// see if two-level suffix is present in the word
struct hentry* SfxEntry::check_twosfx(const char* word,
int len,
int optflags,
PfxEntry* ppfx,
const FLAG needflag) {
PfxEntry* ep = ppfx;
// if this suffix is being cross checked with a prefix
// but it does not support cross products skip it
if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
return NULL;
// upon entry suffix is 0 length or already matches the end of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
// generate new root word by removing suffix and adding
// back any characters that would have been stripped or
// or null terminating the shorter string
std::string tmpword(word);
tmpword.resize(tmpl);
tmpword.append(strip);
tmpl += strip.size();
const char* beg = tmpword.c_str();
const char* end = beg + tmpl;
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
// this file for more info on exactly what is being
// tested
// if all conditions are met then recall suffix_check
if (test_condition(end, beg)) {
struct hentry* he; // hash entry pointer
if (ppfx) {
// handle conditional suffix
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
(FLAG)aflag, needflag, IN_CPD_NOT);
else
he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, optflags, ppfx,
(FLAG)aflag, needflag, IN_CPD_NOT);
} else {
he = pmyMgr->suffix_check(tmpword.c_str(), tmpl, 0, NULL,
(FLAG)aflag, needflag, IN_CPD_NOT);
}
if (he)
return he;
}
}
return NULL;
}
// see if two-level suffix is present in the word
std::string SfxEntry::check_twosfx_morph(const char* word,
int len,
int optflags,
PfxEntry* ppfx,
const FLAG needflag) {
PfxEntry* ep = ppfx;
std::string result;
// if this suffix is being cross checked with a prefix
// but it does not support cross products skip it
if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
return result;
// upon entry suffix is 0 length or already matches the end of the word.
// So if the remaining root word has positive length
// and if there are enough chars in root word and added back strip chars
// to meet the number of characters conditions, then test it
int tmpl = len - appnd.size(); // length of tmpword
if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&
(tmpl + strip.size() >= numconds)) {
// generate new root word by removing suffix and adding
// back any characters that would have been stripped or
// or null terminating the shorter string
std::string tmpword(word);
tmpword.resize(tmpl);
tmpword.append(strip);
tmpl += strip.size();
const char* beg = tmpword.c_str();
const char* end = beg + tmpl;
// now make sure all of the conditions on characters
// are met. Please see the appendix at the end of
// this file for more info on exactly what is being
// tested
// if all conditions are met then recall suffix_check
if (test_condition(end, beg)) {
if (ppfx) {
// handle conditional suffix
if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag,
needflag);
if (!st.empty()) {
if (ppfx->getMorph()) {
result.append(ppfx->getMorph());
result.append(" ");
}
result.append(st);
mychomp(result);
}
} else {
std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, optflags, ppfx, aflag,
needflag);
if (!st.empty()) {
result.append(st);
mychomp(result);
}
}
} else {
std::string st = pmyMgr->suffix_check_morph(tmpword.c_str(), tmpl, 0, NULL, aflag, needflag);
if (!st.empty()) {
result.append(st);
mychomp(result);
}
}
}
}
return result;
}
// get next homonym with same affix
struct hentry* SfxEntry::get_next_homonym(struct hentry* he,
int optflags,
PfxEntry* ppfx,
const FLAG cclass,
const FLAG needflag) {
PfxEntry* ep = ppfx;
FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;
while (he->next_homonym) {
he = he->next_homonym;
if ((TESTAFF(he->astr, aflag, he->alen) ||
(ep && ep->getCont() &&
TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
((optflags & aeXPRODUCT) == 0 || TESTAFF(he->astr, eFlag, he->alen) ||
// handle conditional suffix
((contclass) && TESTAFF(contclass, eFlag, contclasslen))) &&
// handle cont. class
((!cclass) ||
((contclass) && TESTAFF(contclass, cclass, contclasslen))) &&
// handle required flag
((!needflag) ||
(TESTAFF(he->astr, needflag, he->alen) ||
((contclass) && TESTAFF(contclass, needflag, contclasslen)))))
return he;
}
return NULL;
}
void SfxEntry::initReverseWord() {
rappnd = appnd;
reverseword(rappnd);
}
#if 0
Appendix: Understanding Affix Code
An affix is either a prefix or a suffix attached to root words to make
other words.
Basically a Prefix or a Suffix is set of AffEntry objects
which store information about the prefix or suffix along
with supporting routines to check if a word has a particular
prefix or suffix or a combination.
The structure affentry is defined as follows:
struct affentry
{
unsigned short aflag; // ID used to represent the affix
std::string strip; // string to strip before adding affix
std::string appnd; // the affix string to add
char numconds; // the number of conditions that must be met
char opts; // flag: aeXPRODUCT- combine both prefix and suffix
char conds[SETSIZE]; // array which encodes the conditions to be met
};
Here is a suffix borrowed from the en_US.aff file. This file
is whitespace delimited.
SFX D Y 4
SFX D 0 e d
SFX D y ied [^aeiou]y
SFX D 0 ed [^ey]
SFX D 0 ed [aeiou]y
This information can be interpreted as follows:
In the first line has 4 fields
Field
-----
1 SFX - indicates this is a suffix
2 D - is the name of the character flag which represents this suffix
3 Y - indicates it can be combined with prefixes (cross product)
4 4 - indicates that sequence of 4 affentry structures are needed to
properly store the affix information
The remaining lines describe the unique information for the 4 SfxEntry
objects that make up this affix. Each line can be interpreted
as follows: (note fields 1 and 2 are as a check against line 1 info)
Field
-----
1 SFX - indicates this is a suffix
2 D - is the name of the character flag for this affix
3 y - the string of chars to strip off before adding affix
(a 0 here indicates the NULL string)
4 ied - the string of affix characters to add
5 [^aeiou]y - the conditions which must be met before the affix
can be applied
Field 5 is interesting. Since this is a suffix, field 5 tells us that
there are 2 conditions that must be met. The first condition is that
the next to the last character in the word must *NOT* be any of the
following "a", "e", "i", "o" or "u". The second condition is that
the last character of the word must end in "y".
So how can we encode this information concisely and be able to
test for both conditions in a fast manner? The answer is found
but studying the wonderful ispell code of Geoff Kuenning, et.al.
(now available under a normal BSD license).
If we set up a conds array of 256 bytes indexed (0 to 255) and access it
using a character (cast to an unsigned char) of a string, we have 8 bits
of information we can store about that character. Specifically we
could use each bit to say if that character is allowed in any of the
last (or first for prefixes) 8 characters of the word.
Basically, each character at one end of the word (up to the number
of conditions) is used to index into the conds array and the resulting
value found there says whether the that character is valid for a
specific character position in the word.
For prefixes, it does this by setting bit 0 if that char is valid
in the first position, bit 1 if valid in the second position, and so on.
If a bit is not set, then that char is not valid for that postion in the
word.
If working with suffixes bit 0 is used for the character closest
to the front, bit 1 for the next character towards the end, ...,
with bit numconds-1 representing the last char at the end of the string.
Note: since entries in the conds[] are 8 bits, only 8 conditions
(read that only 8 character positions) can be examined at one
end of a word (the beginning for prefixes and the end for suffixes.
So to make this clearer, lets encode the conds array values for the
first two affentries for the suffix D described earlier.
For the first affentry:
numconds = 1 (only examine the last character)
conds['e'] = (1 << 0) (the word must end in an E)
all others are all 0
For the second affentry:
numconds = 2 (only examine the last two characters)
conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
where X is all characters *but* a, e, i, o, or u
conds['y'] = (1 << 1) (the last char must be a y)
all other bits for all other entries in the conds array are zero
#endif

View File

@ -1,223 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef AFFIX_HXX_
#define AFFIX_HXX_
#include "atypes.hxx"
#include "baseaffix.hxx"
#include "affixmgr.hxx"
/* A Prefix Entry */
class PfxEntry : public AffEntry {
private:
PfxEntry(const PfxEntry&);
PfxEntry& operator=(const PfxEntry&);
private:
AffixMgr* pmyMgr;
PfxEntry* next;
PfxEntry* nexteq;
PfxEntry* nextne;
PfxEntry* flgnxt;
public:
explicit PfxEntry(AffixMgr* pmgr);
bool allowCross() const { return ((opts & aeXPRODUCT) != 0); }
struct hentry* checkword(const char* word,
int len,
char in_compound,
const FLAG needflag = FLAG_NULL);
struct hentry* check_twosfx(const char* word,
int len,
char in_compound,
const FLAG needflag = FLAG_NULL);
std::string check_morph(const char* word,
int len,
char in_compound,
const FLAG needflag = FLAG_NULL);
std::string check_twosfx_morph(const char* word,
int len,
char in_compound,
const FLAG needflag = FLAG_NULL);
FLAG getFlag() { return aflag; }
const char* getKey() { return appnd.c_str(); }
std::string add(const char* word, size_t len);
inline short getKeyLen() { return appnd.size(); }
inline const char* getMorph() { return morphcode; }
inline const unsigned short* getCont() { return contclass; }
inline short getContLen() { return contclasslen; }
inline PfxEntry* getNext() { return next; }
inline PfxEntry* getNextNE() { return nextne; }
inline PfxEntry* getNextEQ() { return nexteq; }
inline PfxEntry* getFlgNxt() { return flgnxt; }
inline void setNext(PfxEntry* ptr) { next = ptr; }
inline void setNextNE(PfxEntry* ptr) { nextne = ptr; }
inline void setNextEQ(PfxEntry* ptr) { nexteq = ptr; }
inline void setFlgNxt(PfxEntry* ptr) { flgnxt = ptr; }
inline char* nextchar(char* p);
inline int test_condition(const char* st);
};
/* A Suffix Entry */
class SfxEntry : public AffEntry {
private:
SfxEntry(const SfxEntry&);
SfxEntry& operator=(const SfxEntry&);
private:
AffixMgr* pmyMgr;
std::string rappnd;
SfxEntry* next;
SfxEntry* nexteq;
SfxEntry* nextne;
SfxEntry* flgnxt;
SfxEntry* l_morph;
SfxEntry* r_morph;
SfxEntry* eq_morph;
public:
explicit SfxEntry(AffixMgr* pmgr);
bool allowCross() const { return ((opts & aeXPRODUCT) != 0); }
struct hentry* checkword(const char* word,
int len,
int optflags,
PfxEntry* ppfx,
const FLAG cclass,
const FLAG needflag,
const FLAG badflag);
struct hentry* check_twosfx(const char* word,
int len,
int optflags,
PfxEntry* ppfx,
const FLAG needflag = FLAG_NULL);
std::string check_twosfx_morph(const char* word,
int len,
int optflags,
PfxEntry* ppfx,
const FLAG needflag = FLAG_NULL);
struct hentry* get_next_homonym(struct hentry* he);
struct hentry* get_next_homonym(struct hentry* word,
int optflags,
PfxEntry* ppfx,
const FLAG cclass,
const FLAG needflag);
FLAG getFlag() { return aflag; }
const char* getKey() { return rappnd.c_str(); }
std::string add(const char* word, size_t len);
inline const char* getMorph() { return morphcode; }
inline const unsigned short* getCont() { return contclass; }
inline short getContLen() { return contclasslen; }
inline const char* getAffix() { return appnd.c_str(); }
inline short getKeyLen() { return appnd.size(); }
inline SfxEntry* getNext() { return next; }
inline SfxEntry* getNextNE() { return nextne; }
inline SfxEntry* getNextEQ() { return nexteq; }
inline SfxEntry* getLM() { return l_morph; }
inline SfxEntry* getRM() { return r_morph; }
inline SfxEntry* getEQM() { return eq_morph; }
inline SfxEntry* getFlgNxt() { return flgnxt; }
inline void setNext(SfxEntry* ptr) { next = ptr; }
inline void setNextNE(SfxEntry* ptr) { nextne = ptr; }
inline void setNextEQ(SfxEntry* ptr) { nexteq = ptr; }
inline void setFlgNxt(SfxEntry* ptr) { flgnxt = ptr; }
void initReverseWord();
inline char* nextchar(char* p);
inline int test_condition(const char* st, const char* begin);
};
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,369 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef AFFIXMGR_HXX_
#define AFFIXMGR_HXX_
#include <stdio.h>
#include <string>
#include <vector>
#include "atypes.hxx"
#include "baseaffix.hxx"
#include "hashmgr.hxx"
#include "phonet.hxx"
#include "replist.hxx"
// check flag duplication
#define dupSFX (1 << 0)
#define dupPFX (1 << 1)
class PfxEntry;
class SfxEntry;
class AffixMgr {
PfxEntry* pStart[SETSIZE];
SfxEntry* sStart[SETSIZE];
PfxEntry* pFlag[SETSIZE];
SfxEntry* sFlag[SETSIZE];
const std::vector<HashMgr*>& alldic;
const HashMgr* pHMgr;
std::string keystring;
std::string trystring;
std::string encoding;
struct cs_info* csconv;
int utf8;
int complexprefixes;
FLAG compoundflag;
FLAG compoundbegin;
FLAG compoundmiddle;
FLAG compoundend;
FLAG compoundroot;
FLAG compoundforbidflag;
FLAG compoundpermitflag;
int compoundmoresuffixes;
int checkcompounddup;
int checkcompoundrep;
int checkcompoundcase;
int checkcompoundtriple;
int simplifiedtriple;
FLAG forbiddenword;
FLAG nosuggest;
FLAG nongramsuggest;
FLAG needaffix;
int cpdmin;
bool parsedrep;
std::vector<replentry> reptable;
RepList* iconvtable;
RepList* oconvtable;
bool parsedmaptable;
std::vector<mapentry> maptable;
bool parsedbreaktable;
std::vector<std::string> breaktable;
bool parsedcheckcpd;
std::vector<patentry> checkcpdtable;
int simplifiedcpd;
bool parseddefcpd;
std::vector<flagentry> defcpdtable;
phonetable* phone;
int maxngramsugs;
int maxcpdsugs;
int maxdiff;
int onlymaxdiff;
int nosplitsugs;
int sugswithdots;
int cpdwordmax;
int cpdmaxsyllable;
std::string cpdvowels; // vowels (for calculating of Hungarian compounding limit,
std::vector<w_char> cpdvowels_utf16; //vowels for UTF-8 encoding
std::string cpdsyllablenum; // syllable count incrementing flag
const char* pfxappnd; // BUG: not stateless
const char* sfxappnd; // BUG: not stateless
int sfxextra; // BUG: not stateless
FLAG sfxflag; // BUG: not stateless
char* derived; // BUG: not stateless
SfxEntry* sfx; // BUG: not stateless
PfxEntry* pfx; // BUG: not stateless
int checknum;
std::string wordchars; // letters + spec. word characters
std::vector<w_char> wordchars_utf16;
std::string ignorechars; // letters + spec. word characters
std::vector<w_char> ignorechars_utf16;
std::string version; // affix and dictionary file version string
std::string lang; // language
int langnum;
FLAG lemma_present;
FLAG circumfix;
FLAG onlyincompound;
FLAG keepcase;
FLAG forceucase;
FLAG warn;
int forbidwarn;
FLAG substandard;
int checksharps;
int fullstrip;
int havecontclass; // boolean variable
char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold
// affix)
public:
AffixMgr(const char* affpath, const std::vector<HashMgr*>& ptr, const char* key = NULL);
~AffixMgr();
struct hentry* affix_check(const char* word,
int len,
const unsigned short needflag = (unsigned short)0,
char in_compound = IN_CPD_NOT);
struct hentry* prefix_check(const char* word,
int len,
char in_compound,
const FLAG needflag = FLAG_NULL);
inline int isSubset(const char* s1, const char* s2);
struct hentry* prefix_check_twosfx(const char* word,
int len,
char in_compound,
const FLAG needflag = FLAG_NULL);
inline int isRevSubset(const char* s1, const char* end_of_s2, int len);
struct hentry* suffix_check(const char* word,
int len,
int sfxopts,
PfxEntry* ppfx,
const FLAG cclass = FLAG_NULL,
const FLAG needflag = FLAG_NULL,
char in_compound = IN_CPD_NOT);
struct hentry* suffix_check_twosfx(const char* word,
int len,
int sfxopts,
PfxEntry* ppfx,
const FLAG needflag = FLAG_NULL);
std::string affix_check_morph(const char* word,
int len,
const FLAG needflag = FLAG_NULL,
char in_compound = IN_CPD_NOT);
std::string prefix_check_morph(const char* word,
int len,
char in_compound,
const FLAG needflag = FLAG_NULL);
std::string suffix_check_morph(const char* word,
int len,
int sfxopts,
PfxEntry* ppfx,
const FLAG cclass = FLAG_NULL,
const FLAG needflag = FLAG_NULL,
char in_compound = IN_CPD_NOT);
std::string prefix_check_twosfx_morph(const char* word,
int len,
char in_compound,
const FLAG needflag = FLAG_NULL);
std::string suffix_check_twosfx_morph(const char* word,
int len,
int sfxopts,
PfxEntry* ppfx,
const FLAG needflag = FLAG_NULL);
std::string morphgen(const char* ts,
int wl,
const unsigned short* ap,
unsigned short al,
const char* morph,
const char* targetmorph,
int level);
int expand_rootword(struct guessword* wlst,
int maxn,
const char* ts,
int wl,
const unsigned short* ap,
unsigned short al,
const char* bad,
int,
const char*);
short get_syllable(const std::string& word);
int cpdrep_check(const char* word, int len);
int cpdpat_check(const char* word,
int len,
hentry* r1,
hentry* r2,
const char affixed);
int defcpd_check(hentry*** words,
short wnum,
hentry* rv,
hentry** rwords,
char all);
int cpdcase_check(const char* word, int len);
inline int candidate_check(const char* word, int len);
void setcminmax(int* cmin, int* cmax, const char* word, int len);
struct hentry* compound_check(const std::string& word,
short wordnum,
short numsyllable,
short maxwordnum,
short wnum,
hentry** words,
hentry** rwords,
char hu_mov_rule,
char is_sug,
int* info);
int compound_check_morph(const char* word,
int len,
short wordnum,
short numsyllable,
short maxwordnum,
short wnum,
hentry** words,
hentry** rwords,
char hu_mov_rule,
std::string& result,
const std::string* partresult);
std::vector<std::string> get_suffix_words(short unsigned* suff,
int len,
const char* root_word);
struct hentry* lookup(const char* word);
const std::vector<replentry>& get_reptable() const;
RepList* get_iconvtable() const;
RepList* get_oconvtable() const;
struct phonetable* get_phonetable() const;
const std::vector<mapentry>& get_maptable() const;
const std::vector<std::string>& get_breaktable() const;
const std::string& get_encoding();
int get_langnum() const;
char* get_key_string();
char* get_try_string() const;
const std::string& get_wordchars() const;
const std::vector<w_char>& get_wordchars_utf16() const;
const char* get_ignore() const;
const std::vector<w_char>& get_ignore_utf16() const;
int get_compound() const;
FLAG get_compoundflag() const;
FLAG get_forbiddenword() const;
FLAG get_nosuggest() const;
FLAG get_nongramsuggest() const;
FLAG get_needaffix() const;
FLAG get_onlyincompound() const;
const char* get_derived() const;
const std::string& get_version() const;
int have_contclass() const;
int get_utf8() const;
int get_complexprefixes() const;
char* get_suffixed(char) const;
int get_maxngramsugs() const;
int get_maxcpdsugs() const;
int get_maxdiff() const;
int get_onlymaxdiff() const;
int get_nosplitsugs() const;
int get_sugswithdots(void) const;
FLAG get_keepcase(void) const;
FLAG get_forceucase(void) const;
FLAG get_warn(void) const;
int get_forbidwarn(void) const;
int get_checksharps(void) const;
char* encode_flag(unsigned short aflag) const;
int get_fullstrip() const;
private:
int parse_file(const char* affpath, const char* key);
bool parse_flag(const std::string& line, unsigned short* out, FileMgr* af);
bool parse_num(const std::string& line, int* out, FileMgr* af);
bool parse_cpdsyllable(const std::string& line, FileMgr* af);
bool parse_reptable(const std::string& line, FileMgr* af);
bool parse_convtable(const std::string& line,
FileMgr* af,
RepList** rl,
const std::string& keyword);
bool parse_phonetable(const std::string& line, FileMgr* af);
bool parse_maptable(const std::string& line, FileMgr* af);
bool parse_breaktable(const std::string& line, FileMgr* af);
bool parse_checkcpdtable(const std::string& line, FileMgr* af);
bool parse_defcpdtable(const std::string& line, FileMgr* af);
bool parse_affix(const std::string& line, const char at, FileMgr* af, char* dupflags);
void reverse_condition(std::string&);
std::string& debugflag(std::string& result, unsigned short flag);
int condlen(const char*);
int encodeit(AffEntry& entry, const char* cs);
int build_pfxtree(PfxEntry* pfxptr);
int build_sfxtree(SfxEntry* sfxptr);
int process_pfx_order();
int process_sfx_order();
PfxEntry* process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr);
SfxEntry* process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr);
int process_pfx_tree_to_list();
int process_sfx_tree_to_list();
int redundant_condition(char, const char* strip, int stripl, const char* cond, int);
void finishFileMgr(FileMgr* afflst);
};
#endif

View File

@ -1,119 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef ATYPES_HXX_
#define ATYPES_HXX_
#ifndef HUNSPELL_WARNING
#include <stdio.h>
#ifdef HUNSPELL_WARNING_ON
#define HUNSPELL_WARNING fprintf
#else
// empty inline function to switch off warnings (instead of the C99 standard
// variadic macros)
static inline void HUNSPELL_WARNING(FILE*, const char*, ...) {}
#endif
#endif
// HUNSTEM def.
#define HUNSTEM
#include "w_char.hxx"
#include <algorithm>
#include <string>
#include <vector>
#define SETSIZE 256
#define CONTSIZE 65536
// AffEntry options
#define aeXPRODUCT (1 << 0)
#define aeUTF8 (1 << 1)
#define aeALIASF (1 << 2)
#define aeALIASM (1 << 3)
#define aeLONGCOND (1 << 4)
// compound options
#define IN_CPD_NOT 0
#define IN_CPD_BEGIN 1
#define IN_CPD_END 2
#define IN_CPD_OTHER 3
// info options
#define SPELL_COMPOUND (1 << 0)
#define SPELL_FORBIDDEN (1 << 1)
#define SPELL_ALLCAP (1 << 2)
#define SPELL_NOCAP (1 << 3)
#define SPELL_INITCAP (1 << 4)
#define SPELL_ORIGCAP (1 << 5)
#define SPELL_WARN (1 << 6)
#define MINCPDLEN 3
#define MAXCOMPOUND 10
#define MAXCONDLEN 20
#define MAXCONDLEN_1 (MAXCONDLEN - sizeof(char*))
#define MAXACC 1000
#define FLAG unsigned short
#define FLAG_NULL 0x00
#define FREE_FLAG(a) a = 0
#define TESTAFF(a, b, c) (std::binary_search(a, a + c, b))
struct guessword {
char* word;
bool allow;
char* orig;
};
typedef std::vector<std::string> mapentry;
typedef std::vector<FLAG> flagentry;
struct patentry {
std::string pattern;
std::string pattern2;
std::string pattern3;
FLAG cond;
FLAG cond2;
patentry()
: cond(FLAG_NULL)
, cond2(FLAG_NULL) {
}
};
#endif

View File

@ -1,74 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef BASEAFF_HXX_
#define BASEAFF_HXX_
#include <string>
class AffEntry {
private:
AffEntry(const AffEntry&);
AffEntry& operator=(const AffEntry&);
public:
AffEntry()
: numconds(0),
opts(0),
aflag(0),
morphcode(0),
contclass(NULL),
contclasslen(0) {}
virtual ~AffEntry();
std::string appnd;
std::string strip;
unsigned char numconds;
char opts;
unsigned short aflag;
union {
char conds[MAXCONDLEN];
struct {
char conds1[MAXCONDLEN_1];
char* conds2;
} l;
} c;
char* morphcode;
unsigned short* contclass;
short contclasslen;
};
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,314 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef CSUTIL_HXX_
#define CSUTIL_HXX_
#include "hunvisapi.h"
// First some base level utility routines
#include <fstream>
#include <string>
#include <vector>
#include <string.h>
#include "w_char.hxx"
#include "htypes.hxx"
#ifdef MOZILLA_CLIENT
#include "nscore.h" // for mozalloc headers
#endif
// casing
#define NOCAP 0
#define INITCAP 1
#define ALLCAP 2
#define HUHCAP 3
#define HUHINITCAP 4
// default encoding and keystring
#define SPELL_ENCODING "ISO8859-1"
#define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm"
// default morphological fields
#define MORPH_STEM "st:"
#define MORPH_ALLOMORPH "al:"
#define MORPH_POS "po:"
#define MORPH_DERI_PFX "dp:"
#define MORPH_INFL_PFX "ip:"
#define MORPH_TERM_PFX "tp:"
#define MORPH_DERI_SFX "ds:"
#define MORPH_INFL_SFX "is:"
#define MORPH_TERM_SFX "ts:"
#define MORPH_SURF_PFX "sp:"
#define MORPH_FREQ "fr:"
#define MORPH_PHON "ph:"
#define MORPH_HYPH "hy:"
#define MORPH_PART "pa:"
#define MORPH_FLAG "fl:"
#define MORPH_HENTRY "_H:"
#define MORPH_TAG_LEN strlen(MORPH_STEM)
#define MSEP_FLD ' '
#define MSEP_REC '\n'
#define MSEP_ALT '\v'
// default flags
#define DEFAULTFLAGS 65510
#define FORBIDDENWORD 65510
#define ONLYUPCASEFLAG 65511
// fix long pathname problem of WIN32 by using w_char std::fstream::open override
LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path,
std::ios_base::openmode mode);
// convert UTF-16 characters to UTF-8
LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest,
const std::vector<w_char>& src);
// convert UTF-8 characters to UTF-16
LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest,
const std::string& src);
// remove end of line char(s)
LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s);
// duplicate string
LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s);
// parse into tokens with char delimiter
LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str,
std::string::const_iterator& start);
// replace pat by rep in word and return word
LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str,
const std::string& search,
const std::string& replace);
// append s to ends of every lines in text
LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str,
const std::string& apd);
// tokenize into lines with new line
LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text,
char breakchar);
// tokenize into lines with new line and uniq in place
LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar);
LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar);
// reverse word
LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word);
// reverse word
LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&);
// remove duplicates
LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list);
// character encoding information
struct cs_info {
unsigned char ccase;
unsigned char clower;
unsigned char cupper;
};
LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl();
LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl();
LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c,
int langnum);
LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum);
LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum);
LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c,
int langnum);
LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c);
LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es);
// get language identifiers of language codes
LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang);
// get characters of the given 8bit encoding with lower- and uppercase forms
LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc);
// convert std::string to all caps
LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s,
const struct cs_info* csconv);
// convert null terminated string to all little
LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s,
const struct cs_info* csconv);
// convert first letter of string to little
LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s,
const struct cs_info* csconv);
// convert first letter of string to capital
LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s,
const struct cs_info* csconv);
// convert first letter of UTF-8 string to capital
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
mkinitcap_utf(std::vector<w_char>& u, int langnum);
// convert UTF-8 string to little
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
mkallsmall_utf(std::vector<w_char>& u, int langnum);
// convert first letter of UTF-8 string to little
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
mkinitsmall_utf(std::vector<w_char>& u, int langnum);
// convert UTF-8 string to capital
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
mkallcap_utf(std::vector<w_char>& u, int langnum);
// get type of capitalization
LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*);
// get type of capitalization (UTF-8)
LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum);
// strip all ignored characters in the string
LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf(
std::string& word,
const std::vector<w_char>& ignored_chars);
// strip all ignored characters in the string
LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars(
std::string& word,
const std::string& ignored_chars);
LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line,
std::string& out,
int ln);
LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line,
std::string& out,
std::vector<w_char>& out_utf16,
int utf8,
int ln);
LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r);
LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest,
const std::string& morph,
const std::string& var);
// conversion function for protected memory
LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source);
// conversion function for protected memory
LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
// hash entry macros
LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) {
char* ret;
if (!h->var)
ret = NULL;
else if (h->var & H_OPT_ALIASM)
ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
else
ret = HENTRY_WORD(h) + h->blen + 1;
return ret;
}
LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA(
const struct hentry* h) {
const char* ret;
if (!h->var)
ret = NULL;
else if (h->var & H_OPT_ALIASM)
ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
else
ret = HENTRY_WORD(h) + h->blen + 1;
return ret;
}
// NULL-free version for warning-free OOo build
LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2(
const struct hentry* h) {
const char* ret;
if (!h->var)
ret = "";
else if (h->var & H_OPT_ALIASM)
ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
else
ret = HENTRY_WORD(h) + h->blen + 1;
return ret;
}
LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry* h,
const char* p) {
return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
}
#endif

View File

@ -1,117 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "filemgr.hxx"
#include "csutil.hxx"
int FileMgr::fail(const char* err, const char* par) {
fprintf(stderr, err, par);
return -1;
}
FileMgr::FileMgr(const char* file, const char* key) : hin(NULL), linenum(0) {
in[0] = '\0';
myopen(fin, file, std::ios_base::in);
if (!fin.is_open()) {
// check hzipped file
std::string st(file);
st.append(HZIP_EXTENSION);
hin = new Hunzip(st.c_str(), key);
}
if (!fin.is_open() && !hin->is_open())
fail(MSG_OPEN, file);
}
FileMgr::~FileMgr() {
delete hin;
}
bool FileMgr::getline(std::string& dest) {
bool ret = false;
++linenum;
if (fin.is_open()) {
ret = static_cast<bool>(std::getline(fin, dest));
} else if (hin->is_open()) {
ret = hin->getline(dest);
}
if (!ret) {
--linenum;
}
return ret;
}
int FileMgr::getlinenum() {
return linenum;
}

View File

@ -1,98 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* file manager class - read lines of files [filename] OR [filename.hz] */
#ifndef FILEMGR_HXX_
#define FILEMGR_HXX_
#include "hunzip.hxx"
#include <stdio.h>
#include <string>
#include <fstream>
class FileMgr {
private:
FileMgr(const FileMgr&);
FileMgr& operator=(const FileMgr&);
protected:
std::ifstream fin;
Hunzip* hin;
char in[BUFSIZE + 50]; // input buffer
int fail(const char* err, const char* par);
int linenum;
public:
FileMgr(const char* filename, const char* key = NULL);
~FileMgr();
bool getline(std::string&);
int getlinenum();
};
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,145 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef HASHMGR_HXX_
#define HASHMGR_HXX_
#include <stdio.h>
#include <string>
#include <vector>
#include "htypes.hxx"
#include "filemgr.hxx"
#include "w_char.hxx"
enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI };
class HashMgr {
int tablesize;
struct hentry** tableptr;
flag flag_mode;
int complexprefixes;
int utf8;
unsigned short forbiddenword;
int langnum;
std::string enc;
std::string lang;
struct cs_info* csconv;
std::string ignorechars;
std::vector<w_char> ignorechars_utf16;
int numaliasf; // flag vector `compression' with aliases
unsigned short** aliasf;
unsigned short* aliasflen;
int numaliasm; // morphological desciption `compression' with aliases
char** aliasm;
public:
HashMgr(const char* tpath, const char* apath, const char* key = NULL);
~HashMgr();
struct hentry* lookup(const char*) const;
int hash(const char*) const;
struct hentry* walk_hashtable(int& col, struct hentry* hp) const;
int add(const std::string& word);
int add_with_affix(const std::string& word, const std::string& pattern);
int remove(const std::string& word);
int decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const;
bool decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const;
unsigned short decode_flag(const char* flag) const;
char* encode_flag(unsigned short flag) const;
int is_aliasf() const;
int get_aliasf(int index, unsigned short** fvec, FileMgr* af) const;
int is_aliasm() const;
char* get_aliasm(int index) const;
private:
int get_clen_and_captype(const std::string& word, int* captype);
int get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf);
int load_tables(const char* tpath, const char* key);
int add_word(const std::string& word,
int wcl,
unsigned short* ap,
int al,
const std::string* desc,
bool onlyupcase);
int load_config(const char* affpath, const char* key);
bool parse_aliasf(const std::string& line, FileMgr* af);
int add_hidden_capitalized_word(const std::string& word,
int wcl,
unsigned short* flags,
int al,
const std::string* dp,
int captype);
bool parse_aliasm(const std::string& line, FileMgr* af);
int remove_forbidden_flag(const std::string& word);
};
#endif

View File

@ -1,68 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef HTYPES_HXX_
#define HTYPES_HXX_
#define ROTATE_LEN 5
#define ROTATE(v, q) \
(v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q)) - 1));
// hentry options
#define H_OPT (1 << 0)
#define H_OPT_ALIASM (1 << 1)
#define H_OPT_PHON (1 << 2)
// see also csutil.hxx
#define HENTRY_WORD(h) &(h->word[0])
// approx. number of user defined words
#define USERWORD 1000
struct hentry {
unsigned char blen; // word length in bytes
unsigned char clen; // word length in characters (different for UTF-8 enc.)
short alen; // length of affix flag vector
unsigned short* astr; // affix flag vector
struct hentry* next; // next word with same hash code
struct hentry* next_homonym; // next homonym word (with same hash code)
char var; // variable fields (only for special pronounciation yet)
char word[1]; // variable-length word (8-bit or UTF-8 encoding)
};
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,162 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Hunspell, based on MySpell.
*
* The Initial Developers of the Original Code are
* Kevin Hendricks (MySpell) and Németh László (Hunspell).
* Portions created by the Initial Developers are Copyright (C) 2002-2005
* the Initial Developers. All Rights Reserved.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef MYSPELLMGR_H_
#define MYSPELLMGR_H_
#include "hunvisapi.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct Hunhandle Hunhandle;
LIBHUNSPELL_DLL_EXPORTED Hunhandle* Hunspell_create(const char* affpath,
const char* dpath);
LIBHUNSPELL_DLL_EXPORTED Hunhandle* Hunspell_create_key(const char* affpath,
const char* dpath,
const char* key);
LIBHUNSPELL_DLL_EXPORTED void Hunspell_destroy(Hunhandle* pHunspell);
/* load extra dictionaries (only dic files)
* output: 0 = additional dictionary slots available, 1 = slots are now full*/
LIBHUNSPELL_DLL_EXPORTED int Hunspell_add_dic(Hunhandle* pHunspell,
const char* dpath);
/* spell(word) - spellcheck word
* output: 0 = bad word, not 0 = good word
*/
LIBHUNSPELL_DLL_EXPORTED int Hunspell_spell(Hunhandle* pHunspell, const char*);
LIBHUNSPELL_DLL_EXPORTED char* Hunspell_get_dic_encoding(Hunhandle* pHunspell);
/* suggest(suggestions, word) - search suggestions
* input: pointer to an array of strings pointer and the (bad) word
* array of strings pointer (here *slst) may not be initialized
* output: number of suggestions in string array, and suggestions in
* a newly allocated array of strings (*slts will be NULL when number
* of suggestion equals 0.)
*/
LIBHUNSPELL_DLL_EXPORTED int Hunspell_suggest(Hunhandle* pHunspell,
char*** slst,
const char* word);
/* morphological functions */
/* analyze(result, word) - morphological analysis of the word */
LIBHUNSPELL_DLL_EXPORTED int Hunspell_analyze(Hunhandle* pHunspell,
char*** slst,
const char* word);
/* stem(result, word) - stemmer function */
LIBHUNSPELL_DLL_EXPORTED int Hunspell_stem(Hunhandle* pHunspell,
char*** slst,
const char* word);
/* stem(result, analysis, n) - get stems from a morph. analysis
* example:
* char ** result, result2;
* int n1 = Hunspell_analyze(result, "words");
* int n2 = Hunspell_stem2(result2, result, n1);
*/
LIBHUNSPELL_DLL_EXPORTED int Hunspell_stem2(Hunhandle* pHunspell,
char*** slst,
char** desc,
int n);
/* generate(result, word, word2) - morphological generation by example(s) */
LIBHUNSPELL_DLL_EXPORTED int Hunspell_generate(Hunhandle* pHunspell,
char*** slst,
const char* word,
const char* word2);
/* generate(result, word, desc, n) - generation by morph. description(s)
* example:
* char ** result;
* char * affix = "is:plural"; // description depends from dictionaries, too
* int n = Hunspell_generate2(result, "word", &affix, 1);
* for (int i = 0; i < n; i++) printf("%s\n", result[i]);
*/
LIBHUNSPELL_DLL_EXPORTED int Hunspell_generate2(Hunhandle* pHunspell,
char*** slst,
const char* word,
char** desc,
int n);
/* functions for run-time modification of the dictionary */
/* add word to the run-time dictionary */
LIBHUNSPELL_DLL_EXPORTED int Hunspell_add(Hunhandle* pHunspell,
const char* word);
/* add word to the run-time dictionary with affix flags of
* the example (a dictionary word): Hunspell will recognize
* affixed forms of the new word, too.
*/
LIBHUNSPELL_DLL_EXPORTED int Hunspell_add_with_affix(Hunhandle* pHunspell,
const char* word,
const char* example);
/* remove word from the run-time dictionary */
LIBHUNSPELL_DLL_EXPORTED int Hunspell_remove(Hunhandle* pHunspell,
const char* word);
/* free suggestion lists */
LIBHUNSPELL_DLL_EXPORTED void Hunspell_free_list(Hunhandle* pHunspell,
char*** slst,
int n);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -1,229 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef MYSPELLMGR_HXX_
#define MYSPELLMGR_HXX_
#include "hunvisapi.h"
#include "w_char.hxx"
#include "atypes.hxx"
#include <string>
#include <vector>
#define SPELL_XML "<?xml?>"
#define MAXSUGGESTION 15
#define MAXSHARPS 5
#ifndef MAXWORDLEN
#define MAXWORDLEN 100
#endif
#if defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
# define H_DEPRECATED __attribute__((__deprecated__))
#elif defined(_MSC_VER) && (_MSC_VER >= 1300)
# define H_DEPRECATED __declspec(deprecated)
#else
# define H_DEPRECATED
#endif
class HunspellImpl;
class LIBHUNSPELL_DLL_EXPORTED Hunspell {
private:
Hunspell(const Hunspell&);
Hunspell& operator=(const Hunspell&);
private:
HunspellImpl* m_Impl;
public:
/* Hunspell(aff, dic) - constructor of Hunspell class
* input: path of affix file and dictionary file
*
* In WIN32 environment, use UTF-8 encoded paths started with the long path
* prefix \\\\?\\ to handle system-independent character encoding and very
* long path names (without the long path prefix Hunspell will use fopen()
* with system-dependent character encoding instead of _wfopen()).
*/
Hunspell(const char* affpath, const char* dpath, const char* key = NULL);
~Hunspell();
/* load extra dictionaries (only dic files) */
int add_dic(const char* dpath, const char* key = NULL);
/* spell(word) - spellcheck word
* output: false = bad word, true = good word
*
* plus output:
* info: information bit array, fields:
* SPELL_COMPOUND = a compound word
* SPELL_FORBIDDEN = an explicit forbidden word
* root: root (stem), when input is a word with affix(es)
*/
bool spell(const std::string& word, int* info = NULL, std::string* root = NULL);
H_DEPRECATED int spell(const char* word, int* info = NULL, char** root = NULL);
/* suggest(suggestions, word) - search suggestions
* input: pointer to an array of strings pointer and the (bad) word
* array of strings pointer (here *slst) may not be initialized
* output: number of suggestions in string array, and suggestions in
* a newly allocated array of strings (*slts will be NULL when number
* of suggestion equals 0.)
*/
std::vector<std::string> suggest(const std::string& word);
H_DEPRECATED int suggest(char*** slst, const char* word);
/* Suggest words from suffix rules
* suffix_suggest(suggestions, root_word)
* input: pointer to an array of strings pointer and the word
* array of strings pointer (here *slst) may not be initialized
* output: number of suggestions in string array, and suggestions in
* a newly allocated array of strings (*slts will be NULL when number
* of suggestion equals 0.)
*/
std::vector<std::string> suffix_suggest(const std::string& root_word);
H_DEPRECATED int suffix_suggest(char*** slst, const char* root_word);
/* deallocate suggestion lists */
H_DEPRECATED void free_list(char*** slst, int n);
const std::string& get_dict_encoding() const;
char* get_dic_encoding();
/* morphological functions */
/* analyze(result, word) - morphological analysis of the word */
std::vector<std::string> analyze(const std::string& word);
H_DEPRECATED int analyze(char*** slst, const char* word);
/* stem(word) - stemmer function */
std::vector<std::string> stem(const std::string& word);
H_DEPRECATED int stem(char*** slst, const char* word);
/* stem(analysis, n) - get stems from a morph. analysis
* example:
* char ** result, result2;
* int n1 = analyze(&result, "words");
* int n2 = stem(&result2, result, n1);
*/
std::vector<std::string> stem(const std::vector<std::string>& morph);
H_DEPRECATED int stem(char*** slst, char** morph, int n);
/* generate(result, word, word2) - morphological generation by example(s) */
std::vector<std::string> generate(const std::string& word, const std::string& word2);
H_DEPRECATED int generate(char*** slst, const char* word, const char* word2);
/* generate(result, word, desc, n) - generation by morph. description(s)
* example:
* char ** result;
* char * affix = "is:plural"; // description depends from dictionaries, too
* int n = generate(&result, "word", &affix, 1);
* for (int i = 0; i < n; i++) printf("%s\n", result[i]);
*/
std::vector<std::string> generate(const std::string& word, const std::vector<std::string>& pl);
H_DEPRECATED int generate(char*** slst, const char* word, char** desc, int n);
/* functions for run-time modification of the dictionary */
/* add word to the run-time dictionary */
int add(const std::string& word);
/* add word to the run-time dictionary with affix flags of
* the example (a dictionary word): Hunspell will recognize
* affixed forms of the new word, too.
*/
int add_with_affix(const std::string& word, const std::string& example);
/* remove word from the run-time dictionary */
int remove(const std::string& word);
/* other */
/* get extra word characters definied in affix file for tokenization */
const char* get_wordchars() const;
const std::string& get_wordchars_cpp() const;
const std::vector<w_char>& get_wordchars_utf16() const;
struct cs_info* get_csconv();
const char* get_version() const;
const std::string& get_version_cpp() const;
int get_langnum() const;
/* need for putdic */
bool input_conv(const std::string& word, std::string& dest);
H_DEPRECATED int input_conv(const char* word, char* dest, size_t destsize);
};
#endif

View File

@ -1,18 +0,0 @@
#ifndef HUNSPELL_VISIBILITY_H_
#define HUNSPELL_VISIBILITY_H_
#if defined(HUNSPELL_STATIC)
# define LIBHUNSPELL_DLL_EXPORTED
#elif defined(_MSC_VER)
# if defined(BUILDING_LIBHUNSPELL)
# define LIBHUNSPELL_DLL_EXPORTED __declspec(dllexport)
# else
# define LIBHUNSPELL_DLL_EXPORTED __declspec(dllimport)
# endif
#elif defined(BUILDING_LIBHUNSPELL) && 1
# define LIBHUNSPELL_DLL_EXPORTED __attribute__((__visibility__("default")))
#else
# define LIBHUNSPELL_DLL_EXPORTED
#endif
#endif

View File

@ -1,18 +0,0 @@
#ifndef HUNSPELL_VISIBILITY_H_
#define HUNSPELL_VISIBILITY_H_
#if defined(HUNSPELL_STATIC)
# define LIBHUNSPELL_DLL_EXPORTED
#elif defined(_MSC_VER)
# if defined(BUILDING_LIBHUNSPELL)
# define LIBHUNSPELL_DLL_EXPORTED __declspec(dllexport)
# else
# define LIBHUNSPELL_DLL_EXPORTED __declspec(dllimport)
# endif
#elif defined(BUILDING_LIBHUNSPELL) && @HAVE_VISIBILITY@
# define LIBHUNSPELL_DLL_EXPORTED __attribute__((__visibility__("default")))
#else
# define LIBHUNSPELL_DLL_EXPORTED
#endif
#endif

View File

@ -1,256 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "hunzip.hxx"
#include "csutil.hxx"
#define CODELEN 65536
#define BASEBITREC 5000
#define UNCOMPRESSED '\002'
#define MAGIC "hz0"
#define MAGIC_ENCRYPT "hz1"
#define MAGICLEN (sizeof(MAGIC) - 1)
int Hunzip::fail(const char* err, const char* par) {
fprintf(stderr, err, par);
return -1;
}
Hunzip::Hunzip(const char* file, const char* key)
: bufsiz(0), lastbit(0), inc(0), inbits(0), outc(0) {
in[0] = out[0] = line[0] = '\0';
filename = mystrdup(file);
if (getcode(key) == -1)
bufsiz = -1;
else
bufsiz = getbuf();
}
int Hunzip::getcode(const char* key) {
unsigned char c[2];
int i, j, n;
int allocatedbit = BASEBITREC;
const char* enc = key;
if (!filename)
return -1;
myopen(fin, filename, std::ios_base::in | std::ios_base::binary);
if (!fin.is_open())
return -1;
// read magic number
if (!fin.read(in, 3) ||
!(strncmp(MAGIC, in, MAGICLEN) == 0 ||
strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0)) {
return fail(MSG_FORMAT, filename);
}
// check encryption
if (strncmp(MAGIC_ENCRYPT, in, MAGICLEN) == 0) {
unsigned char cs;
if (!key)
return fail(MSG_KEY, filename);
if (!fin.read(reinterpret_cast<char*>(c), 1))
return fail(MSG_FORMAT, filename);
for (cs = 0; *enc; enc++)
cs ^= *enc;
if (cs != c[0])
return fail(MSG_KEY, filename);
enc = key;
} else
key = NULL;
// read record count
if (!fin.read(reinterpret_cast<char*>(c), 2))
return fail(MSG_FORMAT, filename);
if (key) {
c[0] ^= *enc;
if (*(++enc) == '\0')
enc = key;
c[1] ^= *enc;
}
n = ((int)c[0] << 8) + c[1];
dec.resize(BASEBITREC);
dec[0].v[0] = 0;
dec[0].v[1] = 0;
// read codes
for (i = 0; i < n; i++) {
unsigned char l;
if (!fin.read(reinterpret_cast<char*>(c), 2))
return fail(MSG_FORMAT, filename);
if (key) {
if (*(++enc) == '\0')
enc = key;
c[0] ^= *enc;
if (*(++enc) == '\0')
enc = key;
c[1] ^= *enc;
}
if (!fin.read(reinterpret_cast<char*>(&l), 1))
return fail(MSG_FORMAT, filename);
if (key) {
if (*(++enc) == '\0')
enc = key;
l ^= *enc;
}
if (!fin.read(in, l / 8 + 1))
return fail(MSG_FORMAT, filename);
if (key)
for (j = 0; j <= l / 8; j++) {
if (*(++enc) == '\0')
enc = key;
in[j] ^= *enc;
}
int p = 0;
for (j = 0; j < l; j++) {
int b = (in[j / 8] & (1 << (7 - (j % 8)))) ? 1 : 0;
int oldp = p;
p = dec[p].v[b];
if (p == 0) {
lastbit++;
if (lastbit == allocatedbit) {
allocatedbit += BASEBITREC;
dec.resize(allocatedbit);
}
dec[lastbit].v[0] = 0;
dec[lastbit].v[1] = 0;
dec[oldp].v[b] = lastbit;
p = lastbit;
}
}
dec[p].c[0] = c[0];
dec[p].c[1] = c[1];
}
return 0;
}
Hunzip::~Hunzip() {
if (filename)
free(filename);
}
int Hunzip::getbuf() {
int p = 0;
int o = 0;
do {
if (inc == 0) {
fin.read(in, BUFSIZE);
inbits = fin.gcount() * 8;
}
for (; inc < inbits; inc++) {
int b = (in[inc / 8] & (1 << (7 - (inc % 8)))) ? 1 : 0;
int oldp = p;
p = dec[p].v[b];
if (p == 0) {
if (oldp == lastbit) {
fin.close();
// add last odd byte
if (dec[lastbit].c[0])
out[o++] = dec[lastbit].c[1];
return o;
}
out[o++] = dec[oldp].c[0];
out[o++] = dec[oldp].c[1];
if (o == BUFSIZE)
return o;
p = dec[p].v[b];
}
}
inc = 0;
} while (inbits == BUFSIZE * 8);
return fail(MSG_FORMAT, filename);
}
bool Hunzip::getline(std::string& dest) {
char linebuf[BUFSIZE];
int l = 0, eol = 0, left = 0, right = 0;
if (bufsiz == -1)
return false;
while (l < bufsiz && !eol) {
linebuf[l++] = out[outc];
switch (out[outc]) {
case '\t':
break;
case 31: { // escape
if (++outc == bufsiz) {
bufsiz = getbuf();
outc = 0;
}
linebuf[l - 1] = out[outc];
break;
}
case ' ':
break;
default:
if (((unsigned char)out[outc]) < 47) {
if (out[outc] > 32) {
right = out[outc] - 31;
if (++outc == bufsiz) {
bufsiz = getbuf();
outc = 0;
}
}
if (out[outc] == 30)
left = 9;
else
left = out[outc];
linebuf[l - 1] = '\n';
eol = 1;
}
}
if (++outc == bufsiz) {
outc = 0;
bufsiz = fin.is_open() ? getbuf() : -1;
}
}
if (right)
strcpy(linebuf + l - 1, line + strlen(line) - right - 1);
else
linebuf[l] = '\0';
strcpy(line + left, linebuf);
dest.assign(line);
return true;
}

View File

@ -1,87 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/* hunzip: file decompression for sorted dictionaries with optional encryption,
* algorithm: prefix-suffix encoding and 16-bit Huffman encoding */
#ifndef HUNZIP_HXX_
#define HUNZIP_HXX_
#include "hunvisapi.h"
#include <stdio.h>
#include <fstream>
#include <vector>
#define BUFSIZE 65536
#define HZIP_EXTENSION ".hz"
#define MSG_OPEN "error: %s: cannot open\n"
#define MSG_FORMAT "error: %s: not in hzip format\n"
#define MSG_MEMORY "error: %s: missing memory\n"
#define MSG_KEY "error: %s: missing or bad password\n"
struct bit {
unsigned char c[2];
int v[2];
};
class LIBHUNSPELL_DLL_EXPORTED Hunzip {
private:
Hunzip(const Hunzip&);
Hunzip& operator=(const Hunzip&);
protected:
char* filename;
std::ifstream fin;
int bufsiz, lastbit, inc, inbits, outc;
std::vector<bit> dec; // code table
char in[BUFSIZE]; // input buffer
char out[BUFSIZE + 1]; // Huffman-decoded buffer
char line[BUFSIZE + 50]; // decoded line
int getcode(const char* key);
int getbuf();
int fail(const char* err, const char* par);
public:
Hunzip(const char* filename, const char* key = NULL);
~Hunzip();
bool is_open() { return fin.is_open(); }
bool getline(std::string& dest);
};
#endif

View File

@ -1,75 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef LANGNUM_HXX_
#define LANGNUM_HXX_
/*
language numbers for language specific codes
see https://wiki.openoffice.org/w/index.php?title=Languages&oldid=230199
*/
enum {
LANG_ar = 96,
LANG_az = 100, // custom number
LANG_bg = 41,
LANG_ca = 37,
LANG_cs = 42,
LANG_da = 45,
LANG_de = 49,
LANG_el = 30,
LANG_en = 01,
LANG_es = 34,
LANG_eu = 10,
LANG_fr = 02,
LANG_gl = 38,
LANG_hr = 78,
LANG_hu = 36,
LANG_it = 39,
LANG_la = 99, // custom number
LANG_lv = 101, // custom number
LANG_nl = 31,
LANG_pl = 48,
LANG_pt = 03,
LANG_ru = 07,
LANG_sv = 50,
LANG_tr = 90,
LANG_uk = 80,
LANG_xx = 999
};
#endif

View File

@ -1,270 +0,0 @@
/* phonetic.c - generic replacement aglogithms for phonetic transformation
Copyright (C) 2000 Bjoern Jacke
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License version 2.1 as published by the Free Software Foundation;
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; If not, see
<http://www.gnu.org/licenses/>.
Changelog:
2000-01-05 Bjoern Jacke <bjoern at j3e.de>
Initial Release insprired by the article about phonetic
transformations out of c't 25/1999
2007-07-26 Bjoern Jacke <bjoern at j3e.de>
Released under MPL/GPL/LGPL tri-license for Hunspell
2007-08-23 Laszlo Nemeth <nemeth at OOo>
Porting from Aspell to Hunspell using C-like structs
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include "csutil.hxx"
#include "phonet.hxx"
void init_phonet_hash(phonetable& parms) {
for (int i = 0; i < HASHSIZE; i++) {
parms.hash[i] = -1;
}
for (int i = 0; parms.rules[i][0] != '\0'; i += 2) {
/** set hash value **/
int k = (unsigned char)parms.rules[i][0];
if (parms.hash[k] < 0) {
parms.hash[k] = i;
}
}
}
// like strcpy but safe if the strings overlap
// but only if dest < src
static inline void strmove(char* dest, char* src) {
while (*src)
*dest++ = *src++;
*dest = '\0';
}
static int myisalpha(char ch) {
if ((unsigned char)ch < 128)
return isalpha(ch);
return 1;
}
/* Do phonetic transformation. */
/* phonetic transcription algorithm */
/* see: http://aspell.net/man-html/Phonetic-Code.html */
/* convert string to uppercase before this call */
std::string phonet(const std::string& inword, phonetable& parms) {
int i, k = 0, p, z;
int k0, n0, p0 = -333;
char c;
typedef unsigned char uchar;
size_t len = inword.size();
if (len > MAXPHONETUTF8LEN)
return std::string();
char word[MAXPHONETUTF8LEN + 1];
strncpy(word, inword.c_str(), MAXPHONETUTF8LEN);
word[MAXPHONETUTF8LEN] = '\0';
std::string target;
/** check word **/
i = z = 0;
while ((c = word[i]) != '\0') {
int n = parms.hash[(uchar)c];
int z0 = 0;
if (n >= 0 && !parms.rules[n].empty()) {
/** check all rules for the same letter **/
while (parms.rules[n][0] == c) {
/** check whole string **/
k = 1; /** number of found letters **/
p = 5; /** default priority **/
const char*s = parms.rules[n].c_str();
s++; /** important for (see below) "*(s-1)" **/
while (*s != '\0' && word[i + k] == *s && !isdigit((unsigned char)*s) &&
strchr("(-<^$", *s) == NULL) {
k++;
s++;
}
if (*s == '(') {
/** check letters in "(..)" **/
if (myisalpha(word[i + k]) // ...could be implied?
&& strchr(s + 1, word[i + k]) != NULL) {
k++;
while (*s != ')')
s++;
s++;
}
}
p0 = (int)*s;
k0 = k;
while (*s == '-' && k > 1) {
k--;
s++;
}
if (*s == '<')
s++;
if (isdigit((unsigned char)*s)) {
/** determine priority **/
p = *s - '0';
s++;
}
if (*s == '^' && *(s + 1) == '^')
s++;
if (*s == '\0' || (*s == '^' && (i == 0 || !myisalpha(word[i - 1])) &&
(*(s + 1) != '$' || (!myisalpha(word[i + k0])))) ||
(*s == '$' && i > 0 && myisalpha(word[i - 1]) &&
(!myisalpha(word[i + k0])))) {
/** search for followup rules, if: **/
/** parms.followup and k > 1 and NO '-' in searchstring **/
char c0 = word[i + k - 1];
n0 = parms.hash[(uchar)c0];
// if (parms.followup && k > 1 && n0 >= 0
if (k > 1 && n0 >= 0 && p0 != (int)'-' && word[i + k] != '\0' && !parms.rules[n0].empty()) {
/** test follow-up rule for "word[i+k]" **/
while (parms.rules[n0][0] == c0) {
/** check whole string **/
k0 = k;
p0 = 5;
s = parms.rules[n0].c_str();
s++;
while (*s != '\0' && word[i + k0] == *s &&
!isdigit((unsigned char)*s) &&
strchr("(-<^$", *s) == NULL) {
k0++;
s++;
}
if (*s == '(') {
/** check letters **/
if (myisalpha(word[i + k0]) &&
strchr(s + 1, word[i + k0]) != NULL) {
k0++;
while (*s != ')' && *s != '\0')
s++;
if (*s == ')')
s++;
}
}
while (*s == '-') {
/** "k0" gets NOT reduced **/
/** because "if (k0 == k)" **/
s++;
}
if (*s == '<')
s++;
if (isdigit((unsigned char)*s)) {
p0 = *s - '0';
s++;
}
if (*s == '\0'
/** *s == '^' cuts **/
|| (*s == '$' && !myisalpha(word[i + k0]))) {
if (k0 == k) {
/** this is just a piece of the string **/
n0 += 2;
continue;
}
if (p0 < p) {
/** priority too low **/
n0 += 2;
continue;
}
/** rule fits; stop search **/
break;
}
n0 += 2;
} /** End of "while (parms.rules[n0][0] == c0)" **/
if (p0 >= p && parms.rules[n0][0] == c0) {
n += 2;
continue;
}
} /** end of follow-up stuff **/
/** replace string **/
s = parms.rules[n + 1].c_str();
p0 = (!parms.rules[n].empty() &&
strchr(parms.rules[n].c_str() + 1, '<') != NULL)
? 1
: 0;
if (p0 == 1 && z == 0) {
/** rule with '<' is used **/
if (!target.empty() && *s != '\0' &&
(target[target.size()-1] == c || target[target.size()-1] == *s)) {
target.erase(target.size() - 1);
}
z0 = 1;
z = 1;
k0 = 0;
while (*s != '\0' && word[i + k0] != '\0') {
word[i + k0] = *s;
k0++;
s++;
}
if (k > k0)
strmove(&word[0] + i + k0, &word[0] + i + k);
/** new "actual letter" **/
c = word[i];
} else { /** no '<' rule used **/
i += k - 1;
z = 0;
while (*s != '\0' && *(s + 1) != '\0' && target.size() < len) {
if (target.empty() || target[target.size()-1] != *s) {
target.push_back(*s);
}
s++;
}
/** new "actual letter" **/
c = *s;
if (!parms.rules[n].empty() &&
strstr(parms.rules[n].c_str() + 1, "^^") != NULL) {
if (c != '\0') {
target.push_back(c);
}
strmove(&word[0], &word[0] + i + 1);
i = 0;
z0 = 1;
}
}
break;
} /** end of follow-up stuff **/
n += 2;
} /** end of while (parms.rules[n][0] == c) **/
} /** end of if (n >= 0) **/
if (z0 == 0) {
if (k && !p0 && target.size() < len && c != '\0') {
/** condense only double letters **/
target.push_back(c);
/// printf("\n setting \n");
}
i++;
z = 0;
k = 0;
}
} /** end of while ((c = word[i]) != '\0') **/
return target;
} /** end of function "phonet" **/

View File

@ -1,50 +0,0 @@
/* phonetic.c - generic replacement aglogithms for phonetic transformation
Copyright (C) 2000 Bjoern Jacke
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License version 2.1 as published by the Free Software Foundation;
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; If not, see
<http://www.gnu.org/licenses/>.
Changelog:
2000-01-05 Bjoern Jacke <bjoern at j3e.de>
Initial Release insprired by the article about phonetic
transformations out of c't 25/1999
2007-07-26 Bjoern Jacke <bjoern at j3e.de>
Released under MPL/GPL/LGPL tri-license for Hunspell
2007-08-23 Laszlo Nemeth <nemeth at OOo>
Porting from Aspell to Hunspell using C-like structs
*/
#ifndef PHONET_HXX_
#define PHONET_HXX_
#define HASHSIZE 256
#define MAXPHONETLEN 256
#define MAXPHONETUTF8LEN (MAXPHONETLEN * 4)
#include "hunvisapi.h"
struct phonetable {
char utf8;
std::vector<std::string> rules;
int hash[HASHSIZE];
};
LIBHUNSPELL_DLL_EXPORTED void init_phonet_hash(phonetable& parms);
LIBHUNSPELL_DLL_EXPORTED std::string phonet(const std::string& inword,
phonetable& phone);
#endif

View File

@ -1,196 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <limits>
#include "replist.hxx"
#include "csutil.hxx"
RepList::RepList(int n) {
dat = (replentry**)malloc(sizeof(replentry*) * n);
if (dat == 0)
size = 0;
else
size = n;
pos = 0;
}
RepList::~RepList() {
for (int i = 0; i < pos; i++) {
delete dat[i];
}
free(dat);
}
replentry* RepList::item(int n) {
return dat[n];
}
int RepList::find(const char* word) {
int p1 = 0;
int p2 = pos - 1;
int ret = -1;
while (p1 <= p2) {
int m = ((unsigned)p1 + (unsigned)p2) >> 1;
int c = strncmp(word, dat[m]->pattern.c_str(), dat[m]->pattern.size());
if (c < 0)
p2 = m - 1;
else if (c > 0)
p1 = m + 1;
else { // scan in the right half for a longer match
ret = m;
p1 = m + 1;
}
}
return ret;
}
std::string RepList::replace(const char* word, int ind, bool atstart) {
int type = atstart ? 1 : 0;
if (ind < 0)
return std::string();
if (strlen(word) == dat[ind]->pattern.size())
type = atstart ? 3 : 2;
while (type && dat[ind]->outstrings[type].empty())
type = (type == 2 && !atstart) ? 0 : type - 1;
return dat[ind]->outstrings[type];
}
int RepList::add(const std::string& in_pat1, const std::string& pat2) {
if (pos >= size || in_pat1.empty() || pat2.empty()) {
return 1;
}
// analyse word context
int type = 0;
std::string pat1(in_pat1);
if (pat1[0] == '_') {
pat1.erase(0, 1);
type = 1;
}
if (!pat1.empty() && pat1[pat1.size() - 1] == '_') {
type = type + 2;
pat1.erase(pat1.size() - 1);
}
mystrrep(pat1, "_", " ");
// find existing entry
int m = find(pat1.c_str());
if (m >= 0 && dat[m]->pattern == pat1) {
// since already used
dat[m]->outstrings[type] = pat2;
mystrrep(dat[m]->outstrings[type], "_", " ");
return 0;
}
// make a new entry if none exists
replentry* r = new replentry;
if (r == NULL)
return 1;
r->pattern = pat1;
r->outstrings[type] = pat2;
mystrrep(r->outstrings[type], "_", " ");
dat[pos++] = r;
// sort to the right place in the list
int i;
for (i = pos - 1; i > 0; i--) {
if (strcmp(r->pattern.c_str(), dat[i - 1]->pattern.c_str()) < 0) {
dat[i] = dat[i - 1];
} else
break;
}
dat[i] = r;
return 0;
}
bool RepList::conv(const std::string& in_word, std::string& dest) {
dest.clear();
size_t wordlen = in_word.size();
const char* word = in_word.c_str();
bool change = false;
for (size_t i = 0; i < wordlen; ++i) {
int n = find(word + i);
std::string l = replace(word + i, n, i == 0);
if (!l.empty()) {
dest.append(l);
i += dat[n]->pattern.size() - 1;
change = true;
} else {
dest.push_back(word[i]);
}
}
return change;
}

View File

@ -1,100 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* string replacement list class */
#ifndef REPLIST_HXX_
#define REPLIST_HXX_
#include "w_char.hxx"
#include <string>
#include <vector>
class RepList {
private:
RepList(const RepList&);
RepList& operator=(const RepList&);
protected:
replentry** dat;
int size;
int pos;
public:
explicit RepList(int n);
~RepList();
int add(const std::string& pat1, const std::string& pat2);
replentry* item(int n);
int find(const char* word);
std::string replace(const char* word, int n, bool atstart);
bool conv(const std::string& word, std::string& dest);
};
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,188 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef SUGGESTMGR_HXX_
#define SUGGESTMGR_HXX_
#define MAX_ROOTS 100
#define MAX_WORDS 100
#define MAX_GUESS 200
#define MAXNGRAMSUGS 4
#define MAXPHONSUGS 2
#define MAXCOMPOUNDSUGS 3
// timelimit: max ~1/4 sec (process time on Linux) for a time consuming function
#define TIMELIMIT (CLOCKS_PER_SEC >> 2)
#define MINTIMER 100
#define MAXPLUSTIMER 100
#define NGRAM_LONGER_WORSE (1 << 0)
#define NGRAM_ANY_MISMATCH (1 << 1)
#define NGRAM_LOWERING (1 << 2)
#define NGRAM_WEIGHTED (1 << 3)
#include "atypes.hxx"
#include "affixmgr.hxx"
#include "hashmgr.hxx"
#include "langnum.hxx"
#include <time.h>
enum { LCS_UP, LCS_LEFT, LCS_UPLEFT };
class SuggestMgr {
private:
SuggestMgr(const SuggestMgr&);
SuggestMgr& operator=(const SuggestMgr&);
private:
char* ckey;
size_t ckeyl;
std::vector<w_char> ckey_utf;
char* ctry;
size_t ctryl;
std::vector<w_char> ctry_utf;
AffixMgr* pAMgr;
unsigned int maxSug;
struct cs_info* csconv;
int utf8;
int langnum;
int nosplitsugs;
int maxngramsugs;
int maxcpdsugs;
int complexprefixes;
public:
SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr);
~SuggestMgr();
void suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug);
void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr);
std::string suggest_morph(const std::string& word);
std::string suggest_gen(const std::vector<std::string>& pl, const std::string& pattern);
private:
void testsug(std::vector<std::string>& wlst,
const std::string& candidate,
int cpdsuggest,
int* timer,
clock_t* timelimit);
int checkword(const std::string& word, int, int*, clock_t*);
int check_forbidden(const char*, int);
void capchars(std::vector<std::string>&, const char*, int);
int replchars(std::vector<std::string>&, const char*, int);
int doubletwochars(std::vector<std::string>&, const char*, int);
int forgotchar(std::vector<std::string>&, const char*, int);
int swapchar(std::vector<std::string>&, const char*, int);
int longswapchar(std::vector<std::string>&, const char*, int);
int movechar(std::vector<std::string>&, const char*, int);
int extrachar(std::vector<std::string>&, const char*, int);
int badcharkey(std::vector<std::string>&, const char*, int);
int badchar(std::vector<std::string>&, const char*, int);
int twowords(std::vector<std::string>&, const char*, int);
void capchars_utf(std::vector<std::string>&, const w_char*, int wl, int);
int doubletwochars_utf(std::vector<std::string>&, const w_char*, int wl, int);
int forgotchar_utf(std::vector<std::string>&, const w_char*, int wl, int);
int extrachar_utf(std::vector<std::string>&, const w_char*, int wl, int);
int badcharkey_utf(std::vector<std::string>&, const w_char*, int wl, int);
int badchar_utf(std::vector<std::string>&, const w_char*, int wl, int);
int swapchar_utf(std::vector<std::string>&, const w_char*, int wl, int);
int longswapchar_utf(std::vector<std::string>&, const w_char*, int, int);
int movechar_utf(std::vector<std::string>&, const w_char*, int, int);
int mapchars(std::vector<std::string>&, const char*, int);
int map_related(const char*,
std::string&,
int,
std::vector<std::string>& wlst,
int,
const std::vector<mapentry>&,
int*,
clock_t*);
int ngram(int n, const std::vector<w_char>& su1,
const std::vector<w_char>& su2, int opt);
int ngram(int n, const std::string& s1, const std::string& s2, int opt);
int mystrlen(const char* word);
int leftcommonsubstring(const std::vector<w_char>& su1,
const std::vector<w_char>& su2);
int leftcommonsubstring(const char* s1, const char* s2);
int commoncharacterpositions(const char* s1, const char* s2, int* is_swap);
void bubblesort(char** rwd, char** rwd2, int* rsc, int n);
void lcs(const char* s, const char* s2, int* l1, int* l2, char** result);
int lcslen(const char* s, const char* s2);
int lcslen(const std::string& s, const std::string& s2);
std::string suggest_hentry_gen(hentry* rv, const char* pattern);
};
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,72 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef W_CHAR_HXX_
#define W_CHAR_HXX_
#include <string>
#ifndef GCC
struct w_char {
#else
struct __attribute__((packed)) w_char {
#endif
unsigned char l;
unsigned char h;
friend bool operator<(const w_char a, const w_char b) {
unsigned short a_idx = (a.h << 8) + a.l;
unsigned short b_idx = (b.h << 8) + b.l;
return a_idx < b_idx;
}
friend bool operator==(const w_char a, const w_char b) {
return (((a).l == (b).l) && ((a).h == (b).h));
}
friend bool operator!=(const w_char a, const w_char b) {
return !(a == b);;
}
};
// two character arrays
struct replentry {
std::string pattern;
std::string outstrings[4]; // med, ini, fin, isol
};
#endif

View File

@ -1,6 +0,0 @@
BasedOnStyle: LLVM
IndentWidth: 8
UseTab: ForIndentation
BreakBeforeBraces: Stroustrup
PointerAlignment: Left
AlwaysBreakTemplateDeclarations: true

View File

@ -1,378 +0,0 @@
/* Copyright 2016-2017 Dimitrij Mijoski
*
* This file is part of Hunspell-2.
*
* Hunspell-2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Hunspell-2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Hunspell-2. If not, see <http://www.gnu.org/licenses/>.
*
* Hunspell 2 is based on Hunspell v1 and MySpell.
* Hunspell v1 is Copyright (C) 2002-2017 Németh László
* MySpell is Copyright (C) 2002 Kevin Hendricks.
*/
#include "aff_manager.hxx"
#include "string_utils.hxx"
#include <algorithm>
#include <iostream>
#include <sstream>
#include <unordered_map>
namespace hunspell {
using namespace std;
namespace {
template <class T, class Func>
auto parse_vector_of_T(istream& in, const string& command,
unordered_map<string, int>& counts, vector<T>& vec,
Func parseLineFunc) -> void
{
auto dat = counts.find(command);
if (dat == counts.end()) {
// first line
int a;
in >> a;
if (!in || a < 0) {
a = 0; // err
}
counts[command] = a;
}
else if (dat->second) {
vec.emplace_back();
parseLineFunc(in, vec.back());
if (in.fail()) {
vec.pop_back();
}
dat->second--;
}
else {
cerr << "Hunspell warning: extra entries of " << command
<< '\n';
}
}
// Expects that there are flags in the stream.
// If there are no flags in the stream (eg, stream is at eof)
// or if the format of the flags is incorrect the stream failbit will be set.
auto decode_flags(std::istream& in, flag_type_t t, utf8_to_ucs2_converter& cv)
-> u16string
{
string s;
u16string ret;
// utf8 to ucs-2 converter. flags can be only in BPM
// wstring_convert<codecvt_utf8<char16_t>,char16_t> cv;
switch (t) {
case single_char_flag:
in >> s;
ret.resize(s.size());
transform(s.begin(), s.end(), ret.begin(),
cast_lambda<unsigned char>());
break;
case double_char_flag: {
in >> s;
auto i = s.begin();
auto e = s.end();
if (s.size() & 1) {
--e;
}
for (; i != e; i += 2) {
char16_t c1 = (unsigned char)*i;
char16_t c2 = (unsigned char)*(i + 1);
ret.push_back((c1 << 8) | c2);
}
if (i != s.end()) {
ret.push_back((unsigned char)*i);
}
break;
}
case number_flag:
unsigned short flag;
if (in >> flag) {
ret.push_back(flag);
}
else {
// err no flag at all
cerr << "Hunspell error: missing flag\n";
break;
}
// peek can set failbit
while (in.good() && in.peek() == ',') {
in.get();
if (in >> flag) {
ret.push_back(flag);
}
else {
// err, comma and no number after that
cerr << "Hunspell error: long flag, no number "
"after comma\n";
break;
}
}
break;
case utf8_flag:
ret = cv.from_bytes(s);
break;
}
return ret;
}
auto parse_affix(istream& ss, string& command, vector<aff_data::affix>& vec,
unordered_map<string, pair<bool, int>>& cmd_affix,
utf8_to_ucs2_converter& cv, aff_data& thiss) -> void
{
char16_t f = thiss.decode_single_flag(ss, cv);
if (f == 0) {
// err
return;
}
char f1 = f & 0xff;
char f2 = (f >> 8) & 0xff;
command.push_back(f1);
command.push_back(f2);
auto dat = cmd_affix.find(command);
// note: the current affix parser does not allow the same flag
// to be used once with cross product and again witohut
// one flag is tied to one cross product value
if (dat == cmd_affix.end()) {
char cross_char; // 'Y' or 'N'
int cnt;
ss >> cross_char >> cnt;
bool cross = cross_char == 'Y';
if (!ss || cnt < 0) {
cnt = 0; // err
}
cmd_affix[command] = make_pair(cross, cnt);
}
else if (dat->second.second) {
vec.emplace_back();
auto& elem = vec.back();
elem.flag = f;
elem.cross_product = dat->second.first;
ss >> elem.stripping;
if (read_to_slash_or_space(ss, elem.affix)) {
elem.new_flags = thiss.decode_flags(ss, cv);
}
ss >> elem.condition;
if (ss.fail()) {
vec.pop_back();
}
else {
parse_morhological_fields(ss,
elem.morphological_fields);
}
dat->second.second--;
}
else {
cerr << "Hunspell warning: extra entries of "
<< command.substr(0, 3) << '\n';
}
}
}
auto aff_data::decode_flags(istream& in, utf8_to_ucs2_converter& cv) const
-> u16string
{
return hunspell::decode_flags(in, flag_type, cv);
}
auto aff_data::decode_single_flag(istream& in, utf8_to_ucs2_converter& cv) const
-> char16_t
{
auto flags = decode_flags(in, cv);
if (flags.size()) {
return flags.front();
}
return 0;
}
auto aff_data::parse(std::istream& in) -> bool
{
unordered_map<string, string*> command_strings = {
{"SET", &encoding}, {"LANG", &language_code},
{"IGNORE", &ignore_chars},
{"KEY", &keyboard_layout}, {"TRY", &try_chars},
{"WORDCHARS", &wordchars}};
unordered_map<string, bool*> command_bools = {
{"COMPLEXPREFIXES", &complex_prefixes},
{"ONLYMAXDIFF", &only_max_diff},
{"NOSPLITSUGS", &no_split_suggestions},
{"SUGSWITHDOTS", &suggest_with_dots},
{"FORBIDWARN", &forbid_warn},
{"COMPOUNDMORESUFFIXES", &compound_more_suffixes},
{"CHECKCOMPOUNDDUP", &compound_check_up},
{"CHECKCOMPOUNDREP", &compound_check_rep},
{"CHECKCOMPOUNDCASE", &compound_check_case},
{"CHECKCOMPOUNDTRIPLE", &compound_check_triple},
{"SIMPLIFIEDTRIPLE", &compound_simplified_triple},
{"FULLSTRIP", &fullstrip},
{"CHECKSHARPS", &checksharps}};
unordered_map<string, vector<string>*> command_vec_str = {
{"BREAK", &break_patterns},
{"MAP", &map_related_chars}, // maybe add special parsing code
{"COMPOUNDRULE", &compound_rules}};
unordered_map<string, short*> command_shorts = {
{"MAXCPDSUGS", &max_compound_suggestions},
{"MAXNGRAMSUGS", &max_ngram_suggestions},
{"MAXDIFF", &max_diff_factor},
{"COMPOUNDMIN", &compoud_minimum},
{"COMPOUNDWORDMAX", &compound_word_max}};
unordered_map<string, vector<pair<string, string>>*> command_vec_pair =
{{"REP", &replacements},
{"PHONE", &phonetic_replacements},
{"ICONV", &input_conversion},
{"OCONV", &output_conversion}};
unordered_map<string, char16_t*> command_flag = {
{"NOSUGGEST", &nosuggest_flag},
{"WARN", &warn_flag},
{"COMPOUNDFLAG", &compound_flag},
{"COMPOUNDBEGIN", &compound_begin_flag},
{"COMPOUNDLAST", &compound_last_flag},
{"COMPOUNDMIDDLE", &compound_middle_flag},
{"ONLYINCOMPOUND", &compound_onlyin_flag},
{"COMPOUNDPERMITFLAG", &compound_permit_flag},
{"COMPOUNDFORBIDFLAG", &compound_forbid_flag},
{"COMPOUNDROOT", &compound_root_flag},
{"FORCEUCASE", &compound_force_uppercase},
{"CIRCUMFIX", &circumfix_flag},
{"FORBIDDENWORD", &forbiddenword_flag},
{"KEEPCASE", &keepcase_flag},
{"NEEDAFFIX", &need_affix_flag},
{"SUBSTANDARD", &substandard_flag}};
// keeps count for each vector
unordered_map<string, int> cmd_with_vec_cnt;
unordered_map<string, pair<bool, int>> cmd_affix;
utf8_to_ucs2_converter cv;
string line;
string command;
int line_number = 0;
flag_type = single_char_flag;
while (getline(in, line)) {
line_number++;
istringstream ss(line);
ss >> ws;
if (ss.eof() || ss.peek() == '#') {
continue; // skip comment or empty lines
}
ss >> command;
toupper_ascii(command);
ss >> ws;
if (command == "PFX" || command == "SFX") {
auto& vec = command[0] == 'P' ? prefixes : suffixes;
parse_affix(ss, command, vec, cmd_affix, cv, *this);
}
else if (command_strings.count(command)) {
auto& str = *command_strings[command];
ss >> str;
if (&str == &encoding) {
toupper_ascii(str);
}
}
else if (command_bools.count(command)) {
*command_bools[command] = true;
}
else if (command_shorts.count(command)) {
ss >> *command_shorts[command];
}
else if (command_flag.count(command)) {
*command_flag[command] = decode_single_flag(ss, cv);
}
else if (command_vec_str.count(command)) {
auto& vec = *command_vec_str[command];
auto func = [&](istream& in, string& p) { in >> p; };
parse_vector_of_T(ss, command, cmd_with_vec_cnt, vec,
func);
}
else if (command_vec_pair.count(command)) {
auto& vec = *command_vec_pair[command];
auto func = [&](istream& in, pair<string, string>& p) {
in >> p.first >> p.second;
};
parse_vector_of_T(ss, command, cmd_with_vec_cnt, vec,
func);
}
else if (command == "FLAG") {
string p;
ss >> p;
toupper_ascii(p);
if (p == "LONG")
flag_type = double_char_flag;
else if (p == "NUM")
flag_type = number_flag;
else if (p == "UTF-8")
flag_type = utf8_flag;
}
else if (command == "AF") {
auto& vec = flag_aliases;
auto func = [&](istream& inn, u16string& p) {
p = decode_flags(inn, cv);
};
parse_vector_of_T(ss, command, cmd_with_vec_cnt, vec,
func);
}
else if (command == "AM") {
auto& vec = morphological_aliases;
parse_vector_of_T(ss, command, cmd_with_vec_cnt, vec,
parse_morhological_fields);
}
else if (command == "CHECKCOMPOUNDPATTERN") {
auto& vec = compound_check_patterns;
auto func = [&](istream& in,
compound_check_pattern& p) {
if (read_to_slash_or_space(in, p.end_chars)) {
p.end_flag = decode_single_flag(in, cv);
}
if (read_to_slash_or_space(in, p.begin_chars)) {
p.begin_flag =
decode_single_flag(in, cv);
}
if (in.fail()) {
return;
}
in >> p.replacement;
reset_failbit_istream(in);
};
parse_vector_of_T(ss, command, cmd_with_vec_cnt, vec,
func);
}
else if (command == "COMPOUNDSYLLABLE") {
ss >> compound_syllable_max >> compound_syllable_vowels;
}
else if (command == "SYLLABLENUM") {
compound_syllable_num = decode_flags(ss, cv);
}
if (ss.fail()) {
cerr << "Hunspell aff error in line " << line_number
<< ": " << line << endl;
}
}
return in.eof(); // success if we reached eof
}
}

View File

@ -1,142 +0,0 @@
/* Copyright 2016-2017 Dimitrij Mijoski
*
* This file is part of Hunspell-2.
*
* Hunspell-2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Hunspell-2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Hunspell-2. If not, see <http://www.gnu.org/licenses/>.
*
* Hunspell 2 is based on Hunspell v1 and MySpell.
* Hunspell v1 is Copyright (C) 2002-2017 Németh László
* MySpell is Copyright (C) 2002 Kevin Hendricks.
*/
#ifndef HUNSPELL_AFF_MANAGER_HXX
#define HUNSPELL_AFF_MANAGER_HXX
#include "string_utils.hxx"
#include <istream>
#include <string>
#include <utility>
#include <vector>
namespace hunspell {
enum flag_type_t { single_char_flag, double_char_flag, number_flag, utf8_flag };
struct aff_data {
using string = std::string;
using u16string = std::u16string;
using istream = std::istream;
template <class T>
using vector = std::vector<T>;
template <class T, class U>
using pair = std::pair<T, U>;
string encoding;
flag_type_t flag_type;
bool complex_prefixes;
string language_code;
string ignore_chars;
vector<u16string> flag_aliases;
vector<vector<string>> morphological_aliases;
// suggestion options
string keyboard_layout;
string try_chars;
char16_t nosuggest_flag;
short max_compound_suggestions;
short max_ngram_suggestions;
short max_diff_factor;
bool only_max_diff;
bool no_split_suggestions;
bool suggest_with_dots;
vector<pair<string, string>> replacements;
vector<string> map_related_chars;
vector<pair<string, string>> phonetic_replacements;
char16_t warn_flag;
bool forbid_warn;
// compouding options
vector<string> break_patterns;
vector<string> compound_rules;
short compoud_minimum;
char16_t compound_flag;
char16_t compound_begin_flag;
char16_t compound_last_flag;
char16_t compound_middle_flag;
char16_t compound_onlyin_flag;
char16_t compound_permit_flag;
char16_t compound_forbid_flag;
bool compound_more_suffixes;
char16_t compound_root_flag;
short compound_word_max;
bool compound_check_up;
bool compound_check_rep;
bool compound_check_case;
bool compound_check_triple;
bool compound_simplified_triple;
struct compound_check_pattern {
string end_chars;
char16_t end_flag;
string begin_chars;
char16_t begin_flag;
string replacement;
};
vector<compound_check_pattern> compound_check_patterns;
char16_t compound_force_uppercase;
short compound_syllable_max;
string compound_syllable_vowels;
u16string compound_syllable_num;
// affix creation
struct affix {
char16_t flag;
bool cross_product;
string stripping;
string affix;
u16string new_flags;
string condition;
vector<string> morphological_fields;
};
vector<affix> prefixes;
vector<affix> suffixes;
// others
char16_t circumfix_flag;
char16_t forbiddenword_flag;
bool fullstrip;
char16_t keepcase_flag;
vector<pair<string, string>> input_conversion;
vector<pair<string, string>> output_conversion;
char16_t need_affix_flag;
char16_t substandard_flag;
string wordchars;
bool checksharps;
// methods
auto parse(std::istream& in) -> bool;
auto decode_flags(istream& in, utf8_to_ucs2_converter& cv) const
-> u16string;
// u16string decode_flags(istream& in);
auto decode_single_flag(istream& in, utf8_to_ucs2_converter& cv) const
-> char16_t;
// char16_t decode_single_flag(istream& in);
};
}
#endif

View File

@ -1 +0,0 @@
clang-format -style=file -i *.cxx *.hxx

View File

@ -1,100 +0,0 @@
/* Copyright 2016-2017 Dimitrij Mijoski
*
* This file is part of Hunspell-2.
*
* Hunspell-2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Hunspell-2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Hunspell-2. If not, see <http://www.gnu.org/licenses/>.
*
* Hunspell 2 is based on Hunspell v1 and MySpell.
* Hunspell v1 is Copyright (C) 2002-2017 Németh László
* MySpell is Copyright (C) 2002 Kevin Hendricks.
*/
#include "dic_manager.hxx"
#include "string_utils.hxx"
#include <algorithm>
#include <iostream>
#include <limits>
#include <sstream>
namespace hunspell {
using namespace std;
auto dic_data::parse(std::istream& in, const aff_data& aff) -> bool
{
size_t approximate_size;
if (in >> approximate_size) {
words.reserve(approximate_size);
in.ignore(numeric_limits<streamsize>::max(), '\n');
}
else {
return false;
}
string line;
string word;
string morph;
vector<string> morphs;
u16string flags;
istringstream ss;
utf8_to_ucs2_converter cv;
while (getline(in, line)) {
ss.str(line);
ss.clear();
word.clear();
morph.clear();
flags.clear();
morphs.clear();
if (line.find('/') == line.npos) {
// no slash, treat word until first space
ss >> word;
if (ss.fail()) {
// probably all whitespace
continue;
}
}
else { // slash found, word untill slash
read_to_slash(ss, word);
if (ss.fail() || word.empty()) {
continue;
}
if (aff.flag_aliases.empty()) {
flags = aff.decode_flags(ss, cv);
}
else {
size_t flag_alias_idx;
ss >> flag_alias_idx;
if (ss.fail() ||
flag_alias_idx > aff.flag_aliases.size()) {
continue;
}
flags = aff.flag_aliases[flag_alias_idx - 1];
}
}
parse_morhological_fields(ss, morphs);
words[word].append(flags);
if (morphs.size()) {
auto& vec = morph_data[word];
vec.insert(vec.end(), morphs.begin(), morphs.end());
}
}
for (auto& wd : words) {
// sort unique flag vectors
auto& vec = wd.second;
sort(vec.begin(), vec.end());
vec.erase(unique(vec.begin(), vec.end()), vec.end());
}
return in.eof(); // success if we reached eof
}
}

View File

@ -1,52 +0,0 @@
/* Copyright 2016-2017 Dimitrij Mijoski
*
* This file is part of Hunspell-2.
*
* Hunspell-2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Hunspell-2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Hunspell-2. If not, see <http://www.gnu.org/licenses/>.
*
* Hunspell 2 is based on Hunspell v1 and MySpell.
* Hunspell v1 is Copyright (C) 2002-2017 Németh László
* MySpell is Copyright (C) 2002 Kevin Hendricks.
*/
#ifndef HUNSPELL_DIC_MANAGER_HXX
#define HUNSPELL_DIC_MANAGER_HXX
#include "aff_manager.hxx"
#include <istream>
#include <string>
#include <unordered_map>
#include <vector>
namespace hunspell {
struct dic_data {
// word and flag vector
// efficient for short flag vectors
// for long flag vectors like in Korean dict
// we should keep pointers to the string in the affix aliases vector
// for now we will leave it like this
std::unordered_map<std::string, std::u16string> words;
// word and morphological data
// we keep them separate because morph data is generally absent
std::unordered_map<std::string, std::vector<std::string>> morph_data;
// methods
// parses the dic data to hashtable
auto parse(std::istream& in, const aff_data& aff) -> bool;
};
}
#endif

View File

@ -1,395 +0,0 @@
/* Copyright 2016-2017 Dimitrij Mijoski
*
* This file is part of Hunspell-2.
*
* Hunspell-2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Hunspell-2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Hunspell-2. If not, see <http://www.gnu.org/licenses/>.
*
* Hunspell 2 is based on Hunspell v1 and MySpell.
* Hunspell v1 is Copyright (C) 2002-2017 Németh László
* MySpell is Copyright (C) 2002 Kevin Hendricks.
*/
#include "dict_finder.hxx"
#include <algorithm>
#include <array>
#include <iterator>
#include <sstream>
#include <unordered_set>
#include <utility>
#if !defined(_WIN32) && (defined(__unix__) || defined(__unix) || \
(defined(__APPLE__) && defined(__MACH__)))
#include <unistd.h>
#ifdef _POSIX_VERSION
#include <dirent.h>
#include <glob.h>
#include <sys/stat.h>
#include <sys/types.h>
#endif
const char PATHSEP = ':';
#elif defined(_WIN32)
#ifdef __MINGW32__
#include <dirent.h>
//#include <glob.h> //not present in mingw-w64. present in vanilla mingw
#include <sys/stat.h>
#include <sys/types.h>
#endif //__MINGW32__
#endif
using namespace std;
namespace hunspell {
template <class CharT, class OutIt>
auto split(const basic_string<CharT>& s, CharT sep, OutIt out) -> OutIt
{
basic_istringstream<CharT> is(s);
basic_string<CharT> out_str;
while (getline(is, out_str, sep)) {
*out = out_str;
++out;
}
return out;
}
#ifdef _WIN32
const char PATHSEP = ';';
#else
const char PATHSEP = ':';
#endif
template <class OutIt>
auto get_default_search_directories(OutIt out) -> OutIt
{
*out++ = ".";
char* dicpath = getenv("DICPATH");
if (dicpath) {
out = split(string(dicpath), PATHSEP, out);
}
*out++ = "/mingw64/share/hunspell";
char* home = getenv("HOME");
#ifdef _POSIX_VERSION
array<string, 3> prefixes = {home ? string(home) + "/.local/" : "/",
"/usr/local/", "/usr/"};
array<const char*, 3> dirs = {"share/hunspell", "share/myspell",
"share/myspell/dicts"};
for (auto& dir : dirs) {
for (auto& prefix : prefixes) {
*out = prefix + dir;
++out;
}
}
#endif
#if defined(__APPLE__) && defined(__MACH__)
string osx = "/Library/Spelling";
if (home) {
*out++ = home + osx;
}
*out++ = osx;
#endif
#ifdef _WIN32
array<char*, 2> winpaths = {getenv("LOCALAPPDATA"),
getenv("PROGRAMDATA")};
for (auto& p : winpaths) {
if (p) {
*out++ = string(p) + "/hunspell";
}
}
#endif
return out;
}
auto get_default_search_directories() -> vector<string>
{
vector<string> v;
get_default_search_directories(back_inserter(v));
return v;
}
#ifdef _POSIX_VERSION
class Globber {
private:
glob_t g;
int ret;
public:
Globber(const char* pattern) : g{}
{
ret = ::glob(pattern, 0, nullptr, &g);
}
Globber(const string& pattern) : Globber(pattern.c_str()) {}
auto glob(const char* pattern) -> bool
{
globfree(&g);
ret = ::glob(pattern, 0, nullptr, &g);
return ret == 0;
}
auto glob(const string& pattern) -> bool
{
return glob(pattern.c_str());
}
auto begin() -> const char* const* { return g.gl_pathv; }
auto end() -> const char* const* { return begin() + g.gl_pathc; }
template <class OutIt>
auto copy_glob_paths(OutIt out) -> OutIt
{
if (ret == 0) {
out = copy(begin(), end(), out);
}
return out;
}
~Globber() { globfree(&g); }
};
#else
// unimplemented
struct Globber {
Globber(const char* pattern) {}
Globber(const string& pattern) {}
auto glob(const char* pattern) -> bool { return false; }
auto glob(const string& pattern) -> bool { return false; }
auto begin() -> char** { return nullptr; }
auto end() -> char** { return nullptr; }
template <class OutIt>
auto copy_glob_paths(OutIt out) -> OutIt
{
return out;
}
};
#endif
template <class OutIt>
auto get_mozilla_directories(OutIt out) -> OutIt
{
#ifdef _POSIX_VERSION
// add Mozilla linux global directory
array<const char*, 2> dirs = {"/usr/local/lib/firefox/dictionaries",
"/usr/lib/firefox/dictionaries"};
struct stat dir_stat;
for (auto& dir : dirs) {
if (lstat(dir, &dir_stat) == 0) {
if (S_ISDIR(dir_stat.st_mode)) {
*out++ = dir;
}
// if SYMLINK do not add
}
}
// add Mozilla linux user directory
char* home = getenv("HOME");
if (home == nullptr) {
return out;
}
string moz = home;
moz += "/.mozilla/firefox/*/extensions/*/dictionaries";
Globber g(moz);
out = g.copy_glob_paths(out);
#elif defined(_WIN32)
// add Mozilla windows global directory
array<char*, 2> winpaths = {getenv("PROGRAMFILES"),
getenv("PROGRAMFILES(x86)")};
for (auto& p : winpaths) {
if (p) {
*out++ = string(p) + "/Mozilla Firefox/dictionaries";
}
}
// add Mozilla windows local directory
char* home = getenv("APPDATA");
if (home == nullptr) {
return out;
}
string moz = home;
moz += "/Mozilla/Firefox/Profiles/*/extensions/*/dictionaries";
Globber g(moz);
out = g.copy_glob_paths(out);
#endif
return out;
}
auto get_mozilla_directories(vector<string>& out) -> void
{
get_mozilla_directories(back_inserter(out));
}
template <class OutIt>
auto get_libreoffice_directories(OutIt out) -> OutIt
{
string lo_user_glob;
#ifdef _POSIX_VERSION
// add Libreoffice linux global directories
array<const char*, 3> prefixes = {"/usr/local/lib/libreoffice",
"/usr/lib/libreoffice",
"/opt/libreoffice*"};
for (auto& p : prefixes) {
Globber g(string(p) + "/share/extensions/dict-*");
out = g.copy_glob_paths(out);
}
// add Libreoffice linux local
char* home = getenv("HOME");
if (home == nullptr) {
return out;
}
lo_user_glob = home;
lo_user_glob += "/.config/libreoffice/?/user/uno_packages/cache"
"/uno_packages/*/*.oxt/";
#elif defined(_WIN32)
// add Libreoffice windows global directories
array<char*, 2> prefixes = {getenv("PROGRAMFILES"),
getenv("PROGRAMFILES(x86)")};
for (auto& p : prefixes) {
if (p == nullptr) {
continue;
}
Globber g(string(p) + "Libre Office ?/share/extensions/dict-*");
out = g.copy_glob_paths(out);
}
char* home = getenv("APPDATA");
if (home == nullptr) {
return out;
}
lo_user_glob = home;
lo_user_glob += "/libreoffice/?/user/uno_packages/cache"
"/uno_packages/*/*.oxt/";
#else
return out;
#endif
// finish adding LO user directory dicts (linux and windows)
Globber g(lo_user_glob + "dictionaries");
out = g.copy_glob_paths(out);
g.glob(lo_user_glob + "*.aff");
string path_str;
for (auto& path : g) {
path_str = path;
path_str.erase(path_str.rfind('/'));
*out = path_str;
++out;
}
return out;
}
auto get_libreoffice_directories(std::vector<std::string>& out) -> void
{
get_libreoffice_directories(back_inserter(out));
}
#if defined(_POSIX_VERSION) || defined(__MINGW32__)
class Directory {
DIR* dp = nullptr;
#ifdef _POSIX_VERSION
struct dirent ent;
#endif
struct dirent* ent_p = nullptr;
public:
Directory() {}
Directory(const Directory& d) = delete;
void operator=(const Directory& d) = delete;
auto open(const string& dirname) -> bool
{
if (dp) {
(void)closedir(dp);
}
dp = opendir(dirname.c_str());
return dp;
}
auto next() -> bool
{
#ifdef _POSIX_VERSION
return readdir_r(dp, &ent, &ent_p) == 0 && ent_p;
#else
return (ent_p = readdir(dp));
#endif
}
auto entry_name() -> const char* { return ent_p->d_name; }
auto close() -> void
{
(void)closedir(dp);
dp = nullptr;
}
~Directory() { close(); }
};
#else
struct Directory()
{
Directory() {}
Directory(const Directory& d) = delete;
void operator=(const Directory& d) = delete;
auto open(const string& dirname)->bool { return false; }
auto next()->bool { return false; }
auto entry_name()->const char* { return nullptr; }
auto close() {}
}
#endif
template <class OutIt>
auto search_dir_for_dicts(const string& dir, OutIt out) -> OutIt
{
Directory d;
if (d.open(dir) == false) {
return out;
}
unordered_set<string> dics;
string file_name;
while (d.next()) {
file_name = d.entry_name();
auto sz = file_name.size();
if (sz < 4) {
continue;
}
if (file_name.find(".dic", sz - 4) != file_name.npos) {
dics.insert(file_name);
file_name.resize(sz - 4);
file_name += ".aff";
if (dics.count(file_name)) {
file_name.resize(sz - 4);
auto full_path = dir + '/' + file_name;
*out = make_pair(file_name, full_path);
out++;
}
}
else if (file_name.find(".aff", sz - 4) != file_name.npos) {
dics.insert(file_name);
file_name.resize(sz - 4);
file_name += ".dic";
if (dics.count(file_name)) {
file_name.resize(sz - 4);
auto full_path = dir + '/' + file_name;
*out = make_pair(file_name, full_path);
out++;
}
}
}
return out;
}
auto search_dirs_for_dicts(const vector<string>& dirs)
-> vector<pair<string, string>>
{
vector<pair<string, string>> v;
for (auto& dir : dirs) {
search_dir_for_dicts(dir, back_inserter(v));
}
return v;
}
}

View File

@ -1,40 +0,0 @@
/* Copyright 2016-2017 Dimitrij Mijoski
*
* This file is part of Hunspell-2.
*
* Hunspell-2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Hunspell-2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Hunspell-2. If not, see <http://www.gnu.org/licenses/>.
*
* Hunspell 2 is based on Hunspell v1 and MySpell.
* Hunspell v1 is Copyright (C) 2002-2017 Németh László
* MySpell is Copyright (C) 2002 Kevin Hendricks.
*/
#ifndef HUNSPELL_DIC_FINDER_HXX
#define HUNSPELL_DIC_FINDER_HXX
#include <string>
#include <utility>
#include <vector>
namespace hunspell {
auto get_default_search_directories() -> std::vector<std::string>;
auto get_mozilla_directories(std::vector<std::string>& out) -> void;
auto get_libreoffice_directories(std::vector<std::string>& out) -> void;
auto search_dirs_for_dicts(const std::vector<std::string>& dirs)
-> std::vector<std::pair<std::string, std::string>>;
}
#endif

View File

@ -1,117 +0,0 @@
#include <string>
namespace hunspell {
enum spell_result {
bad_word,
good_word,
affixed_good_word,
compound_good_word
};
class hunspell {
public:
using string = std::string;
using u16string = std::u16string;
private:
/* (0)
All the major work is done here.
(1) and (2) are the lowest level specializations.
The rest just do some conversions and delegate to them.
(1) will simply call this
with ConversionIterator set to string::iterator
(2) will call this with u8_u32 on the fly conversion iterator.
*/
template <class ConvIter>
auto spell(ConvIter start, ConvIter end, const string& s)
-> spell_result;
/**
(1) This should be called when the input and the dictionary
are in the same encoding and that encoding is single byte encoding.
*/
auto spell_singlechar_input_singlechar_dict(const string& word)
-> spell_result;
/**
(2) This should be called when the input and the dictionary
are in the same encoding and that encoding UTF-8.
*/
auto spell_u8_input_u8_dict(const string& word) -> spell_result;
/*
(3) This should be called when the input is UTF-8 string
and the dictionary is byte encoding. Lossy conversion should happend
UTF-8 to single byte, and then (1) should be called.
*/
auto spell_u8_input_singlechar_dict(const string& word) -> spell_result;
/*
(4) This should be called when the input is
single-byte narow OR multi-byte narrow string.
and the dictionary is UTF-8
The input can be anything so we must use some info about the input
encoding, a C locale od C++ locale object.
One can do narrow -> u16 -> u8 like this:
get old C locale,
set C locale to loc,
call mbrtoc16,
revert old C locale,
then codecvt<char16_t, char, mbstate_t>
There is no C++ way to go mbr to u16, we're limited to mbrtoc16.
For that reason we will do similar conversion,
but in a more high level public funcrion.
Tis function should be UNUSED.
*/
// spell_result spell_narrow_input_u8_dict(const string& word);
public:
/**
(5) This should be called when the input and the dictionary
are in the same encoding which can be single byte or UTF-8.
Simply calls (1) or (2).
This is the same as spell() in v1.
*/
auto spell(const string& word) -> spell_result;
/**
(6) Unknown narrow input (single byte or multi byte).
Use current C locale and mbrtoc16 to convert it to known.
Do a conversion mbr -> u16 -> u8.
Use mbrtoc16, codecvt<char16_t, char, mbstate_t>
We can check if the the current locale is already utf-8 to skip this.
Once we know we have a u8 string, just call (7).
This should be the recomended way to interface with the command line
utility. Before calling this function, one should call
setlocale(LC_ALL, "") or locale::global(locale("")).
If we use std::cin, we should imbue it with cin.imbue(locale())
*/
auto spell_narrow_input(const string& word) -> spell_result;
/**
(7) UTF-8 input. Will delegate either to (2) or (3).
*/
auto spell_u8_input(const string& word) -> spell_result;
private:
/** (8) */
auto spell_u16_input_singlechar_dict(const u16string& word)
-> spell_result;
/** (9) */
auto spell_u16_input_u8_dict(const u16string& word) -> spell_result;
public:
/** (10) */
auto spell_u16_input(const u16string& word) -> spell_result;
};
}

View File

@ -1,178 +0,0 @@
/* Copyright 2016-2017 Dimitrij Mijoski
*
* This file is part of Hunspell-2.
*
* Hunspell-2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Hunspell-2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Hunspell-2. If not, see <http://www.gnu.org/licenses/>.
*
* Hunspell 2 is based on Hunspell v1 and MySpell.
* Hunspell v1 is Copyright (C) 2002-2017 Németh László
* MySpell is Copyright (C) 2002 Kevin Hendricks.
*/
#include "aff_manager.hxx"
#include "dic_manager.hxx"
#include "dict_finder.hxx"
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
#if defined(__MINGW32__) || defined(__unix__) || defined(__unix) || \
(defined(__APPLE__) && defined(__MACH__))
#include <unistd.h>
#endif
using namespace std;
struct args_t {
unordered_map<char, string> options;
vector<string> operands;
};
auto parse_args(int argc, char* argv[]) -> args_t
{
// usage
// hunspell -d dict [-l|-G]
// hunspell [-D]
#if defined(_POSIX_VERSION) || defined(__MINGW32__)
unordered_map<char, string> ret;
int c;
int errflg = 0;
while ((c = getopt(argc, argv, ":d:DGl")) != -1) {
switch (c) {
case 'd':
ret[c] = optarg;
break;
case 'D':
case 'G':
case 'l':
ret[c];
break;
case ':': /* -d without operand */
ret[c] += optopt;
cerr << "Option -" << (char)optopt
<< " requires an operand\n";
errflg++;
break;
case '?':
ret[c] += optopt;
cerr << "Unrecognized option: '-" << (char)optopt
<< "'\n";
errflg++;
break;
}
}
return {ret, vector<string>(argv + optind, argv + argc)};
#else
return {};
#endif
}
int main(int argc, char* argv[])
{
auto args1 = parse_args(argc, argv);
auto& args = args1.options;
auto v = hunspell::get_default_search_directories();
hunspell::get_mozilla_directories(v);
hunspell::get_libreoffice_directories(v);
auto dics = hunspell::search_dirs_for_dicts(v);
if (args.empty() || args.count('D')) {
for (auto& a : v) {
cout << a << endl;
}
for (auto& a : dics) {
cout << a.first << '\t' << a.second << endl;
}
return 0;
}
if (args.count('d') == 0) {
return 0;
}
string filename;
for (auto& a : dics) {
if (a.first == args['d']) {
filename = a.second;
break;
}
}
if (filename.empty()) {
return 1;
}
/*
locale::global(locale(""));
cin.imbue(locale());
hunspell::hunspell dic(filename);
string word;
if (args.count('l')) {
while (cin >> word) {
auto res = dic.spell_narrow_input(word);
switch (res) {
case bad_word:
cout << word << '\n';
}
}
}
else if (args.count('G')) {
while (cin >> word) {
auto res = dic.spell_narrow_input(word);
switch (res) {
case bad_word:
break;
default:
cout << word << '\n';
}
}
}
else {
while (cin >> word) {
auto res = dic.spell_narrow_input(word);
switch (res) {
case bad_word:
case good_word:
case affixed_good_word:
case compound_good_word:
}
}
}
*/
ifstream affstream(filename + ".aff");
ifstream dicstream(filename + ".dic");
hunspell::aff_data aff;
aff.parse(affstream);
hunspell::dic_data dic;
dic.parse(dicstream, aff);
std::cout << aff.encoding << endl;
std::cout << aff.try_chars << endl;
for (auto& a : aff.compound_rules) {
cout << a << endl;
}
for (auto& a : aff.suffixes) {
cout << (char)a.flag << ' ' << (a.cross_product ? 'Y' : 'N')
<< ' ' << a.stripping << ' ' << a.affix
<< (a.new_flags.size() ? "/ " : " ") << a.condition;
cout << endl;
}
for (auto& wd : dic.words) {
cout << wd.first;
if (wd.second.size()) {
cout << '/';
for (auto& flag : wd.second) {
cout << flag << ',';
}
}
cout << endl;
}
return 0;
}

View File

@ -1,106 +0,0 @@
/* Copyright 2016-2017 Dimitrij Mijoski
*
* This file is part of Hunspell-2.
*
* Hunspell-2 is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Hunspell-2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Hunspell-2. If not, see <http://www.gnu.org/licenses/>.
*
* Hunspell 2 is based on Hunspell v1 and MySpell.
* Hunspell v1 is Copyright (C) 2002-2017 Németh László
* MySpell is Copyright (C) 2002 Kevin Hendricks.
*/
#ifndef HUNSPELL_STRING_UTILS_HXX
#define HUNSPELL_STRING_UTILS_HXX
#include <codecvt>
#include <istream>
#include <locale>
#include <string>
#include <vector>
#include <cctype>
namespace hunspell {
using utf8_to_ucs2_converter =
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t>;
inline void toupper_ascii(std::string& s)
{
for (auto& c : s)
c = toupper(c);
}
inline void reset_failbit_istream(std::istream& in)
{
in.clear(in.rdstate() & ~in.failbit);
}
template <class To>
struct cast_lambda {
template <class From>
To operator()(From& f) const
{
return static_cast<To>(f);
}
};
inline bool read_to_slash_or_space(std::istream& in, std::string& out)
{
in >> std::ws;
int c;
bool readSomething = false;
while ((c = in.get()) != std::istream::traits_type::eof() &&
!isspace((char)c, in.getloc()) && c != '/') {
out.push_back(c);
readSomething = true;
}
bool slash = c == '/';
if (readSomething || slash) {
reset_failbit_istream(in);
}
return slash;
}
inline bool read_to_slash(std::istream& in, std::string& out)
{
in >> std::ws;
int c;
bool readSomething = false;
while ((c = in.get()) != std::istream::traits_type::eof() && c != '/') {
out.push_back(c);
readSomething = true;
}
bool slash = c == '/';
if (readSomething || slash) {
reset_failbit_istream(in);
}
return slash;
}
inline void parse_morhological_fields(std::istream& in,
std::vector<std::string>& vecOut)
{
if (!in.good()) {
return;
}
std::string morph;
while (in >> morph) {
vecOut.push_back(morph);
}
reset_failbit_istream(in);
}
}
#endif

View File

@ -1 +0,0 @@
testparser

View File

@ -1,65 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "firstparser.hxx"
#ifndef W32
using namespace std;
#endif
FirstParser::FirstParser(const char* wordchars)
: TextParser(wordchars) {
}
FirstParser::~FirstParser() {}
bool FirstParser::next_token(std::string& t) {
t.clear();
const size_t tabpos = line[actual].find('\t');
if (tabpos != std::string::npos && tabpos > token) {
token = tabpos;
t = line[actual].substr(0, tabpos);
return true;
}
return false;
}

View File

@ -1,56 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef FIRSTPARSER_HXX_
#define FIRSTPARSER_HXX_
#include "textparser.hxx"
/*
* Check first word of the input line
*
*/
class FirstParser : public TextParser {
public:
explicit FirstParser(const char* wc);
virtual ~FirstParser();
virtual bool next_token(std::string&);
};
#endif

View File

@ -1,84 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "htmlparser.hxx"
#ifndef W32
using namespace std;
#endif
static const char* PATTERN[][2] = {{"<script", "</script>"},
{"<style", "</style>"},
{"<code", "</code>"},
{"<samp", "</samp>"},
{"<kbd", "</kbd>"},
{"<var", "</var>"},
{"<listing", "</listing>"},
{"<address", "</address>"},
{"<pre", "</pre>"},
{"<!--", "-->"},
{"<[cdata[", "]]>"}, // XML comment
{"<", ">"}};
#define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char*) * 2))
static const char* PATTERN2[][2] = {
{"<img", "alt="}, // ALT and TITLE attrib handled spec.
{"<img", "title="},
{"<a ", "title="}};
#define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char*) * 2))
HTMLParser::HTMLParser(const char* wordchars)
: XMLParser(wordchars) {
}
HTMLParser::HTMLParser(const w_char* wordchars, int len)
: XMLParser(wordchars, len) {
}
bool HTMLParser::next_token(std::string& t) {
return XMLParser::next_token(PATTERN, PATTERN_LEN, PATTERN2, PATTERN_LEN2, t);
}
HTMLParser::~HTMLParser() {}

View File

@ -1,56 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef HTMLPARSER_HXX_
#define HTMLPARSER_HXX_
#include "xmlparser.hxx"
/*
* HTML Parser
*
*/
class HTMLParser : public XMLParser {
public:
explicit HTMLParser(const char* wc);
HTMLParser(const w_char* wordchars, int len);
virtual bool next_token(std::string&);
virtual ~HTMLParser();
};
#endif

View File

@ -1,261 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "latexparser.hxx"
#ifndef W32
using namespace std;
#endif
static struct {
const char* pat[2];
int arg;
} PATTERN[] = {{{"\\(", "\\)"}, 0},
{{"$$", "$$"}, 0},
{{"$", "$"}, 0},
{{"\\begin{math}", "\\end{math}"}, 0},
{{"\\[", "\\]"}, 0},
{{"\\begin{displaymath}", "\\end{displaymath}"}, 0},
{{"\\begin{equation}", "\\end{equation}"}, 0},
{{"\\begin{equation*}", "\\end{equation*}"}, 0},
{{"\\cite", NULL}, 1},
{{"\\nocite", NULL}, 1},
{{"\\index", NULL}, 1},
{{"\\label", NULL}, 1},
{{"\\ref", NULL}, 1},
{{"\\pageref", NULL}, 1},
{{"\\autoref", NULL}, 1},
{{"\\parbox", NULL}, 1},
{{"\\begin{verbatim}", "\\end{verbatim}"}, 0},
{{"\\verb+", "+"}, 0},
{{"\\verb|", "|"}, 0},
{{"\\verb#", "#"}, 0},
{{"\\verb*", "*"}, 0},
{{"\\documentstyle", "\\begin{document}"}, 0},
{{"\\documentclass", "\\begin{document}"}, 0},
// { { "\\documentclass", NULL } , 1 },
{{"\\usepackage", NULL}, 1},
{{"\\includeonly", NULL}, 1},
{{"\\include", NULL}, 1},
{{"\\input", NULL}, 1},
{{"\\vspace", NULL}, 1},
{{"\\setlength", NULL}, 2},
{{"\\addtolength", NULL}, 2},
{{"\\settowidth", NULL}, 2},
{{"\\rule", NULL}, 2},
{{"\\hspace", NULL}, 1},
{{"\\vspace", NULL}, 1},
{{"\\\\[", "]"}, 0},
{{"\\pagebreak[", "]"}, 0},
{{"\\nopagebreak[", "]"}, 0},
{{"\\enlargethispage", NULL}, 1},
{{"\\begin{tabular}", NULL}, 1},
{{"\\addcontentsline", NULL}, 2},
{{"\\begin{thebibliography}", NULL}, 1},
{{"\\bibliography", NULL}, 1},
{{"\\bibliographystyle", NULL}, 1},
{{"\\bibitem", NULL}, 1},
{{"\\begin", NULL}, 1},
{{"\\end", NULL}, 1},
{{"\\pagestyle", NULL}, 1},
{{"\\pagenumbering", NULL}, 1},
{{"\\thispagestyle", NULL}, 1},
{{"\\newtheorem", NULL}, 2},
{{"\\newcommand", NULL}, 2},
{{"\\renewcommand", NULL}, 2},
{{"\\setcounter", NULL}, 2},
{{"\\addtocounter", NULL}, 1},
{{"\\stepcounter", NULL}, 1},
{{"\\selectlanguage", NULL}, 1},
{{"\\inputencoding", NULL}, 1},
{{"\\hyphenation", NULL}, 1},
{{"\\definecolor", NULL}, 3},
{{"\\color", NULL}, 1},
{{"\\textcolor", NULL}, 1},
{{"\\pagecolor", NULL}, 1},
{{"\\colorbox", NULL}, 2},
{{"\\fcolorbox", NULL}, 2},
{{"\\declaregraphicsextensions", NULL}, 1},
{{"\\psfig", NULL}, 1},
{{"\\url", NULL}, 1},
{{"\\eqref", NULL}, 1},
{{"\\vskip", NULL}, 1},
{{"\\vglue", NULL}, 1},
{{"\'\'", NULL}, 1}};
#define PATTERN_LEN (sizeof(PATTERN) / sizeof(PATTERN[0]))
LaTeXParser::LaTeXParser(const char* wordchars)
: TextParser(wordchars)
, pattern_num(0), depth(0), arg(0), opt(0) {
}
LaTeXParser::LaTeXParser(const w_char* wordchars, int len)
: TextParser(wordchars, len)
, pattern_num(0), depth(0), arg(0), opt(0) {
}
LaTeXParser::~LaTeXParser() {}
int LaTeXParser::look_pattern(int col) {
for (unsigned int i = 0; i < PATTERN_LEN; i++) {
const char* j = line[actual].c_str() + head;
const char* k = PATTERN[i].pat[col];
if (!k)
continue;
while ((*k != '\0') && (tolower(*j) == *k)) {
j++;
k++;
}
if (*k == '\0')
return i;
}
return -1;
}
/*
* LaTeXParser
*
* state 0: not wordchar
* state 1: wordchar
* state 2: comments
* state 3: commands
* state 4: commands with arguments
* state 5: % comment
*
*/
bool LaTeXParser::next_token(std::string& t) {
t.clear();
int i;
int slash = 0;
int apostrophe;
for (;;) {
// fprintf(stderr,"depth: %d, state: %d, , arg: %d, token:
// %s\n",depth,state,arg,line[actual]+head);
switch (state) {
case 0: // non word chars
if ((pattern_num = look_pattern(0)) != -1) {
if (PATTERN[pattern_num].pat[1]) {
state = 2;
} else {
state = 4;
depth = 0;
arg = 0;
opt = 1;
}
head += strlen(PATTERN[pattern_num].pat[0]) - 1;
} else if (line[actual][head] == '%') {
state = 5;
} else if (is_wordchar(line[actual].c_str() + head)) {
state = 1;
token = head;
} else if (line[actual][head] == '\\') {
if (line[actual][head + 1] == '\\' || // \\ (linebreak)
(line[actual][head + 1] == '$') || // \$ (dollar sign)
(line[actual][head + 1] == '%')) { // \% (percent)
head++;
break;
}
state = 3;
}
break;
case 1: // wordchar
apostrophe = 0;
if (!is_wordchar(line[actual].c_str() + head) ||
(line[actual][head] == '\'' && line[actual][head + 1] == '\'' &&
++apostrophe)) {
state = 0;
bool ok = alloc_token(token, &head, t);
if (apostrophe)
head += 2;
if (ok)
return true;
}
break;
case 2: // comment, labels, etc
if (((i = look_pattern(1)) != -1) &&
(strcmp(PATTERN[i].pat[1], PATTERN[pattern_num].pat[1]) == 0)) {
state = 0;
head += strlen(PATTERN[pattern_num].pat[1]) - 1;
}
break;
case 3: // command
if ((tolower(line[actual][head]) < 'a') ||
(tolower(line[actual][head]) > 'z')) {
state = 0;
head--;
}
break;
case 4: // command with arguments
if (slash && (line[actual][head] != '\0')) {
slash = 0;
head++;
break;
} else if (line[actual][head] == '\\') {
slash = 1;
} else if ((line[actual][head] == '{') ||
((opt) && (line[actual][head] == '['))) {
depth++;
opt = 0;
} else if (line[actual][head] == '}') {
depth--;
if (depth == 0) {
opt = 1;
arg++;
}
if (((depth == 0) && (arg == PATTERN[pattern_num].arg)) ||
(depth < 0)) {
state = 0; // XXX not handles the last optional arg.
}
} else if (line[actual][head] == ']')
depth--;
} // case
if (next_char(line[actual].c_str(), &head)) {
if (state == 5)
state = 0;
return false;
}
}
}

View File

@ -1,65 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef LATEXPARSER_HXX_
#define LATEXPARSER_HXX_
#include "textparser.hxx"
/*
* HTML Parser
*
*/
class LaTeXParser : public TextParser {
int pattern_num; // number of comment
int depth; // depth of blocks
int arg; // arguments's number
int opt; // optional argument attrib.
public:
explicit LaTeXParser(const char* wc);
LaTeXParser(const w_char* wordchars, int len);
virtual ~LaTeXParser();
virtual bool next_token(std::string&);
private:
int look_pattern(int col);
};
#endif

View File

@ -1,98 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "manparser.hxx"
#ifndef W32
using namespace std;
#endif
ManParser::ManParser(const char* wordchars)
: TextParser(wordchars) {
}
ManParser::ManParser(const w_char* wordchars, int len)
: TextParser(wordchars, len) {
}
ManParser::~ManParser() {}
bool ManParser::next_token(std::string& t) {
for (;;) {
switch (state) {
case 1: // command arguments
if (line[actual][head] == ' ')
state = 2;
break;
case 0: // dot in begin of line
if (line[actual][0] == '.') {
state = 1;
break;
} else {
state = 2;
}
// no break
case 2: // non word chars
if (is_wordchar(line[actual].c_str() + head)) {
state = 3;
token = head;
} else if ((line[actual][head] == '\\') &&
(line[actual][head + 1] == 'f') &&
(line[actual][head + 2] != '\0')) {
head += 2;
}
break;
case 3: // wordchar
if (!is_wordchar(line[actual].c_str() + head)) {
state = 2;
if (alloc_token(token, &head, t))
return true;
}
break;
}
if (next_char(line[actual].c_str(), &head)) {
state = 0;
return false;
}
}
}

View File

@ -1,58 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef MANPARSER_HXX_
#define MANPARSER_HXX_
#include "textparser.hxx"
/*
* Manparse Parser
*
*/
class ManParser : public TextParser {
protected:
public:
explicit ManParser(const char* wc);
ManParser(const w_char* wordchars, int len);
virtual ~ManParser();
virtual bool next_token(std::string&);
};
#endif

View File

@ -1,76 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "odfparser.hxx"
#ifndef W32
using namespace std;
#endif
static const char* PATTERN[][2] = {
{"<office:meta>", "</office:meta>"},
{"<office:settings>", "</office:settings>"},
{"<office:binary-data>", "</office:binary-data>"},
{"<!--", "-->"},
{"<[cdata[", "]]>"}, // XML comment
{"<", ">"}};
#define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char*) * 2))
static const char* (*PATTERN2)[2] = NULL;
#define PATTERN_LEN2 0
ODFParser::ODFParser(const char* wordchars)
: XMLParser(wordchars) {
}
ODFParser::ODFParser(const w_char* wordchars, int len)
: XMLParser(wordchars, len) {
}
bool ODFParser::next_token(std::string& t) {
return XMLParser::next_token(PATTERN, PATTERN_LEN, PATTERN2, PATTERN_LEN2, t);
}
ODFParser::~ODFParser() {}

View File

@ -1,56 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef ODFPARSER_HXX_
#define ODFPARSER_HXX_
#include "xmlparser.hxx"
/*
* HTML Parser
*
*/
class ODFParser : public XMLParser {
public:
explicit ODFParser(const char* wc);
ODFParser(const w_char* wordchars, int len);
virtual bool next_token(std::string&);
virtual ~ODFParser();
};
#endif

View File

@ -1,86 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include "textparser.hxx"
#include "htmlparser.hxx"
#include "latexparser.hxx"
#include "xmlparser.hxx"
#ifndef W32
using namespace std;
#endif
int main(int argc, char** argv) {
FILE* f;
/* first parse the command line options */
if (argc < 2) {
fprintf(stderr, "correct syntax is:\n");
fprintf(stderr, "testparser file\n");
fprintf(stderr, "example: testparser /dev/stdin\n");
exit(1);
}
/* open the words to check list */
f = fopen(argv[1], "r");
if (!f) {
fprintf(stderr, "Error - could not open file of words to check\n");
exit(1);
}
TextParser* p = new TextParser(
"qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM");
char buf[MAXLNLEN];
while (fgets(buf, MAXLNLEN, f)) {
p->put_line(buf);
p->set_url_checking(1);
std::string next;
while (p->next_token(next)) {
fprintf(stdout, "token: %s\n", next.c_str());
}
}
delete p;
fclose(f);
return 0;
}

View File

@ -1,298 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "textparser.hxx"
#include <algorithm>
#ifndef W32
using namespace std;
#endif
// ISO-8859-1 HTML character entities
static const char* LATIN1[] = {
"&Agrave;", "&Atilde;", "&Aring;", "&AElig;", "&Egrave;", "&Ecirc;",
"&Igrave;", "&Iuml;", "&ETH;", "&Ntilde;", "&Ograve;", "&Oslash;",
"&Ugrave;", "&THORN;", "&agrave;", "&atilde;", "&aring;", "&aelig;",
"&egrave;", "&ecirc;", "&igrave;", "&iuml;", "&eth;", "&ntilde;",
"&ograve;", "&oslash;", "&ugrave;", "&thorn;", "&yuml;"};
#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*))
#define ENTITY_APOS "&apos;"
#define UTF8_APOS "\xe2\x80\x99"
#define APOSTROPHE "'"
TextParser::TextParser(const char* wordchars) {
init(wordchars);
}
TextParser::TextParser(const w_char* wordchars, int len) {
init(wordchars, len);
}
TextParser::~TextParser() {}
int TextParser::is_wordchar(const char* w) {
if (*w == '\0')
return 0;
if (utf8) {
std::vector<w_char> wc;
unsigned short idx;
u8_u16(wc, w);
if (wc.empty())
return 0;
idx = (wc[0].h << 8) + wc[0].l;
return (unicodeisalpha(idx) ||
(wordchars_utf16 &&
std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0])));
} else {
return wordcharacters[(*w + 256) % 256];
}
}
const char* TextParser::get_latin1(const char* s) {
if (s[0] == '&') {
unsigned int i = 0;
while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])))
i++;
if (i != LATIN1_LEN)
return LATIN1[i];
}
return NULL;
}
void TextParser::init(const char* wordchars) {
actual = 0;
head = 0;
token = 0;
state = 0;
utf8 = 0;
checkurl = 0;
wordchars_utf16 = NULL;
wclen = 0;
wordcharacters.resize(256, 0);
if (!wordchars)
wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
for (unsigned int j = 0; j < strlen(wordchars); ++j) {
wordcharacters[(wordchars[j] + 256) % 256] = 1;
}
}
void TextParser::init(const w_char* wc, int len) {
actual = 0;
head = 0;
token = 0;
state = 0;
utf8 = 1;
checkurl = 0;
wordchars_utf16 = wc;
wclen = len;
}
int TextParser::next_char(const char* ln, size_t* pos) {
if (*(ln + *pos) == '\0')
return 1;
if (utf8) {
if (*(ln + *pos) >> 7) {
// jump to next UTF-8 character
for ((*pos)++; (*(ln + *pos) & 0xc0) == 0x80; (*pos)++)
;
} else {
(*pos)++;
}
} else
(*pos)++;
return 0;
}
void TextParser::put_line(const char* word) {
actual = (actual + 1) % MAXPREVLINE;
line[actual].assign(word);
token = 0;
head = 0;
check_urls();
}
std::string TextParser::get_prevline(int n) const {
return line[(actual + MAXPREVLINE - n) % MAXPREVLINE];
}
std::string TextParser::get_line() const {
return get_prevline(0);
}
bool TextParser::next_token(std::string &t) {
const char* latin1;
for (;;) {
switch (state) {
case 0: // non word chars
if (is_wordchar(line[actual].c_str() + head)) {
state = 1;
token = head;
} else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
state = 1;
token = head;
head += strlen(latin1);
}
break;
case 1: // wordchar
if ((latin1 = get_latin1(line[actual].c_str() + head))) {
head += strlen(latin1);
} else if ((is_wordchar((char*)APOSTROPHE) ||
(is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
!line[actual].empty() && line[actual][head] == '\'' &&
is_wordchar(line[actual].c_str() + head + 1)) {
head++;
} else if (is_utf8() &&
is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
// to the WORDCHARS, if
// needed
strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
0 &&
is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
head += strlen(UTF8_APOS) - 1;
} else if (!is_wordchar(line[actual].c_str() + head)) {
state = 0;
if (alloc_token(token, &head, t))
return true;
}
break;
}
if (next_char(line[actual].c_str(), &head))
return false;
}
}
size_t TextParser::get_tokenpos() {
return token;
}
int TextParser::change_token(const char* word) {
if (word) {
std::string remainder(line[actual].substr(head));
line[actual].resize(token);
line[actual].append(word);
line[actual].append(remainder);
head = token;
return 1;
}
return 0;
}
void TextParser::check_urls() {
urlline.resize(line[actual].size() + 1);
int url_state = 0;
size_t url_head = 0;
size_t url_token = 0;
int url = 0;
for (;;) {
switch (url_state) {
case 0: // non word chars
if (is_wordchar(line[actual].c_str() + url_head)) {
url_state = 1;
url_token = url_head;
// Unix path
} else if (line[actual][url_head] == '/') {
url_state = 1;
url_token = url_head;
url = 1;
}
break;
case 1: // wordchar
char ch = line[actual][url_head];
// e-mail address
if ((ch == '@') ||
// MS-DOS, Windows path
(strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) ||
// URL
(strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) {
url = 1;
} else if (!(is_wordchar(line[actual].c_str() + url_head) || (ch == '-') ||
(ch == '_') || (ch == '\\') || (ch == '.') ||
(ch == ':') || (ch == '/') || (ch == '~') || (ch == '%') ||
(ch == '*') || (ch == '$') || (ch == '[') || (ch == ']') ||
(ch == '?') || (ch == '!') ||
((ch >= '0') && (ch <= '9')))) {
url_state = 0;
if (url == 1) {
for (size_t i = url_token; i < url_head; ++i) {
urlline[i] = true;
}
}
url = 0;
}
break;
}
urlline[url_head] = false;
if (next_char(line[actual].c_str(), &url_head))
return;
}
}
int TextParser::get_url(size_t token_pos, size_t* hd) {
for (size_t i = *hd; i < line[actual].size() && urlline[i]; i++, (*hd)++)
;
return checkurl ? 0 : urlline[token_pos];
}
void TextParser::set_url_checking(int check) {
checkurl = check;
}
bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) {
size_t url_head = *hd;
if (get_url(tokn, &url_head))
return false;
t = line[actual].substr(tokn, *hd - tokn);
// remove colon for Finnish and Swedish language
if (!t.empty() && t[t.size() - 1] == ':') {
t.resize(t.size() - 1);
if (t.empty()) {
return false;
}
}
return true;
}

View File

@ -1,98 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef TEXTPARSER_HXX_
#define TEXTPARSER_HXX_
// set sum of actual and previous lines
#define MAXPREVLINE 4
#ifndef MAXLNLEN
#define MAXLNLEN 8192
#endif
#include "../hunspell/w_char.hxx"
#include <vector>
/*
* Base Text Parser
*
*/
class TextParser {
protected:
std::vector<int> wordcharacters;// for detection of the word boundaries
std::string line[MAXPREVLINE]; // parsed and previous lines
std::vector<bool> urlline; // mask for url detection
int checkurl;
int actual; // actual line
size_t head; // head position
size_t token;// begin of token
int state; // state of automata
int utf8; // UTF-8 character encoding
int next_char(const char* line, size_t* pos);
const w_char* wordchars_utf16;
int wclen;
public:
TextParser(const w_char* wordchars, int len);
explicit TextParser(const char* wc);
virtual ~TextParser();
void put_line(const char* line);
std::string get_line() const;
std::string get_prevline(int n) const;
virtual bool next_token(std::string&);
virtual int change_token(const char* word);
void set_url_checking(int check);
size_t get_tokenpos();
int is_wordchar(const char* w);
inline int is_utf8() { return utf8; }
const char* get_latin1(const char* s);
char* next_char();
int tokenize_urls();
void check_urls();
int get_url(size_t token_pos, size_t* head);
bool alloc_token(size_t token, size_t* head, std::string& out);
private:
void init(const char*);
void init(const w_char* wordchars, int len);
};
#endif

View File

@ -1,213 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "xmlparser.hxx"
#ifndef W32
using namespace std;
#endif
enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
static const char* __PATTERN__[][2] = {{"<!--", "-->"},
{"<[cdata[", "]]>"}, // XML comment
{"<", ">"}};
#define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char*) * 2))
static const char* (*__PATTERN2__)[2] = NULL;
#define __PATTERN_LEN2__ 0
#define ENTITY_APOS "&apos;"
#define UTF8_APOS "\xe2\x80\x99"
#define APOSTROPHE "'"
XMLParser::XMLParser(const char* wordchars)
: TextParser(wordchars)
, pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) {
}
XMLParser::XMLParser(const w_char* wordchars, int len)
: TextParser(wordchars, len)
, pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) {
}
XMLParser::~XMLParser() {}
int XMLParser::look_pattern(const char* p[][2], unsigned int len, int column) {
for (unsigned int i = 0; i < len; i++) {
const char* j = line[actual].c_str() + head;
const char* k = p[i][column];
while ((*k != '\0') && (tolower(*j) == *k)) {
j++;
k++;
}
if (*k == '\0')
return i;
}
return -1;
}
/*
* XML parser
*
*/
bool XMLParser::next_token(const char* PATTERN[][2],
unsigned int PATTERN_LEN,
const char* PATTERN2[][2],
unsigned int PATTERN_LEN2,
std::string& t) {
t.clear();
const char* latin1;
for (;;) {
switch (state) {
case ST_NON_WORD: // non word chars
prevstate = ST_NON_WORD;
if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
checkattr = 0;
if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
checkattr = 1;
}
state = ST_TAG;
} else if (is_wordchar(line[actual].c_str() + head)) {
state = ST_WORD;
token = head;
} else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
state = ST_WORD;
token = head;
head += strlen(latin1);
} else if (line[actual][head] == '&') {
state = ST_CHAR_ENTITY;
}
break;
case ST_WORD: // wordchar
if ((latin1 = get_latin1(line[actual].c_str() + head))) {
head += strlen(latin1);
} else if ((is_wordchar((char*)APOSTROPHE) ||
(is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
strncmp(line[actual].c_str() + head, ENTITY_APOS,
strlen(ENTITY_APOS)) == 0 &&
is_wordchar(line[actual].c_str() + head + strlen(ENTITY_APOS))) {
head += strlen(ENTITY_APOS) - 1;
} else if (is_utf8() &&
is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
// to the WORDCHARS, if
// needed
strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
0 &&
is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
head += strlen(UTF8_APOS) - 1;
} else if (!is_wordchar(line[actual].c_str() + head)) {
state = prevstate;
if (alloc_token(token, &head, t))
return true;
}
break;
case ST_TAG: // comment, labels, etc
int i;
if ((checkattr == 1) &&
((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) &&
(strcmp(PATTERN2[i][0], PATTERN2[pattern2_num][0]) == 0)) {
checkattr = 2;
} else if ((checkattr > 0) && (line[actual][head] == '>')) {
state = ST_NON_WORD;
} else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
(strcmp(PATTERN[i][1], PATTERN[pattern_num][1]) == 0)) {
state = ST_NON_WORD;
head += strlen(PATTERN[pattern_num][1]) - 1;
} else if ((strcmp(PATTERN[pattern_num][0], "<") == 0) &&
((line[actual][head] == '"') ||
(line[actual][head] == '\''))) {
quotmark = line[actual][head];
state = ST_ATTRIB;
}
break;
case ST_ATTRIB: // non word chars
prevstate = ST_ATTRIB;
if (line[actual][head] == quotmark) {
state = ST_TAG;
if (checkattr == 2)
checkattr = 1;
// for IMG ALT
} else if (is_wordchar(line[actual].c_str() + head) && (checkattr == 2)) {
state = ST_WORD;
token = head;
} else if (line[actual][head] == '&') {
state = ST_CHAR_ENTITY;
}
break;
case ST_CHAR_ENTITY: // SGML element
if ((tolower(line[actual][head]) == ';')) {
state = prevstate;
head--;
}
}
if (next_char(line[actual].c_str(), &head))
return false;
}
//FIXME No return, in function returning non-void
}
bool XMLParser::next_token(std::string& t) {
return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__,
__PATTERN_LEN2__, t);
}
int XMLParser::change_token(const char* word) {
if (strstr(word, APOSTROPHE) != NULL || strchr(word, '"') != NULL ||
strchr(word, '&') != NULL || strchr(word, '<') != NULL ||
strchr(word, '>') != NULL) {
std::string r(word);
mystrrep(r, "&", "__namp;__");
mystrrep(r, "__namp;__", "&amp;");
mystrrep(r, APOSTROPHE, ENTITY_APOS);
mystrrep(r, "\"", "&quot;");
mystrrep(r, ">", "&gt;");
mystrrep(r, "<", "&lt;");
return TextParser::change_token(r.c_str());
}
return TextParser::change_token(word);
}

View File

@ -1,70 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef XMLPARSER_HXX_
#define XMLPARSER_HXX_
#include "textparser.hxx"
/*
* XML Parser
*
*/
class XMLParser : public TextParser {
public:
explicit XMLParser(const char* wc);
XMLParser(const w_char* wordchars, int len);
bool next_token(const char* p[][2],
unsigned int len,
const char* p2[][2],
unsigned int len2,
std::string&);
virtual bool next_token(std::string&);
int change_token(const char* word);
virtual ~XMLParser();
private:
int look_pattern(const char* p[][2], unsigned int len, int column);
int pattern_num;
int pattern2_num;
int prevstate;
int checkattr;
char quotmark;
};
#endif

View File

@ -1,9 +0,0 @@
analyze
bulkcheck
chmorph
example
hunspell
hunzip
hzip
munch
unmunch

View File

@ -1,195 +0,0 @@
#!/bin/sh
# affix compressor utility for Hunspell
# 2008 (c) László Németh, version 0.3
# usage: affixcompress sorted_word_list_file [max_affix_rules]
case $# in
0) echo \
"affixcompress - compress a huge sorted word list to Hunspell format
Usage:
LC_ALL=C sort word_list >sorted_word_list
affixcompress sorted_word_list [max_affix_rules]
Default value of max_affix_rules = 5000
Note: output may need manually added affix parameters (SET character_encoding,
TRY suggestion_characters etc., see man(4) hunspell)"
exit 0;;
esac
MAXAFFIX=${2:-5000}
# profiling
#AWK="pgawk --profile"
AWK="awk"
if which gawk; then
AWK="gawk"
fi
rm -f $1.aff $1.dic
cat $1 | $AWK '
{
# calculate frequent suffixes
A[$1] = 1
len = length($1)
if (len > 2) {
# print $1, substr($1, 1, len - 1), substr($1, len, 1) >"/dev/stderr"
B[substr($1, 1, len - 1)] = substr($1, len, 1);
}
for(i = 2; i < len; i++) {
r = substr($1, 1, i)
if (i == 2) {
if (prev != r) {
delete A
delete B
print "Deleted roots: ", prev > "/dev/stderr"
A[$1] = 1
}
prev = r
}
if (A[r]) {
# print $1 ": " r " és "substr($1, i + 1, len - i + 1) >"/dev/stderr"
sfx[substr($1, i + 1, len - i + 1)]++
} else if (B[r] && B[r] != substr($1, i + 1, 1)) {
r2 = substr($1, i + 1, len - i + 1)
sfy[r2,B[r]]++
}
}
}
END {
for (i in sfx) print i, 0, sfx[i]
for (i in sfy) print i, sfy[i]
}
' | tr '\034' ' ' >affixcompress0.tmp
sort -rnk 3 affixcompress0.tmp | $AWK '$3 >= 1{print $0}' |
head -$MAXAFFIX >affixcompress1.tmp
cat affixcompress1.tmp |
$AWK '
function potential_roots() {
# potential roots with most frequent suffixes
for(word in W) if (W[word]==1) {
print word >"word"
len = length(word);
for(i = 2; i < len; i++) {
root = substr(word, 1, i)
suff = substr(word, i + 1, len - i + 1)
if ((W[root]!="") && (sfxfr[suff] > 100)) C[root]++
if (sfz[suff]) {
l = split(sfz[suff], a)
for (k=1; k <= l; k++) if ((W[root a[k]]!="") && (sfyfr[root a[k]] > 100)) {
C[root a[k]]++
}
}
}
}
# calculate roots
for(word in W) if (W[word]==1) {
print word >"word2"
len = length(word);
z = 0
# choose most frequent root (maybe the original word)
max = C[word]
maxword = word
maxsuff = 0
for(i = 2; i < len; i++) {
root = substr(word, 1, i)
suff = substr(word, i + 1, len - i + 1)
if ((sfx[suff] != "") && (C[root] > max)) {
max = C[root]
maxword = root
maxsuff = sfx[suff]
}
if (sfz[suff] != "") {
l = split(sfz[suff], a)
for (k=1; k <= l; k++) if (C[root a[k]] > max) {
max = C[root a[k]]
maxword = root a[k]
maxsuff = sfy[suff,a[k]]
}
}
}
if (max > 0) {
if (maxsuff > 0) print maxword, maxsuff; else print maxword
A[maxword]++
z=1
} else {
for(i = 2; i < len; i++) {
root = substr(word, 1, i)
suff = substr(word, i + 1, len - i + 1)
if ((A[root] > 0) && sfx[suff]!="") {
print root, sfx[suff]
z = 1
break
}
if (sfz[suff]) {
l = split(sfz[suff], a)
for (k=1; k <= l; k++) if (A[root a[k]]!="") {
print root a[k], sfy[suff,a[k]]
z = 1
break
}
}
}
}
if (z == 0) {
print word
A[word]++
}
}
delete A
delete C
}
FILENAME == "-" {
if ($2 == 0) {
sfx[$1] = NR
sfxfr[$1] = $3
} else {
sfy[$1,$2] = NR
sfyfr[$1,$2] = $3
sfz[$1] = sfz[$1] " " $2
}
maxsuf = NR
next
}
{
cap = substr($1, 1, 3)
if (cap != prev) {
potential_roots()
delete W
print "Deleted class:", prev > "/dev/stderr"
}
prev = cap
W[$1] = 1
}
END {
potential_roots()
# write out frequent suffixes
out=FILENAME ".aff"
print "FLAG num" >out
for (i in sfx) if (sfx[i] > 0) {
print "SFX", sfx[i], "Y 1" >out
print "SFX", sfx[i], "0", i, "." >out
}
for (i in sfy) if (sfy[i] > 0) {
print "SFX", sfy[i], "Y 1" >out
split(i, c, "\034");
print "SFX", sfy[i], c[2], c[1], c[2] >out
}
}
' - $1 >affixcompress2.tmp
sort -nk 2 affixcompress2.tmp >affixcompress3.tmp
cat affixcompress3.tmp | $AWK -v out="$1.dic" '
{
if (A[$1]=="") A[$1]=$2;
else if ($2!="") A[$1] = A[$1] "," $2
}
END {
for (i in A) n++
print n >out
for (i in A) {
if (A[i]=="") print i
else print i "/" A[i]
}
}
' | sort >>$1.dic

View File

@ -1,103 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include "hunspell.hxx"
#ifndef WIN32
using namespace std;
#endif
int main(int, char** argv) {
/* first parse the command line options */
for (int i = 1; i < 3; ++i)
if (!argv[i]) {
fprintf(stderr, "correct syntax is:\nanalyze affix_file");
fprintf(stderr, " dictionary_file file_of_words_to_check\n");
fprintf(stderr, "use two words per line for morphological generation\n");
exit(1);
}
/* open the words to check list */
FILE* wtclst = fopen(argv[3], "r");
if (!wtclst) {
fprintf(stderr, "Error - could not open file to check\n");
exit(1);
}
Hunspell* pMS = new Hunspell(argv[1], argv[2]);
char buf[100];
while (fgets(buf, sizeof(buf), wtclst)) {
buf[strcspn(buf, "\n")] = 0;
if (*buf == '\0')
continue;
// morphgen demo
char* s = strchr(buf, ' ');
if (s) {
*s = '\0';
std::vector<std::string> result = pMS->generate(buf, s + 1);
for (size_t i = 0; i < result.size(); ++i) {
fprintf(stdout, "generate(%s, %s) = %s\n", buf, s + 1, result[i].c_str());
}
if (result.empty())
fprintf(stdout, "generate(%s, %s) = NO DATA\n", buf, s + 1);
} else {
int dp = pMS->spell(std::string(buf));
fprintf(stdout, "> %s\n", buf);
if (dp) {
std::vector<std::string> result = pMS->analyze(buf);
for (size_t i = 0; i < result.size(); ++i) {
fprintf(stdout, "analyze(%s) = %s\n", buf, result[i].c_str());
}
result = pMS->stem(buf);
for (size_t i = 0; i < result.size(); ++i) {
fprintf(stdout, "stem(%s) = %s\n", buf, result[i].c_str());
}
} else {
fprintf(stdout, "Unknown word.\n");
}
}
}
delete pMS;
fclose(wtclst);
return 0;
}

View File

@ -1,196 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László for original code example.cxx
* Copyright (C) 2017 Pander for new code bulkcheck.cxx
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstring>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <iomanip>
#include "config.h" // for macro VERSION
#include "hunspell.hxx"
using namespace std;
int main(int argc, char** argv) {
/* first parse the command line options */
if (argc < 4) {
//TODO refactor to use a library for this
fprintf(stderr, "bulkcheck (now it works with more dictionary files):\n");
fprintf(stderr,
"bulkcheck affix_file dictionary_file(s) file_of_words_to_check result_file\n");
exit(1);
}
/* open the words to check list, word is expected on each line */
ifstream input_file(argv[argc - 1], ios_base::in);
if (!input_file.is_open()) {
fprintf(stderr, "Error - could not open file of words to check %s\n", argv[argc - 1]);
exit(1);
}
Hunspell* hunspell = new Hunspell(argv[1], argv[2]);
// load extra dictionaries, such as medical dictionaries or personal dictionaries that do not have affix file
//TODO This should go into the documentation
if (argc > 3)
for (int k = 3; k < argc - 1; ++k)
hunspell->add_dic(argv[k]);
// /* open output file */
// string filename = string(argv[argc - 1]) + "-bulkcheck.tsv";
// ofstream output_file(filename, ios_base::out);
// if (!output_file.is_open()) {
// fprintf(stderr, "Error - could not open result file\n");
// exit(1);
// }
/* declare variables for iteration */
string word;
int num = 0;
int num_space = 0;
int num_correct = 0;
int num_correct_space = 0;
/* iterate all lines in input file */
while (getline(input_file, word)) {
/* count number of words and words with space */
num++;
bool has_space = false;
if (count(word.begin(), word.end(), ' ') > 0 ) {
has_space = true;
num_space++;
}
bool is_correct = hunspell->spell(word);
// output_file << is_success << "\t" << is_correct << "\t" << has_space << "\t" << has_suggest
// << "\t" << word.c_str() << "\t" << expect.c_str() << "\t";
cout << is_correct << "\t" << word.c_str() << "\t";
bool is_first = true;
if (is_correct) {
num_correct++;
if (has_space) {
num_correct_space++;
}
} else {
vector<string> suggestions = hunspell->suggest(word.c_str());
for (size_t i = 0; i < suggestions.size(); ++i) {
if (is_first) {
cout << suggestions[i];
is_first = false;
} else {
cout << ";" << suggestions[i];
}
}
}
cout << endl;
}
input_file.close();
if (num == 0) {
fprintf(stderr, "ERROR: No words to check in file %s:\n", argv[argc - 2]);
exit(1);
}
int num_nospace = num - num_space;
int num_incorrect = num - num_correct;
int num_incorrect_space = num_space - num_correct_space;
int num_correct_nospace = num_correct - num_correct_space;
int num_incorrect_nospace = num_nospace - num_correct_nospace;
float per_nospace = 100.0 * num_nospace / num;
float per_correct = 100.0 * num_correct / num;
float per_correct_space = 100.0;
if (num_space != 0) {
per_correct_space = 100.0 * num_correct_space / num_space;
}
float per_correct_nospace = 100.0;
if (num_nospace != 0) {
per_correct_nospace = 100.0 * num_correct_nospace / num_nospace;
}
float per_space = 100.0 - per_nospace;
float per_incorrect = 100.0 - per_correct;
float per_incorrect_space = 100.0 - per_correct_space;
float per_incorrect_nospace = 100.0 - per_correct_nospace;
cerr << "Hunspell version\t" << VERSION << endl;
cerr << "Hunspell affix\t" << argv[1] << endl;
cerr << "Hunspell dict\t" << argv[2] << endl;
cerr << "wordlist\t" << argv[argc - 1] << endl;
cerr << "percentage of words without space\t" << fixed << setw(6)
<< setprecision(2) << setfill('0') << per_nospace << endl;
cerr << "percentage of words with space\t" << fixed << setw(6)
<< setprecision(2) << setfill('0') << per_space << endl;
cerr << "number of words\t" << num << endl;
cerr << "number of correct words\t" << num_correct << endl;
cerr << "number of incorect words\t" << num_incorrect << endl;
cerr << "percentage of correct words\t" << fixed << setw(6)
<< setprecision(2) << setfill('0') << per_correct << endl;
cerr << "percentage of incorrect words\t" << fixed << setw(6)
<< setprecision(2) << setfill('0') << per_incorrect << endl;
cerr << "number of words without space\t" << num_nospace << endl;
cerr << "number of correct words without space\t" << num_correct_nospace << endl;
cerr << "number of incorrect words without space\t" << num_incorrect_nospace << endl;
cerr << "percentage of correct words without space\t" << fixed << setw(6)
<< setprecision(2) << setfill('0') << per_correct_nospace << endl;
cerr << "percentage of incorrect words without space\t" << fixed << setw(6)
<< setprecision(2) << setfill('0') << per_incorrect_nospace << endl;
cerr << "number of words with space\t" << num_space << endl;
cerr << "number of correct words with space\t" << num_correct_space << endl;
cerr << "number of incorrect words with space\t" << num_incorrect_space << endl;
cerr << "percentage of correct words with space\t" << fixed << setw(6)
<< setprecision(2) << setfill('0') << per_correct_space << endl;
cerr << "percentage of incorrect words with space\t" << fixed << setw(6)
<< setprecision(2) << setfill('0') << per_incorrect_space << endl;
// output_file.close();
delete hunspell;
return 0;
}

View File

@ -1,115 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include "hunspell.hxx"
#include "textparser.hxx"
#ifndef W32
using namespace std;
#endif
int main(int, char** argv) {
FILE* f;
/* first parse the command line options */
for (int i = 1; i < 6; i++)
if (!argv[i]) {
fprintf(
stderr,
"chmorph - change affixes by morphological analysis and generation\n"
"correct syntax is:\nchmorph affix_file "
"dictionary_file file_to_convert STRING1 STRING2\n"
"STRINGS may be arbitrary parts of the morphological descriptions\n"
"example: chmorph hu.aff hu.dic hu.txt SG_2 SG_3 "
" (convert informal Hungarian second person texts to formal third "
"person texts)\n");
exit(1);
}
/* open the words to check list */
f = fopen(argv[3], "r");
if (!f) {
fprintf(stderr, "Error - could not open file to check\n");
exit(1);
}
Hunspell* pMS = new Hunspell(argv[1], argv[2]);
TextParser* p = new TextParser(
"qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM");
char buf[MAXLNLEN];
while (fgets(buf, MAXLNLEN, f)) {
p->put_line(buf);
std::string next;
while (p->next_token(next)) {
std::vector<std::string> pl = pMS->analyze(next);
if (!pl.empty()) {
int gen = 0;
for (size_t i = 0; i < pl.size(); ++i) {
const char* pos = strstr(pl[i].c_str(), argv[4]);
if (pos) {
std::string r(pl[i], pos - pl[i].c_str());
r.append(argv[5]);
r.append(pos + strlen(argv[4]));
pl[i] = r;
gen = 1;
}
}
if (gen) {
std::vector<std::string> pl2 = pMS->generate(next, pl);
if (!pl2.empty()) {
p->change_token(pl2[0].c_str());
// jump over the (possibly un)modified word
(void)p->next_token(next);
}
}
}
}
fprintf(stdout, "%s\n", p->get_line().c_str());
}
delete p;
fclose(f);
return 0;
}

View File

@ -1,93 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstring>
#include <cstdlib>
#include <fstream>
#include "hunspell.hxx"
using namespace std;
int main(int argc, char** argv) {
/* first parse the command line options */
if (argc < 4) {
fprintf(stderr, "example (now it works with more dictionary files):\n");
fprintf(stderr,
"example affix_file dictionary_file(s) file_of_words_to_check\n");
exit(1);
}
/* open the words to check list */
std::ifstream wtclst(argv[argc - 1], std::ios_base::in);
if (!wtclst.is_open()) {
fprintf(stderr, "Error - could not open file of words to check\n");
exit(1);
}
Hunspell* pMS = new Hunspell(argv[1], argv[2]);
// load extra dictionaries
if (argc > 4)
for (int k = 3; k < argc - 1; ++k)
pMS->add_dic(argv[k]);
std::string buf;
while (std::getline(wtclst, buf)) {
int dp = pMS->spell(buf);
if (dp) {
fprintf(stdout, "\"%s\" is okay\n", buf.c_str());
fprintf(stdout, "\n");
} else {
fprintf(stdout, "\"%s\" is incorrect!\n", buf.c_str());
fprintf(stdout, " suggestions:\n");
std::vector<std::string> wlst = pMS->suggest(buf.c_str());
for (size_t i = 0; i < wlst.size(); ++i) {
fprintf(stdout, " ...\"%s\"\n", wlst[i].c_str());
}
fprintf(stdout, "\n");
}
// for the same of testing this code path
// do an analysis here and throw away the results
pMS->analyze(buf);
}
delete pMS;
return 0;
}

View File

@ -1,78 +0,0 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <hunspell/hunspell.hxx>
#include <sys/types.h>
#include <dirent.h>
#include <string.h>
#include <libgen.h>
std::vector<Hunspell*> dictionaries;
bool endswith(const std::string &str, const std::string &suffix)
{
return str.size() >= suffix.size() &&
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
}
extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv)
{
char* exe_path = (*argv)[0];
// dirname() can modify its argument.
char* exe_path_copy = strdup(exe_path);
char* dir = dirname(exe_path_copy);
DIR* d = opendir(dir);
struct dirent *direntry;
while ((direntry = readdir(d)) != NULL)
{
std::string entry(direntry->d_name);
if (endswith(entry, ".aff"))
{
std::string dic = entry.substr(0, entry.size() - 4) + ".dic";
dictionaries.push_back(new Hunspell(entry.c_str(), dic.c_str()));
}
}
closedir(d);
free(exe_path_copy);
return 0;
}
extern "C" int LLVMFuzzerTestOneInput(const char* data, size_t size)
{
std::string word(data, size);
for (std::vector<Hunspell*>::const_iterator it = dictionaries.begin(); it != dictionaries.end(); ++it)
{
Hunspell *dict = *it;
if (!dict->spell(word))
dict->suggest(word);
}
return 0;
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

File diff suppressed because it is too large Load Diff

View File

@ -1,60 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "hunzip.hxx"
#define DESC \
"hunzip - decompress a hzip file to the standard output\n" \
"Usage: hunzip file.hz [password]\n"
int main(int argc, char** argv) {
if (argc == 1 || strcmp(argv[1], "-h") == 0) {
fprintf(stderr, DESC);
return 1;
}
Hunzip h(argv[1], (argc > 2) ? argv[2] : NULL);
if (!h.is_open())
return 0;
std::string s;
while (h.getline(s))
printf("%s", s.c_str());
return 0;
}

View File

@ -1,419 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/* hzip: file compression for sorted dictionaries with optional encryption,
* algorithm: prefix-suffix encoding and 16-bit Huffman encoding */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <string>
#include <sys/stat.h>
#define CODELEN 65536
#define BUFSIZE 65536
#define EXTENSION ".hz"
#define ESCAPE 31
#define MAGIC "hz0"
#define MAGIC_ENCRYPTED "hz1"
#define DESC \
"hzip - dictionary compression utility\n" \
"Usage: hzip [-h | -P password ] [file1 file2 ..]\n" \
" -P password encrypted compression\n" \
" -h display this help and exit\n"
enum { code_LEAF, code_TERM, code_NODE };
struct item {
unsigned short word;
int count;
char type;
struct item* left;
struct item* right;
};
int fail(const char* err, const char* par) {
fprintf(stderr, err, par);
return 1;
}
void code2table(struct item* tree, char** table, char* code, int deep) {
int first = 0;
if (!code) {
first = 1;
code = (char*)malloc(CODELEN);
}
code[deep] = '1';
if (tree->left)
code2table(tree->left, table, code, deep + 1);
if (tree->type != code_NODE) {
int i = tree->word;
code[deep] = '\0';
if (tree->type == code_TERM)
i = CODELEN; /* terminal code */
table[i] = (char*)malloc(deep + 1);
strcpy(table[i], code);
}
code[deep] = '0';
if (tree->right)
code2table(tree->right, table, code, deep + 1);
if (first)
free(code);
}
struct item* newitem(int c, struct item* l, struct item* r, int t) {
struct item* ni = (struct item*)malloc(sizeof(struct item));
ni->type = t;
ni->word = 0;
ni->count = c;
ni->left = l;
ni->right = r;
return ni;
}
/* return length of the freq array */
int get_freqdata(struct item*** dest, FILE* f, unsigned short* termword) {
int freq[CODELEN];
int i, j, k, n;
union {
char c[2];
unsigned short word;
} u;
for (i = 0; i < CODELEN; i++)
freq[i] = 0;
while ((j = getc(f)) != -1 && (k = getc(f)) != -1) {
u.c[0] = j;
u.c[1] = k;
freq[u.word]++;
}
if (j != -1) {
u.c[0] = 1;
u.c[1] = j;
} else {
u.c[0] = 0;
u.c[1] = 0;
}
*dest = (struct item**)malloc((CODELEN + 1) * sizeof(struct item*));
if (!*dest)
return -1;
for (i = 0, n = 0; i < CODELEN; i++)
if (freq[i]) {
(*dest)[n] = newitem(freq[i], NULL, NULL, code_LEAF);
(*dest)[n]->word = i;
n++;
}
/* terminal sequence (also contains the last odd byte of the file) */
(*dest)[n] = newitem(1, NULL, NULL, code_TERM);
*termword = u.word;
return n + 1;
}
void get_codetable(struct item** l, int n, char** table) {
int i;
while (n > 1) {
int min = 0;
int mi2 = 1;
for (i = 1; i < n; i++) {
if (l[i]->count < l[min]->count) {
mi2 = min;
min = i;
} else if (l[i]->count < l[mi2]->count)
mi2 = i;
}
l[min] = newitem(l[min]->count + l[mi2]->count, l[min], l[mi2], code_NODE);
for (i = mi2 + 1; i < n; i++)
l[i - 1] = l[i];
n--;
}
code2table(l[0], table, NULL, 0);
}
int write_bits(FILE* f, char* bitbuf, int* bits, char* code) {
while (*code) {
int b = (*bits) % 8;
if (!b)
bitbuf[(*bits) / 8] = ((*code) - '0') << 7;
else
bitbuf[(*bits) / 8] |= (((*code) - '0') << (7 - b));
(*bits)++;
code++;
if (*bits == BUFSIZE * 8) {
if (BUFSIZE != fwrite(bitbuf, 1, BUFSIZE, f))
return 1;
*bits = 0;
}
}
return 0;
}
int encode_file(char** table,
int n,
FILE* f,
FILE* f2,
unsigned short tw,
char* key) {
char bitbuf[BUFSIZE];
int i, bits = 0;
unsigned char cl, ch;
int cx[2];
union {
char c[2];
unsigned short word;
} u;
char* enc = key;
/* header and codes */
fprintf(f2, "%s", (key ? MAGIC_ENCRYPTED : MAGIC)); /* 3-byte HEADER */
cl = (unsigned char)(n & 0x00ff);
ch = (unsigned char)(n >> 8);
if (key) {
unsigned char cs;
for (cs = 0; *enc; enc++)
cs ^= *enc;
fprintf(f2, "%c", cs); /* 1-byte check sum */
enc = key;
ch ^= *enc;
if ((*(++enc)) == '\0')
enc = key;
cl ^= *enc;
}
fprintf(f2, "%c%c", ch, cl); /* upper and lower byte of record count */
for (i = 0; i < BUFSIZE; i++)
bitbuf[i] = '\0';
for (i = 0; i < CODELEN + 1; i++)
if (table[i]) {
size_t nmemb;
u.word = (unsigned short)i;
if (i == CODELEN)
u.word = tw;
if (key) {
if (*(++enc) == '\0')
enc = key;
u.c[0] ^= *enc;
if (*(++enc) == '\0')
enc = key;
u.c[1] ^= *enc;
}
fprintf(f2, "%c%c", u.c[0], u.c[1]); /* 2-character code id */
bits = 0;
if (write_bits(f2, bitbuf, &bits, table[i]) != 0)
return 1;
if (key) {
if (*(++enc) == '\0')
enc = key;
fprintf(f2, "%c", ((unsigned char)bits) ^ *enc);
for (cl = 0; cl <= bits / 8; cl++) {
if (*(++enc) == '\0')
enc = key;
bitbuf[cl] ^= *enc;
}
} else
fprintf(f2, "%c", (unsigned char)bits); /* 1-byte code length */
nmemb = bits / 8 + 1;
if (fwrite(bitbuf, 1, bits / 8 + 1, f2) != nmemb) /* x-byte code */
return 1;
}
/* file encoding */
bits = 0;
while ((cx[0] = getc(f)) != -1 && (cx[1] = getc(f)) != -1) {
u.c[0] = cx[0];
u.c[1] = cx[1];
if (write_bits(f2, bitbuf, &bits, table[u.word]) != 0)
return 1;
}
/* terminal suffixes */
if (write_bits(f2, bitbuf, &bits, table[CODELEN]) != 0)
return 1;
if (bits > 0) {
size_t nmemb = bits / 8 + 1;
if (fwrite(bitbuf, 1, nmemb, f2) != nmemb)
return 1;
}
return 0;
}
int prefixcompress(FILE* f, FILE* tempfile) {
char buf[BUFSIZE];
char buf2[BUFSIZE * 2];
char prev[BUFSIZE];
int prevlen = 0;
while (fgets(buf, BUFSIZE, f)) {
int i, j, k, m, c = 0;
int pfx = prevlen;
char* p = buf2;
m = j = 0;
for (i = 0; buf[i]; i++) {
if ((pfx > 0) && (buf[i] == prev[i])) {
j++;
} else
pfx = 0;
}
if (i > 0 && buf[i - 1] == '\n') {
if (j == i)
j--; /* line duplicate */
if (j > 29)
j = 29;
c = j;
if (c == '\t')
c = 30;
/* common suffix */
for (; (m < i - j - 1) && (m < 15) && (prevlen - m - 2 >= 0) &&
buf[i - m - 2] == prev[prevlen - m - 2];
m++)
;
if (m == 1)
m = 0;
} else {
j = 0;
m = -1;
}
for (k = j; k < i - m - 1; k++, p++) {
if (((unsigned char)buf[k]) < 47 && buf[k] != '\t' && buf[k] != ' ') {
*p = ESCAPE;
p++;
}
*p = buf[k];
}
if (m > 0) {
*p = m + 31; /* 33-46 */
p++;
}
if (i > 0 && buf[i - 1] == '\n') {
size_t nmemb = p - buf2 + 1;
*p = c;
if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
return 1;
} else {
size_t nmemb = p - buf2;
if (fwrite(buf2, 1, nmemb, tempfile) != nmemb)
return 1;
}
memcpy(prev, buf, i);
prevlen = i;
}
return 0;
}
int hzip(const char* filename, char* key) {
struct item** list;
char* table[CODELEN + 1];
int n;
unsigned short termword;
FILE* f = fopen(filename, "r");
if (!f)
return fail("hzip: %s: Permission denied\n", filename);
char tmpfiletemplate[] = "/tmp/hunspellXXXXXX";
mode_t mask = umask(S_IXUSR | S_IRWXG | S_IRWXO);
int tempfileno = mkstemp(tmpfiletemplate);
umask(mask);
if (tempfileno == -1) {
fclose(f);
return fail("hzip: cannot create temporary file\n", NULL);
}
FILE *tempfile = fdopen(tempfileno, "rw");
if (!tempfile) {
close(tempfileno);
unlink(tmpfiletemplate);
fclose(f);
return fail("hzip: cannot create temporary file\n", NULL);
}
std::string out(filename);
out.append(EXTENSION);
FILE* f2 = fopen(out.c_str(), "wb");
if (!f2) {
fclose(tempfile);
fclose(f);
unlink(tmpfiletemplate);
return fail("hzip: %s: Permission denied\n", out.c_str());
}
for (n = 0; n < CODELEN; n++)
table[n] = NULL;
if (prefixcompress(f, tempfile) != 0) {
fclose(f2);
fclose(tempfile);
fclose(f);
unlink(tmpfiletemplate);
return fail("hzip: cannot write file\n", NULL);
}
rewind(tempfile);
n = get_freqdata(&list, tempfile, &termword);
get_codetable(list, n, table);
rewind(tempfile);
n = encode_file(table, n, tempfile, f2, termword, key);
free(list);
fclose(f2);
fclose(tempfile);
fclose(f);
unlink(tmpfiletemplate);
if (n != 0)
return fail("hzip: cannot write file\n", NULL);
return n;
}
int main(int argc, char** argv) {
int i, j = 0;
char* key = NULL;
for (i = 1; i < argc; i++) {
if (*(argv[i]) == '-') {
if (*(argv[i] + 1) == 'h')
return fail(DESC, NULL);
if (*(argv[i] + 1) == 'P') {
if (i + 1 == argc)
return fail("hzip: missing password\n", NULL);
key = argv[i + 1];
i++;
continue;
}
return fail("hzip: no such option: %s\n", argv[i]);
} else if (hzip(argv[i], key) != 0)
return 1;
else
j = 1;
}
if (j == 0)
return fail("hzip: need a filename parameter\n", NULL);
return 0;
}

View File

@ -1,471 +0,0 @@
#!/usr/bin/perl -w
# -*- coding: iso-8859-1 -*-
# $Id$
#
# (C) 2002-2005 Agustin Martin Domingo <agustin.martin@hispalinux.es>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
sub usage {
print "ispellaff2myspell: A program to convert ispell affix tables to myspell format
(C) 2002-2005 Agustin Martin Domingo <agustin.martin\@hispalinux.es> License: GPL
Usage:
ispellaff2myspell [options] <affixfile>
Options:
--affixfile=s Affix file
--bylocale Use current locale setup for upper/lowercase
conversion
--charset=s Use specified charset for upper/lowercase
conversion (defaults to latin1)
--debug Print debugging info
--extraflags Allow some non alphabetic flags
--lowercase=s Lowercase string
--myheader=s Header file
--printcomments Print commented lines in output
--replacements=s Replacements file
--split=i Split flags with more that i entries
--uppercase=s Uppercase string
--wordlist=s Still unused
Currently allowed valued for charset are: latin1, latin2, latin3
This script does not create the dict file. Something like
( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
should do the work, with mydict.words+ being the ispell munched wordlist
";
exit;
}
sub debugprint {
if ( $debug ){
print STDERR "@_";
}
}
sub shipoutflag{
my $flag_entries=scalar @flag_array;
if ( $flag_entries != 0 ){
if ( $split ){
while ( @flag_array ){
my @flag_subarray=splice(@flag_array,0,$split);
my $subflag_entries=scalar @flag_subarray;
if ( scalar @flag_array ){
print "$myaffix $flagname $flagcombine $subflag_entries S\n";
} else {
print "$myaffix $flagname $flagcombine $subflag_entries\n";
}
print join("\n",@flag_subarray);
print "\n\n";
}
} else {
print "$myaffix $flagname $flagcombine $flag_entries\n";
print join("\n",@flag_array);
print "\n\n";
}
}
@flag_array=();
$flagname='';
$flagcombine='';
}
sub mylc{
my $inputstring=shift;
my $outputstring;
if ( $bylocale ){
{
use locale;
$outputstring = lc $inputstring;
}
} else {
if ( $charset eq "latin0" ){
$lowercase='a-z珀矣粤肄蓍裨跋鈿韵鴦<E99FB5><E9B4A6>巐鄕<E5B790><EFBFBD>';
$uppercase='A-Z請唾津毒班碧麺力佰厶壞嶷掣桀毳痔<E6AFB3>';
} elsif ( $charset eq "latin1" ){
$lowercase='a-z珀矣粤肄蓍裨跋鈿韵鴦<E99FB5><E9B4A6>巐鄕<E5B790>';
$uppercase='A-Z請唾津毒班碧麺力佰厶壞嶷掣桀毳';
} elsif ( $charset eq "latin2" ){
$lowercase='a-z嘘偽杭纂梢珀矣粤肄蓍裨跋鈿韵鴦<E99FB5><E9B4A6>巐鄕<E5B790>';
$uppercase='A-Z。ウ<E38082><E382A6><EFBFBD>請唾津毒班碧麺力佰厶壞嶷掣桀毳';
} elsif ( $charset eq "latin3" ){
$lowercase='a-z蔚杭纂逗痰粤肄蓍裨跋鈿髓齡<E9AB93><E9BDA1>巐鄕<E5B790>';
$uppercase='A-Z・<5A><E383BB><EFBFBD>疎津毒班碧麺力冫嘖孛忤掣桀毳';
# } elsif ( $charset eq "other_charset" ){
# die "latin2 still unimplemented";
} else {
if ( not $lowercase and not $uppercase ){
die "Unsupported charset [$charset]
Explicitly use --lowercase=string and --uppercase=string
options. Remember that both string must match exactly, but
case changed.
";
}
}
$outputstring=$inputstring;
eval "\$outputstring=~tr/$uppercase/$lowercase/";
}
return $outputstring;
}
sub validate_flag (){
my $flag = shift;
if ($flag=~m/[a-zA-Z]+/){
return $flag;
} elsif ( $hasextraflags ){
foreach ( keys %theextraflags ){
if ($flag =~ m/^$_/){
$flag =~ s/^$_//;
return $flag;
}
}
}
return '';
}
sub process_replacements{
my $file = shift;
my @replaces = ();
open (REPLACE,"< $file") ||
die "Error: Could not open replacements file: $file\n";
while (<REPLACE>){
next unless m/^REP[\s\t]*\D.*/;
next if m/^REP\s+[0-9]+/;
s/\015\012//;
s/\015//;
chomp;
push @replaces, $_;
}
close REPLACE;
my $number = scalar @replaces;
print "REP $number\n";
foreach ( @replaces ){
print $_ . "\n";
}
}
# -----------------------------------------------------------
# Now the progran start, after the functions are defined
# -----------------------------------------------------------
use Getopt::Long;
# Initializing option values
$affixfile = '';
$bylocale = '';
$charset = '';
$debug = '';
$lowercase = '';
$myheader = '';
$printcomments = '';
$replacements = '';
$split = '';
$uppercase = '';
$wordlist = '';
$hasextraflags = '';
@flag_array = ();
%theextraflags = ();
# Initializing root values
$rootremove = "0";
$rootname = '';
$addtoroot = '';
$comment = '';
# Initializing flag values
$flagname = '';
$flagcombine = '';
$inflags = '';
GetOptions ('affixfile=s' => \$affixfile,
'bylocale' => \$bylocale,
'charset=s' => \$charset,
'debug' => \$debug,
'extraflags:s' => sub {
$hasextraflags = 1;
shift;
$theflag = shift;
$theextraflags{$theflag}++ if $theflag},
'lowercase=s' => \$lowercase,
'myheader=s' => \$myheader,
'printcomments' => \$printcomments,
'replacements=s'=> \$replacements,
'split=i' => \$split,
'uppercase=s' => \$uppercase,
'wordlist=s' => \$wordlist) or usage;
if ( not $affixfile ){
$affixfile=shift or usage;
}
if ( $charset and ( $lowercase or $uppercase )){
die "Error: charset and lowercase/uppercase options
are incompatible. Use either charset or lowercase/uppercase options to
specify the patterns
"
} elsif ( not $lowercase and not $uppercase and not $charset ){
$charset="latin1";
}
if ( scalar(keys %theextraflags) == 0 && $hasextraflags ){
$theextraflags{"\\\\"}++;
}
debugprint "$affixfile $charset";
open (AFFIXFILE,"< $affixfile") ||
die "Error: Could not open affix file: $affixfile";
if ( $myheader ){
my $myspell_header=`cat $myheader`;
print $myspell_header . "\n";
}
while (<AFFIXFILE>){
chomp;
if (/^\s*\#.*/){
debugprint "Ignoring line $.\n";
print "$_\n" if $printcomments;
} elsif (/^\s*$/){
debugprint "Ignoring line $.\n";
} elsif (/^\s*prefixes/){
debugprint "Prefixes starting in line $.\n";
$affix="PFX";
} elsif (/^\s*suffixes/){
debugprint "Suffixes starting in line $.\n";
$affix="SFX";
} elsif (/^[\s\t]*flag.*/){
next if not $affix; # In case we are still in the preamble
shipoutflag if $inflags;
$inflags="yes";
s/^[\s\t]*flag[\s\t]*//;
s/[\s\t]*:.*$//;
debugprint "Found flag $_ in line $.\n";
if (/\*/){
s/[\*\s]//g;
$flagcombine="Y";
debugprint "Flag renamed to $_ with combine=$flagcombine\n";
} else {
$flagcombine="N";
}
if ( $flagname = &validate_flag($_) ){
$myaffix = $affix;
} else {
$myaffix = "\# $affix";
$flagname = $_;
print STDERR "Ignoring invalid flag $flagname in line $.\n";
}
} elsif ( $affix and $inflags ) {
($rootname,@comments) = split('#',$_);
$comment = '# ' . join('#',@comments);
$rootname =~ s/\s*//g;
$rootname = mylc $rootname;
($rootname,$addtoroot) = split('>',$rootname);
if ( $addtoroot =~ s/^\-//g ){
($rootremove,$addtoroot) = split(',',$addtoroot);
$addtoroot = "0" unless $addtoroot;
$addtoroot = "0" if ( $addtoroot eq "-");
} else {
$rootremove = "0";
}
$addtoroot =~ s/\\\-/\-/g; # prefix ANTI\- to anti-
if ( $rootname eq '.' && $rootremove ne "0" ){
$rootname = $rootremove;
}
debugprint "$rootname, $addtoroot, $rootremove\n";
if ( $printcomments ){
$affix_line=sprintf("%s %s %-5s %-11s %-24s %s",
$myaffix, $flagname, $rootremove,
$addtoroot, $rootname, $comment);
} else {
$affix_line=sprintf("%s %s %-5s %-11s %s",
$myaffix, $flagname, $rootremove,
$addtoroot, $rootname);
}
$rootremove = "0";
$rootname = '';
$addtoroot = '';
$comment = '';
@comments = ();
push @flag_array,$affix_line;
debugprint "$affix_line\n";
} else {
#
}
}
shipoutflag;
close AFFIXFILE;
if ( $replacements ){
&process_replacements($replacements);
}
__END__
=head1 NAME
B<ispellaff2myspell> - A program to convert ispell affix tables to myspell format.
=head1 SYNOPSIS
ispellaff2myspell [options] <affixfile> --myheader your_header
Options:
--affixfile=s Affix file
--bylocale Use current locale setup for upper/lowercase
conversion
--charset=s Use specified charset for upper/lowercase
conversion (defaults to latin1)
--debug Print debugging info
--extraflags=s Allow some non alphabetic flags
--lowercase=s Lowercase string
--myheader=s Header file
--printcomments Print commented lines in output
--replacements=s Replacements file
--split=i Split flags with more that i entries
--uppercase=s Uppercase string
=head1 DESCRIPTION
B<ispellaff2myspell> is a script that will convert ispell affix tables
to myspell format in a more or less successful way.
This script does not create the dict file. Something like
( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
should do the work, with mydict.words+ being the munched wordlist
=head1 OPTIONS
=over 8
=item B<--affixfile=s>
Affix file. You can put it directly in the command line.
=item B<--bylocale>
Use current locale setup for upper/lowercase conversion. Make sure
that the selected locale match the dictionary one, or you might get
into trouble.
=item B<--charset=s>
Use specified charset for upper/lowercase conversion (defaults to latin1).
Currently allowed values for charset are: latin0, latin1, latin2, latin3.
=item B<--debug>
Print some debugging info.
=item B<--extraflags:s>
Allows some non alphabetic flags.
When invoked with no value the supported flags are currently those
corresponding to chars represented with the escape char B<\> as
first char. B<\> will be stripped.
When given with the flag prefix will allow that flag and strip the
given prefix. Be careful when giving the prefix to properly escape chars,
e.g. you will need B<-e "\\\\"> or B<-e '\\'> for flags like B<\[> to be stripped to
B<[>. Otherwise you might even get errors. Use B<-e "^"> to allow all
flags and pass them unmodified.
You will need a call to -e for each flag type, e.g.,
B<-e "\\\\" -e "~\\\\"> (or B<-e '\\' -e '~\\'>).
When a prefix is explicitely set, the default value (anything starting by B<\>)
is disabled and you need to enable it explicitely as in previous example.
=item B<--lowercase=s>
Lowercase string. Manually set the string of lowercase chars. This
requires B<--uppercase> having exactly that string but uppercase.
=item B<--myheader=s>
Header file. The myspell aff header. You need to write it
manually. This can contain everything you want to be before the affix table
=item B<--printcomments>
Print commented lines in output.
=item B<--replacements=file>
Add a pre-defined replacements table taken from 'file' to the .aff file.
Will skip lines not beginning with REP, and set the replacements number
appropriately.
=item B<--split=i>
Split flags with more that i entries. This can be of interest for flags
having a lot of entries. Will split the flag in chunks containing B<i>
entries.
=item B<--uppercase=s>
Uppercase string. Manually set the sring of uppercase chars. This
requires B<--lowercase> having exactly that string but lowercase.
=back
If your encoding is currently unsupported you can send me a file with
the two strings of lower and uppercase chars. Note that they must match
exactly but case changed. It will look something like
$lowercase='a-z珀矣粤肄蓍裨跋鈿韵鴦<E99FB5><E9B4A6>巐鄕<E5B790>';
$uppercase='A-Z請唾津毒班碧麺力佰厶壞嶷掣桀毳';
=head1 SEE ALSO
The OpenOffice.org Lingucomponent Project home page
L<http://lingucomponent.openoffice.org/index.html>
and the document
L<http://lingucomponent.openoffice.org/affix.readme>
that provides information about the basics of the myspell affix file format.
You can also take a look at
/usr/share/doc/libmyspell-dev/affix.readme.gz
/usr/share/doc/libmyspell-dev/README.compoundwords
/usr/share/doc/libmyspell-dev/README.replacetable
in your Debian system.
=head1 AUTHORS
Agustin Martin <agustin.martin@hispalinux.es>
=cut

View File

@ -1,115 +0,0 @@
#!/bin/sh
# makealias: make alias compressed dic and aff files
# Usage: alias.sh dic aff (not alias.sh aff dic!)
# Version: 2007-10-26
case $# in
0|1)
echo 'makealias: make alias compressed dic and aff files
Usage: makealias file.dic file.aff (not makefile file.aff file.dic!)' >/dev/stderr
exit;;
esac
DIC=`basename $1 .dic`
AFF=`basename $2 .aff`
# FLAG type definition must be before alias definitions
grep '^FLAG' $2 >"${AFF}_alias.aff"
awk 'BEGIN{n=1;m=1}
function cutslash(st) {
if (split(st,t,"/") > 1) return t[1]
return st
}
function ltrim(st) {
sub(/^ +/,"",st)
return st
}
FILENAME ~ /.dic$/ && $1 ~ "/[^ \t]" {
split($1,t,"/")
if(!a[t[2]]){
a[t[2]]=n
b[n]=t[2]
n++
}
if (NF > 1) {
$1 = ""
if(!a2[$0]){
a2[$0]=m
c[m]=$0
m++
}
print t[1]"/"a[t[2]] "\t" a2[$0]
} else {
print t[1]"/"a[t[2]]
}
next
}
FILENAME ~ /.dic$/ && NF > 1 {
x = $1
$1 = ""
if(!a2[$0]){
a2[$0]=m
c[m]=$0
m++
}
print cutslash(x) "\t" a2[$0]
next
}
FILENAME ~ /.dic$/ { print cutslash($1) }
FILENAME ~ /.aff$/ && /^[PS]FX/ && ($4 ~ /\/[^ ]/) && NF > 4 {
split($4,t,"/")
if(!a[t[2]]){
a[t[2]]=n
b[n]=t[2]
n++
}
begin = $1 " " $2 " " $3 " " (t[1]"/"a[t[2]]) " " $5
if ($6!="") ok = 1; else ok = 0;
$1 = ""
$2 = ""
$3 = ""
$4 = ""
$5 = ""
if(ok){
if(!a2[$0]){
a2[$0]=m
c[m]=$0
m++
}
print begin " " a2[$0] >>"/dev/stderr"
} else print begin >>"/dev/stderr"
next
}
FILENAME ~ /.aff$/ && /^[PS]FX/ && NF > 4 {
begin = $1 " " $2 " " $3 " " cutslash($4) " " $5
if ($6!="") ok = 1; else ok = 0;
$1 = ""
$2 = ""
$3 = ""
$4 = ""
$5 = ""
if(ok) {
if (!a2[$0]){
a2[$0]=m
c[m]=$0
m++
}
print begin " " a2[$0] >>"/dev/stderr"
} else print begin >>"/dev/stderr"
next
}
FILENAME ~ /.aff$/ { print $0 >>"/dev/stderr" }
END{
if (n>1) {
print "AF", n-1 >>"'${AFF}_alias.aff'"
for(i=1;i<n;i++) print "AF", b[i],"#",i >>"'${AFF}_alias.aff'"
}
if (m>1) {
print "AM", m-1 >>"'${AFF}_alias.aff'"
for(i=1;i<m;i++) print "AM " ltrim(c[i]) >>"'${AFF}_alias.aff'"
}
}' $1 $2 >${DIC}_alias.dic 2>${AFF}_alias.$$
grep -v '^FLAG' ${AFF}_alias.$$ >>${AFF}_alias.aff
echo "output: ${DIC}_alias.dic, ${AFF}_alias.aff"
rm ${AFF}_alias.$$

View File

@ -1,868 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/* Munch a word list and generate a smaller root word list with affixes*/
#include <ctype.h>
#include <string.h>
#include <string>
#include <unistd.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <stddef.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <limits>
#include "munch.h"
int main(int argc, char** argv) {
int i, j, k, n;
int rl, p, nwl;
int al;
FILE* wrdlst;
FILE* afflst;
char *nword, *wf, *af;
char as[(MAX_PREFIXES + MAX_SUFFIXES)];
char* ap;
struct hentry* ep;
struct hentry* ep1;
struct affent* pfxp;
struct affent* sfxp;
(void)argc;
/* first parse the command line options */
/* arg1 - wordlist, arg2 - affix file */
if (argv[1]) {
wf = mystrdup(argv[1]);
} else {
fprintf(stderr, "correct syntax is:\n");
fprintf(stderr, "munch word_list_file affix_file\n");
exit(1);
}
if (argv[2]) {
af = mystrdup(argv[2]);
} else {
fprintf(stderr, "correct syntax is:\n");
fprintf(stderr, "munch word_list_file affix_file\n");
exit(1);
}
/* open the affix file */
afflst = fopen(af, "r");
if (!afflst) {
fprintf(stderr, "Error - could not open affix description file\n");
exit(1);
}
/* step one is to parse the affix file building up the internal
affix data structures */
numpfx = 0;
numsfx = 0;
if (parse_aff_file(afflst)) {
fprintf(stderr, "Error - in affix file loading\n");
exit(1);
}
fclose(afflst);
fprintf(stderr, "parsed in %d prefixes and %d suffixes\n", numpfx, numsfx);
/* affix file is now parsed so create hash table of wordlist on the fly */
/* open the wordlist */
wrdlst = fopen(wf, "r");
if (!wrdlst) {
fprintf(stderr, "Error - could not open word list file\n");
exit(1);
}
if (load_tables(wrdlst)) {
fprintf(stderr, "Error building hash tables\n");
exit(1);
}
fclose(wrdlst);
for (i = 0; i < tablesize; i++) {
ep = &tableptr[i];
if (ep->word == NULL)
continue;
for (; ep != NULL; ep = ep->next) {
numroots = 0;
aff_chk(ep->word, strlen(ep->word));
if (numroots) {
/* now there might be a number of combinations */
/* of prefixes and suffixes that might match this */
/* word. So how to choose? As a first shot look */
/* for the shortest remaining root word to */
/* to maximize the combinatorial power */
/* but be careful, do not REQUIRE a specific combination */
/* of a prefix and a suffix to generate the word since */
/* that violates the rule that the root word with just */
/* the prefix or just the suffix must also exist in the */
/* wordlist as well */
/* in fact because of the cross product issue, this not a */
/* simple choice since some combinations of previous */
/* prefixes and new suffixes may not be valid. */
/* The only way to know is to simply try them all */
rl = 1000;
p = -1;
for (j = 0; j < numroots; j++) {
/* first collect the root word info and build up */
/* the potential new affix string */
nword = (roots[j].hashent)->word;
nwl = strlen(nword);
*as = '\0';
ap = as;
if (roots[j].prefix)
*ap++ = (roots[j].prefix)->achar;
if (roots[j].suffix)
*ap++ = (roots[j].suffix)->achar;
if ((roots[j].hashent)->affstr) {
strcpy(ap, (roots[j].hashent)->affstr);
} else {
*ap = '\0';
}
al = strlen(as);
/* now expand the potential affix string to generate */
/* all legal words and make sure they all exist in the */
/* word list */
numwords = 0;
wlist[numwords].word = mystrdup(nword);
wlist[numwords].pallow = 0;
numwords++;
n = 0;
if (al)
expand_rootword(nword, nwl, as);
for (k = 0; k < numwords; k++) {
if (lookup(wlist[k].word))
n++;
free(wlist[k].word);
wlist[k].word = NULL;
wlist[k].pallow = 0;
}
/* if all exist in word list then okay */
if (n == numwords) {
if (nwl < rl) {
rl = nwl;
p = j;
}
}
}
if (p != -1) {
ep1 = roots[p].hashent;
pfxp = roots[p].prefix;
sfxp = roots[p].suffix;
ep1->keep = 1;
if (pfxp != NULL)
add_affix_char(ep1, pfxp->achar);
if (sfxp != NULL)
add_affix_char(ep1, sfxp->achar);
} else {
ep->keep = 1;
}
} else {
ep->keep = 1;
}
}
}
/* now output only the words to keep along with affixes info */
/* first count how many words that is */
k = 0;
for (i = 0; i < tablesize; i++) {
ep = &tableptr[i];
if (ep->word == NULL)
continue;
for (; ep != NULL; ep = ep->next) {
if (ep->keep > 0)
k++;
}
}
fprintf(stdout, "%d\n", k);
for (i = 0; i < tablesize; i++) {
ep = &tableptr[i];
if (ep->word == NULL)
continue;
for (; ep != NULL; ep = ep->next) {
if (ep->keep > 0) {
if (ep->affstr != NULL) {
fprintf(stdout, "%s/%s\n", ep->word, ep->affstr);
} else {
fprintf(stdout, "%s\n", ep->word);
}
}
}
}
return 0;
}
int parse_aff_file(FILE* afflst) {
int i, j;
int numents = 0;
char achar = '\0';
short ff = 0;
struct affent* ptr = NULL;
struct affent* nptr = NULL;
char* line = (char*)malloc(MAX_LN_LEN);
while (fgets(line, MAX_LN_LEN, afflst)) {
mychomp(line);
char ft = ' ';
fprintf(stderr, "parsing line: %s\n", line);
if (strncmp(line, "PFX", 3) == 0)
ft = 'P';
if (strncmp(line, "SFX", 3) == 0)
ft = 'S';
if (ft != ' ') {
char* tp = line;
char* piece;
i = 0;
ff = 0;
while ((piece = mystrsep(&tp, ' '))) {
if (*piece != '\0') {
switch (i) {
case 0:
break;
case 1: {
achar = *piece;
break;
}
case 2: {
if (*piece == 'Y')
ff = XPRODUCT;
break;
}
case 3: {
numents = atoi(piece);
if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
sizeof(struct affent)) < static_cast<size_t>(numents))) {
fprintf(stderr, "Error: too many entries: %d\n", numents);
numents = 0;
} else {
ptr = (struct affent*)malloc(numents * sizeof(struct affent));
ptr->achar = achar;
ptr->xpflg = ff;
fprintf(stderr, "parsing %c entries %d\n", achar, numents);
}
break;
}
default:
break;
}
i++;
}
free(piece);
}
/* now parse all of the sub entries*/
nptr = ptr;
for (j = 0; j < numents; j++) {
if (!fgets(line, MAX_LN_LEN, afflst))
return 1;
mychomp(line);
tp = line;
i = 0;
while ((piece = mystrsep(&tp, ' '))) {
if (*piece != '\0') {
switch (i) {
case 0: {
if (nptr != ptr) {
nptr->achar = ptr->achar;
nptr->xpflg = ptr->xpflg;
}
break;
}
case 1:
break;
case 2: {
nptr->strip = mystrdup(piece);
nptr->stripl = strlen(nptr->strip);
if (strcmp(nptr->strip, "0") == 0) {
free(nptr->strip);
nptr->strip = mystrdup("");
nptr->stripl = 0;
}
break;
}
case 3: {
nptr->appnd = mystrdup(piece);
nptr->appndl = strlen(nptr->appnd);
if (strcmp(nptr->appnd, "0") == 0) {
free(nptr->appnd);
nptr->appnd = mystrdup("");
nptr->appndl = 0;
}
break;
}
case 4: {
encodeit(nptr, piece);
}
fprintf(stderr, " affix: %s %d, strip: %s %d\n", nptr->appnd,
nptr->appndl, nptr->strip, nptr->stripl);
// no break
default:
break;
}
i++;
}
free(piece);
}
nptr++;
}
if (ft == 'P') {
if (numpfx < MAX_PREFIXES) {
ptable[numpfx].aep = ptr;
ptable[numpfx].num = numents;
fprintf(stderr, "ptable %d num is %d\n", numpfx, ptable[numpfx].num);
numpfx++;
} else {
fprintf(stderr, "prefix buffer ptable is full\n");
}
} else {
if (numsfx < MAX_SUFFIXES) {
stable[numsfx].aep = ptr;
stable[numsfx].num = numents;
fprintf(stderr, "stable %d num is %d\n", numsfx, stable[numsfx].num);
numsfx++;
} else {
fprintf(stderr, "suffix buffer stable is full\n");
}
}
ptr = NULL;
nptr = NULL;
numents = 0;
achar = '\0';
}
}
free(line);
return 0;
}
void encodeit(struct affent* ptr, char* cs) {
int nc;
int neg;
int grp;
int n;
int ec;
int nm;
int i, j, k;
unsigned char mbr[MAX_WD_LEN];
/* now clear the conditions array */
for (i = 0; i < SET_SIZE; i++)
ptr->conds[i] = (unsigned char)0;
/* now parse the string to create the conds array */
nc = strlen(cs);
neg = 0; /* complement indicator */
grp = 0; /* group indicator */
n = 0; /* number of conditions */
ec = 0; /* end condition indicator */
nm = 0; /* number of member in group */
i = 0;
if (strcmp(cs, ".") == 0) {
ptr->numconds = 0;
return;
}
while (i < nc) {
unsigned char c = *((unsigned char*)(cs + i));
if (c == '[') {
grp = 1;
c = 0;
}
if ((grp == 1) && (c == '^')) {
neg = 1;
c = 0;
}
if (c == ']') {
ec = 1;
c = 0;
}
if ((grp == 1) && (c != 0)) {
*(mbr + nm) = c;
nm++;
c = 0;
}
if (c != 0) {
ec = 1;
}
if (ec) {
if (grp == 1) {
if (neg == 0) {
for (j = 0; j < nm; j++) {
k = (unsigned int)mbr[j];
ptr->conds[k] = ptr->conds[k] | (1 << n);
}
} else {
for (j = 0; j < SET_SIZE; j++)
ptr->conds[j] = ptr->conds[j] | (1 << n);
for (j = 0; j < nm; j++) {
k = (unsigned int)mbr[j];
ptr->conds[k] = ptr->conds[k] & ~(1 << n);
}
}
neg = 0;
grp = 0;
nm = 0;
} else {
/* not a group so just set the proper bit for this char */
/* but first handle special case of . inside condition */
if (c == '.') {
/* wild card character so set them all */
for (j = 0; j < SET_SIZE; j++)
ptr->conds[j] = ptr->conds[j] | (1 << n);
} else {
ptr->conds[(unsigned int)c] = ptr->conds[(unsigned int)c] | (1 << n);
}
}
n++;
ec = 0;
}
i++;
}
ptr->numconds = n;
return;
}
/* search for a prefix */
void pfx_chk(const char* word, int len, struct affent* ep, int num) {
struct affent* aent;
int cond;
struct hentry* hent;
int i;
for (aent = ep, i = num; i > 0; aent++, i--) {
int tlen = len - aent->appndl;
if (tlen > 0 &&
(aent->appndl == 0 || strncmp(aent->appnd, word, aent->appndl) == 0) &&
tlen + aent->stripl >= aent->numconds) {
std::string tword(aent->strip);
tword.append(word + aent->appndl);
/* now go through the conds and make sure they all match */
unsigned char* cp = (unsigned char*)tword.c_str();
for (cond = 0; cond < aent->numconds; cond++) {
if ((aent->conds[*cp++] & (1 << cond)) == 0)
break;
}
if (cond >= aent->numconds) {
if ((hent = lookup(tword.c_str())) != NULL) {
if (numroots < MAX_ROOTS) {
roots[numroots].hashent = hent;
roots[numroots].prefix = aent;
roots[numroots].suffix = NULL;
numroots++;
}
}
}
}
}
}
void suf_chk(const char* word,
int len,
struct affent* ep,
int num,
struct affent* pfxent,
int cpflag) {
struct affent* aent;
int cond;
struct hentry* hent;
int i;
for (aent = ep, i = num; i > 0; aent++, i--) {
if ((cpflag & XPRODUCT) != 0 && (aent->xpflg & XPRODUCT) == 0)
continue;
int tlen = len - aent->appndl;
if (tlen > 0 &&
(aent->appndl == 0 || strcmp(aent->appnd, (word + tlen)) == 0) &&
tlen + aent->stripl >= aent->numconds) {
std::string tword(word);
tword.resize(tlen);
tword.append(aent->strip);
unsigned char* cp = (unsigned char*)(tword.c_str() + tword.size());
for (cond = aent->numconds; --cond >= 0;) {
if ((aent->conds[*--cp] & (1 << cond)) == 0)
break;
}
if (cond < 0) {
if ((hent = lookup(tword.c_str())) != NULL) {
if (numroots < MAX_ROOTS) {
roots[numroots].hashent = hent;
roots[numroots].prefix = pfxent;
roots[numroots].suffix = aent;
numroots++;
}
}
}
}
}
}
void aff_chk(const char* word, int len) {
int i;
int nh = 0;
if (len < 4)
return;
for (i = 0; i < numpfx; i++) {
pfx_chk(word, len, ptable[i].aep, ptable[i].num);
}
nh = numroots;
if (nh > 0) {
for (int j = 0; j < nh; j++) {
if (roots[j].prefix->xpflg & XPRODUCT) {
char* nword = mystrdup((roots[j].hashent)->word);
int nwl = strlen(nword);
for (i = 0; i < numsfx; i++) {
suf_chk(nword, nwl, stable[i].aep, stable[i].num, roots[j].prefix,
XPRODUCT);
}
free(nword);
}
}
}
for (i = 0; i < numsfx; i++) {
suf_chk(word, len, stable[i].aep, stable[i].num, NULL, 0);
}
}
/* lookup a root word in the hashtable */
struct hentry* lookup(const char* word) {
struct hentry* dp;
dp = &tableptr[hash(word)];
if (dp->word == NULL)
return NULL;
for (; dp != NULL; dp = dp->next) {
if (strcmp(word, dp->word) == 0)
return dp;
}
return NULL;
}
/* add a word to the hash table */
int add_word(char* word) {
int i;
struct hentry* dp;
struct hentry* hp = (struct hentry*)malloc(sizeof(struct hentry));
hp->word = word;
hp->affstr = NULL;
hp->keep = 0;
hp->next = NULL;
i = hash(word);
dp = &tableptr[i];
if (dp->word == NULL) {
*dp = *hp;
free(hp);
} else {
while (dp->next != NULL)
dp = dp->next;
dp->next = hp;
}
return 0;
}
/* load a word list and build a hash table on the fly */
int load_tables(FILE* wdlst) {
char ts[MAX_LN_LEN];
int nExtra = 5;
/* first read the first line of file to get hash table size */
if (!fgets(ts, MAX_LN_LEN - 1, wdlst))
return 2;
mychomp(ts);
tablesize = atoi(ts);
if (tablesize <= 0 ||
(tablesize >= (std::numeric_limits<int>::max() - 1 - nExtra) / (int)sizeof(struct hentry*))) {
return 3;
}
tablesize += nExtra;
if ((tablesize % 2) == 0)
tablesize++;
/* allocate the hash table */
tableptr = (struct hentry*)calloc(tablesize, sizeof(struct hentry));
if (!tableptr)
return 3;
/* loop thorugh all words on much list and add to hash
* table and store away word and affix strings in tmpfile
*/
while (fgets(ts, MAX_LN_LEN - 1, wdlst)) {
mychomp(ts);
char* ap = mystrdup(ts);
add_word(ap);
}
return 0;
}
/* the hash function is a simple load and rotate
* algorithm borrowed
*/
int hash(const char* word) {
int i;
long hv = 0;
for (i = 0; i < 4 && *word != 0; i++)
hv = (hv << 8) | (*word++);
while (*word != 0) {
ROTATE(hv, ROTATE_LEN);
hv ^= (*word++);
}
return (unsigned long)hv % tablesize;
}
void add_affix_char(struct hentry* ep, char ac) {
int al;
int i;
char* tmp;
if (ep->affstr == NULL) {
ep->affstr = (char*)malloc(2);
*(ep->affstr) = ac;
*((ep->affstr) + 1) = '\0';
return;
}
al = strlen(ep->affstr);
for (i = 0; i < al; i++)
if (ac == (ep->affstr)[i])
return;
tmp = (char*)calloc(al + 2, 1);
memcpy(tmp, ep->affstr, (al + 1));
*(tmp + al) = ac;
*(tmp + al + 1) = '\0';
free(ep->affstr);
ep->affstr = tmp;
return;
}
/* add a prefix to word */
void pfx_add(const char* word, int len, struct affent* ep, int num) {
struct affent* aent;
int cond;
unsigned char* cp;
int i;
char* pp;
char tword[MAX_WD_LEN];
for (aent = ep, i = num; i > 0; aent++, i--) {
/* now make sure all conditions match */
if ((len > aent->stripl) && (len >= aent->numconds)) {
cp = (unsigned char*)word;
for (cond = 0; cond < aent->numconds; cond++) {
if ((aent->conds[*cp++] & (1 << cond)) == 0)
break;
}
if (cond >= aent->numconds) {
/* we have a match so add prefix */
int tlen = 0;
if (aent->appndl) {
strncpy(tword, aent->appnd, MAX_WD_LEN - 1);
tword[MAX_WD_LEN - 1] = '\0';
tlen += aent->appndl;
}
pp = tword + tlen;
strcpy(pp, (word + aent->stripl));
if (numwords < MAX_WORDS) {
wlist[numwords].word = mystrdup(tword);
wlist[numwords].pallow = 0;
numwords++;
}
}
}
}
}
/* add a suffix to a word */
void suf_add(const char* word, int len, struct affent* ep, int num) {
struct affent* aent;
int cond;
unsigned char* cp;
int i;
char tword[MAX_WD_LEN];
char* pp;
for (aent = ep, i = num; i > 0; aent++, i--) {
/* if conditions hold on root word
* then strip off strip string and add suffix
*/
if ((len > aent->stripl) && (len >= aent->numconds)) {
cp = (unsigned char*)(word + len);
for (cond = aent->numconds; --cond >= 0;) {
if ((aent->conds[*--cp] & (1 << cond)) == 0)
break;
}
if (cond < 0) {
/* we have a matching condition */
int tlen = len;
strncpy(tword, word, MAX_WD_LEN - 1);
tword[MAX_WD_LEN - 1] = '\0';
if (aent->stripl) {
tlen -= aent->stripl;
}
pp = (tword + tlen);
if (aent->appndl) {
strcpy(pp, aent->appnd);
} else
*pp = '\0';
if (numwords < MAX_WORDS) {
wlist[numwords].word = mystrdup(tword);
wlist[numwords].pallow = (aent->xpflg & XPRODUCT);
numwords++;
}
}
}
}
}
int expand_rootword(const char* ts, int wl, const char* ap) {
int i;
int nh = 0;
for (i = 0; i < numsfx; i++) {
if (strchr(ap, (stable[i].aep)->achar)) {
suf_add(ts, wl, stable[i].aep, stable[i].num);
}
}
nh = numwords;
if (nh > 1) {
for (int j = 1; j < nh; j++) {
if (wlist[j].pallow) {
for (i = 0; i < numpfx; i++) {
if (strchr(ap, (ptable[i].aep)->achar)) {
if ((ptable[i].aep)->xpflg & XPRODUCT) {
int nwl = strlen(wlist[j].word);
pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num);
}
}
}
}
}
}
for (i = 0; i < numpfx; i++) {
if (strchr(ap, (ptable[i].aep)->achar)) {
pfx_add(ts, wl, ptable[i].aep, ptable[i].num);
}
}
return 0;
}
/* strip strings into token based on single char delimiter
* acts like strsep() but only uses a delim char and not
* a delim string
*/
char* mystrsep(char** stringp, const char delim) {
char* rv = NULL;
char* mp = *stringp;
int n = strlen(mp);
if (n > 0) {
char* dp = (char*)memchr(mp, (int)((unsigned char)delim), n);
if (dp) {
ptrdiff_t nc;
*stringp = dp + 1;
nc = dp - mp;
rv = (char*)malloc(nc + 1);
if (rv) {
memcpy(rv, mp, nc);
*(rv + nc) = '\0';
}
} else {
rv = (char*)malloc(n + 1);
if (rv) {
memcpy(rv, mp, n);
*(rv + n) = '\0';
*stringp = mp + n;
}
}
}
return rv;
}
char* mystrdup(const char* s) {
char* d = NULL;
if (s) {
int sl = strlen(s) + 1;
d = (char*)malloc(sl);
if (d)
memcpy(d, s, sl);
}
return d;
}
void mychomp(char* s) {
int k = strlen(s);
if (k > 0)
*(s + k - 1) = '\0';
if ((k > 1) && (*(s + k - 2) == '\r'))
*(s + k - 2) = '\0';
}

View File

@ -1,156 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Hunspell, based on MySpell.
*
* The Initial Developers of the Original Code are
* Kevin Hendricks (MySpell) and Németh László (Hunspell).
* Portions created by the Initial Developers are Copyright (C) 2002-2005
* the Initial Developers. All Rights Reserved.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/* munch header file */
#define MAX_LN_LEN 200
#define MAX_WD_LEN 200
#define MAX_PREFIXES 2048
#define MAX_SUFFIXES 2048
#define MAX_ROOTS 20
#define MAX_WORDS 5000
#define ROTATE_LEN 5
#define ROTATE(v, q) \
(v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q)) - 1));
#define SET_SIZE 256
#define XPRODUCT (1 << 0)
/* the affix table entry */
struct affent {
char* appnd;
char* strip;
short appndl;
short stripl;
char achar;
char xpflg;
short numconds;
char conds[SET_SIZE];
};
struct affixptr {
struct affent* aep;
int num;
};
/* the prefix and suffix table */
int numpfx; /* Number of prefixes in table */
int numsfx; /* Number of suffixes in table */
/* the prefix table */
struct affixptr ptable[MAX_PREFIXES];
/* the suffix table */
struct affixptr stable[MAX_SUFFIXES];
/* data structure to store results of lookups */
struct matches {
struct hentry* hashent; /* hash table entry */
struct affent* prefix; /* Prefix used, or NULL */
struct affent* suffix; /* Suffix used, or NULL */
};
int numroots; /* number of root words found */
struct matches roots[MAX_ROOTS]; /* list of root words found */
/* hashing stuff */
struct hentry {
char* word;
char* affstr;
struct hentry* next;
int keep;
};
int tablesize;
struct hentry* tableptr;
/* unmunch stuff */
int numwords; /* number of words found */
struct dwords {
char* word;
int pallow;
};
struct dwords wlist[MAX_WORDS]; /* list words found */
/* the routines */
int parse_aff_file(FILE* afflst);
void encodeit(struct affent* ptr, char* cs);
int load_tables(FILE* wrdlst);
int hash(const char*);
int add_word(char*);
struct hentry* lookup(const char*);
void aff_chk(const char* word, int len);
void pfx_chk(const char* word, int len, struct affent* ep, int num);
void suf_chk(const char* word,
int len,
struct affent* ep,
int num,
struct affent* pfxent,
int cpflag);
void add_affix_char(struct hentry* hent, char ac);
int expand_rootword(const char*, int, const char*);
void pfx_add(const char* word, int len, struct affent* ep, int num);
void suf_add(const char* word, int len, struct affent* ep, int num);
char* mystrsep(char** stringp, const char delim);
char* mystrdup(const char* s);
void mychomp(char* s);

View File

@ -1,3 +0,0 @@
./usr
./var
./debs

View File

@ -1,213 +0,0 @@
#!/usr/bin/env python3
from os import listdir, path
from datetime import datetime
def report(output, desc, dikt_has, options, doc, option_count, all_dikts=False):
# header
output.write('## {} Options\n\n'.format(desc))
difopt = 0
for option in options:
if option in option_count:
difopt += 1
if difopt == 0:
output.write('A total of {} {} different options are recognised by Hunspell. None of these options are used'.format(
len(options), desc.lower()))
elif difopt == len(options):
output.write('A total of {} {} different options are recognised by Hunspell. All of these options are used'.format(
len(options), desc.lower()))
elif difopt == 1:
output.write('A total of {} {} different options are recognised by Hunspell. Of these, only 1 option is used'.format(
len(options), desc.lower()))
else:
output.write('A total of {} {} different options are recognised by Hunspell. Of these, only {} different options are used'.format(
len(options), desc.lower(), difopt))
if len(dikt_has) == 0:
output.write('\n\n')
elif len(dikt_has) == 1:
if len(dikt_has) == len(doc):
output.write(' in all 1 dictionary.\n\n')
else:
output.write(' in only 1 dictionary.\n\n')
else:
if len(dikt_has) == len(doc):
output.write(' in all {} dictionaries.\n\n'.format(len(dikt_has)))
else:
output.write(' in only {} dictionaries.\n\n'.format(len(dikt_has)))
output.write('| {} vs. Dictionary'.format(desc))
if all_dikts:
for dikt in sorted(doc):
output.write(' | {}'.format(dikt.replace('_', '\_')))
else:
for dikt in sorted(dikt_has):
output.write(' | {}'.format(dikt.replace('_', '\_')))
output.write(' |\n')
# format
output.write('|---')
if all_dikts:
for dikt in sorted(doc):
output.write('|--:')
else:
for dikt in sorted(dikt_has):
output.write('|--:')
output.write('|\n')
# content
for option in options:
output.write('| {}'.format(option))
if all_dikts:
for dikt in sorted(doc):
oc = doc[dikt]
if option in oc:
output.write(' | {}'.format(oc[option]))
else:
output.write(' |')
else:
for dikt in sorted(dikt_has):
oc = doc[dikt]
if option in oc:
output.write(' | {}'.format(oc[option]))
else:
output.write(' |')
output.write(' |\n')
output.write('\n\n')
# followings list are manually obtained from $ man -K 5 hunspell
options_general = ('SET', 'FLAG', 'COMPLEXPREFIXES',
'LANG', 'IGNORE', 'AF', 'AM', )
options_suggest = ('KEY', 'TRY', 'NOSUGGEST', 'MAXCPDSUGS', 'MAXNGRAMSUGS', 'MAXDIFF',
'ONLYMAXDIFF', 'NOSPLITSUGS', 'SUGSWITHDOTS', 'REP', 'MAP', 'PHONE', 'WARN', 'FORBIDWARN', )
options_compounding = ('BREAK', 'COMPOUNDRULE', 'COMPOUNDMIN', 'COMPOUNDFLAG', 'COMPOUNDBEGIN', 'COMPOUNDLAST', 'COMPOUNDMIDDLE', 'ONLYINCOMPOUND', 'COMPOUNDPERMITFLAG', 'COMPOUNDFORBIDFLAG', 'COMPOUNDMORESUFFIXES', 'COMPOUNDROOT',
'COMPOUNDWORDMAX', 'CHECKCOMPOUNDDUP', 'CHECKCOMPOUNDREP', 'CHECKCOMPOUNDCASE', 'CHECKCOMPOUNDTRIPLE', 'SIMPLIFIEDTRIPLE', 'CHECKCOMPOUNDPATTERN', 'FORCEUCASE', 'COMPOUNDSYLLABLE', 'SYLLABLENUM', ) # 'COMPOUND',
options_affix = ('PFX', 'SFX', 'CIRCUMFIX', 'FORBIDDENWORD', 'FULLSTRIP', 'KEEPCASE',
'ICONV', 'OCONV', 'NEEDAFFIX', 'SUBSTANDARD', 'WORDCHARS', 'CHECKSHARPS', )
options_deprecated = ('LEMMA_PRESENT', 'PSEUDOROOT', )
# self-check
for o in options_general:
if o in options_suggest:
print('ERROR: Overlap general and sugest')
exit(1)
if o in options_compounding:
print('ERROR: Overlap general and compounding')
exit(1)
if o in options_affix:
print('ERROR: Overlap general and affix')
exit(1)
if o in options_deprecated:
print('ERROR: Overlap general and deprecated')
exit(1)
for o in options_suggest:
if o in options_compounding:
print('ERROR: Overlap suggest and compounding')
exit(1)
if o in options_affix:
print('ERROR: Overlap suggest and affix')
exit(1)
if o in options_deprecated:
print('ERROR: Overlap sugges and deprecated')
exit(1)
for o in options_compounding:
if o in options_affix:
print('ERROR: Overlap compounding and affix')
exit(1)
if o in options_deprecated:
print('ERROR: Overlap compounding and deprecated')
exit(1)
for o in options_affix:
if o in options_deprecated:
print('ERROR: Overlap affix and deprecated')
exit(1)
options_found = []
options_undocumented = []
option_count = {} # option / count
doc = {} # dictionary / option / count
options = []
dikt_has_general = []
dikt_has_suggest = []
dikt_has_compounding = []
dikt_has_affix = []
dikt_has_deprecated = []
dikt_has_undocumented = []
directory = 'usr/share/hunspell/'
for filename in listdir(directory):
filepath = directory + filename
if filename.endswith('.aff') and path.islink(filepath):
print('XX', filename)
if filename.endswith('.dic') and not path.islink(filepath):
print('YY', filename)
if not filename.endswith('.aff') or path.islink(filepath) or filename in ('kk_KZ.aff', ): #FIXME kk_KZ.aff has invalid first character
continue
input = None
print(filename)
if filename in ('de_AT_frami.aff', 'de_CH_frami.aff', 'de_DE_frami.aff', 'de_DE.aff', 'en_US.aff', 'pt_BR.aff', 'sl_SI.aff', 'th_TH.aff', 'ru_RU.aff', 'nn_NO.aff', 'an_ES.aff', 'af_ZA.aff', 'el_GR.aff', 'bg_BG.aff', 'de_CH.aff', 'it_IT.aff', 'hu_HU.aff', 'pl_PL.aff', 'cs_CZ.aff', 'eu.aff', 'lt_LT.aff', 'nb_NO.aff', 'oc_FR.aff', 'bs_BA.aff', 'de_AT.aff', ):
input = open(filepath, 'r', encoding='ISO-8859-1')
else:
input = open(filepath, 'r')
dikt = filename.replace('.aff', '')
doc[dikt] = {}
oc = doc[dikt]
for line in input:
if dikt == 'kk_KZ':
line.replace('', '')
print(line)
line = line.strip()
if line == '' or line.startswith('#'):
continue
while ' ' in line: # TODO
line = line.replace(' ', ' ')
while '\t' in line: # TODO report?
line = line.replace('\t', ' ')
br = line.split(' ')
option = br[0]
# print(option, oc[option])
if option not in options_found:
options_found.append(option)
if option in oc:
oc[option] += 1
else:
oc[option] = 1
if option in options_general:
if dikt not in dikt_has_general:
dikt_has_general.append(dikt)
elif option in options_suggest:
if dikt not in dikt_has_suggest:
dikt_has_suggest.append(dikt)
elif option in options_compounding:
if dikt not in dikt_has_compounding:
dikt_has_compounding.append(dikt)
elif option in options_affix:
if dikt not in dikt_has_affix:
dikt_has_affix.append(dikt)
elif option in options_deprecated:
if dikt not in dikt_has_deprecated:
dikt_has_deprecated.append(dikt)
else:
if dikt not in dikt_has_undocumented:
dikt_has_undocumented.append(dikt)
if option not in options_undocumented:
options_undocumented.append(option)
if option in option_count:
option_count[option] += 1
else:
option_count[option] = 1
output = open('option-usage.md', 'w')
output.write('# Hunspell Option Usage per Dictionary\n\n')
output.write('This page has been generated at {}. Do not edit this page manually.\n\n'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S').replace(' ' , ' at ')))
print('dictionaries found', len(doc))
print('options found', len(options_found))
report(output, 'General', dikt_has_general, options_general, doc, option_count, all_dikts=True)
report(output, 'Suggest', dikt_has_suggest, options_suggest, doc, option_count)
report(output, 'Compounding', dikt_has_compounding, options_compounding, doc, option_count)
report(output, 'Affix', dikt_has_affix, options_affix, doc, option_count)
report(output, 'Deprecated', dikt_has_deprecated, options_deprecated, doc, option_count)
report(output, 'Undocumented', dikt_has_undocumented, options_undocumented, doc, option_count)

View File

@ -1,44 +0,0 @@
#!/usr/bin/env bash
PACKAGES=`apt-cache search hunspell|grep ^hunspell|grep dict|awk '{print $1}'|tr '\n' ' '`
echo $PACKAGES|sed 's/ /\n/g'
if [ -e usr ]
then
rm -rf usr
fi
if [ -e var ]
then
rm -rf var
fi
if [ -e debs ]
then
rm -rf debs
fi
mkdir debs
cd debs
apt-get download $PACKAGES
for i in *.deb
do
dpkg -x $i ..
done
cd ..
if [ -e debs ]
then
rm -rf debs
fi
if [ -e var ]
then
rm -rf var
fi
if [ -e usr/share/myspell ]
then
rm -rf usr/share/myspell
fi
if [ -e usr/share/doc ]
then
rm -rf usr/share/doc
fi

View File

@ -1,121 +0,0 @@
# Hunspell Option Usage per Dictionary
This page has been generated at 2017-04-25 at 23:17:56. Do not edit this page manually.
## General Options
A total of 7 general different options are recognised by Hunspell. Of these, only 6 different options are used in all 63 dictionaries.
| General vs. Dictionary | af\_ZA | an\_ES | ar | be\_BY | bg\_BG | bn\_BD | bo | br\_FR | bs\_BA | ca | ca\_ES-valencia | cs\_CZ | da\_DK | de\_AT\_frami | de\_CH\_frami | de\_DE\_frami | el\_GR | en\_AU | en\_CA | en\_GB | en\_US | en\_ZA | es\_ES | eu | fr | gd\_GB | gl\_ES | gu\_IN | he\_IL | hi\_IN | hr\_HR | hu\_HU | is\_IS | it\_IT | kmr\_Latn | ko | lo\_LA | lt\_LT | ml\_IN | nb\_NO | ne\_NP | nl\_NL | nn\_NO | oc\_FR | pl\_PL | pt\_BR | pt\_PT | ro\_RO | ru\_RU | se | si\_LK | sk\_SK | sl\_SI | sr\_Latn\_RS | sr\_RS | sv\_FI | sv\_SE | sw\_TZ | te\_IN | th\_TH | uk\_UA | uz\_UZ | vi\_VN |
|---|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|
| SET | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| FLAG | | | 1 | | | | | 1 | | 1 | 1 | | 1 | | | | | | | | | | | 1 | 1 | | 1 | | | | 1 | | 1 | | | 1 | | | | | 1 | 1 | | | | | | | | 1 | 1 | | | | | | | | | | | | |
| COMPLEXPREFIXES | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| LANG | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 | | 1 | | 1 | | | | | | | | | | | 1 | | | | | | | 1 | 1 | | | | | | | | |
| IGNORE | | | 1 | | | | 1 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 | | |
| AF | | | 512 | | | | | | | | | | | | | | | | | | | | | | 277 | | | | | | 99 | 1307 | | | | 56 | | | | | | | | | | | | | | 3501 | | | | | | | | | | | | | |
| AM | | | 23625 | | | | | | | | | | | | | | | | | | | | | | 485 | | | | | | | 23051 | 701 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
## Suggest Options
A total of 14 suggest different options are recognised by Hunspell. Of these, only 12 different options are used in only 60 dictionaries.
| Suggest vs. Dictionary | af\_ZA | an\_ES | ar | be\_BY | bg\_BG | bn\_BD | bo | br\_FR | bs\_BA | ca | ca\_ES-valencia | cs\_CZ | da\_DK | de\_AT\_frami | de\_CH\_frami | de\_DE\_frami | el\_GR | en\_AU | en\_CA | en\_GB | en\_US | en\_ZA | es\_ES | eu | fr | gd\_GB | gl\_ES | gu\_IN | he\_IL | hi\_IN | hr\_HR | hu\_HU | is\_IS | it\_IT | kmr\_Latn | ko | lt\_LT | nb\_NO | ne\_NP | nl\_NL | nn\_NO | oc\_FR | pl\_PL | pt\_BR | pt\_PT | ro\_RO | ru\_RU | se | si\_LK | sk\_SK | sl\_SI | sr\_Latn\_RS | sr\_RS | sv\_FI | sv\_SE | sw\_TZ | te\_IN | th\_TH | uk\_UA | vi\_VN |
|---|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|
| KEY | | | 1 | | | | | | | 1 | 1 | | 1 | | | | | | | | 1 | | | | 1 | | 1 | | | | | 1 | 1 | | | | | | | 1 | | | | | 1 | 1 | | | | | | 1 | 1 | | | | | | | |
| TRY | 1 | 1 | 1 | 1 | 1 | 1 | | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | | 1 | 1 |
| NOSUGGEST | | | | | | | | | | | | | 1 | 1 | 1 | 1 | | 1 | 1 | 1 | 1 | 1 | | | 1 | | | | | | 1 | 1 | | | | | | | | 1 | | | | 1 | | | | | | | | | | 1 | 1 | | | | | |
| MAXCPDSUGS | | | | | | | | | | | | | 1 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 | 1 | | | | | |
| MAXNGRAMSUGS | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 | | | | | | | | | | | | | | | | |
| MAXDIFF | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 | | | | | | | | | | 1 | 1 | | | | | |
| ONLYMAXDIFF | | | | | | | | | | | | | 1 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 | | | | | | | | | | 1 | 1 | | | | | |
| NOSPLITSUGS | 1 | | | | | | 1 | | | | | | 1 | | | | | | | | | | | 1 | | | 1 | | | | | | 1 | | | | | | | 1 | | | | | | | | | | | | | | 1 | 1 | | | | | |
| SUGSWITHDOTS | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| REP | 28 | 114 | 81 | 36 | | | 7 | | | 49 | 49 | | 113 | 29 | 29 | 29 | 523 | 115 | 91 | 28 | 98 | 37 | 21 | 37 | 83 | 49 | 3737 | | | | 100 | 125 | 58 | 5 | 37 | 60 | | | 23 | 488 | | | 65 | 1056 | 26 | 5 | | 74 | 26 | 53 | | 2 | 9 | 60 | 60 | | | 5 | 6 | 19 |
| MAP | 7 | 37 | 17 | 21 | 27 | | | | | 25 | 25 | | | | | | 6 | | | | | | 6 | | 26 | 6 | 22 | | 11 | | | 6 | | 6 | | 13 | | | | 6 | | 6 | 9 | 7 | 12 | 9 | | | | | | 5 | 5 | 3 | 3 | | | | | 19 |
| PHONE | | | | | | | | | | | | | | | | | | 105 | | | 106 | 105 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
| WARN | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 | | | | | | | | | | | | | 1 | | | | | | | | | | | | | | | | |
| FORBIDWARN | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
## Compounding Options
A total of 22 compounding different options are recognised by Hunspell. Of these, only 21 different options are used in only 25 dictionaries.
| Compounding vs. Dictionary | be\_BY | da\_DK | de\_AT\_frami | de\_CH\_frami | de\_DE\_frami | en\_AU | en\_CA | en\_GB | en\_US | en\_ZA | eu | fr | gd\_GB | he\_IL | hr\_HR | hu\_HU | ko | nb\_NO | nl\_NL | nn\_NO | pt\_BR | se | sv\_FI | sv\_SE | uk\_UA |
|---|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|
| BREAK | 3 | | 3 | 3 | 3 | 4 | | | | 4 | 2 | 8 | 2 | 4 | 3 | 5 | | | 2 | | 65 | 2 | 4 | 4 | 2 |
| COMPOUNDRULE | | | | | | 3 | 3 | 3 | 3 | | | | | | 8 | 3 | 7 | | | | | | 13 | 13 | |
| COMPOUNDMIN | | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | | | | | | 1 | 1 | 1 | 1 | 1 | 1 | | | 1 | 1 | |
| COMPOUNDFLAG | | | | | | | | | | | | | | | 1 | 1 | | 1 | | 1 | | | | | |
| COMPOUNDBEGIN | | 1 | 1 | 1 | 1 | | | | | | | | | | 1 | 1 | | | 1 | | | 1 | 1 | 1 | |
| COMPOUNDLAST | | | | | | | | | | | | | | | | 1 | | | | | | | | | |
| COMPOUNDMIDDLE | | 1 | 1 | 1 | 1 | | | | | | | | | | 1 | | | | 1 | | | 1 | 1 | 1 | |
| ONLYINCOMPOUND | | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | | | | | | 1 | 1 | | | 1 | | | 1 | 1 | 1 | |
| COMPOUNDPERMITFLAG | | 1 | 1 | 1 | 1 | | | | | | | | | | 1 | 1 | | | 1 | | | 1 | 1 | 1 | |
| COMPOUNDFORBIDFLAG | | | | | | | | | | | | | | | 1 | 1 | | | | | | 1 | | | |
| COMPOUNDMORESUFFIXES | | | | | | | | | | | | | | | | | | | | | | | | | |
| COMPOUNDROOT | | | | | | | | | | | | | | | | 1 | | | | | | | | | |
| COMPOUNDWORDMAX | | 1 | | | | | | | | | | | | | | 1 | | | | | | | | | |
| CHECKCOMPOUNDDUP | | | | | | | | | | | | | | | | 1 | | | 1 | | | | 1 | 1 | |
| CHECKCOMPOUNDREP | | | | | | | | | | | | | | | | 1 | | | | | | | 1 | 1 | |
| CHECKCOMPOUNDCASE | | | | | | | | | | | | | | | | 1 | | | 1 | | | | | | |
| CHECKCOMPOUNDTRIPLE | | | | | | | | | | | | | | | | 1 | | 1 | | | | | 1 | 1 | |
| SIMPLIFIEDTRIPLE | | | | | | | | | | | | | | | | | | 1 | | | | | 1 | 1 | |
| CHECKCOMPOUNDPATTERN | | | | | | | | | | | | | | | | 8 | | | 43 | | | | | | |
| FORCEUCASE | | | | | | | | | | | | | | | | | | | | | | | 1 | 1 | |
| COMPOUNDSYLLABLE | | | | | | | | | | | | | | | | 1 | | | | | | | | | |
| SYLLABLENUM | | | | | | | | | | | | | | | | 1 | | | | | | | | | |
## Affix Options
A total of 12 affix different options are recognised by Hunspell. All of these options are used in only 54 dictionaries.
| Affix vs. Dictionary | af\_ZA | an\_ES | ar | be\_BY | bg\_BG | bo | br\_FR | bs\_BA | ca | ca\_ES-valencia | cs\_CZ | da\_DK | de\_AT\_frami | de\_CH\_frami | de\_DE\_frami | el\_GR | en\_AU | en\_CA | en\_GB | en\_US | en\_ZA | es\_ES | eu | fr | gd\_GB | gl\_ES | he\_IL | hr\_HR | hu\_HU | is\_IS | it\_IT | kmr\_Latn | ko | lt\_LT | nb\_NO | ne\_NP | nl\_NL | nn\_NO | oc\_FR | pl\_PL | pt\_BR | pt\_PT | ro\_RO | ru\_RU | se | si\_LK | sk\_SK | sl\_SI | sr\_Latn\_RS | sr\_RS | sv\_FI | sv\_SE | sw\_TZ | uk\_UA |
|---|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|
| PFX | 20 | 58 | 194 | 10 | | | 27 | | 132 | 132 | 19 | 264 | 68 | 68 | 68 | 10 | 38 | 14 | 36 | 14 | 36 | 78 | 12 | 216 | 34 | | 3335 | | 366 | | 489 | 36 | | 90 | 41 | 2 | 154 | 41 | 16 | 2 | 162 | 38 | 26 | | | | 4 | 10 | | | | | 5 | 6 |
| SFX | 75 | 996 | 1609 | 879 | 1652 | 40 | 355 | 998 | 13318 | 13368 | 2551 | 1034 | 437 | 437 | 437 | 179 | 1078 | 59 | 1078 | 59 | 1078 | 6767 | 94118 | 9735 | 7 | 8586 | | 534 | 24052 | 13282 | 2744 | 80 | 55722 | 2586 | 495 | 525 | 447 | 383 | 708 | 7102 | 25770 | 1302 | 1624 | 1606 | 371599 | 10000 | 2443 | 526 | | | 492 | 492 | | 4477 |
| CIRCUMFIX | | | | | | | | | | | | | 1 | 1 | 1 | | | | | | | | | 1 | | | | 1 | | | | | | | | | | | | | | | | | | | | | | | | | | |
| FORBIDDENWORD | | | | | | | | | 1 | 1 | | 1 | 1 | 1 | 1 | | | | | | | | | 1 | | | | 1 | 1 | | | | 1 | | | | 1 | | | | 1 | | | | | | | | | | 1 | 1 | | |
| FULLSTRIP | | | | | | | | | 1 | 1 | | | | | | | | | | | | | | 1 | | | | | | 1 | | | | | | | | | | | | | | | | | | | | | 1 | 1 | | |
| KEEPCASE | | | | | | | | | | | | | 1 | 1 | 1 | | | | | | | | | 1 | | 1 | | 1 | 1 | | | | | | | | 1 | | | | | | | | | | | | | | | | | |
| ICONV | | | 10 | | | | | | | | | | | | | | 7 | 2 | | | 7 | | | 42 | 2 | | | 29 | 6 | | | | 11173 | | | | 10 | | | | | | | | | | | | 5 | 4 | | | | |
| OCONV | | | | | | | | | | | | | | | | | 2 | | | | 2 | | | 2 | | | | | | | | | 11173 | | | | 3 | | | | | | | | | | | | | | | | | |
| NEEDAFFIX | | | | | | 1 | | | | | | 1 | 1 | 1 | 1 | | | | | | | | | 1 | | 1 | 1 | 1 | 1 | | | | | | | | | | | | | | | | 1 | | | | | | 1 | 1 | | |
| SUBSTANDARD | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 | 1 | | | | | | | | | | | | | | | | | | | | | | | | | |
| WORDCHARS | | | | 1 | | | 1 | | 1 | 1 | | 1 | 1 | 1 | 1 | | 1 | 1 | 1 | 1 | | | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | | | 1 | | | | 1 | | | 1 | | 1 | | | 1 | | | | | | 1 | 1 | | 1 |
| CHECKSHARPS | | | | | | | | | | | | | 1 | 1 | 1 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
## Deprecated Options
A total of 2 deprecated different options are recognised by Hunspell. Of these, only 1 option is used in only 1 dictionary.
| Deprecated vs. Dictionary | hu\_HU |
|---|--:|
| LEMMA_PRESENT | 1 |
| PSEUDOROOT | |
## Undocumented Options
A total of 12 undocumented different options are recognised by Hunspell. All of these options are used in only 15 dictionaries.
| Undocumented vs. Dictionary | bo | da\_DK | de\_AT\_frami | de\_CH\_frami | de\_DE\_frami | en\_AU | gd\_GB | hr\_HR | hu\_HU | it\_IT | ko | nl\_NL | se | sv\_FI | sv\_SE |
|---|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|--:|
| LANGCODE | 1 | | | | | | | | | | | | | | |
| NAME | | | | | | | | | 1 | 1 | | | | | |
| HOME | | | | | | | | | 1 | 1 | | | | | |
| VERSION | | | | | | | | | 1 | 1 | 1 | | | | |
| COMPOUNDFIRST | | | | | | | | | 1 | | | | | | |
| ONLYROOT | | | | | | | | | 1 | | | | | | |
| HU_KOTOHANGZO | | | | | | | | | 1 | | | | | | |
| COMPOUNDEND | | 1 | 1 | 1 | 1 | | | 1 | 1 | | | 1 | 1 | 1 | 1 |
| GENERATE | | | | | | | | | 1 | | | | | | |
| LEFTHYPHENMIN | | | | | | | 1 | | | | | | | | |
| MIDWORD | | | | | | 1 | | | | | | | | | |
| BAD | | | | | | 1 | | | | | | | | | |

View File

@ -1,550 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/* Un-munch a root word list with affix tags
* to recreate the original word list
*/
#include <ctype.h>
#include <string.h>
#include <string>
#include <unistd.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <stddef.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <limits>
#include "unmunch.h"
int main(int argc, char** argv) {
int i;
int al;
FILE* wrdlst;
FILE* afflst;
char *wf, *af;
char ts[MAX_LN_LEN];
(void)argc;
/* first parse the command line options */
/* arg1 - munched wordlist, arg2 - affix file */
if (argv[1]) {
wf = mystrdup(argv[1]);
} else {
fprintf(stderr, "correct syntax is:\n");
fprintf(stderr, "unmunch dic_file affix_file\n");
exit(1);
}
if (argv[2]) {
af = mystrdup(argv[2]);
} else {
fprintf(stderr, "correct syntax is:\n");
fprintf(stderr, "unmunch dic_file affix_file\n");
exit(1);
}
/* open the affix file */
afflst = fopen(af, "r");
if (!afflst) {
fprintf(stderr, "Error - could not open affix description file\n");
exit(1);
}
/* step one is to parse the affix file building up the internal
affix data structures */
numpfx = 0;
numsfx = 0;
fullstrip = 0;
if (parse_aff_file(afflst)) {
fprintf(stderr, "Error - in affix file loading\n");
exit(1);
}
fclose(afflst);
fprintf(stderr, "parsed in %d prefixes and %d suffixes\n", numpfx, numsfx);
/* affix file is now parsed so create hash table of wordlist on the fly */
/* open the wordlist */
wrdlst = fopen(wf, "r");
if (!wrdlst) {
fprintf(stderr, "Error - could not open word list file\n");
exit(1);
}
/* skip over the hash table size */
if (!fgets(ts, MAX_LN_LEN - 1, wrdlst)) {
fclose(wrdlst);
return 2;
}
mychomp(ts);
while (fgets(ts, MAX_LN_LEN - 1, wrdlst)) {
mychomp(ts);
/* split each line into word and affix char strings */
char* ap = strchr(ts, '/');
if (ap) {
*ap = '\0';
ap++;
al = strlen(ap);
} else {
al = 0;
ap = NULL;
}
int wl = strlen(ts);
numwords = 0;
wlist[numwords].word = mystrdup(ts);
wlist[numwords].pallow = 0;
numwords++;
if (al)
expand_rootword(ts, wl, ap);
for (i = 0; i < numwords; i++) {
fprintf(stdout, "%s\n", wlist[i].word);
free(wlist[i].word);
wlist[i].word = NULL;
wlist[i].pallow = 0;
}
}
fclose(wrdlst);
return 0;
}
int parse_aff_file(FILE* afflst) {
int i, j;
int numents = 0;
char achar = '\0';
short ff = 0;
struct affent* ptr = NULL;
struct affent* nptr = NULL;
char* line = (char*)malloc(MAX_LN_LEN);
while (fgets(line, MAX_LN_LEN, afflst)) {
mychomp(line);
char ft = ' ';
fprintf(stderr, "parsing line: %s\n", line);
if (strncmp(line, "FULLSTRIP", 9) == 0)
fullstrip = 1;
if (strncmp(line, "PFX", 3) == 0)
ft = 'P';
if (strncmp(line, "SFX", 3) == 0)
ft = 'S';
if (ft != ' ') {
char* tp = line;
char* piece;
ff = 0;
i = 0;
while ((piece = mystrsep(&tp, ' '))) {
if (*piece != '\0') {
switch (i) {
case 0:
break;
case 1: {
achar = *piece;
break;
}
case 2: {
if (*piece == 'Y')
ff = XPRODUCT;
break;
}
case 3: {
numents = atoi(piece);
if ((numents <= 0) || ((std::numeric_limits<size_t>::max() /
sizeof(struct affent)) < static_cast<size_t>(numents))) {
fprintf(stderr, "Error: too many entries: %d\n", numents);
numents = 0;
} else {
ptr = (struct affent*)malloc(numents * sizeof(struct affent));
ptr->achar = achar;
ptr->xpflg = ff;
fprintf(stderr, "parsing %c entries %d\n", achar, numents);
}
break;
}
default:
break;
}
i++;
}
free(piece);
}
/* now parse all of the sub entries*/
nptr = ptr;
for (j = 0; j < numents; j++) {
if (!fgets(line, MAX_LN_LEN, afflst))
return 1;
mychomp(line);
tp = line;
i = 0;
while ((piece = mystrsep(&tp, ' '))) {
if (*piece != '\0') {
switch (i) {
case 0: {
if (nptr != ptr) {
nptr->achar = ptr->achar;
nptr->xpflg = ptr->xpflg;
}
break;
}
case 1:
break;
case 2: {
nptr->strip = mystrdup(piece);
nptr->stripl = strlen(nptr->strip);
if (strcmp(nptr->strip, "0") == 0) {
free(nptr->strip);
nptr->strip = mystrdup("");
nptr->stripl = 0;
}
break;
}
case 3: {
nptr->appnd = mystrdup(piece);
nptr->appndl = strlen(nptr->appnd);
if (strcmp(nptr->appnd, "0") == 0) {
free(nptr->appnd);
nptr->appnd = mystrdup("");
nptr->appndl = 0;
}
if (strchr(nptr->appnd, '/')) {
char* addseparator =
(char*)realloc(nptr->appnd, nptr->appndl + 2);
if (addseparator) {
nptr->appndl++;
addseparator[nptr->appndl - 1] = '|';
addseparator[nptr->appndl] = '\0';
nptr->appnd = addseparator;
}
}
break;
}
case 4: {
encodeit(nptr, piece);
}
fprintf(stderr, " affix: %s %d, strip: %s %d\n", nptr->appnd,
nptr->appndl, nptr->strip, nptr->stripl);
// no break
default:
break;
}
i++;
}
free(piece);
}
nptr++;
}
if (ptr) {
if (ft == 'P') {
ptable[numpfx].aep = ptr;
ptable[numpfx].num = numents;
fprintf(stderr, "ptable %d num is %d flag %c\n", numpfx,
ptable[numpfx].num, ptr->achar);
numpfx++;
} else if (ft == 'S') {
stable[numsfx].aep = ptr;
stable[numsfx].num = numents;
fprintf(stderr, "stable %d num is %d flag %c\n", numsfx,
stable[numsfx].num, ptr->achar);
numsfx++;
}
ptr = NULL;
}
nptr = NULL;
numents = 0;
achar = '\0';
}
}
free(line);
return 0;
}
void encodeit(struct affent* ptr, char* cs) {
int nc;
int neg;
int grp;
int n;
int ec;
int nm;
int i, j, k;
unsigned char mbr[MAX_WD_LEN];
/* now clear the conditions array */
for (i = 0; i < SET_SIZE; i++)
ptr->conds[i] = (unsigned char)0;
/* now parse the string to create the conds array */
nc = strlen(cs);
neg = 0; /* complement indicator */
grp = 0; /* group indicator */
n = 0; /* number of conditions */
ec = 0; /* end condition indicator */
nm = 0; /* number of member in group */
i = 0;
if (strcmp(cs, ".") == 0) {
ptr->numconds = 0;
return;
}
while (i < nc) {
unsigned char c = *((unsigned char*)(cs + i));
if (c == '[') {
grp = 1;
c = 0;
}
if ((grp == 1) && (c == '^')) {
neg = 1;
c = 0;
}
if (c == ']') {
ec = 1;
c = 0;
}
if ((grp == 1) && (c != 0)) {
*(mbr + nm) = c;
nm++;
c = 0;
}
if (c != 0) {
ec = 1;
}
if (ec) {
if (grp == 1) {
if (neg == 0) {
for (j = 0; j < nm; j++) {
k = (unsigned int)mbr[j];
ptr->conds[k] = ptr->conds[k] | (1 << n);
}
} else {
for (j = 0; j < SET_SIZE; j++)
ptr->conds[j] = ptr->conds[j] | (1 << n);
for (j = 0; j < nm; j++) {
k = (unsigned int)mbr[j];
ptr->conds[k] = ptr->conds[k] & ~(1 << n);
}
}
neg = 0;
grp = 0;
nm = 0;
} else {
/* not a group so just set the proper bit for this char */
/* but first handle special case of . inside condition */
if (c == '.') {
/* wild card character so set them all */
for (j = 0; j < SET_SIZE; j++)
ptr->conds[j] = ptr->conds[j] | (1 << n);
} else {
ptr->conds[(unsigned int)c] = ptr->conds[(unsigned int)c] | (1 << n);
}
}
n++;
ec = 0;
}
i++;
}
ptr->numconds = n;
return;
}
/* add a prefix to word */
void pfx_add(const char* word, int len, struct affent* ep, int num) {
struct affent* aent;
int cond;
unsigned char* cp;
int i;
for (aent = ep, i = num; i > 0; aent++, i--) {
/* now make sure all conditions match */
if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) &&
((aent->stripl == 0) ||
(strncmp(aent->strip, word, aent->stripl) == 0))) {
cp = (unsigned char*)word;
for (cond = 0; cond < aent->numconds; cond++) {
if ((aent->conds[*cp++] & (1 << cond)) == 0)
break;
}
if (cond >= aent->numconds) {
std::string tword;
/* we have a match so add prefix */
if (aent->appndl) {
tword.append(aent->appnd);
}
tword.append(word + aent->stripl);
if (numwords < MAX_WORDS) {
wlist[numwords].word = mystrdup(tword.c_str());
wlist[numwords].pallow = 0;
numwords++;
}
}
}
}
}
/* add a suffix to a word */
void suf_add(const char* word, int len, struct affent* ep, int num) {
struct affent* aent;
int cond;
unsigned char* cp;
int i;
for (aent = ep, i = num; i > 0; aent++, i--) {
/* if conditions hold on root word
* then strip off strip string and add suffix
*/
if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) &&
((aent->stripl == 0) ||
(strcmp(aent->strip, word + len - aent->stripl) == 0))) {
cp = (unsigned char*)(word + len);
for (cond = aent->numconds; --cond >= 0;) {
if ((aent->conds[*--cp] & (1 << cond)) == 0)
break;
}
if (cond < 0) {
/* we have a matching condition */
std::string tword(word);
tword.resize(len - aent->stripl);
tword.append(aent->appnd);
if (numwords < MAX_WORDS) {
wlist[numwords].word = mystrdup(tword.c_str());
wlist[numwords].pallow = (aent->xpflg & XPRODUCT);
numwords++;
}
}
}
}
}
int expand_rootword(const char* ts, int wl, const char* ap) {
int i;
int nh = 0;
for (i = 0; i < numsfx; i++) {
if (strchr(ap, (stable[i].aep)->achar)) {
suf_add(ts, wl, stable[i].aep, stable[i].num);
}
}
nh = numwords;
if (nh > 1) {
for (int j = 1; j < nh; j++) {
if (wlist[j].pallow) {
for (i = 0; i < numpfx; i++) {
if (strchr(ap, (ptable[i].aep)->achar)) {
if ((ptable[i].aep)->xpflg & XPRODUCT) {
int nwl = strlen(wlist[j].word);
pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num);
}
}
}
}
}
}
for (i = 0; i < numpfx; i++) {
if (strchr(ap, (ptable[i].aep)->achar)) {
pfx_add(ts, wl, ptable[i].aep, ptable[i].num);
}
}
return 0;
}
/* strip strings into token based on single char delimiter
* acts like strsep() but only uses a delim char and not
* a delim string
*/
char* mystrsep(char** stringp, const char delim) {
char* rv = NULL;
char* mp = *stringp;
int n = strlen(mp);
if (n > 0) {
char* dp = (char*)memchr(mp, (int)((unsigned char)delim), n);
if (dp) {
ptrdiff_t nc;
*stringp = dp + 1;
nc = dp - mp;
rv = (char*)malloc(nc + 1);
if (rv) {
memcpy(rv, mp, nc);
*(rv + nc) = '\0';
}
} else {
rv = (char*)malloc(n + 1);
if (rv) {
memcpy(rv, mp, n);
*(rv + n) = '\0';
*stringp = mp + n;
}
}
}
return rv;
}
char* mystrdup(const char* s) {
char* d = NULL;
if (s) {
int sl = strlen(s) + 1;
d = (char*)malloc(sl);
if (d)
memcpy(d, s, sl);
}
return d;
}
void mychomp(char* s) {
int k = strlen(s);
if ((k > 0) && (*(s + k - 1) == '\n'))
*(s + k - 1) = '\0';
if ((k > 1) && (*(s + k - 2) == '\r'))
*(s + k - 2) = '\0';
}

View File

@ -1,109 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/* unmunch header file */
#define MAX_LN_LEN 200
#define MAX_WD_LEN 200
#define MAX_PREFIXES 256
#define MAX_SUFFIXES 256
#define MAX_WORDS 500000
#define ROTATE_LEN 5
#define ROTATE(v, q) \
(v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q)) - 1));
#define SET_SIZE 256
#define XPRODUCT (1 << 0)
/* the affix table entry */
struct affent {
char* appnd;
char* strip;
short appndl;
short stripl;
char achar;
char xpflg;
short numconds;
char conds[SET_SIZE];
};
struct affixptr {
struct affent* aep;
int num;
};
/* the prefix and suffix table */
int numpfx; /* Number of prefixes in table */
int numsfx; /* Number of suffixes in table */
/* the prefix table */
struct affixptr ptable[MAX_PREFIXES];
/* the suffix table */
struct affixptr stable[MAX_SUFFIXES];
int fullstrip;
int numwords; /* number of words found */
struct dwords {
char* word;
int pallow;
};
struct dwords wlist[MAX_WORDS]; /* list words found */
/* the routines */
int parse_aff_file(FILE* afflst);
void encodeit(struct affent* ptr, char* cs);
int expand_rootword(const char*, int, const char*);
void pfx_add(const char* word, int len, struct affent* ep, int num);
void suf_add(const char* word, int len, struct affent* ep, int num);
char* mystrsep(char** stringp, const char delim);
char* mystrdup(const char* s);
void mychomp(char* s);

View File

@ -1,35 +0,0 @@
#!/bin/sh
case $# in
0|1|2) echo "Usage: wordforms [-s | -p] dictionary.aff dictionary.dic word
-s: print only suffixed forms
-p: print only prefixed forms
"; exit 1;;
esac
fx=0
case $1 in
-s) fx=1; shift;;
-p) fx=2; shift;;
esac
test -h /tmp/wordforms.aff && rm /tmp/wordforms.aff
ln -s $PWD/$1 /tmp/wordforms.aff
# prepared dic only with the query word
echo 1 >/tmp/wordforms.dic
grep "^$3/" $2 >>/tmp/wordforms.dic
echo $3 | awk -v "fx=$fx" '
fx!=2 && FILENAME!="-" && /^SFX/ && NF > 4{split($4,a,"/");clen=($3=="0") ? 0 : length($3);sfx[a[1],clen]=a[1];sfxc[a[1],clen]=clen;next}
fx!=1 && FILENAME!="-" && /^PFX/ && NF > 4{split($4,a,"/");clen=($3=="0") ? 0 : length($3);pfx[a[1],clen]=a[1];pfxc[a[1],clen]=clen;next}
FILENAME=="-"{
wlen=length($1)
if (fx==0 || fx==2) {
for (j in pfx) {if (wlen<=pfxc[j]) continue; print (pfx[j]=="0" ? "" : pfx[j]) substr($1, pfxc[j]+1)}
}
if (fx==0 || fx==1) {
for(i in sfx){clen=sfxc[i];if (wlen<=clen) continue; print substr($1, 1, wlen-clen) (sfx[i]=="0" ? "": sfx[i]) }
}
if (fx==0) {
for (j in pfx) {if (wlen<=pfxc[j]) continue;
for(i in sfx){clen=sfxc[i];if (wlen<=clen || wlen <= (clen + pfxc[j]))continue;
print (pfx[j]=="0" ? "" : pfx[j]) substr($1, pfxc[j]+1, wlen-clen-pfxc[j]) (sfx[i]=="0" ? "": sfx[i]) }}
}
}
' /tmp/wordforms.aff - | hunspell -d /tmp/wordforms -G -l

View File

@ -1,37 +0,0 @@
#!/bin/sh
#
# (C) 2008 Caolán McNamara <caolanm@redhat.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# This creates a LANG_TERRITORY .aff & .dic from a wordlist.
# It is only a simple wordlist spellchecking dictionary output, no
# knowledge of language rules can be extrapolated to shrink the
# wordlist or provide .aff rules for extending wordstems
if [ $# -lt 2 ]; then
echo "Usage: wordlist2hunspell wordlist_file locale"
echo "e.g. wordlist2hunspell breton.words br_FR to create br_FR.dic and br_FR.aff in cwd"
exit 1
fi
export LANG=$2.utf8
echo "# A basic .aff for a raw wordlist, created through wordlist2hunspell" > $2.aff
echo SET UTF-8 >> $2.aff
#see https://bugzilla.redhat.com/show_bug.cgi?id=462184 for the "C" hacks
echo TRY `sed 's/./&\n/g' $1 | sed '/^$/d' | LC_ALL=C sort -n | LC_ALL=C uniq -c | LC_ALL=C sort -rn | tr -s ' ' | cut -d ' ' -f 3 | tr -d '\n'` >> $2.aff
cat $1 | sed '/^$/d' | wc -l > $2.dic
LC_ALL=C sort $1 | sed '/^$/d' >> $2.dic
echo Basic $2.dic and $2.aff created

Some files were not shown because too many files have changed in this diff Show More