/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: GPL 2.0
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License. You should have
* received a copy of the GPL license along with this program; if you
* did not, you can find it at http://www.gnu.org/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Coreseek.com code.
*
* Copyright (C) 2007-2008. All Rights Reserved.
*
* Author:
* Li monan
*
* ***** END LICENSE BLOCK ***** */
#include "UnigramCorpusReader.h"
#include "UnigramDict.h"
namespace css {
int UnigramDict::load(const char* filename)
{
m_da.clear();
return m_da.open(filename);
}
/**
* This function should be used only, in Debug mode.
*/
std::string UnigramDict::getString(int id)
{
return "";
}
/**
* Find all word item in UnigramDict, which buf as a prefix
* @return total items found
*/
int UnigramDict::findHits(const char* buf, result_pair_type *result, size_t result_len, int keylen)
{
if(!m_da.array())
return 0;
int num = m_da.commonPrefixSearch(buf, result, result_len, keylen);
return num;
}
int UnigramDict::import(UnigramCorpusReader &ur)
{
std::vector key;
std::vector value;
int i = 0;
UnigramRecord* rec = NULL;
for(i=0;ikey[0];
key.push_back(ptr);
value.push_back(rec->count);
}
}//end for
//build da
m_da.clear();
//1st 0 is the length array.
//return m_da.build(key.size(), &key[0], 0, 0, &progress_bar) ;
return m_da.build(key.size(), &key[0], 0, &value[0] ) ;
}
int UnigramDict::save(const char* filename)
{
m_da.save(filename);
return 0;
}
int UnigramDict::isLoad()
{
return m_da.array() != NULL;
}
int UnigramDict::exactMatch(const char* key, int *id)
{
Darts::DoubleArray::result_pair_type rs;
m_da.exactMatchSearch(key,rs);
if(id)
*id = rs.pos;
if(rs.pos)
return rs.value;
///FIXME: this totaly a mixture. some single char's id > 0 if it in unigram input text, while other's id < 0 if not in ungram text.
///so you can not just simply use UCS2 code as a char's id.
///FIXED in prof. version by changing unigram-dictionary format.
//check is single char.
int len = strlen(key);
if(len<4){
const char* tm_pCur = key;
char v = key[0];
//might be single cjk char.
if ( v<128 && len == 1 && id)
*id = -1*(int)v;
// get number of bytes
int iBytes = 0, iBytesLength = 0;
while ( v & 0x80 ) { iBytes++; v <<= 1; }
if(iBytes == len && len != 1){
//single char
tm_pCur ++;
int iCode = 0;
iCode = ( v>>iBytes );
iBytes--;
do
{
if ( !(*tm_pCur) )
break;
if ( ((*tm_pCur) & 0xC0)!=0x80 ) {
iCode = 0;
break;
}
iCode = ( iCode<<6 ) + ( (*tm_pCur) & 0x3F );
iBytes--;
tm_pCur++;
} while ( iBytes );
if(iCode && id)
*id = -1*iCode;
}
}
return rs.value;
}
} /* End of namespace css */