/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: GPL 2.0
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License. You should have
* received a copy of the GPL license along with this program; if you
* did not, you can find it at http://www.gnu.org/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Coreseek.com code.
*
* Copyright (C) 2007-2008. All Rights Reserved.
*
* Author:
* Li monan
*
* ***** END LICENSE BLOCK ***** */
#include "Segmenter.h"
#include "SegmenterManager.h"
extern "C"{
#include "iniparser/iniparser.h"
}
namespace css {
const char g_ngram_unigram_dict_name[] = "uni.lib";
const char g_kword_unigram_dict_name[] = "kw.lib";
const char g_wordweight_unigram_dict_name[] = "weight.lib";
const char g_synonyms_dict_name[] = "synonyms.dat";
const char g_thesaurus_dict_name[] = "thesaurus.lib";
const char g_config_name[] = "mmseg.ini";
/**
* Return a newly created segmenter
*/
Segmenter *SegmenterManager::getSegmenter( bool bFromPool)
{
Segmenter* seg = NULL;
if(m_method == SEG_METHOD_NGRAM){
if(bFromPool)
seg = seg_freelist_.alloc();
else
seg = new Segmenter();
//init seg
seg->m_unidict = &m_uni;
seg->m_symdict = &m_sym;
if(m_kw.isLoad())
seg->m_kwdict = &m_kw;
if(m_weight.isLoad())
seg->m_weightdict = &m_weight;
if(m_thesaurus.isLoad())
seg->m_thesaurus = &m_thesaurus;
seg->m_config = &m_config;
}
return seg;
}
void SegmenterManager::loadconfig(const char* confile)
{
if(confile == NULL)
return;
dictionary * ini;
char * s;
int sl = 0;
//m_config
ini = iniparser_load(confile);
if (ini==NULL) {
return; // not exist or not a valid ini file
}
/*
u1 merge_number_and_ascii;
u1 seperate_number_ascii;
u1 compress_space;
u1 number_and_ascii_joint[512];
*/
m_config.merge_number_and_ascii =
iniparser_getboolean(ini, "mmseg:merge_number_and_ascii", 0);
m_config.seperate_number_ascii =
iniparser_getboolean(ini, "mmseg:seperate_number_ascii", 0);
m_config.compress_space =
iniparser_getboolean(ini, "mmseg:compress_space", 0);
s =
iniparser_getstring(ini, "mmseg:number_and_ascii_joint", NULL);
if(s){
sl = strlen(s);
if(sl>511){
memcpy(m_config.number_and_ascii_joint,s,sl);
m_config.number_and_ascii_joint[511] = 0;
}else{
memcpy(m_config.number_and_ascii_joint,s,sl);
m_config.number_and_ascii_joint[sl] = 0;
}
}
}
int SegmenterManager::init(const char* path, u1 method)
{
if( method != SEG_METHOD_NGRAM)
return -4; //unsupport segmethod.
if( m_inited )
return 0; //only can be init once.
char buf[1024];
memset(buf,0,sizeof(buf));
if(!path)
memcpy(buf,".",1);
else
memcpy(buf,path,strlen(path));
int nLen = (int)strlen(path);
//check is end.
#ifdef WIN32
if(buf[nLen-1] != '\\'){
buf[nLen] = '\\';
nLen++;
}
#else
if(buf[nLen-1] != '/'){
buf[nLen] = '/';
nLen++;
}
#endif
m_method = method;
int nRet = 0;
if(method == SEG_METHOD_NGRAM) {
seg_freelist_.set_size(64);
memcpy(&buf[nLen],g_ngram_unigram_dict_name,strlen(g_ngram_unigram_dict_name));
nRet = m_uni.load(buf);
if(nRet!=0){
printf("Unigram dictionary load Error\n");
return nRet;
}
//no needs to care kwformat
memcpy(&buf[nLen],g_kword_unigram_dict_name,strlen(g_kword_unigram_dict_name));
buf[nLen+strlen(g_kword_unigram_dict_name)] = 0;
nRet = m_kw.load(buf);
if(nRet!=0 && nRet!=-1 ){
//m_kw not exist or format error.
printf("Keyword dictionary load Error\n");
return nRet;
}
//try to load weight dict
memcpy(&buf[nLen],g_wordweight_unigram_dict_name,strlen(g_wordweight_unigram_dict_name));
buf[nLen+strlen(g_wordweight_unigram_dict_name)] = 0;
nRet = m_weight.load(buf);
if(nRet!=0 && nRet!=-1 ){
//m_kw not exist or format error.
printf("Keyword dictionary load Error\n");
return nRet;
}
memcpy(&buf[nLen],g_synonyms_dict_name,strlen(g_synonyms_dict_name));
buf[nLen+strlen(g_synonyms_dict_name)] = 0;
//load g_synonyms_dict_name, we do not care the load in right or not
nRet = m_sym.load(buf);
if(nRet!=0 && nRet != -1){
printf("Synonyms dictionary format Error\n");
}
memcpy(&buf[nLen],g_thesaurus_dict_name,strlen(g_thesaurus_dict_name));
buf[nLen+strlen(g_thesaurus_dict_name)] = 0;
//load g_synonyms_dict_name, we do not care the load in right or not
nRet = m_thesaurus.load(buf);
if(nRet!=0 && nRet != -1){
printf("Thesaurus dictionary format Error\n");
}
//read config
memcpy(&buf[nLen],g_config_name,strlen(g_config_name));
buf[nLen+strlen(g_config_name)] = 0;
loadconfig(buf);
nRet = 0;
m_inited = 1;
return nRet;
}
return -1;
}
void SegmenterManager::clear()
{
seg_freelist_.free();
}
SegmenterManager::SegmenterManager()
:m_inited(0)
{
m_method = SEG_METHOD_NGRAM;
}
SegmenterManager::~SegmenterManager()
{
clear();
}
} /* End of namespace css */