/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* ***** BEGIN LICENSE BLOCK ***** * Version: GPL 2.0 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License. You should have * received a copy of the GPL license along with this program; if you * did not, you can find it at http://www.gnu.org/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Coreseek.com code. * * Copyright (C) 2007-2008. All Rights Reserved. * * Author: * Li monan * * ***** END LICENSE BLOCK ***** */ #include "Segmenter.h" #include "SegmenterManager.h" extern "C"{ #include "iniparser/iniparser.h" } namespace css { const char g_ngram_unigram_dict_name[] = "uni.lib"; const char g_kword_unigram_dict_name[] = "kw.lib"; const char g_wordweight_unigram_dict_name[] = "weight.lib"; const char g_synonyms_dict_name[] = "synonyms.dat"; const char g_thesaurus_dict_name[] = "thesaurus.lib"; const char g_config_name[] = "mmseg.ini"; /** * Return a newly created segmenter */ Segmenter *SegmenterManager::getSegmenter( bool bFromPool) { Segmenter* seg = NULL; if(m_method == SEG_METHOD_NGRAM){ if(bFromPool) seg = seg_freelist_.alloc(); else seg = new Segmenter(); //init seg seg->m_unidict = &m_uni; seg->m_symdict = &m_sym; if(m_kw.isLoad()) seg->m_kwdict = &m_kw; if(m_weight.isLoad()) seg->m_weightdict = &m_weight; if(m_thesaurus.isLoad()) seg->m_thesaurus = &m_thesaurus; seg->m_config = &m_config; } return seg; } void SegmenterManager::loadconfig(const char* confile) { if(confile == NULL) return; dictionary * ini; char * s; int sl = 0; //m_config ini = iniparser_load(confile); if (ini==NULL) { return; // not exist or not a valid ini file } /* u1 merge_number_and_ascii; u1 seperate_number_ascii; u1 compress_space; u1 number_and_ascii_joint[512]; */ m_config.merge_number_and_ascii = iniparser_getboolean(ini, "mmseg:merge_number_and_ascii", 0); m_config.seperate_number_ascii = iniparser_getboolean(ini, "mmseg:seperate_number_ascii", 0); m_config.compress_space = iniparser_getboolean(ini, "mmseg:compress_space", 0); s = iniparser_getstring(ini, "mmseg:number_and_ascii_joint", NULL); if(s){ sl = strlen(s); if(sl>511){ memcpy(m_config.number_and_ascii_joint,s,sl); m_config.number_and_ascii_joint[511] = 0; }else{ memcpy(m_config.number_and_ascii_joint,s,sl); m_config.number_and_ascii_joint[sl] = 0; } } } int SegmenterManager::init(const char* path, u1 method) { if( method != SEG_METHOD_NGRAM) return -4; //unsupport segmethod. if( m_inited ) return 0; //only can be init once. char buf[1024]; memset(buf,0,sizeof(buf)); if(!path) memcpy(buf,".",1); else memcpy(buf,path,strlen(path)); int nLen = (int)strlen(path); //check is end. #ifdef WIN32 if(buf[nLen-1] != '\\'){ buf[nLen] = '\\'; nLen++; } #else if(buf[nLen-1] != '/'){ buf[nLen] = '/'; nLen++; } #endif m_method = method; int nRet = 0; if(method == SEG_METHOD_NGRAM) { seg_freelist_.set_size(64); memcpy(&buf[nLen],g_ngram_unigram_dict_name,strlen(g_ngram_unigram_dict_name)); nRet = m_uni.load(buf); if(nRet!=0){ printf("Unigram dictionary load Error\n"); return nRet; } //no needs to care kwformat memcpy(&buf[nLen],g_kword_unigram_dict_name,strlen(g_kword_unigram_dict_name)); buf[nLen+strlen(g_kword_unigram_dict_name)] = 0; nRet = m_kw.load(buf); if(nRet!=0 && nRet!=-1 ){ //m_kw not exist or format error. printf("Keyword dictionary load Error\n"); return nRet; } //try to load weight dict memcpy(&buf[nLen],g_wordweight_unigram_dict_name,strlen(g_wordweight_unigram_dict_name)); buf[nLen+strlen(g_wordweight_unigram_dict_name)] = 0; nRet = m_weight.load(buf); if(nRet!=0 && nRet!=-1 ){ //m_kw not exist or format error. printf("Keyword dictionary load Error\n"); return nRet; } memcpy(&buf[nLen],g_synonyms_dict_name,strlen(g_synonyms_dict_name)); buf[nLen+strlen(g_synonyms_dict_name)] = 0; //load g_synonyms_dict_name, we do not care the load in right or not nRet = m_sym.load(buf); if(nRet!=0 && nRet != -1){ printf("Synonyms dictionary format Error\n"); } memcpy(&buf[nLen],g_thesaurus_dict_name,strlen(g_thesaurus_dict_name)); buf[nLen+strlen(g_thesaurus_dict_name)] = 0; //load g_synonyms_dict_name, we do not care the load in right or not nRet = m_thesaurus.load(buf); if(nRet!=0 && nRet != -1){ printf("Thesaurus dictionary format Error\n"); } //read config memcpy(&buf[nLen],g_config_name,strlen(g_config_name)); buf[nLen+strlen(g_config_name)] = 0; loadconfig(buf); nRet = 0; m_inited = 1; return nRet; } return -1; } void SegmenterManager::clear() { seg_freelist_.free(); } SegmenterManager::SegmenterManager() :m_inited(0) { m_method = SEG_METHOD_NGRAM; } SegmenterManager::~SegmenterManager() { clear(); } } /* End of namespace css */