/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* ***** BEGIN LICENSE BLOCK ***** * Version: GPL 2.0 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License. You should have * received a copy of the GPL license along with this program; if you * did not, you can find it at http://www.gnu.org/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Coreseek.com code. * * Copyright (C) 2007-2008. All Rights Reserved. * * Author: * Li monan * * ***** END LICENSE BLOCK ***** */ #include "csr_typedefs.h" #include "SegmentPkg.h" #include //#include #include #include "Segmenter.h" #include "csr_assert.h" namespace css { void ChineseCharTaggerImpl::init() { /** table format. 3part. 1 ASCII < 256 2 CJK u'\u4e00' < x < u'\u9fff' 3 Symbols 0xFF?? and iCode >= 0x3000)&&(iCode <= 0x303F 2 -> CJK part. 81 entry [9f-4e], each entry is 256 size. so the offset is [(high-byte - 4E) * 256 + low-byte] total size is 256*(9f-4e) */ u1 ansipage[]= { '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 's', 'w', 'w', 's', 's', 's', 's', 'w', 's', 's', 's', 's', 'w', 's', 'w', 's', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'w', 'w', 's', 's', 's', 'w', 's', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 's', 's', 's', 's', 's', 's', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 's', 's', 's', 's', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', '\0', }; //0xFF?? u1 sym1[]= { '\0', 'w', 'w', 's', 's', 's', 's', 'w', 's', 's', 's', 's', 'w', 's', 's', 's', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'w', 's', 's', 's', 's', 'w', 's', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 's', 's', 's', 's', 's', 's', 'w', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 's', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', }; // 0x30?? u1 sym2[]= { 's', 's', 'w', 'w', 's', 's', 's', 'm', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', }; //unicode Chinese number u2 chs_number[] ={ 0x4e00, 0x4e03, 0x4e09, 0x4e2a, 0x4e07, 0x9646, 0x4e5d, 0x767e, 0x8086, 0x4e8c, 0x4e94, 0x4ebf, 0x56db, 0x4edf, 0x96f6, 0x58f9, 0x62fe, 0x4f0d, 0x8d30, 0x5341, 0x5343, 0x5146, 0x634c, 0x516b, 0x516d, 0x4f70, 0x7396, 0x53c1, 0x67d2 }; memset(index_map,0,sizeof(index_map)); memcpy(ansi_map,ansipage,sizeof(ansipage)); index_map[0] = ansi_map; u1* sym = new u1[256]; memcpy(sym,sym2,sizeof(sym2)); //0x30 index_map[0x30] = sym; sym = new u1[256]; memcpy(sym,sym1,sizeof(sym1)); //0xff index_map[0xFF] = sym; //make cjk-map for(int i=0; i< sizeof(chs_number)/sizeof(u2); i++){ u1 idx = chs_number[i]>>8; u1* ptr = index_map[idx]; if(!ptr) { ptr = new u1[256]; memset(ptr,0,256); index_map[idx] = ptr; } ptr[(u1)chs_number[i]] = 'n'; //change tag from m -> n used by pure Chinese number } //unicode Chinese Symb //0x2018-0x201f && 0x2025-0x2027 && 0x2032-0x2037, //Not in Chinese Char range, used as sep. Generation Sep. //# NOTE 0x22xx, 0x23xx is number symbol, { u1* ptr = new u1[256]; memset(ptr,'s',256); index_map[0x22] = ptr; index_map[0x23] = ptr; ptr = new u1[256]; memset(ptr,0,256); for(u2 i = 0x2018; i<= 0x201f; i++) ptr[(u1)i] = 'w'; for(u2 i = 0x2025; i<= 0x2027; i++) ptr[(u1)i] = 'w'; for(u2 i = 0x2032; i<= 0x2037; i++) ptr[(u1)i] = 'w'; index_map[0x20] = ptr; } } /** if fact we use low-byte only, high byte is reserved. */ u2 ChineseCharTaggerImpl::tagUnicode(u2 iCode, u1 length){ //check length #ifdef DEBUG ASSERT(length<4); #else if(length>3) return 0; #endif //u'\u4e00' < 到 >8; if(index_map[idx]) { c_type = index_map[idx][(u1)iCode]; } if(c_type) goto DONE; if ((iCode >= 0x4E00)&&(iCode <= 0x9FA5)) //Chinese Char c_type = 'c'; else if((iCode >= 0x3000)&&(iCode <= 0x303F)) //Chinese Symbols and Punctuation c_type = 's'; else //Unknown char. c_type = 'u'; if(c_type) goto DONE; return 0; DONE: return flags[length-1]|(c_type - 'a' + 1); } SegmentPkg::SegmentPkg() // don't delete the following line as it's needed to preserve source code of this autogenerated element // section 127-0-0-1-20b1ad92:11734020486:-8000:0000000000000918 begin { init(); } void SegmentPkg::init() { m_length = 0; m_buf = NULL; m_tag = NULL; m_size = 0; m_remains_bytes = 0; m_used = 0; m_Own = 1; m_tagger = ChineseCharTagger::Get(); } // section 127-0-0-1-20b1ad92:11734020486:-8000:0000000000000918 end // don't delete the previous line as it's needed to preserve source code of this autogenerated element SegmentPkg::~SegmentPkg() { if(m_buf) free((void*)m_buf); m_buf = NULL; if(m_tag) free((void*)m_tag); m_tag = NULL; } /*********************************************************************************/ /* Note:We do NOT needs the iCode(UCS2), for we use UTF-8 as key lookup in dict. */ /* Change m_wTagList as a side effect /*********************************************************************************/ //the total offset of buf[0] int SegmentPkg::tagData(const char* buf, u1* tag, int l, int offset) { int length = l; if(!length) length = (int)strlen(buf); /* Utf8_Iter iter8; iter8.set((const unsigned char*)buf,length,Utf8_16::eUtf16LittleEndian); for (; iter8; ++iter8) { if (iter8.canGet()) { u2 val = iter8.get(); //check the unicode value. } }*/ const char* tm_pCur = buf; const char* tm_pCurFine = buf; const char* tm_pEnd = &buf[length]; u1* tm_pTagCur = tag; u2 iCode = 0; for ( ;(tm_pCur-buf)tagUnicode(iCode,1); if ((*tm_pTagCur&0x3F) == ('w' - 'a' + 1)) //check tag is w m_wTagList.push_back(offset + tm_pTagCur - tag); tm_pCurFine = tm_pCur; continue; } // get number of bytes int iBytes = 0, iBytesLength = 0; while ( v & 0x80 ) { iBytes++; v <<= 1; } // check for valid number of bytes if ( iBytes<2 || iBytes>4 ) continue; iCode = ( v>>iBytes ); iBytesLength = iBytes; iBytes--; do { if ( !(*tm_pCur) ) break; if ( ((*tm_pCur) & 0xC0)!=0x80 ) break; iCode = ( iCode<<6 ) + ( (*tm_pCur) & 0x3F ); iBytes--; tm_pCur++; if(tm_pEnd == tm_pCur) break; } while ( iBytes ); // return code point if there were no errors // ignore and continue scanning otherwise if(iCode == 0xFEFF) { //win32's utf-8 text header. m_used = iBytesLength; continue; } if ( !iBytes ) { *tm_pTagCur = (u1)m_tagger->tagUnicode(iCode,iBytesLength); u1 t1 = 'w'; u1 t2 = (*tm_pTagCur&0x3F); if ((*tm_pTagCur&0x3F) == ('w' - 'a' + 1)) m_wTagList.push_back(offset + tm_pTagCur - tag); tm_pCurFine = tm_pCur; } } ///FIXME: Count token(char) type only once might be a bit faster? /* u1* tm_pTagPtr = tag; u1* tm_pTagPrevPtr = NULL; u1 prev_tag = 0; while(tm_pTagPtr !=tm_pTagCur) { if (*tm_pTagPtr) { //here is a char. u1 tag = (*tm_pTagPtr)&0x3F; //0011 1111 if(tag == 'm' || tag == 'e') { //NOTE: we do not treate 'n' tag(Chinese number now). if(prev_tag != tag) { //char type changed } }//end if tag. } tm_pTagPtr ++; } */ return (int)(length - (tm_pCurFine - buf)); } int SegmentPkg::tagData(const char* buf,int length) { return tagData(buf,&m_tag[m_length-m_remains_bytes],length); } int SegmentPkg::feedData(const char* buf,int length ) { //check is enough if((m_size - m_length) > (length+1)){ //good.enough space memcpy((void*)&m_buf[m_length],buf,length*sizeof(char)); m_remains_bytes = tagData(&m_buf[m_length-m_remains_bytes],&m_tag[m_length-m_remains_bytes], length+m_remains_bytes, m_length-m_remains_bytes); m_length += length; return 0; }else if(m_length==0){ //new pkg, realloc. //check is larger than default length. if(DEFAULT_PACKAGE_LENGTH < length){ m_buf = (char*)malloc(length+1); m_tag = (u1*)malloc((length+1)*sizeof(u1)); m_size = length + 1; }else{ m_buf = (char*)malloc(DEFAULT_PACKAGE_LENGTH+1); m_tag = (u1*)malloc((DEFAULT_PACKAGE_LENGTH+1)*sizeof(u1)); m_size = DEFAULT_PACKAGE_LENGTH+1; } if(!m_buf||!m_tag) throw std::bad_alloc(); memcpy((void*)m_buf,buf,length*sizeof(char)); memset(m_tag,0,sizeof(u1)*(m_size)); m_used = 0; m_remains_bytes = tagData(buf,length); m_length = length; return 0; }else{ //not a new pkg, and new arrived data is larger than new feed data. } //can not append data to current pkg. return -1; } void SegmentPkg::setSize(int length) { if(m_buf && m_size > length) return; //clear if(m_buf) free((void*)m_buf); m_buf = NULL; if(m_tag) free((void*)m_tag); m_tag = NULL; //create new if(DEFAULT_PACKAGE_LENGTH < length){ m_buf = (char*)malloc(length+1); m_tag = (u1*)malloc((length+1)*sizeof(u1)); m_size = length + 1; }else{ m_buf = (char*)malloc(DEFAULT_PACKAGE_LENGTH+1); m_tag = (u1*)malloc((DEFAULT_PACKAGE_LENGTH+1)*sizeof(u1)); m_size = DEFAULT_PACKAGE_LENGTH+1; } m_length = 0; m_used = 0; } } /* End of namespace css */