/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* ***** BEGIN LICENSE BLOCK ***** * Version: GPL 2.0 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License. You should have * received a copy of the GPL license along with this program; if you * did not, you can find it at http://www.gnu.org/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Coreseek.com code. * * Copyright (C) 2007-2008. All Rights Reserved. * * Author: * Li monan * * ***** END LICENSE BLOCK ***** */ #ifndef _MM_THUNK_H_ #define _MM_THUNK_H_ #include #include #include "UnigramDict.h" #include "freelist.h" #define CHUNK_BUFFER_SIZE 1024 #define CHUNK_DEBUG 0 namespace css { class Chunk{ public: Chunk():m_free_score(0.0),total_length(0){} float m_free_score; int total_length; std::vector tokens; std::vector freqs; inline void pushToken(u2 len, u2 freq) { #if CHUNK_DEBUG printf("pt:%d, %d;\t",len, freq); #endif tokens.push_back(len); total_length += len; freqs.push_back(freq); //m_free_score += log((float)freq) * 100; } inline float get_free(){ //m_free_score float score = 0.0; std::vector::iterator it; float freq = 0; for(it = freqs.begin(); it < freqs.end(); it++){ freq = ((float)*it) + 1; score+= log(freq) * 100; } return score; } inline float get_avl() { float avg = (float)1.0*total_length/tokens.size(); return avg; } inline float get_avg(){ float avg = (float)1.0*total_length/tokens.size(); std::vector::iterator it; float total = 0; for(it = tokens.begin(); it < tokens.end(); it++){ float diff = ((*it) - avg); total += diff*diff; } return (float)1.0*total/(tokens.size() -1); } inline void popup() { if(tokens.size()) { total_length -= tokens[tokens.size() - 1]; tokens.pop_back(); freqs.pop_back(); } } inline void reset() { tokens.clear(); freqs.clear(); total_length = 0; } }; class ChunkQueue { public: ChunkQueue():max_length(0) {}; public: void push(Chunk& ck) { if(ck.total_length < max_length) return; //rule:1 if(ck.total_length > max_length) { max_length = ck.total_length; m_chunks.clear(); } m_chunks.push_back(ck); }; u2 getToken(){ size_t num_chunk = m_chunks.size(); if(!num_chunk) return 0; if(num_chunk == 1) return m_chunks[0].tokens[0]; //debug use->dump chunk #if CHUNK_DEBUG for(size_t i = 0; i avg_length){ avg_length = avl; k_ptr = remains; *k_ptr = (u4)i; k_ptr++; }else if(avl == avg_length){ *k_ptr = (u4)i; k_ptr++; } } if((k_ptr - remains) == 1) return m_chunks[remains[0]].tokens[0]; //match by rule2 //apply rule 3 u4 remains_r3[256]; u4* k_ptr_r3 = remains_r3; avg_length = 1024*64; //an unreachable avg for(size_t i = 0; imax_score){ max_score = score; idx = remains_r3[i]; } } return m_chunks[idx].tokens[0]; //return 0; }; inline void reset() { m_chunks.clear(); max_length = 0; }; protected: std::vector m_chunks; i4 max_length; }; class item_info { public: item_info(): //length(0), freq(0){ }; public: //u4 length; u4 freq; std::vector items; }; class MMThunk { public: MMThunk():base_offset(0), m_max_length(-1), m_length(0) { memset(m_charinfos, 0, sizeof(item_info*)*CHUNK_BUFFER_SIZE); memset(m_kwinfos, 0, sizeof(item_info*)*CHUNK_BUFFER_SIZE); item_list.set_size(CHUNK_BUFFER_SIZE*2); }; ~MMThunk() {}; void setItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results); void setKwItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results); void advance(u2 step) { base_offset += step; }; //peek the current token u1* peekToken(u2& length); u2 popupToken(); u1* peekKwToken(u2& pos, u2& length); u2 popupKwToken(); int Tokenize(); void pushToken(u2 aSize, i4 base); void reset(); u4 length() { return m_length; }; protected: u2 base_offset; CRFPP::FreeList item_list; item_info* m_charinfos[CHUNK_BUFFER_SIZE]; std::vector tokens; item_info* m_kwinfos[CHUNK_BUFFER_SIZE]; i4 m_kw_pos; i4 m_kw_ipos; i4 m_max_length; u4 m_length; ChunkQueue m_queue; protected: void pushChunk(Chunk& ck); }; } #endif