/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: GPL 2.0
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License. You should have
* received a copy of the GPL license along with this program; if you
* did not, you can find it at http://www.gnu.org/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Coreseek.com code.
*
* Copyright (C) 2007-2008. All Rights Reserved.
*
* Author:
* Li monan
*
* ***** END LICENSE BLOCK ***** */
#ifndef _MM_THUNK_H_
#define _MM_THUNK_H_
#include
#include
#include "UnigramDict.h"
#include "freelist.h"
#define CHUNK_BUFFER_SIZE 1024
#define CHUNK_DEBUG 0
namespace css {
class Chunk{
public:
Chunk():m_free_score(0.0),total_length(0){}
float m_free_score;
int total_length;
std::vector tokens;
std::vector freqs;
inline void pushToken(u2 len, u2 freq) {
#if CHUNK_DEBUG
printf("pt:%d, %d;\t",len, freq);
#endif
tokens.push_back(len);
total_length += len;
freqs.push_back(freq);
//m_free_score += log((float)freq) * 100;
}
inline float get_free(){
//m_free_score
float score = 0.0;
std::vector::iterator it;
float freq = 0;
for(it = freqs.begin(); it < freqs.end(); it++){
freq = ((float)*it) + 1;
score+= log(freq) * 100;
}
return score;
}
inline float get_avl() {
float avg = (float)1.0*total_length/tokens.size();
return avg;
}
inline float get_avg(){
float avg = (float)1.0*total_length/tokens.size();
std::vector::iterator it;
float total = 0;
for(it = tokens.begin(); it < tokens.end(); it++){
float diff = ((*it) - avg);
total += diff*diff;
}
return (float)1.0*total/(tokens.size() -1);
}
inline void popup() {
if(tokens.size()) {
total_length -= tokens[tokens.size() - 1];
tokens.pop_back();
freqs.pop_back();
}
}
inline void reset() {
tokens.clear();
freqs.clear();
total_length = 0;
}
};
class ChunkQueue
{
public:
ChunkQueue():max_length(0) {};
public:
void push(Chunk& ck) {
if(ck.total_length < max_length)
return; //rule:1
if(ck.total_length > max_length) {
max_length = ck.total_length;
m_chunks.clear();
}
m_chunks.push_back(ck);
};
u2 getToken(){
size_t num_chunk = m_chunks.size();
if(!num_chunk)
return 0;
if(num_chunk == 1)
return m_chunks[0].tokens[0];
//debug use->dump chunk
#if CHUNK_DEBUG
for(size_t i = 0; i avg_length){
avg_length = avl;
k_ptr = remains;
*k_ptr = (u4)i;
k_ptr++;
}else
if(avl == avg_length){
*k_ptr = (u4)i;
k_ptr++;
}
}
if((k_ptr - remains) == 1)
return m_chunks[remains[0]].tokens[0]; //match by rule2
//apply rule 3
u4 remains_r3[256];
u4* k_ptr_r3 = remains_r3;
avg_length = 1024*64; //an unreachable avg
for(size_t i = 0; imax_score){
max_score = score;
idx = remains_r3[i];
}
}
return m_chunks[idx].tokens[0];
//return 0;
};
inline void reset() {
m_chunks.clear();
max_length = 0;
};
protected:
std::vector m_chunks;
i4 max_length;
};
class item_info
{
public:
item_info():
//length(0),
freq(0){
};
public:
//u4 length;
u4 freq;
std::vector items;
};
class MMThunk
{
public:
MMThunk():base_offset(0), m_max_length(-1), m_length(0)
{
memset(m_charinfos, 0, sizeof(item_info*)*CHUNK_BUFFER_SIZE);
memset(m_kwinfos, 0, sizeof(item_info*)*CHUNK_BUFFER_SIZE);
item_list.set_size(CHUNK_BUFFER_SIZE*2);
};
~MMThunk() {};
void setItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results);
void setKwItems(i4 idx, u2 rs_count, UnigramDict::result_pair_type* results);
void advance(u2 step) { base_offset += step; };
//peek the current token
u1* peekToken(u2& length);
u2 popupToken();
u1* peekKwToken(u2& pos, u2& length);
u2 popupKwToken();
int Tokenize();
void pushToken(u2 aSize, i4 base);
void reset();
u4 length() { return m_length; };
protected:
u2 base_offset;
CRFPP::FreeList item_list;
item_info* m_charinfos[CHUNK_BUFFER_SIZE];
std::vector tokens;
item_info* m_kwinfos[CHUNK_BUFFER_SIZE];
i4 m_kw_pos;
i4 m_kw_ipos;
i4 m_max_length;
u4 m_length;
ChunkQueue m_queue;
protected:
void pushChunk(Chunk& ck);
};
}
#endif