//
// $Id$
//

//
// Copyright (c) 2001-2011, Andrew Aksyonoff
// Copyright (c) 2008-2011, Sphinx Technologies Inc
// All rights reserved
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License. You should have
// received a copy of the GPL license along with this program; if you
// did not, you can find it at http://www.gnu.org/
//

#include "sphinx.h"
#include "sphinxexpr.h"
#include "sphinxutils.h"
#include "sphinxquery.h"
#include "sphinxrt.h"
#include "sphinxint.h"
#include "sphinxstem.h"
#include <math.h>

#define SNOWBALL 0
#define CROSSCHECK 0
#define PORTER1 0

#if SNOWBALL
#include "header.h"
#include "api.c"
#include "utilities.c"
#include "stem.c"
#endif

#if PORTER1
#include "porter1.c"
#endif

//////////////////////////////////////////////////////////////////////////

const char * g_sTmpfile = "__libsphinxtest.tmp";

//////////////////////////////////////////////////////////////////////////

bool CreateSynonymsFile ( const char * sMagic )
{
	FILE * fp = fopen ( g_sTmpfile, "w+" );
	if ( !fp )
		return false;

	fprintf ( fp,
		"AT&T      => AT&T\n"
		"   AT & T => AT & T  \n"
		"standarten fuehrer => Standartenfuehrer\n"
		"standarten fuhrer  => Standartenfuehrer\n"
		"OS/2 => OS/2\n"
		"Ms-Dos => MS-DOS\n"
		"MS DOS => MS-DOS\n"
		"feat. => featuring\n"
		"U.S. => US\n"
		"U.S.A. => USA\n"
		"U.S.B. => USB\n"
		"U.S.D. => USD\n"
		"U.S.P. => USP\n"
		"U.S.A.F. => USAF\n"
		);
	if ( sMagic )
		fprintf ( fp, "%s => test\n", sMagic );
	fclose ( fp );
	return true;
}


ISphTokenizer * CreateTestTokenizer ( bool bUTF8, bool bSynonyms, bool bEscaped = false )
{
	CSphString sError;
	CSphTokenizerSettings tSettings;
	tSettings.m_iType = bUTF8 ? TOKENIZER_UTF8 : TOKENIZER_SBCS;
	tSettings.m_iMinWordLen = 2;
	ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tSettings, sError );
	assert ( pTokenizer->SetCaseFolding ( "-, 0..9, A..Z->a..z, _, a..z, U+80..U+FF", sError ) );
	pTokenizer->AddSpecials ( "!-" );
	pTokenizer->EnableQueryParserMode ( true );
	if ( bSynonyms )
		assert ( pTokenizer->LoadSynonyms ( g_sTmpfile, sError ) );

	if ( bEscaped )
	{
		ISphTokenizer * pOldTokenizer = pTokenizer;
		pTokenizer = pTokenizer->Clone ( true );
		SafeDelete ( pOldTokenizer );
	}

	return pTokenizer;
}


void TestTokenizer ( bool bUTF8 )
{
	const char * sPrefix = bUTF8
		? "testing UTF8 tokenizer"
		: "testing SBCS tokenizer";

	for ( int iRun=1; iRun<=3; iRun++ )
	{
		// simple "one-line" tests
		const char * sMagic = bUTF8
			? "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82\xD1\x82\xD1\x82" // valid UTF-8
			: "\xC0\xC1\xF5\xF6"; // valid SBCS but invalid UTF-8

		assert ( CreateSynonymsFile ( sMagic ) );
		bool bExceptions = ( iRun>=2 );
		bool bEscaped = ( iRun==3 );
		ISphTokenizer * pTokenizer = CreateTestTokenizer ( bUTF8, bExceptions, bEscaped );

		const char * dTests[] =
		{
			"1", "",							NULL,								// test that empty strings work
			"1", "this is my rifle",			"this", "is", "my", "rifle", NULL,	// test that tokenizing works
			"1", "This is MY rifle",			"this", "is", "my", "rifle", NULL,	// test that folding works
			"1", "i-phone",						"i-phone", NULL,					// test that duals (specials in the middle of the word) work ok
			"1", "i phone",						"phone", NULL,						// test that short words are skipped
			"1", "this is m",					"this", "is", NULL,					// test that short words at the end are skipped
			"1", "the -phone",					"the", "-", "phone", NULL,			// test that specials work
			"1", "the!phone",					"the", "!", "phone", NULL,			// test that specials work
			"1", "i!phone",						"!", "phone", NULL,					// test that short words preceding specials are skipped
			"1", "/-hi",						"-", "hi", NULL,					// test that synonym-dual but folded-special chars work ok
			"2", "AT&T",						"AT&T", NULL,						// test that synonyms work
			"2", "AT & T",						"AT & T", NULL,						// test that synonyms with spaces work
			"2", "AT    &  T",					"AT & T", NULL,						// test that synonyms with continuous spaces work
			"2", "-AT&T",						"-", "AT&T", NULL,					// test that synonyms with specials work
			"2", "AT&",							"at", NULL,							// test that synonyms prefixes are not lost on eof
			"2", "AT&tee.yo",					"at", "tee", "yo", NULL,			// test that non-synonyms with partially matching prefixes work
			"2", "standarten fuehrer",			"Standartenfuehrer", NULL,
			"2", "standarten fuhrer",			"Standartenfuehrer", NULL,
			"2", "standarten fuehrerr",			"standarten", "fuehrerr", NULL,
			"2", "standarten fuehrer Stirlitz",	"Standartenfuehrer", "stirlitz", NULL,
			"2", "OS/2 vs OS/360 vs Ms-Dos",	"OS/2", "vs", "os", "360", "vs", "MS-DOS", NULL,
			"2", "AT ",							"at", NULL,							// test that prefix-whitespace-eof combo does not hang
			"2", "AT&T&TT",						"AT&T", "tt", NULL,
			"2", "http://OS/2",					"http", "OS/2", NULL,
			"2", "AT*&*T",						"at", NULL,
			"2", "# OS/2's system install",		"OS/2", "system", "install", NULL,
			"2", "IBM-s/OS/2/Merlin",			"ibm-s", "OS/2", "merlin", NULL,
			"2", "U.S.A",						"US", NULL,
			"2", "AT&T!",						"AT&T", "!", NULL,					// exceptions vs specials
			"2", "AT&T!!!",						"AT&T", "!", "!", "!", NULL,		// exceptions vs specials
			"2", "U.S.A.!",						"USA", "!", NULL,					// exceptions vs specials
			"2", "MS DOSS feat.Deskview.MS DOS",			"ms", "doss", "featuring", "deskview", "MS-DOS", NULL,
			"2", sMagic,									"test", NULL,
			"2", "U.S. U.S.A. U.S.A.F.",					"US", "USA", "USAF", NULL,
			"2", "U.S.AB U.S.A. U.S.B.U.S.D.U.S.U.S.A.F.",	"US", "ab", "USA", "USB", "USD", "US", "USAF", NULL,
			"3", "phon\\e",						"phone", NULL,
			"3", "\\thephone",					"thephone", NULL,
			"3", "the\\!phone",					"the", "phone", NULL,
			"3", "\\!phone",					"phone", NULL,
			"3", "\\\\phone",					"phone", NULL,						// the correct behavior if '\' is not in charset
			"3", "pho\\\\ne",					"pho", "ne", NULL,
			"3", "phon\\\\e",					"phon", NULL,
			"3", "trailing\\",					"trailing", NULL,
			NULL
		};

		for ( int iCur=0; dTests[iCur] && atoi ( dTests[iCur++] )<=iRun; )
		{
			printf ( "%s, run=%d, line=%s\n", sPrefix, iRun, dTests[iCur] );
			pTokenizer->SetBuffer ( (BYTE*)dTests[iCur], strlen ( dTests[iCur] ) );
			iCur++;

			for ( BYTE * pToken=pTokenizer->GetToken(); pToken; pToken=pTokenizer->GetToken() )
			{
				assert ( dTests[iCur] && strcmp ( (const char*)pToken, dTests[iCur] )==0 );
				iCur++;
			}

			assert ( dTests[iCur]==NULL );
			iCur++;
		}

		// test misc SBCS-only and UTF8-only one-liners
		const char * dTests2[] =
		{
			"0", "\x80\x81\x82",				"\x80\x81\x82", NULL,
			"1", "\xC2\x80\xC2\x81\xC2\x82",	"\xC2\x80\xC2\x81\xC2\x82", NULL,
			NULL
		};

		for ( int iCur=0; dTests2[iCur] && atoi ( dTests2[iCur++] )==int(bUTF8); )
		{
			printf ( "%s, run=%d, line=%s\n", sPrefix, iRun, dTests2[iCur] );
			pTokenizer->SetBuffer ( (BYTE*)dTests2[iCur], strlen ( dTests2[iCur] ) );
			iCur++;

			for ( BYTE * pToken=pTokenizer->GetToken(); pToken; pToken=pTokenizer->GetToken() )
			{
				assert ( dTests2[iCur] && strcmp ( (const char*)pToken, dTests2[iCur] )==0 );
				iCur++;
			}

			assert ( dTests2[iCur]==NULL );
			iCur++;
		}


		// test that decoder does not go over the buffer boundary on errors in UTF-8
		if ( bUTF8 )
		{
			printf ( "%s for proper UTF-8 error handling\n", sPrefix );
			const char * sLine3 = "hi\xd0\xffh";

			pTokenizer->SetBuffer ( (BYTE*)sLine3, 4 );
			assert ( !strcmp ( (char*)pTokenizer->GetToken(), "hi" ) );
		}

		// test uberlong tokens
		printf ( "%s for uberlong token handling\n", sPrefix );

		const int UBERLONG = 4096;
		char * sLine4 = new char [ UBERLONG+1 ];
		memset ( sLine4, 'a', UBERLONG );
		sLine4[UBERLONG] = '\0';

		char sTok4[SPH_MAX_WORD_LEN+1];
		memset ( sTok4, 'a', SPH_MAX_WORD_LEN );
		sTok4[SPH_MAX_WORD_LEN] = '\0';

		pTokenizer->SetBuffer ( (BYTE*)sLine4, strlen(sLine4) );
		assert ( !strcmp ( (char*)pTokenizer->GetToken(), sTok4 ) );
		assert ( pTokenizer->GetToken()==NULL );

		// test short word callbacks
		printf ( "%s for short token handling\n", sPrefix );
		ISphTokenizer * pShortTokenizer = pTokenizer->Clone ( bEscaped );

		CSphRemapRange tStar ( '*', '*', '*' );
		pShortTokenizer->AddCaseFolding ( tStar );

		CSphTokenizerSettings tSettings = pShortTokenizer->GetSettings();
		tSettings.m_iMinWordLen = 5;
		pShortTokenizer->Setup ( tSettings );

		pShortTokenizer->EnableQueryParserMode ( true );

		const char * dTestsShort[] =
		{
			"ab*",		"ab*",		NULL,
			"*ab",		"*ab",		NULL,
			"abcdef",	"abcdef",	NULL,
			"ab *ab* abc", "*ab*",	NULL,
			NULL
		};

		for ( int iCur=0; dTestsShort[iCur]; )
		{
			pShortTokenizer->SetBuffer ( (BYTE*)(dTestsShort [iCur]), strlen ( (const char*)dTestsShort [iCur] ) );
			iCur++;
			for ( BYTE * pToken=pShortTokenizer->GetToken(); pToken; pToken=pShortTokenizer->GetToken() )
			{
				assert ( dTestsShort[iCur] && strcmp ( (const char*)pToken, dTestsShort[iCur] )==0 );
				iCur++;
			}

			assert ( !dTestsShort [iCur] );
			iCur++;
		}

		SafeDelete ( pShortTokenizer );

		// test uberlong synonym-only tokens
		if ( iRun==2 )
		{
			printf ( "%s for uberlong synonym-only char token handling\n", sPrefix );

			memset ( sLine4, '/', UBERLONG );
			sLine4[UBERLONG] = '\0';

			pTokenizer->SetBuffer ( (BYTE*)sLine4, strlen(sLine4) );
			assert ( pTokenizer->GetToken()==NULL );

			printf ( "%s for uberlong synonym token handling\n", sPrefix );

			for ( int i=0; i<UBERLONG-3; i+=3 )
			{
				sLine4[i+0] = 'a';
				sLine4[i+1] = 'a';
				sLine4[i+2] = '/';
				sLine4[i+3] = '\0';
			}

			pTokenizer->SetBuffer ( (BYTE*)sLine4, strlen(sLine4) );
			for ( int i=0; i<UBERLONG-3; i+=3 )
				assert ( !strcmp ( (char*)pTokenizer->GetToken(), "aa" ) );
			assert ( pTokenizer->GetToken()==NULL );
		}

		SafeDeleteArray ( sLine4 );

		// test boundaries
		printf ( "%s for boundaries handling, run=%d\n", sPrefix, iRun );

		CSphString sError;
		assert ( pTokenizer->SetBoundary ( "?", sError ) );

		char sLine5[] = "hello world? testing boundaries?";
		pTokenizer->SetBuffer ( (BYTE*)sLine5, strlen(sLine5) );

		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "hello" ) ); assert ( !pTokenizer->GetBoundary() );
		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "world" ) ); assert ( !pTokenizer->GetBoundary() );
		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "testing" ) ); assert ( pTokenizer->GetBoundary() );
		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "boundaries" ) ); assert ( !pTokenizer->GetBoundary() );

		// test specials vs token start/end ptrs
		printf ( "%s vs specials vs token start/end ptrs\n", sPrefix );

		char sLine6[] = "abc!def";
		pTokenizer->SetBuffer ( (BYTE*)sLine6, strlen(sLine6) );

		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "abc" ) );
		assert ( *pTokenizer->GetTokenStart()=='a' );
		assert ( *pTokenizer->GetTokenEnd()=='!' );

		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "!" ) );
		assert ( *pTokenizer->GetTokenStart()=='!' );
		assert ( *pTokenizer->GetTokenEnd()=='d' );

		assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "def" ) );
		assert ( *pTokenizer->GetTokenStart()=='d' );
		assert ( *pTokenizer->GetTokenEnd()=='\0' );

		// done
		SafeDelete ( pTokenizer );
	}

	// test blended
	printf ( "%s vs escaping vs blend_chars edge cases\n", sPrefix );

	CSphString sError;
	ISphTokenizer * pTokenizer = CreateTestTokenizer ( bUTF8, false, true ); // no exceptions, but escaped
	pTokenizer->AddSpecials ( "()!-\"" );
	assert ( pTokenizer->SetBlendChars ( ".", sError ) );

	char sTest1[] = "(texas.\\\")";
	pTokenizer->SetBuffer ( (BYTE*)sTest1, strlen(sTest1) );

	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "(" ) );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "texas." ) );
	assert ( pTokenizer->TokenIsBlended() );
	pTokenizer->SkipBlended ();
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), ")" ) );
	assert ( pTokenizer->GetToken()==NULL );

	char sTest2[] = "\"series 2003\\-\\\"\"";
	printf ( "test %s\n", sTest2 );
	pTokenizer->SetBuffer ( (BYTE*)sTest2, strlen(sTest2) );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "\"" ) );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "series" ) );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "2003-" ) );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "\"" ) );
	assert ( pTokenizer->GetToken()==NULL );

	char sTest3[] = "aa lock.up bb";
	printf ( "test %s\n", sTest3 );
	pTokenizer->SetBuffer ( (BYTE*)sTest3, strlen(sTest3) );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "aa" ) );
	assert ( !pTokenizer->TokenIsBlended() );
	assert ( !pTokenizer->TokenIsBlendedPart() );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "lock.up" ) );
	assert ( pTokenizer->TokenIsBlended() );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "lock" ) );
	assert ( !pTokenizer->TokenIsBlended() );
	assert ( pTokenizer->TokenIsBlendedPart() );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "up" ) );
	assert ( !pTokenizer->TokenIsBlended() );
	assert ( pTokenizer->TokenIsBlendedPart() );
	assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "bb" ) );
	assert ( !pTokenizer->TokenIsBlended() );
	assert ( !pTokenizer->TokenIsBlendedPart() );

	SafeDelete ( pTokenizer );
}


void BenchTokenizer ( bool bUTF8 )
{
	printf ( "benchmarking %s tokenizer\n", bUTF8 ? "UTF8" : "SBCS" );
	if ( !CreateSynonymsFile ( NULL ) )
	{
		printf ( "benchmark failed: error writing temp synonyms file\n" );
		return;
	}


	const char * sTestfile = "./configure";
	for ( int iRun=1; iRun<=2; iRun++ )
	{
		FILE * fp = fopen ( sTestfile, "rb" );
		if ( !fp )
		{
			printf ( "benchmark failed: error opening %s\n", sTestfile );
			return;
		}
		const int MAX_DATA = 10485760;
		char * sData = new char [ MAX_DATA ];
		int iData = fread ( sData, 1, MAX_DATA, fp );
		fclose ( fp );
		if ( iData<=0 )
		{
			printf ( "benchmark failed: error reading %s\n", sTestfile );
			SafeDeleteArray ( sData );
			return;
		}

		CSphString sError;
		ISphTokenizer * pTokenizer = bUTF8 ? sphCreateUTF8Tokenizer () : sphCreateSBCSTokenizer ();
		pTokenizer->SetCaseFolding ( "-, 0..9, A..Z->a..z, _, a..z", sError );
		if ( iRun==2 )
			pTokenizer->LoadSynonyms ( g_sTmpfile, sError );
		pTokenizer->AddSpecials ( "!-" );

		const int iPasses = 10;
		int iTokens = 0;

		int64_t tmTime = -sphMicroTimer();
		for ( int iPass=0; iPass<iPasses; iPass++ )
		{
			pTokenizer->SetBuffer ( (BYTE*)sData, iData );
			while ( pTokenizer->GetToken() ) iTokens++;
		}
		tmTime += sphMicroTimer();

		iTokens /= iPasses;
		tmTime /= iPasses;

		printf ( "run %d: %d bytes, %d tokens, %d.%03d ms, %.3f MB/sec\n", iRun, iData, iTokens, (int)(tmTime/1000), (int)(tmTime%1000), float(iData)/tmTime );
		SafeDeleteArray ( sData );
	}
}

//////////////////////////////////////////////////////////////////////////

void TestStripper ()
{
	const char * sTests[][4] =
	{
		// source-data, index-attrs, remove-elements, expected-results
		{ "<html>trivial test</html>", "", "", " trivial test " },
		{ "<html>lets <img src=\"g/smth.jpg\" alt=\"nice picture\">index attrs</html>", "img=alt", "", " lets nice picture index attrs " },
		{ "<html>   lets  also<script> whatever here; a<b</script>remove scripts", "", "script, style", " lets also remove scripts" },
		{ "testing in<b><font color='red'>line</font> ele<em>men</em>ts", "", "", "testing inline elements" },
		{ "testing non<p>inline</h1>elements", "", "", "testing non inline elements" },
		{ "testing&nbsp;entities&amp;stuff", "", "", "testing entities&stuff" },
		{ "testing &#1040;&#1041;&#1042; utf encoding", "", "", "testing \xD0\x90\xD0\x91\xD0\x92 utf encoding" },
		{ "testing <1 <\" <\x80 <\xe0 <\xff </3 malformed tags", "", "", "testing <1 <\" <\x80 <\xe0 <\xff </3 malformed tags" },
		{ "testing comm<!--comm-->ents", "", "", "testing comments" },
		{ "&lt; &gt; &thetasym; &somethingverylong; &the", "", "", "< > \xCF\x91 &somethingverylong; &the" },
		{ "testing <img src=\"g/smth.jpg\" alt=\"nice picture\" rel=anotherattr junk=throwaway>inline tags vs attr indexing", "img=alt,rel", "", "testing nice picture anotherattr inline tags vs attr indexing" },
		{ "this <?php $code = \"must be stripped\"; ?> away", "", "", "this away" },
		{ "<a href=\"http://www.com\">content1</a>", "a=title", "", "content1" },
		{ "<a href=\"http://www.com\" title=\"my test title\">content2</a>", "a=title", "", "my test title content2" },
		{ "testing <img src=\"g/smth.jpg\" alt=\"nice picture\" rel=anotherattr junk=\"throwaway\">inline tags vs attr indexing", "img=alt,rel", "", "testing nice picture anotherattr inline tags vs attr indexing" },
		{ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html>test</html>", "", "", " test " },
		{ "<!smth \"that>can<break\"><html>test</html>", "", "", " test " },
		{ "<TABLE CLASS=\"MSONORMALTABLE\" STYLE=\"BORDER-COLLAPSE: COLLAPSE; MARGIN-LEFT: ID=\"TABLE76\"><TR><TD>ohai</TD></TR></TABLE>", "", "", " ohai " },
		{ "ohai2<table class", "", "", "ohai2 " },
		{ "ohai<table class>3", "", "", "ohai 3" },
		{ "ohai<table class >4", "", "", "ohai 4" },
		{ "ohai<table class =>5", "", "", "ohai 5" },
		{ "ohai<table class =\"smth><tr><td>6</td></tr></table> some more content", "", "", "ohai 6 some more content" },
		{ "ohai<table nowrap class=\"a>b\">7", "", "", "ohai 7" },
		{ "ohai<table nowrap class =\"a>b\">8", "", "", "ohai 8" },
		{ "ohai<table nowrap class= \"a>b\">9", "", "", "ohai 9" },
		{ "ohai<table now rap class=\"a>b\">10", "", "", "ohai 10" },
		{ "ohai<table class = \"smth><tr><td>6</td><td class=\"test\">11</td></tr></table> gimme more", "", "", "ohai 11 gimme more" },
		{ "<P ALIGN=\"LEFT STYLE=\"MARGIN:0IN 0IN .0001PT;TEXT-ALIGN:LEFT;\"><B><FONT SIZE=\"2\" FACE=\"TIMES NEW ROMAN\" STYLE=\"FONT-SIZE:10.0PT;FONT-WEIGHT:BOLD;\">Commission File Number: 333-155507", "", "", " Commission File Number: 333-155507" },
		{ "<TD NOWRAP ALIGN=RIGHT STYLE=\"BORDER-BOTTOM: #000000 1PX SOLID; BORDER-TOP: #000000 1PX SOLID;\"\"><B>SGX", "", "", " SGX" },
		{ "tango & cash", "", "", "tango & cash" },
		{ "<font CLASS=\"MSONORMALTABLE\" STYLE=\"BORDER-COLLAPSE: COLLAPSE; MARGIN-LEFT: ID=\"TABLE76\">ahoy\"mate", "font=zzz", "", "ahoy\"mate" },
		{ "ahoy<font class =>2", "font=zzz", "", "ahoy2" },
		{ "ahoy<font class =\"smth><b>3</b></font>there", "font=zzz", "", "ahoy3there" },
		{ "ahoy<font nowrap class=\"a>b\">4", "font=zzz", "", "ahoy4" },
		{ "ahoy<font now rap class=\"a>b\">5", "font=zzz", "", "ahoy5" },
		{ "ahoy<font class = \"smth><b><i>6</i><b class=\"test\">seven</b></i></font>eight", "font=zzz", "", "ahoyseveneight" }
	};

	int nTests = (int)(sizeof(sTests)/sizeof(sTests[0]));
	for ( int iTest=0; iTest<nTests; iTest++ )
	{
		printf ( "testing HTML stripper, test %d/%d... ", 1+iTest, nTests );

		CSphString sError;
		CSphHTMLStripper tStripper ( true );
		assert ( tStripper.SetIndexedAttrs ( sTests[iTest][1], sError ) );
		assert ( tStripper.SetRemovedElements ( sTests[iTest][2], sError ) );

		CSphString sBuf ( sTests[iTest][0] );
		tStripper.Strip ( (BYTE*)sBuf.cstr() );
		assert ( strcmp ( sBuf.cstr(), sTests[iTest][3] )==0 );

		printf ( "ok\n" );
	}
}

void BenchStripper ()
{
	printf ( "benchmarking HTML stripper\n" );

	FILE * fp = fopen ( "doc/sphinx.html", "rb" );
	if ( !fp )
	{
		printf ( "benchmark failed: unable to read doc/sphinx.html\n" );
		return;
	}

	const int MAX_SIZE = 1048576;
	char * sBuf = new char [ MAX_SIZE ];
	int iLen = fread ( sBuf, 1, MAX_SIZE-1, fp );
	fclose ( fp );

	char * sRef = new char [ MAX_SIZE ];
	sBuf[iLen] = '\0';
	memcpy ( sRef, sBuf, iLen+1 );

	for ( int iRun=0; iRun<2; iRun++ )
	{
		CSphString sError;
		CSphHTMLStripper tStripper ( true );
		if ( iRun==1 )
			tStripper.SetRemovedElements ( "style, script", sError );

		const int iPasses = 50;
		int64_t tmTime = -sphMicroTimer();
		for ( int iPass=0; iPass<iPasses; iPass++ )
		{
			tStripper.Strip ( (BYTE*)sBuf );
			memcpy ( sBuf, sRef, iLen+1 );
		}
		tmTime += sphMicroTimer();

		tmTime /= iPasses;
		printf ( "run %d: %d bytes, %d.%03d ms, %.3f MB/sec\n", iRun, iLen, (int)(tmTime/1000), (int)(tmTime%1000), float(iLen)/tmTime );
	}

	SafeDeleteArray ( sBuf );
	SafeDeleteArray ( sRef );
}

//////////////////////////////////////////////////////////////////////////

void TestExpr ()
{
	CSphColumnInfo tCol;
	tCol.m_eAttrType = SPH_ATTR_INTEGER;

	CSphSchema tSchema;
	tCol.m_sName = "aaa"; tSchema.AddAttr ( tCol, false );
	tCol.m_sName = "bbb"; tSchema.AddAttr ( tCol, false );
	tCol.m_sName = "ccc"; tSchema.AddAttr ( tCol, false );

	CSphRowitem * pRow = new CSphRowitem [ tSchema.GetRowSize() ];
	for ( int i=0; i<tSchema.GetRowSize(); i++ )
		pRow[i] = 1+i;

	CSphMatch tMatch;
	tMatch.m_iDocID = 123;
	tMatch.m_iWeight = 456;
	tMatch.m_pStatic = pRow;

	struct ExprTest_t
	{
		const char *	m_sExpr;
		float			m_fValue;
	};
	ExprTest_t dTests[] =
	{
		{ "ccc/2",							1.5f },
		{ "1*2*3*4*5*6*7*8*9*10",			3628800.0f },
		{ "aaa+bbb*sin(0)*ccc",				1.0f },
		{ "if(pow(sqrt(2),2)=2,123,456)",	123.0f },
		{ "if(2<2,3,4)",					4.0f },
		{ "if(2>=2,3,4)",					3.0f },
		{ "pow(7,5)",						16807.f },
		{ "sqrt(3)",						1.7320508f },
		{ "log2((2+2)*(2+2))",				4.0f },
		{ "min(3,15)",						3.0f },
		{ "max(3,15)",						15.0f },
		{ "if(3<15,bbb,ccc)",				2.0f },
		{ "@id+@weight",					579.0f },
		{ "abs(-3-ccc)",					6.0f },
		{ "(aaa+bbb)*(ccc-aaa)",			6.0f },
		{ "(((aaa)))",						1.0f },
		{ "aaa-bbb*ccc",					-5.0f },
		{ " aaa    -\tbbb *\t\t\tccc ",		-5.0f },
		{ "bbb+123*aaa",					125.0f },
		{ "2.000*2e+1+2",					42.0f },
		{ "3<5",							1.0f },
		{ "1 + 2*3 > 4*4",					0.0f },
		{ "aaa/-bbb",						-0.5f, },
		{ "-10*-10",						100.0f },
		{ "aaa+-bbb*-5",					11.0f },
		{ "-aaa>-bbb",						1.0f },
		{ "1-aaa+2-3+4",					3.0f },
		{ "bbb/1*2/6*3",					2.0f },
		{ "(aaa+bbb)/sqrt(3)/sqrt(3)",		1.0f },
		{ "aaa-bbb-2",						-3.0f },
		{ "ccc/2*4/bbb",					3.0f },
		{ "(2+(aaa*bbb))+3",				7.0f }
	};

	const int nTests = sizeof(dTests)/sizeof(dTests[0]);
	for ( int iTest=0; iTest<nTests; iTest++ )
	{
		printf ( "testing expression evaluation, test %d/%d... ", 1+iTest, nTests );

		CSphString sError;
		CSphScopedPtr<ISphExpr> pExpr ( sphExprParse ( dTests[iTest].m_sExpr, tSchema, NULL, NULL, sError ) );
		if ( !pExpr.Ptr() )
		{
			printf ( "FAILED; %s\n", sError.cstr() );
			assert ( 0 );
		}

		float fValue = pExpr->Eval(tMatch);
		if ( fabs ( fValue - dTests[iTest].m_fValue )>=0.0001f )
		{
			printf ( "FAILED; expected %.3f, got %.3f\n", dTests[iTest].m_fValue, fValue );
			assert ( 0 );
		}

		printf ( "ok\n" );
	}

	SafeDeleteArray ( pRow );
}


#if USE_WINDOWS
#define NOINLINE __declspec(noinline)
#else
#define NOINLINE
#endif

#define AAA float(tMatch.m_pStatic[0])
#define BBB float(tMatch.m_pStatic[1])
#define CCC float(tMatch.m_pStatic[2])

NOINLINE float ExprNative1 ( const CSphMatch & tMatch )	{ return AAA+BBB*CCC-1.0f;}
NOINLINE float ExprNative2 ( const CSphMatch & tMatch )	{ return AAA+BBB*CCC*2.0f-3.0f/4.0f*5.0f/6.0f*BBB; }
NOINLINE float ExprNative3 ( const CSphMatch & )		{ return (float)sqrt ( 2.0f ); }


void BenchExpr ()
{
	printf ( "benchmarking expressions\n" );

	CSphColumnInfo tCol;
	tCol.m_eAttrType = SPH_ATTR_INTEGER;

	CSphSchema tSchema;
	tCol.m_sName = "aaa"; tSchema.AddAttr ( tCol, false );
	tCol.m_sName = "bbb"; tSchema.AddAttr ( tCol, false );
	tCol.m_sName = "ccc"; tSchema.AddAttr ( tCol, false );

	CSphRowitem * pRow = new CSphRowitem [ tSchema.GetRowSize() ];
	for ( int i=0; i<tSchema.GetRowSize(); i++ )
		pRow[i] = 1+i;

	CSphMatch tMatch;
	tMatch.m_iDocID = 123;
	tMatch.m_iWeight = 456;
	tMatch.m_pStatic = pRow;

	struct ExprBench_t
	{
		const char *	m_sExpr;
		float			( *m_pFunc )( const CSphMatch & );
	};
	ExprBench_t dBench[] =
	{
		{ "aaa+bbb*(ccc)-1",				ExprNative1 },
		{ "aaa+bbb*ccc*2-3/4*5/6*bbb",		ExprNative2 },
		{ "sqrt(2)",						ExprNative3 }
	};

	for ( int iRun=0; iRun<int(sizeof(dBench)/sizeof(dBench[0])); iRun++ )
	{
		printf ( "run %d: ", iRun+1 );

		ESphAttr uType;
		CSphString sError;
		CSphScopedPtr<ISphExpr> pExpr ( sphExprParse ( dBench[iRun].m_sExpr, tSchema, &uType, NULL, sError ) );
		if ( !pExpr.Ptr() )
		{
			printf ( "FAILED; %s\n", sError.cstr() );
			return;
		}

		const int NRUNS = 1000000;

		volatile float fValue = 0.0f;
		int64_t tmTime = sphMicroTimer();
		for ( int i=0; i<NRUNS; i++ ) fValue += pExpr->Eval(tMatch);
		tmTime = sphMicroTimer() - tmTime;

		int64_t tmTimeInt = sphMicroTimer();
		if ( uType==SPH_ATTR_INTEGER )
		{
			int uValue = 0;
			for ( int i=0; i<NRUNS; i++ ) uValue += pExpr->IntEval(tMatch);
		}
		tmTimeInt = sphMicroTimer() - tmTimeInt;

		int64_t tmTimeNative = sphMicroTimer();
		for ( int i=0; i<NRUNS; i++ ) fValue += dBench[iRun].m_pFunc ( tMatch );
		tmTimeNative = sphMicroTimer() - tmTimeNative;

		if ( uType==SPH_ATTR_INTEGER )
			printf ( "int-eval %.1fM/sec, ", float(NRUNS)/tmTimeInt );

		printf ( "flt-eval %.1fM/sec, native %.1fM/sec\n",
			float(NRUNS)/tmTime,
			float(NRUNS)/tmTimeNative );
	}

	SafeDeleteArray ( pRow );
}

//////////////////////////////////////////////////////////////////////////

CSphString ReconstructNode ( const XQNode_t * pNode, const CSphSchema & tSchema )
{
	CSphString sRes ( "" );

	if ( !pNode )
		return sRes;

	if ( pNode->m_dWords.GetLength() )
	{
		// say just words to me
		const CSphVector<XQKeyword_t> & dWords = pNode->m_dWords;
		ARRAY_FOREACH ( i, dWords )
			sRes.SetSprintf ( "%s %s", sRes.cstr(), dWords[i].m_sWord.cstr() );
		sRes.Chop ();

		switch ( pNode->GetOp() )
		{
			case SPH_QUERY_AND:			break;
			case SPH_QUERY_PHRASE:		sRes.SetSprintf ( "\"%s\"", sRes.cstr() ); break;
			case SPH_QUERY_PROXIMITY:	sRes.SetSprintf ( "\"%s\"~%d", sRes.cstr(), pNode->m_iOpArg ); break;
			case SPH_QUERY_QUORUM:		sRes.SetSprintf ( "\"%s\"/%d", sRes.cstr(), pNode->m_iOpArg ); break;
			case SPH_QUERY_NEAR:		sRes.SetSprintf ( "\"%s\"NEAR/%d", sRes.cstr(), pNode->m_iOpArg ); break;
			default:					assert ( 0 && "unexpected op in ReconstructNode()" ); break;
		}

		if ( !pNode->m_dFieldMask.TestAll(true) )
		{
			CSphString sFields ( "" );
			for ( int i=0; i<CSphSmallBitvec::iTOTALBITS; i++ )
				if ( pNode->m_dFieldMask.Test(i) )
					sFields.SetSprintf ( "%s,%s", sFields.cstr(), tSchema.m_dFields[i].m_sName.cstr() );

			sRes.SetSprintf ( "( @%s: %s )", sFields.cstr()+1, sRes.cstr() );
		} else
		{
			if ( pNode->GetOp()==SPH_QUERY_AND && dWords.GetLength()>1 )
				sRes.SetSprintf ( "( %s )", sRes.cstr() ); // wrap bag of words
		}

	} else
	{
		ARRAY_FOREACH ( i, pNode->m_dChildren )
		{
			if ( !i )
				sRes = ReconstructNode ( pNode->m_dChildren[i], tSchema );
			else
			{
				const char * sOp = "(unknown-op)";
				switch ( pNode->GetOp() )
				{
					case SPH_QUERY_AND:		sOp = "AND"; break;
					case SPH_QUERY_OR:		sOp = "OR"; break;
					case SPH_QUERY_NOT:		sOp = "NOT"; break;
					case SPH_QUERY_ANDNOT:	sOp = "AND NOT"; break;
					case SPH_QUERY_BEFORE:	sOp = "BEFORE"; break;
					case SPH_QUERY_NEAR:	sOp = "NEAR"; break;
					default:				assert ( 0 && "unexpected op in ReconstructNode()" ); break;
				}
				sRes.SetSprintf ( "%s %s %s", sRes.cstr(), sOp, ReconstructNode ( pNode->m_dChildren[i], tSchema ).cstr() );
			}
		}

		if ( pNode->m_dChildren.GetLength()>1 )
			sRes.SetSprintf ( "( %s )", sRes.cstr() );
	}

	return sRes;
}


void TestQueryParser ()
{
	CSphString sTmp;

	CSphSchema tSchema;
	CSphColumnInfo tCol;
	tCol.m_sName = "title"; tSchema.m_dFields.Add ( tCol );
	tCol.m_sName = "body"; tSchema.m_dFields.Add ( tCol );

	CSphDictSettings tDictSettings;
	CSphScopedPtr<ISphTokenizer> pTokenizer ( sphCreateSBCSTokenizer () );
	CSphScopedPtr<CSphDict> pDict ( sphCreateDictionaryCRC ( tDictSettings, pTokenizer.Ptr(), sTmp, "query" ) );
	assert ( pTokenizer.Ptr() );
	assert ( pDict.Ptr() );

	CSphTokenizerSettings tTokenizerSetup;
	tTokenizerSetup.m_iMinWordLen = 2;
	tTokenizerSetup.m_sSynonymsFile = g_sTmpfile;
	pTokenizer->Setup ( tTokenizerSetup );

	CSphString sError;
	assert ( CreateSynonymsFile ( NULL ) );
	assert ( pTokenizer->LoadSynonyms ( g_sTmpfile, sError ) );

	struct QueryTest_t
	{
		const char *	m_sQuery;
		const char *	m_sReconst;
	};
	const QueryTest_t dTest[] =
	{
		{ "aaa bbb ccc",					"( aaa AND bbb AND ccc )" },
		{ "aaa|bbb ccc",					"( ( aaa OR bbb ) AND ccc )" },
		{ "aaa bbb|ccc",					"( aaa AND ( bbb OR ccc ) )" },
		{ "aaa (bbb ccc)|ddd",				"( aaa AND ( ( bbb AND ccc ) OR ddd ) )" },
		{ "aaa bbb|(ccc ddd)",				"( aaa AND ( bbb OR ( ccc AND ddd ) ) )" },
		{ "aaa bbb|(ccc ddd)|eee|(fff)",	"( aaa AND ( bbb OR ( ccc AND ddd ) OR eee OR fff ) )" },
		{ "aaa bbb|(ccc ddd) eee|(fff)",	"( aaa AND ( bbb OR ( ccc AND ddd ) ) AND ( eee OR fff ) )" },
		{ "aaa (ccc ddd)|bbb|eee|(fff)",	"( aaa AND ( ( ccc AND ddd ) OR bbb OR eee OR fff ) )" },
		{ "aaa (ccc ddd)|bbb eee|(fff)",	"( aaa AND ( ( ccc AND ddd ) OR bbb ) AND ( eee OR fff ) )" },
		{ "aaa \"bbb ccc\"~5|ddd",			"( aaa AND ( \"bbb ccc\"~5 OR ddd ) )" },
		{ "aaa bbb|\"ccc ddd\"~5",			"( aaa AND ( bbb OR \"ccc ddd\"~5 ) )" },
		{ "aaa ( ( \"bbb ccc\"~3|ddd ) eee | ( fff -ggg ) )",	"( aaa AND ( ( \"bbb ccc\"~3 OR ddd ) AND ( eee OR ( fff AND NOT ggg ) ) ) )" },
		{ "@title aaa @body ccc|(@title ddd eee)|fff ggg",		"( ( @title: aaa ) AND ( ( @body: ccc ) OR ( ( @title: ddd ) AND ( @title: eee ) ) OR ( @body: fff ) ) AND ( @body: ggg ) )" },
		{ "@title hello world | @body sample program",			"( ( @title: hello ) AND ( ( @title: world ) OR ( @body: sample ) ) AND ( @body: program ) )" },
		{ "@title one two three four",							"( ( @title: one ) AND ( @title: two ) AND ( @title: three ) AND ( @title: four ) )" },
		{ "@title one (@body two three) four",					"( ( @title: one ) AND ( ( @body: two ) AND ( @body: three ) ) AND ( @title: four ) )" },
		{ "windows 7 2000",										"( windows AND 2000 )" },
		{ "aaa a|bbb",											"( aaa AND bbb )" },
		{ "aaa bbb|x y z|ccc",									"( aaa AND bbb AND ccc )" },
		{ "a",													"" },
		{ "hello -world",										"( hello AND NOT world )" },
		{ "-hello world",										"( world AND NOT hello )" },
		{ "\"phrase (query)/3 ~on steroids\"",					"\"phrase query on steroids\"" },
		{ "hello a world",										"( hello AND world )" },
		{ "-one",												"" },
		{ "-one -two",											"" },
		{ "\"\"",												"" },
		{ "\"()\"",												"" },
		{ "\"]\"",												"" },
		{ "@title hello @body -world",							"( ( @title: hello ) AND NOT ( @body: world ) )" },
		{ "Ms-Dos",							"MS-DOS" }
	};


	int nTests = sizeof(dTest)/sizeof(dTest[0]);
	for ( int i=0; i<nTests; i++ )
	{
		printf ( "testing query parser, test %d/%d... ", i+1, nTests );

		XQQuery_t tQuery;
		sphParseExtendedQuery ( tQuery, dTest[i].m_sQuery, pTokenizer.Ptr(), &tSchema, pDict.Ptr(), 1 );

		CSphString sReconst = ReconstructNode ( tQuery.m_pRoot, tSchema );
		assert ( sReconst==dTest[i].m_sReconst );

		printf ( "ok\n" );
	}
}

//////////////////////////////////////////////////////////////////////////

#ifndef NDEBUG
void TestMisc ()
{
	BYTE dBuffer [ 128 ];
	int dValues[] = { 16383, 0, 1, 127, 128, 129, 256, 4095, 4096, 4097, 8192, 16383, 16384, 16385, 123456, 4194303, -1 };

	printf ( "testing string attr length packer/unpacker... " );

	BYTE * pRow = dBuffer;
	for ( int i=0; dValues[i]>=0; i++ )
		pRow += sphPackStrlen ( pRow, dValues[i] );

	const BYTE * pUnp = dBuffer;
	for ( int i=0; dValues[i]>=0; i++ )
	{
		int iUnp = sphUnpackStr ( pUnp, &pUnp );
		assert ( iUnp==dValues[i] );
	}

	printf ( "ok\n" );
}
#endif

//////////////////////////////////////////////////////////////////////////

void BenchLocators ()
{
	const int MAX_ITEMS = 10;
	const int NUM_MATCHES = 1000;
	const int NUM_RUNS = 100000;

	CSphRowitem dStatic[MAX_ITEMS];
	CSphRowitem dDynamic[MAX_ITEMS];
	CSphAttrLocator tLoc[NUM_MATCHES];
	CSphMatch tMatch[NUM_MATCHES];

	for ( int i=0; i<MAX_ITEMS; i++ )
		dStatic[i] = dDynamic[i] = i;

	srand ( 0 );
	for ( int i=0; i<NUM_MATCHES; i++ )
	{
		tLoc[i].m_iBitCount = 32;
		tLoc[i].m_iBitOffset = 32*( rand() % MAX_ITEMS ); // NOLINT
		tLoc[i].m_bDynamic = ( rand() % 2 )==1; // NOLINT
		tMatch[i].m_pStatic = dStatic;
		tMatch[i].m_pDynamic = dDynamic;
	}

	printf ( "benchmarking locators\n" );
	for ( int iRun=1; iRun<=3; iRun++ )
	{
		uint64_t tmLoc = sphMicroTimer();
		int iSum = 0;
		for ( int i=0; i<NUM_RUNS; i++ )
			for ( int j=0; j<NUM_MATCHES; j++ )
				iSum += (int)tMatch[j].GetAttr ( tLoc[j] );
		tmLoc = sphMicroTimer() - tmLoc;
		printf ( "run %d: sum=%d time=%d.%d msec\n", iRun, iSum, (int)(tmLoc/1000), (int)((tmLoc%1000)/100) );
	}

	// manually cleanup to avoid automatic delete
	for ( int i=0; i<NUM_MATCHES; i++ )
		tMatch[i].m_pDynamic = NULL;
}

//////////////////////////////////////////////////////////////////////////

#ifndef NDEBUG
int g_iRwlock;
CSphRwlock g_tRwlock;

void RwlockReader ( void * pArg )
{
	assert ( g_tRwlock.ReadLock() );
	sphSleepMsec ( 10 );
	*(int*)pArg = g_iRwlock;
	assert ( g_tRwlock.Unlock() );
}

void RwlockWriter ( void * pArg )
{
	assert ( g_tRwlock.WriteLock() );
	g_iRwlock += size_t(pArg);
	sphSleepMsec ( 3 );
	assert ( g_tRwlock.Unlock() );
}

void TestRwlock ()
{
	printf ( "testing rwlock... " );
	assert ( g_tRwlock.Init() );

	const int NPAIRS = 10;
	SphThread_t dReaders[NPAIRS];
	SphThread_t dWriters[NPAIRS];
	int iRead[NPAIRS];

	g_iRwlock = 0;
	for ( int i=0; i<NPAIRS; i++ )
	{
		assert ( sphThreadCreate ( &dReaders[i], RwlockReader, (void*)&iRead[i] ) );
		assert ( sphThreadCreate ( &dWriters[i], RwlockWriter, reinterpret_cast<void*>(1+i) ) );
	}

	for ( int i=0; i<NPAIRS; i++ )
	{
		assert ( sphThreadJoin ( &dReaders[i] ) );
		assert ( sphThreadJoin ( &dWriters[i] ) );
	}

	assert ( g_iRwlock==NPAIRS*(1+NPAIRS)/2 );
	assert ( g_tRwlock.Done() );

	int iReadSum = 0;
	for ( int i=0; i<NPAIRS; i++ )
		iReadSum += iRead[i];

	printf ( "ok (read_sum=%d)\n", iReadSum );
}
#endif // NDEBUG

//////////////////////////////////////////////////////////////////////////

#ifndef NDEBUG
void CleanupCallback ( void * pArg )
{
	*(bool*)pArg = true;
}

void CleanupThread ( void * pArg )
{
	sphThreadOnExit ( CleanupCallback, pArg );
}

void TestCleanup ()
{
	printf ( "testing thread cleanup... " );

	const int CLEANUP_COUNT = 10;

	bool bCleanup[CLEANUP_COUNT];
	for ( int i=0; i<CLEANUP_COUNT; i++ )
		bCleanup[i] = false;

	SphThread_t thd[CLEANUP_COUNT];
	for ( int i=0; i<CLEANUP_COUNT; i++ )
		assert ( sphThreadCreate ( &thd[i], CleanupThread, &bCleanup[i] ) );

	for ( int i=0; i<CLEANUP_COUNT; i++ )
		assert ( sphThreadJoin ( &thd[i] ) );

	for ( int i=0; i<CLEANUP_COUNT; i++ )
		assert ( bCleanup[i]==true );

	printf ( "ok\n" );
}
#endif // NDEBUG

//////////////////////////////////////////////////////////////////////////

volatile int g_iMutexBench = 0;

void DummyThread ( void * )
{}

void MutexBenchThread ( void * pArg )
{
	CSphMutex * pMutex = (CSphMutex *) pArg;
	pMutex->Lock();
	for ( int i=0; i<100; i++ )
		g_iMutexBench++;
	g_iMutexBench -= 99;
	pMutex->Unlock();
}

void BenchThreads ()
{
	printf ( "benchmarking threads\n" );

	const int BATCHES = 100;
	const int BATCH_THREADS = 100;
	const int TOTAL_THREADS = BATCHES*BATCH_THREADS;

	SphThread_t * pThd = new SphThread_t [ BATCH_THREADS ];

	CSphMutex tMutex;
	if ( !tMutex.Init() )
		sphDie ( "failed to init mutex" );

	for ( int iRun=1; iRun<=2; iRun++ )
	{
		int64_t tmThd = sphMicroTimer();
		for ( int iBatch=0; iBatch<BATCHES; iBatch++ )
		{
			for ( int i=0; i<BATCH_THREADS; i++ )
				if ( !sphThreadCreate ( pThd+i, ( iRun==1 ) ? DummyThread : MutexBenchThread, &tMutex) )
					sphDie ( "failed to create thread (batch %d thd %d)", iBatch, i );

			for ( int i=0; i<BATCH_THREADS; i++ )
				if ( !sphThreadJoin ( pThd+i ) )
					sphDie ( "failed to join to thread (batch %d thd %d)", iBatch, i );
		}
		tmThd = sphMicroTimer()-tmThd;

		if ( iRun==2 && g_iMutexBench!=TOTAL_THREADS )
			sphDie ( "failed mutex benchmark (expected=%d, got=%d)", TOTAL_THREADS, g_iMutexBench );

		int iThdSec10 = (int)( int64_t(TOTAL_THREADS)*1000000*10/tmThd );
		const char * sDesc = ( iRun==1 ) ? "dummy" : "mutex";

		printf ( "run %d: %d %s threads in %d.%d msec; %d.%d thd/sec\n",
			iRun, TOTAL_THREADS, sDesc,
			(int)(tmThd/1000), (int)((tmThd%1000)/100),
			iThdSec10/10, iThdSec10%10 );
	}

	tMutex.Done ();
	SafeDeleteArray ( pThd );
}

//////////////////////////////////////////////////////////////////////////

void SortDataRepeat1245 ( DWORD * pData, int iCount )
{
	const int dFill[4] = { 1, 2, 4, 5 };
	for ( int i=0; i<iCount; i++ )
		pData[i] = dFill[i%4];
}

void SortDataEnd0 ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = i+1;
	pData[iCount-1] = 0;
}

void SortDataIdentical ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = 123;
}

void SortDataMed3Killer ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount/2; i++ )
		pData[i] = 1 + i + ( i & 1 )*(iCount/2-1);
	for ( int i=iCount/2; i<iCount; i++ )
		pData[i] = 2*(i-iCount/2+1);
}

void SortDataMidKiller ( DWORD * pData, int iCount )
{
	for ( int i=0; i<2; i++ )
		for ( int j=0; j<iCount/2; j++ )
			*pData++ = j*2+i;
}

void SortDataRandDupes ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = sphRand() % ( iCount/10 );
}

void SortDataRandUniq ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = i;
	for ( int i=0; i<iCount; i++ )
		Swap ( pData[i], pData[sphRand()%iCount] );
}

void SortDataRandSteps ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i+=100 )
	{
		int a = i;
		int b = Min ( i+100, iCount );
		for ( int j=a; j<b; j++ )
			pData[j] = j;
		for ( int j=a; j<b; j++ )
			Swap ( pData[j], pData [ a + sphRand()%(b-a) ] );
	}
}

void SortDataRevEnds ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = i;
	Swap ( pData[0], pData[iCount-1] );
}

void SortDataRevPartial ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = iCount-i;
	for ( int i=0; i<iCount/10; i++ )
		Swap ( pData[sphRand()%iCount], pData[sphRand()%iCount] );
}

void SortDataRevSaw ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i+=100 )
	{
		int a = i;
		int b = Min ( i+100, iCount );
		for ( int j=a; j<b; j++ )
			pData[j] = b-j;
	}
}

void SortDataReverse ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = iCount-i;
}

void SortDataStart1000 ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = 1+i;
	pData[0] = 1000;
}

void SortDataSeqPartial ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = 1+i;
	for ( int i=0; i<iCount/10; i++ )
		Swap ( pData[sphRand()%iCount], pData[sphRand()%iCount] );
}

void SortDataSeqSaw ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i+=100 )
	{
		int a = i;
		int b = Min ( i+100, iCount );
		for ( int j=a; j<b; j++ )
			pData[j] = j-a+1;
	}
}

void SortDataSeq ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = 1+i;
}

void SortDataAscDesc ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount/2; i++ )
		pData[i] = 1+i;
	for ( int i=iCount/2; i<iCount; i++ )
		pData[i] = iCount-i;
}

void SortDataDescAsc ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount/2; i++ )
		pData[i] = iCount/2-i;
	for ( int i=iCount/2; i<iCount; i++ )
		pData[i] = i-iCount/2+1;
}

void SortDataRand01 ( DWORD * pData, int iCount )
{
	for ( int i=0; i<iCount; i++ )
		pData[i] = ( sphRand()>>3 ) & 1;
}

typedef void (*SortDataGen_fn)( DWORD *, int );

struct SortDataGenDesc_t
{
	SortDataGen_fn		m_fnGen;
	const char *		m_sName;
};

SortDataGenDesc_t g_dSortDataGens[] =
{
	{ SortDataRepeat1245,	"repeat1245" },
	{ SortDataEnd0,			"end0" },
	{ SortDataIdentical,	"identical" },
	{ SortDataMed3Killer,	"med3killer" },
	{ SortDataMidKiller,	"midkiller" },
	{ SortDataRandDupes,	"randdupes" },
	{ SortDataRandUniq,		"randuniq" },
	{ SortDataRandSteps,	"randsteps" },
	{ SortDataRevEnds,		"revends" },
	{ SortDataRevPartial,	"revpartial" },
	{ SortDataRevSaw,		"revsaw" },
	{ SortDataReverse,		"reverse" },
	{ SortDataStart1000,	"start1000" },
	{ SortDataSeqPartial,	"seqpartial" },
	{ SortDataSeqSaw,		"seqsaw" },
	{ SortDataSeq,			"sequential" },
	{ SortDataAscDesc,		"ascdesc" },
	{ SortDataDescAsc,		"descasc" },
	{ SortDataRand01,		"rand01" },
};

struct SortPayload_t
{
	DWORD m_uKey;
	DWORD m_uPayload[3];

	bool operator < ( const SortPayload_t & rhs ) const
	{
		return m_uKey < rhs.m_uKey;
	}
};

inline bool operator < ( const CSphWordHit & a, const CSphWordHit & b )
{
	return
		( a.m_iWordID<b.m_iWordID || \
		( a.m_iWordID==b.m_iWordID && a.m_iDocID<b.m_iDocID ) || \
		( a.m_iWordID==b.m_iWordID && a.m_iDocID==b.m_iDocID && a.m_iWordPos<b.m_iWordPos ) );
}

template < typename T >
int64_t BenchSort ( T * pData, int iCount, bool bCheck )
{
	int64_t tmSort = sphMicroTimer();
	sphSort ( pData, iCount );
	tmSort = sphMicroTimer() - tmSort;

	if ( bCheck )
	{
		for ( int i=0; i<iCount-1 && bCheck; i++ )
			if ( pData[i+1] < pData[i] )
				bCheck = false;
		if ( !bCheck )
			sphDie ( "sorting results check failed!" );
	}

	return tmSort;
}

void BenchSort ()
{
	const int MINSIZE = 100;
	const int MAXSIZE = 100000;
	const int RUNS = 100;

	FILE * fpRes = fopen ( "benchsort/res.csv", "w+" );
	if ( !fpRes )
		sphDie ( "failed to create benchsort/res.csv" );
	fprintf ( fpRes, "test-name;runs-by-size;total-time\n" );

	// bench synthetic payloads
	DWORD * pKeys = new DWORD [ MAXSIZE ];
	SortPayload_t * pValues = new SortPayload_t [ MAXSIZE ];

	for ( int iGen=0; iGen<(int)(sizeof(g_dSortDataGens)/sizeof(g_dSortDataGens[0])); iGen++ )
		for ( int iLen=MINSIZE; iLen<=MAXSIZE; iLen*=10 )
	{
		int64_t tmSort = 0;
		for ( int iRun=0; iRun<RUNS; iRun++ )
		{
			g_dSortDataGens[iGen].m_fnGen ( pKeys, iLen );
			for ( int i=0; i<iLen; i++ )
				pValues[i].m_uKey = pKeys[i];
			tmSort += BenchSort ( pValues, iLen, iRun==0 );
		}

		printf ( "%-12s 100x%-8d %d.%03d msec\n", g_dSortDataGens[iGen].m_sName, iLen, (int)(tmSort/1000), (int)(tmSort%1000) );
		fprintf ( fpRes, "%s;100x%d;%d,%03d\n", g_dSortDataGens[iGen].m_sName, iLen, (int)(tmSort/1000), (int)(tmSort%1000) );

		CSphString sFile;
		sFile.SetSprintf ( "benchsort/%s.%d.txt", g_dSortDataGens[iGen].m_sName, iLen );

#if 0
		FILE * fp = fopen ( sFile.cstr(), "w+" );
		if ( fp )
		{
			g_dSortDataGens[iGen].m_fnGen ( pKeys, iLen );
			for ( int i=0; i<iLen; i++ )
				fprintf ( fp, "%d\n", pKeys[i] );
			fclose ( fp );
		}
#endif
	}

	SafeDeleteArray ( pKeys );
	SafeDeleteArray ( pValues );

	// bench real hits
	const int MAXHITS = 10000000;
	const char * sHits = "benchsort/hits.bin";

	CSphWordHit * pHits = new CSphWordHit [ MAXHITS ];
	int HITS = MAXHITS;

	FILE * fp = fopen ( sHits, "rb+" );
	if ( !fp )
		sphDie ( "failed to open %s", sHits );
	if ( (int)fread ( pHits, sizeof(CSphWordHit), HITS, fp )!=HITS )
		sphDie ( "failed to read %s", sHits );
	fclose ( fp );

	int64_t tmSort = BenchSort ( pHits, HITS, true );

	printf ( "%-12s 100x%-8d %d.%03d msec\n", "hits", HITS, (int)(tmSort/1000), (int)(tmSort%1000) );
	fprintf ( fpRes, "%s;100x%d;%d,%03d\n", "hits", HITS, (int)(tmSort/1000), (int)(tmSort%1000) );

	SafeDeleteArray ( pHits );

	// owl down
	fclose ( fpRes );
	exit ( 0 );
}

//////////////////////////////////////////////////////////////////////////

extern DWORD sphCRC32 ( const BYTE * pString, int iLen );

struct TestAccCmp_fn
{
	typedef DWORD MEDIAN_TYPE;
	typedef DWORD * PTR_TYPE;

	int m_iStride;

	explicit TestAccCmp_fn ( int iStride )
		: m_iStride ( iStride )
	{}

	DWORD Key ( DWORD * pData ) const
	{
		return *pData;
	}

	void CopyKey ( DWORD * pMed, DWORD * pVal ) const
	{
		*pMed = Key ( pVal );
	}

	bool IsLess ( DWORD a, DWORD b ) const
	{
		return a < b;
	}

	void Swap ( DWORD * a, DWORD * b ) const
	{
		for ( int i=0; i<m_iStride; i++ )
			::Swap ( a[i], b[i] );
	}

	DWORD * Add ( DWORD * p, int i ) const
	{
		return p+i*m_iStride;
	}

	int Sub ( DWORD * b, DWORD * a ) const
	{
		return (int)((b-a)/m_iStride);
	}

	bool IsKeyDataSynced ( const DWORD * pData ) const
	{
		DWORD uKey = *pData;
		DWORD uHash = GenerateKey ( pData );
		return uKey==uHash;
	}

	DWORD GenerateKey ( const DWORD * pData ) const
	{
		return m_iStride > 1 ? sphCRC32 ( ( ( const BYTE * ) ( pData + 1 ) ), ( m_iStride - 1 ) * 4 ) : ( *pData );
	}
};


#ifndef NDEBUG
static bool IsSorted ( DWORD * pData, int iCount, const TestAccCmp_fn & fn )
{
	if ( iCount<1 )
		return true;

	const DWORD * pPrev = pData;
	if ( !fn.IsKeyDataSynced ( pPrev ) )
		return false;

	if ( iCount<2 )
		return true;

	for ( int i = 1; i < iCount; ++i )
	{
		const DWORD * pCurr = fn.Add ( pData, i );

		if ( fn.IsLess ( *pCurr , *pPrev ) || !fn.IsKeyDataSynced ( pCurr ) )
			return false;

		pPrev = pCurr;
	}

	return true;
}
#endif


void RandomFill ( DWORD * pData, int iCount, const TestAccCmp_fn & fn, bool bChainsaw )
{
	for ( int i = 0; i < iCount; ++i )
	{
		DWORD * pCurr = fn.Add ( pData, i );
		const DWORD * pNext = fn.Add ( pData, i + 1 );

		DWORD * pElem = pCurr;
		DWORD * pChainHill = bChainsaw && ( i % 2 ) ? fn.Add ( pData, i -1 ) : NULL;
		do
		{
			*pElem = pChainHill ? *pChainHill / 2 : sphRand();
			++pElem;
			pChainHill = pChainHill ? pChainHill + 1 : pChainHill;
		} while ( pElem!=pNext );

		*pCurr = fn.GenerateKey ( pCurr );
	}
}

void TestStridedSortPass ( int iStride, int iCount )
{
	printf ( "testing strided sort, stride=%d, count=%d... ", iStride, iCount );

	assert ( iStride && iCount );

	DWORD * pData = new DWORD [ iCount * iStride ];
	assert ( pData );

	// checked elements are random
	memset ( pData, 0, sizeof ( DWORD ) * iCount * iStride );
	TestAccCmp_fn fnSort ( iStride );
	RandomFill ( pData, iCount, fnSort, false );

	// crash on sort of mini-arrays
	TestAccCmp_fn fnSortDummy ( 1 );
	DWORD dMini[1] = { 1 };
	sphSort ( dMini, 1, fnSortDummy, fnSortDummy );
	sphSort ( dMini, 0, fnSortDummy, fnSortDummy );
	assert ( IsSorted ( dMini, 1, fnSortDummy ) );

	// random sort
	sphSort ( pData, iCount, fnSort, fnSort );
	assert ( IsSorted ( pData, iCount, fnSort ) );

	// already sorted sort
	sphSort ( pData, iCount, fnSort, fnSort );
	assert ( IsSorted ( pData, iCount, fnSort ) );

	// reverse order sort
	for ( int i = 0; i < iCount; ++i )
	{
		::Swap ( pData[i], pData [ iCount - i - 1 ] );
	}
	sphSort ( pData, iCount, fnSort, fnSort );
	assert ( IsSorted ( pData, iCount, fnSort ) );

	// random chainsaw sort
	RandomFill ( pData, iCount, fnSort, true );
	sphSort ( pData, iCount, fnSort, fnSort );
	assert ( IsSorted ( pData, iCount, fnSort ) );

	printf ( "ok\n" );
	SafeDeleteArray ( pData );
}

void TestStridedSort ()
{
	TestStridedSortPass ( 1, 2 );
	TestStridedSortPass ( 3, 2 );
	TestStridedSortPass ( 37, 2 );

	// SMALL_THRESH case
	TestStridedSortPass ( 1, 30 );
	TestStridedSortPass ( 7, 13 );
	TestStridedSortPass ( 113, 5 );

	TestStridedSortPass ( 1, 1000 );
	TestStridedSortPass ( 5, 1000 );
	TestStridedSortPass ( 17, 50 );
	TestStridedSortPass ( 31, 1367 );

	// rand cases
	for ( int i = 0; i < 10; ++i )
	{
		const int iRndStride = sphRand() % 64;
		const int iNrmStride = Max ( iRndStride, 1 );
		const int iRndCount = sphRand() % 1000;
		const int iNrmCount = Max ( iRndCount, 1 );
		TestStridedSortPass ( iNrmStride, iNrmCount );
	}
}

//////////////////////////////////////////////////////////////////////////

const char * g_sFieldsData[] = { "33", "1033", "If I were a cat...", "We are the greatest cat" };

class SphTestDoc_c : public CSphSource_Document
{
public:
	explicit SphTestDoc_c ( const CSphSchema & tSchema ) : CSphSource_Document ( "test_doc" )
	{
		m_tSchema = tSchema;
	}

	virtual BYTE ** NextDocument ( CSphString & )
	{
		if ( m_tDocInfo.m_iDocID )
		{
			m_tDocInfo.m_iDocID = 0;
			return NULL;
		}

		m_tDocInfo.m_iDocID++;
		return (BYTE **) &g_sFieldsData[2];
	}

	bool Connect ( CSphString & ) { return true; }
	void Disconnect () {}
	bool HasAttrsConfigured () { return true; }
	bool IterateStart ( CSphString & ) { m_tDocInfo.Reset ( m_tSchema.GetRowSize() ); return true; }
	bool IterateMultivaluedStart ( int, CSphString & ) { return false; }
	bool IterateMultivaluedNext () { return false; }
	bool IterateFieldMVAStart ( int, CSphString & ) { return false; }
	bool IterateFieldMVANext () { return false; }
	bool IterateKillListStart ( CSphString & ) { return false; }
	bool IterateKillListNext ( SphDocID_t & ) { return false; }
};


#ifndef NDEBUG

static void CheckRT ( int iVal, int iRef, const char * sMsg )
{
#if 1
	assert ( iRef==iVal && sMsg );
#else
	if ( iRef!=iVal )
		printf ( "\t%s=%d ( %d )\n", sMsg, iVal, iRef );
#endif
}

static void DeleteIndexFiles ( const char * sIndex )
{
	if ( !sIndex )
		return;

	CSphString sName;
	sName.SetSprintf ( "%s.kill", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.lock", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.meta", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.ram", sIndex );
	unlink ( sName.cstr() );

	sName.SetSprintf ( "%s.0.spa", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.0.spd", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.0.sph", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.0.spi", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.0.spk", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.0.spm", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.0.spp", sIndex );
	unlink ( sName.cstr() );
	sName.SetSprintf ( "%s.0.sps", sIndex );
	unlink ( sName.cstr() );
}


void TestRTInit ()
{
	CSphConfigSection tRTConfig;

	sphRTInit();
	sphRTConfigure ( tRTConfig, true );

	SmallStringHash_T<CSphIndex*> hIndexes;
	sphReplayBinlog ( hIndexes );
}


#define RT_INDEX_FILE_NAME "test_temp"
#define RT_PASS_COUNT 5
static const int g_iWeights[RT_PASS_COUNT] = { 1500, 1500, 1500, 1500, 1500 }; // { 1500, 1302, 1252, 1230, 1219 };

void TestRTWeightBoundary ()
{
	DeleteIndexFiles ( RT_INDEX_FILE_NAME );
	for ( int iPass = 0; iPass < RT_PASS_COUNT; ++iPass )
	{
		printf ( "testing rt indexing, test %d/%d... ", 1+iPass, RT_PASS_COUNT );
		TestRTInit ();

		CSphString sError;
		CSphDictSettings tDictSettings;

		ISphTokenizer * pTok = sphCreateUTF8Tokenizer();
		CSphDict * pDict = sphCreateDictionaryCRC ( tDictSettings, pTok, sError, "weight" );

		CSphColumnInfo tCol;
		CSphSchema tSrcSchema;

		CSphSourceSettings tParams;
		tSrcSchema.Reset();

		tCol.m_sName = "channel_id";
		tCol.m_eAttrType = SPH_ATTR_INTEGER;
		tSrcSchema.AddAttr ( tCol, true );

		tCol.m_sName = "title";
		tSrcSchema.m_dFields.Add ( tCol );

		tCol.m_sName = "content";
		tSrcSchema.m_dFields.Add ( tCol );

		SphTestDoc_c * pSrc = new SphTestDoc_c ( tSrcSchema );

		pSrc->SetTokenizer ( pTok );
		pSrc->SetDict ( pDict );

		pSrc->Setup ( tParams );
		Verify ( pSrc->Connect ( sError ) );
		Verify ( pSrc->IterateStart ( sError ) );

		Verify ( pSrc->UpdateSchema ( &tSrcSchema, sError ) );

		CSphSchema tSchema; // source schema must be all dynamic attrs; but index ones must be static
		tSchema.m_dFields = tSrcSchema.m_dFields;
		for ( int i=0; i<tSrcSchema.GetAttrsCount(); i++ )
			tSchema.AddAttr ( tSrcSchema.GetAttr(i), false );

		ISphRtIndex * pIndex = sphCreateIndexRT ( tSchema, "testrt", 32*1024*1024, RT_INDEX_FILE_NAME );

		pIndex->SetTokenizer ( pTok ); // index will own this pair from now on
		pIndex->SetDictionary ( pDict );
		Verify ( pIndex->Prealloc ( false, false, sError ) );

		ISphHits * pHits;
		CSphVector<DWORD> dMvas;
		for ( ;; )
		{
			Verify ( pSrc->IterateDocument ( sError ) );
			if ( !pSrc->m_tDocInfo.m_iDocID )
				break;

			pHits = pSrc->IterateHits ( sError );
			if ( !pHits )
				break;

			pIndex->AddDocument ( pHits, pSrc->m_tDocInfo, NULL, dMvas, sError );
			pIndex->Commit ();
		}

		pSrc->Disconnect();

		CheckRT ( pSrc->GetStats().m_iTotalDocuments, 1, "docs committed" );

		CSphQuery tQuery;
		CSphQueryResult tResult;
		tQuery.m_sQuery = "@title cat";

		ISphMatchSorter * pSorter = sphCreateQueue ( &tQuery, pIndex->GetMatchSchema(), tResult.m_sError, false );
		assert ( pSorter );
		Verify ( pIndex->MultiQuery ( &tQuery, &tResult, 1, &pSorter, NULL ) );
		sphFlattenQueue ( pSorter, &tResult, 0 );
		CheckRT ( tResult.m_dMatches.GetLength(), 1, "results found" );
		CheckRT ( (int)tResult.m_dMatches[0].m_iDocID, 1, "docID" );
		CheckRT ( tResult.m_dMatches[0].m_iWeight, g_iWeights[iPass], "weight" );
		SafeDelete ( pSorter );
		SafeDelete ( pIndex );

		sphRTDone ();

		printf ( "ok\n" );
	}
	DeleteIndexFiles ( RT_INDEX_FILE_NAME );
}

void TestWriter()
{
	printf ( "testing CSphWriter... " );
	const CSphString sTmpWriteout = "__writeout.tmp";
	CSphString sErr;

#define WRITE_OUT_DATA_SIZE 0x40000
	BYTE * pData = new BYTE[WRITE_OUT_DATA_SIZE];
	memset ( pData, 0xfe, WRITE_OUT_DATA_SIZE );

	{
		CSphWriter tWrDef;
		tWrDef.OpenFile ( sTmpWriteout, sErr );
		tWrDef.PutBytes ( pData, WRITE_OUT_DATA_SIZE );
		tWrDef.PutByte ( 0xff );
	}
	{
		CSphWriter tWr;
		tWr.SetBufferSize ( WRITE_OUT_DATA_SIZE );
		tWr.OpenFile ( sTmpWriteout, sErr );
		tWr.PutBytes ( pData, WRITE_OUT_DATA_SIZE );
		tWr.PutByte ( 0xff );
	}
	unlink ( sTmpWriteout.cstr() );
	printf ( "ok\n" );
}

class SphDocRandomizer_c : public CSphSource_Document
{
	static const int m_iMaxFields = 2;
	static const int m_iMaxFieldLen = 512;
	char m_dFields[m_iMaxFields][m_iMaxFieldLen];
	BYTE * m_ppFields[m_iMaxFields];
public:
	explicit SphDocRandomizer_c ( const CSphSchema & tSchema ) : CSphSource_Document ( "test_doc" )
	{
		m_tSchema = tSchema;
		for ( int i=0; i<m_iMaxFields; i++ )
			m_ppFields[i] = (BYTE *)&m_dFields[i];
	}

	virtual BYTE ** NextDocument ( CSphString & )
	{
		if ( m_tDocInfo.m_iDocID>800 )
		{
			m_tDocInfo.m_iDocID = 0;
			return NULL;
		}

		m_tDocInfo.m_iDocID++;

		m_tDocInfo.SetAttr ( m_tSchema.GetAttr(0).m_tLocator, m_tDocInfo.m_iDocID+1000 );
		m_tDocInfo.SetAttr ( m_tSchema.GetAttr(1).m_tLocator, 1313 );

		snprintf ( m_dFields[0], m_iMaxFieldLen, "cat title%d title%d title%d title%d title%d"
			, sphRand(), sphRand(), sphRand(), sphRand(), sphRand() );

		snprintf ( m_dFields[1], m_iMaxFieldLen, "dog contentwashere%d contentwashere%d contentwashere%d contentwashere%d contentwashere%d"
			, sphRand(), sphRand(), sphRand(), sphRand(), sphRand() );

		return &m_ppFields[0];
	}

	bool Connect ( CSphString & ) { return true; }
	void Disconnect () {}
	bool HasAttrsConfigured () { return true; }
	bool IterateStart ( CSphString & ) { m_tDocInfo.Reset ( m_tSchema.GetRowSize() ); return true; }
	bool IterateMultivaluedStart ( int, CSphString & ) { return false; }
	bool IterateMultivaluedNext () { return false; }
	bool IterateFieldMVAStart ( int, CSphString & ) { return false; }
	bool IterateFieldMVANext () { return false; }
	bool IterateKillListStart ( CSphString & ) { return false; }
	bool IterateKillListNext ( SphDocID_t & ) { return false; }
};

void TestRTSendVsMerge ()
{
	DeleteIndexFiles ( RT_INDEX_FILE_NAME );
	printf ( "testing rt send result during merge... " );

	TestRTInit ();

	CSphString sError;
	CSphDictSettings tDictSettings;

	ISphTokenizer * pTok = sphCreateUTF8Tokenizer();
	CSphDict * pDict = sphCreateDictionaryCRC ( tDictSettings, pTok, sError, "rt" );

	CSphColumnInfo tCol;
	CSphSchema tSrcSchema;

	CSphSourceSettings tParams;
	tSrcSchema.Reset();

	tCol.m_sName = "title";
	tSrcSchema.m_dFields.Add ( tCol );

	tCol.m_sName = "content";
	tSrcSchema.m_dFields.Add ( tCol );

	tCol.m_sName = "tag1";
	tCol.m_eAttrType = SPH_ATTR_INTEGER;
	tSrcSchema.AddAttr ( tCol, true );

	tCol.m_sName = "tag2";
	tCol.m_eAttrType = SPH_ATTR_INTEGER;
	tSrcSchema.AddAttr ( tCol, true );

	SphDocRandomizer_c * pSrc = new SphDocRandomizer_c ( tSrcSchema );

	pSrc->SetTokenizer ( pTok );
	pSrc->SetDict ( pDict );

	pSrc->Setup ( tParams );
	Verify ( pSrc->Connect ( sError ) );
	Verify ( pSrc->IterateStart ( sError ) );

	Verify ( pSrc->UpdateSchema ( &tSrcSchema, sError ) );

	CSphSchema tSchema; // source schema must be all dynamic attrs; but index ones must be static
	tSchema.m_dFields = tSrcSchema.m_dFields;
	for ( int i=0; i<tSrcSchema.GetAttrsCount(); i++ )
		tSchema.AddAttr ( tSrcSchema.GetAttr(i), false );

	ISphRtIndex * pIndex = sphCreateIndexRT ( tSchema, "testrt", 128*1024, RT_INDEX_FILE_NAME );

	pIndex->SetTokenizer ( pTok ); // index will own this pair from now on
	pIndex->SetDictionary ( pDict );
	Verify ( pIndex->Prealloc ( false, false, sError ) );

	CSphQuery tQuery;
	CSphQueryResult tResult;
	tQuery.m_sQuery = "@title cat";

	ISphMatchSorter * pSorter = sphCreateQueue ( &tQuery, pIndex->GetMatchSchema(), tResult.m_sError, false );
	assert ( pSorter );

	CSphVector<DWORD> dMvas;
	for ( ;; )
	{
		Verify ( pSrc->IterateDocument ( sError ) );
		if ( !pSrc->m_tDocInfo.m_iDocID )
			break;

		ISphHits * pHits = pSrc->IterateHits ( sError );
		if ( !pHits )
			break;

		pIndex->AddDocument ( pHits, pSrc->m_tDocInfo, NULL, dMvas, sError );
		if ( pSrc->m_tDocInfo.m_iDocID==350 )
		{
			pIndex->Commit ();
			Verify ( pIndex->MultiQuery ( &tQuery, &tResult, 1, &pSorter, NULL ) );
			sphFlattenQueue ( pSorter, &tResult, 0 );
		}
	}
	pIndex->Commit ();

	pSrc->Disconnect();

	for ( int i=0; i<tResult.m_dMatches.GetLength(); i++ )
	{
		const SphDocID_t tID = tResult.m_dMatches[i].m_iDocID;
		const SphAttr_t tTag1 = tResult.m_dMatches[i].GetAttr ( tResult.m_tSchema.GetAttr ( 0 ).m_tLocator );
		const SphAttr_t tTag2 = tResult.m_dMatches[i].GetAttr ( tResult.m_tSchema.GetAttr ( 1 ).m_tLocator );
		assert ( (SphDocID_t)tTag1==tID+1000 );
		assert ( tTag2==1313 );
	}
	SafeDelete ( pSorter );
	SafeDelete ( pIndex );

	sphRTDone ();

	printf ( "ok\n" );

	DeleteIndexFiles ( RT_INDEX_FILE_NAME );
}

#endif

//////////////////////////////////////////////////////////////////////////

void TestSentenceTokenizer()
{
	printf ( "testing sentence detection in tokenizer... " );

	CSphTokenizerSettings tSettings;
	tSettings.m_iType = TOKENIZER_SBCS;
	tSettings.m_iMinWordLen = 1;

	CSphString sError;
	ISphTokenizer * pTok = ISphTokenizer::Create ( tSettings, sError );

	assert ( pTok->SetCaseFolding ( "-, 0..9, A..Z->a..z, _, a..z, U+80..U+FF", sError ) );
//	assert ( pTok->SetBlendChars ( "., &", sError ) ); // NOLINT
	assert ( pTok->EnableSentenceIndexing ( sError ) );

	const char * SENTENCE = "\2"; // MUST be in sync with sphinx.cpp
	const char * sTest[] =
	{
		"Bill Gates Jr. attended", "bill", "gates", "jr", "attended", NULL,
		"Very good, Dr. Watson", "very", "good", "dr", "watson", NULL,
		"VERY GOOD, DR. WATSON", "very", "good", "dr", "watson", NULL,
		"He left US. Went abroad", "he", "left", "us", SENTENCE, "went", "abroad", NULL,
		"Known as Mr. Doe", "known", "as", "mr", "doe", NULL,
		"Survived by Mrs. Doe", "survived", "by", "mrs", "doe", NULL,
		"J. R. R. Tolkien", "j", "r", "r", "tolkien", NULL,
		"That is it. A boundary", "that", "is", "it", SENTENCE, "a", "boundary", NULL,
		"Just a sentence. And then some.", "just", "a", "sentence", SENTENCE, "and", "then", "some", SENTENCE, NULL,
		"Right, guy number two? Yes, guy number one!", "right", "guy", "number", "two", SENTENCE, "yes", "guy", "number", "one", SENTENCE, NULL,
		"S.T.A.L.K.E.R. sold well in the U.K and elsewhere. Including Russia.", "s", "t", "a", "l", "k", "e", "r", "sold", "well", "in", "the", "u", "k", "and", "elsewhere", SENTENCE, "including", "russia", SENTENCE, NULL,
		"Yoyodine Inc. exists since 1800", "yoyodine", "inc", "exists", "since", "1800", NULL,
		"John D. Doe, our CEO", "john", "d", "doe", "our", "ceo", NULL,
		"Yoyodine Inc. (the Company)", "yoyodine", "inc", "the", "company", NULL,
		NULL
	};

	int i = 0;
	while ( sTest[i] )
	{
		pTok->SetBuffer ( (BYTE*)sTest[i], strlen ( sTest[i] ) );
		i++;

		BYTE * sTok;
		while ( ( sTok = pTok->GetToken() )!=NULL )
		{
			assert ( !strcmp ( (char*)sTok, sTest[i] ) );
			i++;
		}

		assert ( sTest[i]==NULL );
		i++;
	}

	printf ( "ok\n" );
}

//////////////////////////////////////////////////////////////////////////

void TestSpanSearch()
{
	printf ( "testing span search... " );

	CSphVector<int> dVec;
	dVec.Add ( 1 );
	dVec.Add ( 3 );
	dVec.Add ( 4 );

	assert ( FindSpan ( dVec, 1, 5 )==0 );
	assert ( FindSpan ( dVec, 3, 5 )==1 );
	assert ( FindSpan ( dVec, 4, 5 )==2 );

	dVec.Add ( 15 );
	dVec.Add ( 17 );
	dVec.Add ( 22 );
	dVec.Add ( 23 );

	assert ( FindSpan ( dVec, 1, 5 )==0 );
	assert ( FindSpan ( dVec, 18, 5 )==4 );
	assert ( FindSpan ( dVec, 23, 5 )==6 );

	printf ( "ok\n" );
}

//////////////////////////////////////////////////////////////////////////

const char * CORPUS = "corpus.txt";
const int POOLSIZE = 80*1048576;
const int GAP = 4;

void BenchStemmer ()
{
	CSphString sError;

#if SNOWBALL
	SN_env * pSnow = english_ISO_8859_1_create_env();
#if 1
	char test[] = "this";
	SN_set_current ( pSnow, strlen(test), (const symbol *)test );
	pSnow->p [ pSnow->l ] = 0;
	english_ISO_8859_1_stem ( pSnow );
	stem_en ( (BYTE*)test, strlen(test) );
#endif
#endif

#if PORTER1
	struct stemmer * z = create_stemmer();
#endif

	BYTE * pRaw = new BYTE [ POOLSIZE ];
	FILE * fp = fopen ( CORPUS, "rb" );
	if ( !fp )
		sphDie ( "fopen %s failed", CORPUS );
	int iLen = fread ( pRaw, 1, POOLSIZE, fp );
	printf ( "read %d bytes\n", iLen );
	fclose ( fp );

	ISphTokenizer * pTok = sphCreateSBCSTokenizer();
	if ( !pTok->SetCaseFolding ( "A..Z->a..z, a..z", sError ) )
		sphDie ( "oops: %s", sError.cstr() );


	pTok->SetBuffer ( pRaw, iLen );

	BYTE * pTokens = new BYTE [ POOLSIZE ];
	BYTE * p = pTokens;
	BYTE * sTok;
	int iToks = 0;
	int iBytes = 0;
	int iStemmed = 0;
	while ( ( sTok = pTok->GetToken() )!=NULL )
	{
		BYTE * pStart = p++; // 1 byte for length
		while ( *sTok )
			*p++ = *sTok++;
		*pStart = (BYTE)( p-pStart-1 ); // store length
		for ( int i=0; i<GAP; i++ )
			*p++ = '\0'; // trailing zero and a safety gap
		if ( p>=pTokens+POOLSIZE )
			sphDie ( "out of buffer at tok %d", iToks );
		iToks++;
	}
	*p++ = '\0';
	iBytes = (int)( p - pTokens );
	printf ( "tokenized %d tokens\n", iToks );

#if 0
	int dCharStats[256];
	memset ( dCharStats, 0, sizeof(dCharStats) );
	for ( BYTE * t = pTokens; t<pTokens+iBytes; t++ )
		dCharStats[*t]++;

	const char * sDump = "aeiouywxY";
	for ( const char * s = sDump; *s; s++ )
		printf ( "%c: %d\n", *s, dCharStats[*s] );

#endif
	int64_t tmStem = sphMicroTimer();
	p = pTokens;
	iToks = 0;
	int iDiff = 0;
	while ( *p )
	{
#if !SNOWBALL && !PORTER1
		stem_en ( p+1, *p );
#endif

#if SNOWBALL
		int iLen = *p;
		SN_set_current ( pSnow, iLen, p+1 );
		english_ISO_8859_1_stem ( pSnow );

#if !CROSSCHECK
		// benchmark
		memcpy ( p+1, pSnow->p, pSnow->l );
		p[pSnow->l+1] = 0;
#else
		// crosscheck
		char buf[256];
		memcpy ( buf, p+1, *p+1 );

		stem_en ( p+1, *p );
		int ll = strlen ( (char*)p+1 );
		if ( ll!=pSnow->l || memcmp ( p+1, pSnow->p, ll ) )
		{
			pSnow->p[pSnow->l] = 0;
			printf ( "%s[%d] vs %s[%d] for orig %s\n", p+1, ll, pSnow->p, pSnow->l, buf );
			iDiff++;
		}
#endif
#endif

#if PORTER1
		p [ stem ( z, (char*)p+1, *p-1 )+2 ] = 0;
#endif

		p += *p + GAP + 1;
		iToks++;
	}
	tmStem = sphMicroTimer() - tmStem;

	if ( iDiff )
		printf ( "%d tokens are different\n", iDiff );

	if ( iStemmed )
		printf ( "%d data bytes stemmed\n", iStemmed );

#if SNOWBALL
	english_ISO_8859_1_close_env ( pSnow );
#endif

	uint64_t uHash = sphFNV64 ( pTokens, iBytes );
	printf ( "stemmed %d tokens (%d bytes) in %d msec, hash %08x %08x\n",
		iToks, iBytes, (int)(tmStem/1000),
		(DWORD)( uHash>>32 ), (DWORD)( uHash & 0xffffffffUL ) );
	if ( uHash!=U64C ( 0x54ef4f21994b67db ) )
		printf ( "ERROR, HASH MISMATCH\n" );

	SafeDelete ( pTok );
	SafeDeleteArray ( pRaw );
}

int main ()
{
	printf ( "RUNNING INTERNAL LIBSPHINX TESTS\n\n" );

#if 0
	BenchSort ();
#endif

#ifdef NDEBUG
	BenchStripper ();
	BenchTokenizer ( false );
	BenchTokenizer ( true );
	BenchExpr ();
	BenchLocators ();
	BenchThreads ();
#else
	TestQueryParser ();
	TestStripper ();
	TestTokenizer ( false );
	TestTokenizer ( true );
	TestExpr ();
	TestMisc ();
	TestRwlock ();
	TestCleanup ();
	TestStridedSort ();
	TestRTWeightBoundary ();
	TestWriter();
	TestRTSendVsMerge ();
	TestSentenceTokenizer ();
	TestSpanSearch ();
#endif

	unlink ( g_sTmpfile );
	printf ( "\nSUCCESS\n" );
	return 0;
}

//
// $Id$
//