// // $Id$ // // // Copyright (c) 2001-2011, Andrew Aksyonoff // Copyright (c) 2008-2011, Sphinx Technologies Inc // All rights reserved // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License. You should have // received a copy of the GPL license along with this program; if you // did not, you can find it at http://www.gnu.org/ // #include "sphinx.h" #include "sphinxexpr.h" #include "sphinxutils.h" #include "sphinxquery.h" #include "sphinxrt.h" #include "sphinxint.h" #include "sphinxstem.h" #include #define SNOWBALL 0 #define CROSSCHECK 0 #define PORTER1 0 #if SNOWBALL #include "header.h" #include "api.c" #include "utilities.c" #include "stem.c" #endif #if PORTER1 #include "porter1.c" #endif ////////////////////////////////////////////////////////////////////////// const char * g_sTmpfile = "__libsphinxtest.tmp"; ////////////////////////////////////////////////////////////////////////// bool CreateSynonymsFile ( const char * sMagic ) { FILE * fp = fopen ( g_sTmpfile, "w+" ); if ( !fp ) return false; fprintf ( fp, "AT&T => AT&T\n" " AT & T => AT & T \n" "standarten fuehrer => Standartenfuehrer\n" "standarten fuhrer => Standartenfuehrer\n" "OS/2 => OS/2\n" "Ms-Dos => MS-DOS\n" "MS DOS => MS-DOS\n" "feat. => featuring\n" "U.S. => US\n" "U.S.A. => USA\n" "U.S.B. => USB\n" "U.S.D. => USD\n" "U.S.P. => USP\n" "U.S.A.F. => USAF\n" ); if ( sMagic ) fprintf ( fp, "%s => test\n", sMagic ); fclose ( fp ); return true; } ISphTokenizer * CreateTestTokenizer ( bool bUTF8, bool bSynonyms, bool bEscaped = false ) { CSphString sError; CSphTokenizerSettings tSettings; tSettings.m_iType = bUTF8 ? TOKENIZER_UTF8 : TOKENIZER_SBCS; tSettings.m_iMinWordLen = 2; ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tSettings, sError ); assert ( pTokenizer->SetCaseFolding ( "-, 0..9, A..Z->a..z, _, a..z, U+80..U+FF", sError ) ); pTokenizer->AddSpecials ( "!-" ); pTokenizer->EnableQueryParserMode ( true ); if ( bSynonyms ) assert ( pTokenizer->LoadSynonyms ( g_sTmpfile, sError ) ); if ( bEscaped ) { ISphTokenizer * pOldTokenizer = pTokenizer; pTokenizer = pTokenizer->Clone ( true ); SafeDelete ( pOldTokenizer ); } return pTokenizer; } void TestTokenizer ( bool bUTF8 ) { const char * sPrefix = bUTF8 ? "testing UTF8 tokenizer" : "testing SBCS tokenizer"; for ( int iRun=1; iRun<=3; iRun++ ) { // simple "one-line" tests const char * sMagic = bUTF8 ? "\xD1\x82\xD0\xB5\xD1\x81\xD1\x82\xD1\x82\xD1\x82" // valid UTF-8 : "\xC0\xC1\xF5\xF6"; // valid SBCS but invalid UTF-8 assert ( CreateSynonymsFile ( sMagic ) ); bool bExceptions = ( iRun>=2 ); bool bEscaped = ( iRun==3 ); ISphTokenizer * pTokenizer = CreateTestTokenizer ( bUTF8, bExceptions, bEscaped ); const char * dTests[] = { "1", "", NULL, // test that empty strings work "1", "this is my rifle", "this", "is", "my", "rifle", NULL, // test that tokenizing works "1", "This is MY rifle", "this", "is", "my", "rifle", NULL, // test that folding works "1", "i-phone", "i-phone", NULL, // test that duals (specials in the middle of the word) work ok "1", "i phone", "phone", NULL, // test that short words are skipped "1", "this is m", "this", "is", NULL, // test that short words at the end are skipped "1", "the -phone", "the", "-", "phone", NULL, // test that specials work "1", "the!phone", "the", "!", "phone", NULL, // test that specials work "1", "i!phone", "!", "phone", NULL, // test that short words preceding specials are skipped "1", "/-hi", "-", "hi", NULL, // test that synonym-dual but folded-special chars work ok "2", "AT&T", "AT&T", NULL, // test that synonyms work "2", "AT & T", "AT & T", NULL, // test that synonyms with spaces work "2", "AT & T", "AT & T", NULL, // test that synonyms with continuous spaces work "2", "-AT&T", "-", "AT&T", NULL, // test that synonyms with specials work "2", "AT&", "at", NULL, // test that synonyms prefixes are not lost on eof "2", "AT&tee.yo", "at", "tee", "yo", NULL, // test that non-synonyms with partially matching prefixes work "2", "standarten fuehrer", "Standartenfuehrer", NULL, "2", "standarten fuhrer", "Standartenfuehrer", NULL, "2", "standarten fuehrerr", "standarten", "fuehrerr", NULL, "2", "standarten fuehrer Stirlitz", "Standartenfuehrer", "stirlitz", NULL, "2", "OS/2 vs OS/360 vs Ms-Dos", "OS/2", "vs", "os", "360", "vs", "MS-DOS", NULL, "2", "AT ", "at", NULL, // test that prefix-whitespace-eof combo does not hang "2", "AT&T&TT", "AT&T", "tt", NULL, "2", "http://OS/2", "http", "OS/2", NULL, "2", "AT*&*T", "at", NULL, "2", "# OS/2's system install", "OS/2", "system", "install", NULL, "2", "IBM-s/OS/2/Merlin", "ibm-s", "OS/2", "merlin", NULL, "2", "U.S.A", "US", NULL, "2", "AT&T!", "AT&T", "!", NULL, // exceptions vs specials "2", "AT&T!!!", "AT&T", "!", "!", "!", NULL, // exceptions vs specials "2", "U.S.A.!", "USA", "!", NULL, // exceptions vs specials "2", "MS DOSS feat.Deskview.MS DOS", "ms", "doss", "featuring", "deskview", "MS-DOS", NULL, "2", sMagic, "test", NULL, "2", "U.S. U.S.A. U.S.A.F.", "US", "USA", "USAF", NULL, "2", "U.S.AB U.S.A. U.S.B.U.S.D.U.S.U.S.A.F.", "US", "ab", "USA", "USB", "USD", "US", "USAF", NULL, "3", "phon\\e", "phone", NULL, "3", "\\thephone", "thephone", NULL, "3", "the\\!phone", "the", "phone", NULL, "3", "\\!phone", "phone", NULL, "3", "\\\\phone", "phone", NULL, // the correct behavior if '\' is not in charset "3", "pho\\\\ne", "pho", "ne", NULL, "3", "phon\\\\e", "phon", NULL, "3", "trailing\\", "trailing", NULL, NULL }; for ( int iCur=0; dTests[iCur] && atoi ( dTests[iCur++] )<=iRun; ) { printf ( "%s, run=%d, line=%s\n", sPrefix, iRun, dTests[iCur] ); pTokenizer->SetBuffer ( (BYTE*)dTests[iCur], strlen ( dTests[iCur] ) ); iCur++; for ( BYTE * pToken=pTokenizer->GetToken(); pToken; pToken=pTokenizer->GetToken() ) { assert ( dTests[iCur] && strcmp ( (const char*)pToken, dTests[iCur] )==0 ); iCur++; } assert ( dTests[iCur]==NULL ); iCur++; } // test misc SBCS-only and UTF8-only one-liners const char * dTests2[] = { "0", "\x80\x81\x82", "\x80\x81\x82", NULL, "1", "\xC2\x80\xC2\x81\xC2\x82", "\xC2\x80\xC2\x81\xC2\x82", NULL, NULL }; for ( int iCur=0; dTests2[iCur] && atoi ( dTests2[iCur++] )==int(bUTF8); ) { printf ( "%s, run=%d, line=%s\n", sPrefix, iRun, dTests2[iCur] ); pTokenizer->SetBuffer ( (BYTE*)dTests2[iCur], strlen ( dTests2[iCur] ) ); iCur++; for ( BYTE * pToken=pTokenizer->GetToken(); pToken; pToken=pTokenizer->GetToken() ) { assert ( dTests2[iCur] && strcmp ( (const char*)pToken, dTests2[iCur] )==0 ); iCur++; } assert ( dTests2[iCur]==NULL ); iCur++; } // test that decoder does not go over the buffer boundary on errors in UTF-8 if ( bUTF8 ) { printf ( "%s for proper UTF-8 error handling\n", sPrefix ); const char * sLine3 = "hi\xd0\xffh"; pTokenizer->SetBuffer ( (BYTE*)sLine3, 4 ); assert ( !strcmp ( (char*)pTokenizer->GetToken(), "hi" ) ); } // test uberlong tokens printf ( "%s for uberlong token handling\n", sPrefix ); const int UBERLONG = 4096; char * sLine4 = new char [ UBERLONG+1 ]; memset ( sLine4, 'a', UBERLONG ); sLine4[UBERLONG] = '\0'; char sTok4[SPH_MAX_WORD_LEN+1]; memset ( sTok4, 'a', SPH_MAX_WORD_LEN ); sTok4[SPH_MAX_WORD_LEN] = '\0'; pTokenizer->SetBuffer ( (BYTE*)sLine4, strlen(sLine4) ); assert ( !strcmp ( (char*)pTokenizer->GetToken(), sTok4 ) ); assert ( pTokenizer->GetToken()==NULL ); // test short word callbacks printf ( "%s for short token handling\n", sPrefix ); ISphTokenizer * pShortTokenizer = pTokenizer->Clone ( bEscaped ); CSphRemapRange tStar ( '*', '*', '*' ); pShortTokenizer->AddCaseFolding ( tStar ); CSphTokenizerSettings tSettings = pShortTokenizer->GetSettings(); tSettings.m_iMinWordLen = 5; pShortTokenizer->Setup ( tSettings ); pShortTokenizer->EnableQueryParserMode ( true ); const char * dTestsShort[] = { "ab*", "ab*", NULL, "*ab", "*ab", NULL, "abcdef", "abcdef", NULL, "ab *ab* abc", "*ab*", NULL, NULL }; for ( int iCur=0; dTestsShort[iCur]; ) { pShortTokenizer->SetBuffer ( (BYTE*)(dTestsShort [iCur]), strlen ( (const char*)dTestsShort [iCur] ) ); iCur++; for ( BYTE * pToken=pShortTokenizer->GetToken(); pToken; pToken=pShortTokenizer->GetToken() ) { assert ( dTestsShort[iCur] && strcmp ( (const char*)pToken, dTestsShort[iCur] )==0 ); iCur++; } assert ( !dTestsShort [iCur] ); iCur++; } SafeDelete ( pShortTokenizer ); // test uberlong synonym-only tokens if ( iRun==2 ) { printf ( "%s for uberlong synonym-only char token handling\n", sPrefix ); memset ( sLine4, '/', UBERLONG ); sLine4[UBERLONG] = '\0'; pTokenizer->SetBuffer ( (BYTE*)sLine4, strlen(sLine4) ); assert ( pTokenizer->GetToken()==NULL ); printf ( "%s for uberlong synonym token handling\n", sPrefix ); for ( int i=0; iSetBuffer ( (BYTE*)sLine4, strlen(sLine4) ); for ( int i=0; iGetToken(), "aa" ) ); assert ( pTokenizer->GetToken()==NULL ); } SafeDeleteArray ( sLine4 ); // test boundaries printf ( "%s for boundaries handling, run=%d\n", sPrefix, iRun ); CSphString sError; assert ( pTokenizer->SetBoundary ( "?", sError ) ); char sLine5[] = "hello world? testing boundaries?"; pTokenizer->SetBuffer ( (BYTE*)sLine5, strlen(sLine5) ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "hello" ) ); assert ( !pTokenizer->GetBoundary() ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "world" ) ); assert ( !pTokenizer->GetBoundary() ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "testing" ) ); assert ( pTokenizer->GetBoundary() ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "boundaries" ) ); assert ( !pTokenizer->GetBoundary() ); // test specials vs token start/end ptrs printf ( "%s vs specials vs token start/end ptrs\n", sPrefix ); char sLine6[] = "abc!def"; pTokenizer->SetBuffer ( (BYTE*)sLine6, strlen(sLine6) ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "abc" ) ); assert ( *pTokenizer->GetTokenStart()=='a' ); assert ( *pTokenizer->GetTokenEnd()=='!' ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "!" ) ); assert ( *pTokenizer->GetTokenStart()=='!' ); assert ( *pTokenizer->GetTokenEnd()=='d' ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "def" ) ); assert ( *pTokenizer->GetTokenStart()=='d' ); assert ( *pTokenizer->GetTokenEnd()=='\0' ); // done SafeDelete ( pTokenizer ); } // test blended printf ( "%s vs escaping vs blend_chars edge cases\n", sPrefix ); CSphString sError; ISphTokenizer * pTokenizer = CreateTestTokenizer ( bUTF8, false, true ); // no exceptions, but escaped pTokenizer->AddSpecials ( "()!-\"" ); assert ( pTokenizer->SetBlendChars ( ".", sError ) ); char sTest1[] = "(texas.\\\")"; pTokenizer->SetBuffer ( (BYTE*)sTest1, strlen(sTest1) ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "(" ) ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "texas." ) ); assert ( pTokenizer->TokenIsBlended() ); pTokenizer->SkipBlended (); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), ")" ) ); assert ( pTokenizer->GetToken()==NULL ); char sTest2[] = "\"series 2003\\-\\\"\""; printf ( "test %s\n", sTest2 ); pTokenizer->SetBuffer ( (BYTE*)sTest2, strlen(sTest2) ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "\"" ) ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "series" ) ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "2003-" ) ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "\"" ) ); assert ( pTokenizer->GetToken()==NULL ); char sTest3[] = "aa lock.up bb"; printf ( "test %s\n", sTest3 ); pTokenizer->SetBuffer ( (BYTE*)sTest3, strlen(sTest3) ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "aa" ) ); assert ( !pTokenizer->TokenIsBlended() ); assert ( !pTokenizer->TokenIsBlendedPart() ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "lock.up" ) ); assert ( pTokenizer->TokenIsBlended() ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "lock" ) ); assert ( !pTokenizer->TokenIsBlended() ); assert ( pTokenizer->TokenIsBlendedPart() ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "up" ) ); assert ( !pTokenizer->TokenIsBlended() ); assert ( pTokenizer->TokenIsBlendedPart() ); assert ( !strcmp ( (const char*)pTokenizer->GetToken(), "bb" ) ); assert ( !pTokenizer->TokenIsBlended() ); assert ( !pTokenizer->TokenIsBlendedPart() ); SafeDelete ( pTokenizer ); } void BenchTokenizer ( bool bUTF8 ) { printf ( "benchmarking %s tokenizer\n", bUTF8 ? "UTF8" : "SBCS" ); if ( !CreateSynonymsFile ( NULL ) ) { printf ( "benchmark failed: error writing temp synonyms file\n" ); return; } const char * sTestfile = "./configure"; for ( int iRun=1; iRun<=2; iRun++ ) { FILE * fp = fopen ( sTestfile, "rb" ); if ( !fp ) { printf ( "benchmark failed: error opening %s\n", sTestfile ); return; } const int MAX_DATA = 10485760; char * sData = new char [ MAX_DATA ]; int iData = fread ( sData, 1, MAX_DATA, fp ); fclose ( fp ); if ( iData<=0 ) { printf ( "benchmark failed: error reading %s\n", sTestfile ); SafeDeleteArray ( sData ); return; } CSphString sError; ISphTokenizer * pTokenizer = bUTF8 ? sphCreateUTF8Tokenizer () : sphCreateSBCSTokenizer (); pTokenizer->SetCaseFolding ( "-, 0..9, A..Z->a..z, _, a..z", sError ); if ( iRun==2 ) pTokenizer->LoadSynonyms ( g_sTmpfile, sError ); pTokenizer->AddSpecials ( "!-" ); const int iPasses = 10; int iTokens = 0; int64_t tmTime = -sphMicroTimer(); for ( int iPass=0; iPassSetBuffer ( (BYTE*)sData, iData ); while ( pTokenizer->GetToken() ) iTokens++; } tmTime += sphMicroTimer(); iTokens /= iPasses; tmTime /= iPasses; printf ( "run %d: %d bytes, %d tokens, %d.%03d ms, %.3f MB/sec\n", iRun, iData, iTokens, (int)(tmTime/1000), (int)(tmTime%1000), float(iData)/tmTime ); SafeDeleteArray ( sData ); } } ////////////////////////////////////////////////////////////////////////// void TestStripper () { const char * sTests[][4] = { // source-data, index-attrs, remove-elements, expected-results { "trivial test", "", "", " trivial test " }, { "lets \"niceindex attrs", "img=alt", "", " lets nice picture index attrs " }, { " lets alsoremove scripts", "", "script, style", " lets also remove scripts" }, { "testing inline elements", "", "", "testing inline elements" }, { "testing non

inlineelements", "", "", "testing non inline elements" }, { "testing entities&stuff", "", "", "testing entities&stuff" }, { "testing АБВ utf encoding", "", "", "testing \xD0\x90\xD0\x91\xD0\x92 utf encoding" }, { "testing <1 <\" <\x80 <\xe0 <\xff ents", "", "", "testing comments" }, { "< > ϑ &somethingverylong; &the", "", "", "< > \xCF\x91 &somethingverylong; &the" }, { "testing \"niceinline tags vs attr indexing", "img=alt,rel", "", "testing nice picture anotherattr inline tags vs attr indexing" }, { "this away", "", "", "this away" }, { "content1", "a=title", "", "content1" }, { "content2", "a=title", "", "my test title content2" }, { "testing \"niceinline tags vs attr indexing", "img=alt,rel", "", "testing nice picture anotherattr inline tags vs attr indexing" }, { "test", "", "", " test " }, { "cantest", "", "", " test " }, { "
ohai
", "", "", " ohai " }, { "ohai23", "", "", "ohai 3" }, { "ohai
4", "", "", "ohai 4" }, { "ohai
5", "", "", "ohai 5" }, { "ohai
6
some more content", "", "", "ohai 6 some more content" }, { "ohaib\">7", "", "", "ohai 7" }, { "ohai
b\">8", "", "", "ohai 8" }, { "ohai
b\">9", "", "", "ohai 9" }, { "ohai
b\">10", "", "", "ohai 10" }, { "ohai
611
gimme more", "", "", "ohai 11 gimme more" }, { "

Commission File Number: 333-155507", "", "", " Commission File Number: 333-155507" }, { "SGX", "", "", " SGX" }, { "tango & cash", "", "", "tango & cash" }, { "ahoy\"mate", "font=zzz", "", "ahoy\"mate" }, { "ahoy2", "font=zzz", "", "ahoy2" }, { "ahoy3there", "font=zzz", "", "ahoy3there" }, { "ahoyb\">4", "font=zzz", "", "ahoy4" }, { "ahoyb\">5", "font=zzz", "", "ahoy5" }, { "ahoy6seveneight", "font=zzz", "", "ahoyseveneight" } }; int nTests = (int)(sizeof(sTests)/sizeof(sTests[0])); for ( int iTest=0; iTest=2,3,4)", 3.0f }, { "pow(7,5)", 16807.f }, { "sqrt(3)", 1.7320508f }, { "log2((2+2)*(2+2))", 4.0f }, { "min(3,15)", 3.0f }, { "max(3,15)", 15.0f }, { "if(3<15,bbb,ccc)", 2.0f }, { "@id+@weight", 579.0f }, { "abs(-3-ccc)", 6.0f }, { "(aaa+bbb)*(ccc-aaa)", 6.0f }, { "(((aaa)))", 1.0f }, { "aaa-bbb*ccc", -5.0f }, { " aaa -\tbbb *\t\t\tccc ", -5.0f }, { "bbb+123*aaa", 125.0f }, { "2.000*2e+1+2", 42.0f }, { "3<5", 1.0f }, { "1 + 2*3 > 4*4", 0.0f }, { "aaa/-bbb", -0.5f, }, { "-10*-10", 100.0f }, { "aaa+-bbb*-5", 11.0f }, { "-aaa>-bbb", 1.0f }, { "1-aaa+2-3+4", 3.0f }, { "bbb/1*2/6*3", 2.0f }, { "(aaa+bbb)/sqrt(3)/sqrt(3)", 1.0f }, { "aaa-bbb-2", -3.0f }, { "ccc/2*4/bbb", 3.0f }, { "(2+(aaa*bbb))+3", 7.0f } }; const int nTests = sizeof(dTests)/sizeof(dTests[0]); for ( int iTest=0; iTest pExpr ( sphExprParse ( dTests[iTest].m_sExpr, tSchema, NULL, NULL, sError ) ); if ( !pExpr.Ptr() ) { printf ( "FAILED; %s\n", sError.cstr() ); assert ( 0 ); } float fValue = pExpr->Eval(tMatch); if ( fabs ( fValue - dTests[iTest].m_fValue )>=0.0001f ) { printf ( "FAILED; expected %.3f, got %.3f\n", dTests[iTest].m_fValue, fValue ); assert ( 0 ); } printf ( "ok\n" ); } SafeDeleteArray ( pRow ); } #if USE_WINDOWS #define NOINLINE __declspec(noinline) #else #define NOINLINE #endif #define AAA float(tMatch.m_pStatic[0]) #define BBB float(tMatch.m_pStatic[1]) #define CCC float(tMatch.m_pStatic[2]) NOINLINE float ExprNative1 ( const CSphMatch & tMatch ) { return AAA+BBB*CCC-1.0f;} NOINLINE float ExprNative2 ( const CSphMatch & tMatch ) { return AAA+BBB*CCC*2.0f-3.0f/4.0f*5.0f/6.0f*BBB; } NOINLINE float ExprNative3 ( const CSphMatch & ) { return (float)sqrt ( 2.0f ); } void BenchExpr () { printf ( "benchmarking expressions\n" ); CSphColumnInfo tCol; tCol.m_eAttrType = SPH_ATTR_INTEGER; CSphSchema tSchema; tCol.m_sName = "aaa"; tSchema.AddAttr ( tCol, false ); tCol.m_sName = "bbb"; tSchema.AddAttr ( tCol, false ); tCol.m_sName = "ccc"; tSchema.AddAttr ( tCol, false ); CSphRowitem * pRow = new CSphRowitem [ tSchema.GetRowSize() ]; for ( int i=0; i pExpr ( sphExprParse ( dBench[iRun].m_sExpr, tSchema, &uType, NULL, sError ) ); if ( !pExpr.Ptr() ) { printf ( "FAILED; %s\n", sError.cstr() ); return; } const int NRUNS = 1000000; volatile float fValue = 0.0f; int64_t tmTime = sphMicroTimer(); for ( int i=0; iEval(tMatch); tmTime = sphMicroTimer() - tmTime; int64_t tmTimeInt = sphMicroTimer(); if ( uType==SPH_ATTR_INTEGER ) { int uValue = 0; for ( int i=0; iIntEval(tMatch); } tmTimeInt = sphMicroTimer() - tmTimeInt; int64_t tmTimeNative = sphMicroTimer(); for ( int i=0; im_dWords.GetLength() ) { // say just words to me const CSphVector & dWords = pNode->m_dWords; ARRAY_FOREACH ( i, dWords ) sRes.SetSprintf ( "%s %s", sRes.cstr(), dWords[i].m_sWord.cstr() ); sRes.Chop (); switch ( pNode->GetOp() ) { case SPH_QUERY_AND: break; case SPH_QUERY_PHRASE: sRes.SetSprintf ( "\"%s\"", sRes.cstr() ); break; case SPH_QUERY_PROXIMITY: sRes.SetSprintf ( "\"%s\"~%d", sRes.cstr(), pNode->m_iOpArg ); break; case SPH_QUERY_QUORUM: sRes.SetSprintf ( "\"%s\"/%d", sRes.cstr(), pNode->m_iOpArg ); break; case SPH_QUERY_NEAR: sRes.SetSprintf ( "\"%s\"NEAR/%d", sRes.cstr(), pNode->m_iOpArg ); break; default: assert ( 0 && "unexpected op in ReconstructNode()" ); break; } if ( !pNode->m_dFieldMask.TestAll(true) ) { CSphString sFields ( "" ); for ( int i=0; im_dFieldMask.Test(i) ) sFields.SetSprintf ( "%s,%s", sFields.cstr(), tSchema.m_dFields[i].m_sName.cstr() ); sRes.SetSprintf ( "( @%s: %s )", sFields.cstr()+1, sRes.cstr() ); } else { if ( pNode->GetOp()==SPH_QUERY_AND && dWords.GetLength()>1 ) sRes.SetSprintf ( "( %s )", sRes.cstr() ); // wrap bag of words } } else { ARRAY_FOREACH ( i, pNode->m_dChildren ) { if ( !i ) sRes = ReconstructNode ( pNode->m_dChildren[i], tSchema ); else { const char * sOp = "(unknown-op)"; switch ( pNode->GetOp() ) { case SPH_QUERY_AND: sOp = "AND"; break; case SPH_QUERY_OR: sOp = "OR"; break; case SPH_QUERY_NOT: sOp = "NOT"; break; case SPH_QUERY_ANDNOT: sOp = "AND NOT"; break; case SPH_QUERY_BEFORE: sOp = "BEFORE"; break; case SPH_QUERY_NEAR: sOp = "NEAR"; break; default: assert ( 0 && "unexpected op in ReconstructNode()" ); break; } sRes.SetSprintf ( "%s %s %s", sRes.cstr(), sOp, ReconstructNode ( pNode->m_dChildren[i], tSchema ).cstr() ); } } if ( pNode->m_dChildren.GetLength()>1 ) sRes.SetSprintf ( "( %s )", sRes.cstr() ); } return sRes; } void TestQueryParser () { CSphString sTmp; CSphSchema tSchema; CSphColumnInfo tCol; tCol.m_sName = "title"; tSchema.m_dFields.Add ( tCol ); tCol.m_sName = "body"; tSchema.m_dFields.Add ( tCol ); CSphDictSettings tDictSettings; CSphScopedPtr pTokenizer ( sphCreateSBCSTokenizer () ); CSphScopedPtr pDict ( sphCreateDictionaryCRC ( tDictSettings, pTokenizer.Ptr(), sTmp, "query" ) ); assert ( pTokenizer.Ptr() ); assert ( pDict.Ptr() ); CSphTokenizerSettings tTokenizerSetup; tTokenizerSetup.m_iMinWordLen = 2; tTokenizerSetup.m_sSynonymsFile = g_sTmpfile; pTokenizer->Setup ( tTokenizerSetup ); CSphString sError; assert ( CreateSynonymsFile ( NULL ) ); assert ( pTokenizer->LoadSynonyms ( g_sTmpfile, sError ) ); struct QueryTest_t { const char * m_sQuery; const char * m_sReconst; }; const QueryTest_t dTest[] = { { "aaa bbb ccc", "( aaa AND bbb AND ccc )" }, { "aaa|bbb ccc", "( ( aaa OR bbb ) AND ccc )" }, { "aaa bbb|ccc", "( aaa AND ( bbb OR ccc ) )" }, { "aaa (bbb ccc)|ddd", "( aaa AND ( ( bbb AND ccc ) OR ddd ) )" }, { "aaa bbb|(ccc ddd)", "( aaa AND ( bbb OR ( ccc AND ddd ) ) )" }, { "aaa bbb|(ccc ddd)|eee|(fff)", "( aaa AND ( bbb OR ( ccc AND ddd ) OR eee OR fff ) )" }, { "aaa bbb|(ccc ddd) eee|(fff)", "( aaa AND ( bbb OR ( ccc AND ddd ) ) AND ( eee OR fff ) )" }, { "aaa (ccc ddd)|bbb|eee|(fff)", "( aaa AND ( ( ccc AND ddd ) OR bbb OR eee OR fff ) )" }, { "aaa (ccc ddd)|bbb eee|(fff)", "( aaa AND ( ( ccc AND ddd ) OR bbb ) AND ( eee OR fff ) )" }, { "aaa \"bbb ccc\"~5|ddd", "( aaa AND ( \"bbb ccc\"~5 OR ddd ) )" }, { "aaa bbb|\"ccc ddd\"~5", "( aaa AND ( bbb OR \"ccc ddd\"~5 ) )" }, { "aaa ( ( \"bbb ccc\"~3|ddd ) eee | ( fff -ggg ) )", "( aaa AND ( ( \"bbb ccc\"~3 OR ddd ) AND ( eee OR ( fff AND NOT ggg ) ) ) )" }, { "@title aaa @body ccc|(@title ddd eee)|fff ggg", "( ( @title: aaa ) AND ( ( @body: ccc ) OR ( ( @title: ddd ) AND ( @title: eee ) ) OR ( @body: fff ) ) AND ( @body: ggg ) )" }, { "@title hello world | @body sample program", "( ( @title: hello ) AND ( ( @title: world ) OR ( @body: sample ) ) AND ( @body: program ) )" }, { "@title one two three four", "( ( @title: one ) AND ( @title: two ) AND ( @title: three ) AND ( @title: four ) )" }, { "@title one (@body two three) four", "( ( @title: one ) AND ( ( @body: two ) AND ( @body: three ) ) AND ( @title: four ) )" }, { "windows 7 2000", "( windows AND 2000 )" }, { "aaa a|bbb", "( aaa AND bbb )" }, { "aaa bbb|x y z|ccc", "( aaa AND bbb AND ccc )" }, { "a", "" }, { "hello -world", "( hello AND NOT world )" }, { "-hello world", "( world AND NOT hello )" }, { "\"phrase (query)/3 ~on steroids\"", "\"phrase query on steroids\"" }, { "hello a world", "( hello AND world )" }, { "-one", "" }, { "-one -two", "" }, { "\"\"", "" }, { "\"()\"", "" }, { "\"]\"", "" }, { "@title hello @body -world", "( ( @title: hello ) AND NOT ( @body: world ) )" }, { "Ms-Dos", "MS-DOS" } }; int nTests = sizeof(dTest)/sizeof(dTest[0]); for ( int i=0; i=0; i++ ) pRow += sphPackStrlen ( pRow, dValues[i] ); const BYTE * pUnp = dBuffer; for ( int i=0; dValues[i]>=0; i++ ) { int iUnp = sphUnpackStr ( pUnp, &pUnp ); assert ( iUnp==dValues[i] ); } printf ( "ok\n" ); } #endif ////////////////////////////////////////////////////////////////////////// void BenchLocators () { const int MAX_ITEMS = 10; const int NUM_MATCHES = 1000; const int NUM_RUNS = 100000; CSphRowitem dStatic[MAX_ITEMS]; CSphRowitem dDynamic[MAX_ITEMS]; CSphAttrLocator tLoc[NUM_MATCHES]; CSphMatch tMatch[NUM_MATCHES]; for ( int i=0; i(1+i) ) ); } for ( int i=0; iLock(); for ( int i=0; i<100; i++ ) g_iMutexBench++; g_iMutexBench -= 99; pMutex->Unlock(); } void BenchThreads () { printf ( "benchmarking threads\n" ); const int BATCHES = 100; const int BATCH_THREADS = 100; const int TOTAL_THREADS = BATCHES*BATCH_THREADS; SphThread_t * pThd = new SphThread_t [ BATCH_THREADS ]; CSphMutex tMutex; if ( !tMutex.Init() ) sphDie ( "failed to init mutex" ); for ( int iRun=1; iRun<=2; iRun++ ) { int64_t tmThd = sphMicroTimer(); for ( int iBatch=0; iBatch>3 ) & 1; } typedef void (*SortDataGen_fn)( DWORD *, int ); struct SortDataGenDesc_t { SortDataGen_fn m_fnGen; const char * m_sName; }; SortDataGenDesc_t g_dSortDataGens[] = { { SortDataRepeat1245, "repeat1245" }, { SortDataEnd0, "end0" }, { SortDataIdentical, "identical" }, { SortDataMed3Killer, "med3killer" }, { SortDataMidKiller, "midkiller" }, { SortDataRandDupes, "randdupes" }, { SortDataRandUniq, "randuniq" }, { SortDataRandSteps, "randsteps" }, { SortDataRevEnds, "revends" }, { SortDataRevPartial, "revpartial" }, { SortDataRevSaw, "revsaw" }, { SortDataReverse, "reverse" }, { SortDataStart1000, "start1000" }, { SortDataSeqPartial, "seqpartial" }, { SortDataSeqSaw, "seqsaw" }, { SortDataSeq, "sequential" }, { SortDataAscDesc, "ascdesc" }, { SortDataDescAsc, "descasc" }, { SortDataRand01, "rand01" }, }; struct SortPayload_t { DWORD m_uKey; DWORD m_uPayload[3]; bool operator < ( const SortPayload_t & rhs ) const { return m_uKey < rhs.m_uKey; } }; inline bool operator < ( const CSphWordHit & a, const CSphWordHit & b ) { return ( a.m_iWordID int64_t BenchSort ( T * pData, int iCount, bool bCheck ) { int64_t tmSort = sphMicroTimer(); sphSort ( pData, iCount ); tmSort = sphMicroTimer() - tmSort; if ( bCheck ) { for ( int i=0; i 1 ? sphCRC32 ( ( ( const BYTE * ) ( pData + 1 ) ), ( m_iStride - 1 ) * 4 ) : ( *pData ); } }; #ifndef NDEBUG static bool IsSorted ( DWORD * pData, int iCount, const TestAccCmp_fn & fn ) { if ( iCount<1 ) return true; const DWORD * pPrev = pData; if ( !fn.IsKeyDataSynced ( pPrev ) ) return false; if ( iCount<2 ) return true; for ( int i = 1; i < iCount; ++i ) { const DWORD * pCurr = fn.Add ( pData, i ); if ( fn.IsLess ( *pCurr , *pPrev ) || !fn.IsKeyDataSynced ( pCurr ) ) return false; pPrev = pCurr; } return true; } #endif void RandomFill ( DWORD * pData, int iCount, const TestAccCmp_fn & fn, bool bChainsaw ) { for ( int i = 0; i < iCount; ++i ) { DWORD * pCurr = fn.Add ( pData, i ); const DWORD * pNext = fn.Add ( pData, i + 1 ); DWORD * pElem = pCurr; DWORD * pChainHill = bChainsaw && ( i % 2 ) ? fn.Add ( pData, i -1 ) : NULL; do { *pElem = pChainHill ? *pChainHill / 2 : sphRand(); ++pElem; pChainHill = pChainHill ? pChainHill + 1 : pChainHill; } while ( pElem!=pNext ); *pCurr = fn.GenerateKey ( pCurr ); } } void TestStridedSortPass ( int iStride, int iCount ) { printf ( "testing strided sort, stride=%d, count=%d... ", iStride, iCount ); assert ( iStride && iCount ); DWORD * pData = new DWORD [ iCount * iStride ]; assert ( pData ); // checked elements are random memset ( pData, 0, sizeof ( DWORD ) * iCount * iStride ); TestAccCmp_fn fnSort ( iStride ); RandomFill ( pData, iCount, fnSort, false ); // crash on sort of mini-arrays TestAccCmp_fn fnSortDummy ( 1 ); DWORD dMini[1] = { 1 }; sphSort ( dMini, 1, fnSortDummy, fnSortDummy ); sphSort ( dMini, 0, fnSortDummy, fnSortDummy ); assert ( IsSorted ( dMini, 1, fnSortDummy ) ); // random sort sphSort ( pData, iCount, fnSort, fnSort ); assert ( IsSorted ( pData, iCount, fnSort ) ); // already sorted sort sphSort ( pData, iCount, fnSort, fnSort ); assert ( IsSorted ( pData, iCount, fnSort ) ); // reverse order sort for ( int i = 0; i < iCount; ++i ) { ::Swap ( pData[i], pData [ iCount - i - 1 ] ); } sphSort ( pData, iCount, fnSort, fnSort ); assert ( IsSorted ( pData, iCount, fnSort ) ); // random chainsaw sort RandomFill ( pData, iCount, fnSort, true ); sphSort ( pData, iCount, fnSort, fnSort ); assert ( IsSorted ( pData, iCount, fnSort ) ); printf ( "ok\n" ); SafeDeleteArray ( pData ); } void TestStridedSort () { TestStridedSortPass ( 1, 2 ); TestStridedSortPass ( 3, 2 ); TestStridedSortPass ( 37, 2 ); // SMALL_THRESH case TestStridedSortPass ( 1, 30 ); TestStridedSortPass ( 7, 13 ); TestStridedSortPass ( 113, 5 ); TestStridedSortPass ( 1, 1000 ); TestStridedSortPass ( 5, 1000 ); TestStridedSortPass ( 17, 50 ); TestStridedSortPass ( 31, 1367 ); // rand cases for ( int i = 0; i < 10; ++i ) { const int iRndStride = sphRand() % 64; const int iNrmStride = Max ( iRndStride, 1 ); const int iRndCount = sphRand() % 1000; const int iNrmCount = Max ( iRndCount, 1 ); TestStridedSortPass ( iNrmStride, iNrmCount ); } } ////////////////////////////////////////////////////////////////////////// const char * g_sFieldsData[] = { "33", "1033", "If I were a cat...", "We are the greatest cat" }; class SphTestDoc_c : public CSphSource_Document { public: explicit SphTestDoc_c ( const CSphSchema & tSchema ) : CSphSource_Document ( "test_doc" ) { m_tSchema = tSchema; } virtual BYTE ** NextDocument ( CSphString & ) { if ( m_tDocInfo.m_iDocID ) { m_tDocInfo.m_iDocID = 0; return NULL; } m_tDocInfo.m_iDocID++; return (BYTE **) &g_sFieldsData[2]; } bool Connect ( CSphString & ) { return true; } void Disconnect () {} bool HasAttrsConfigured () { return true; } bool IterateStart ( CSphString & ) { m_tDocInfo.Reset ( m_tSchema.GetRowSize() ); return true; } bool IterateMultivaluedStart ( int, CSphString & ) { return false; } bool IterateMultivaluedNext () { return false; } bool IterateFieldMVAStart ( int, CSphString & ) { return false; } bool IterateFieldMVANext () { return false; } bool IterateKillListStart ( CSphString & ) { return false; } bool IterateKillListNext ( SphDocID_t & ) { return false; } }; #ifndef NDEBUG static void CheckRT ( int iVal, int iRef, const char * sMsg ) { #if 1 assert ( iRef==iVal && sMsg ); #else if ( iRef!=iVal ) printf ( "\t%s=%d ( %d )\n", sMsg, iVal, iRef ); #endif } static void DeleteIndexFiles ( const char * sIndex ) { if ( !sIndex ) return; CSphString sName; sName.SetSprintf ( "%s.kill", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.lock", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.meta", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.ram", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.0.spa", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.0.spd", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.0.sph", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.0.spi", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.0.spk", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.0.spm", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.0.spp", sIndex ); unlink ( sName.cstr() ); sName.SetSprintf ( "%s.0.sps", sIndex ); unlink ( sName.cstr() ); } void TestRTInit () { CSphConfigSection tRTConfig; sphRTInit(); sphRTConfigure ( tRTConfig, true ); SmallStringHash_T hIndexes; sphReplayBinlog ( hIndexes ); } #define RT_INDEX_FILE_NAME "test_temp" #define RT_PASS_COUNT 5 static const int g_iWeights[RT_PASS_COUNT] = { 1500, 1500, 1500, 1500, 1500 }; // { 1500, 1302, 1252, 1230, 1219 }; void TestRTWeightBoundary () { DeleteIndexFiles ( RT_INDEX_FILE_NAME ); for ( int iPass = 0; iPass < RT_PASS_COUNT; ++iPass ) { printf ( "testing rt indexing, test %d/%d... ", 1+iPass, RT_PASS_COUNT ); TestRTInit (); CSphString sError; CSphDictSettings tDictSettings; ISphTokenizer * pTok = sphCreateUTF8Tokenizer(); CSphDict * pDict = sphCreateDictionaryCRC ( tDictSettings, pTok, sError, "weight" ); CSphColumnInfo tCol; CSphSchema tSrcSchema; CSphSourceSettings tParams; tSrcSchema.Reset(); tCol.m_sName = "channel_id"; tCol.m_eAttrType = SPH_ATTR_INTEGER; tSrcSchema.AddAttr ( tCol, true ); tCol.m_sName = "title"; tSrcSchema.m_dFields.Add ( tCol ); tCol.m_sName = "content"; tSrcSchema.m_dFields.Add ( tCol ); SphTestDoc_c * pSrc = new SphTestDoc_c ( tSrcSchema ); pSrc->SetTokenizer ( pTok ); pSrc->SetDict ( pDict ); pSrc->Setup ( tParams ); Verify ( pSrc->Connect ( sError ) ); Verify ( pSrc->IterateStart ( sError ) ); Verify ( pSrc->UpdateSchema ( &tSrcSchema, sError ) ); CSphSchema tSchema; // source schema must be all dynamic attrs; but index ones must be static tSchema.m_dFields = tSrcSchema.m_dFields; for ( int i=0; iSetTokenizer ( pTok ); // index will own this pair from now on pIndex->SetDictionary ( pDict ); Verify ( pIndex->Prealloc ( false, false, sError ) ); ISphHits * pHits; CSphVector dMvas; for ( ;; ) { Verify ( pSrc->IterateDocument ( sError ) ); if ( !pSrc->m_tDocInfo.m_iDocID ) break; pHits = pSrc->IterateHits ( sError ); if ( !pHits ) break; pIndex->AddDocument ( pHits, pSrc->m_tDocInfo, NULL, dMvas, sError ); pIndex->Commit (); } pSrc->Disconnect(); CheckRT ( pSrc->GetStats().m_iTotalDocuments, 1, "docs committed" ); CSphQuery tQuery; CSphQueryResult tResult; tQuery.m_sQuery = "@title cat"; ISphMatchSorter * pSorter = sphCreateQueue ( &tQuery, pIndex->GetMatchSchema(), tResult.m_sError, false ); assert ( pSorter ); Verify ( pIndex->MultiQuery ( &tQuery, &tResult, 1, &pSorter, NULL ) ); sphFlattenQueue ( pSorter, &tResult, 0 ); CheckRT ( tResult.m_dMatches.GetLength(), 1, "results found" ); CheckRT ( (int)tResult.m_dMatches[0].m_iDocID, 1, "docID" ); CheckRT ( tResult.m_dMatches[0].m_iWeight, g_iWeights[iPass], "weight" ); SafeDelete ( pSorter ); SafeDelete ( pIndex ); sphRTDone (); printf ( "ok\n" ); } DeleteIndexFiles ( RT_INDEX_FILE_NAME ); } void TestWriter() { printf ( "testing CSphWriter... " ); const CSphString sTmpWriteout = "__writeout.tmp"; CSphString sErr; #define WRITE_OUT_DATA_SIZE 0x40000 BYTE * pData = new BYTE[WRITE_OUT_DATA_SIZE]; memset ( pData, 0xfe, WRITE_OUT_DATA_SIZE ); { CSphWriter tWrDef; tWrDef.OpenFile ( sTmpWriteout, sErr ); tWrDef.PutBytes ( pData, WRITE_OUT_DATA_SIZE ); tWrDef.PutByte ( 0xff ); } { CSphWriter tWr; tWr.SetBufferSize ( WRITE_OUT_DATA_SIZE ); tWr.OpenFile ( sTmpWriteout, sErr ); tWr.PutBytes ( pData, WRITE_OUT_DATA_SIZE ); tWr.PutByte ( 0xff ); } unlink ( sTmpWriteout.cstr() ); printf ( "ok\n" ); } class SphDocRandomizer_c : public CSphSource_Document { static const int m_iMaxFields = 2; static const int m_iMaxFieldLen = 512; char m_dFields[m_iMaxFields][m_iMaxFieldLen]; BYTE * m_ppFields[m_iMaxFields]; public: explicit SphDocRandomizer_c ( const CSphSchema & tSchema ) : CSphSource_Document ( "test_doc" ) { m_tSchema = tSchema; for ( int i=0; i800 ) { m_tDocInfo.m_iDocID = 0; return NULL; } m_tDocInfo.m_iDocID++; m_tDocInfo.SetAttr ( m_tSchema.GetAttr(0).m_tLocator, m_tDocInfo.m_iDocID+1000 ); m_tDocInfo.SetAttr ( m_tSchema.GetAttr(1).m_tLocator, 1313 ); snprintf ( m_dFields[0], m_iMaxFieldLen, "cat title%d title%d title%d title%d title%d" , sphRand(), sphRand(), sphRand(), sphRand(), sphRand() ); snprintf ( m_dFields[1], m_iMaxFieldLen, "dog contentwashere%d contentwashere%d contentwashere%d contentwashere%d contentwashere%d" , sphRand(), sphRand(), sphRand(), sphRand(), sphRand() ); return &m_ppFields[0]; } bool Connect ( CSphString & ) { return true; } void Disconnect () {} bool HasAttrsConfigured () { return true; } bool IterateStart ( CSphString & ) { m_tDocInfo.Reset ( m_tSchema.GetRowSize() ); return true; } bool IterateMultivaluedStart ( int, CSphString & ) { return false; } bool IterateMultivaluedNext () { return false; } bool IterateFieldMVAStart ( int, CSphString & ) { return false; } bool IterateFieldMVANext () { return false; } bool IterateKillListStart ( CSphString & ) { return false; } bool IterateKillListNext ( SphDocID_t & ) { return false; } }; void TestRTSendVsMerge () { DeleteIndexFiles ( RT_INDEX_FILE_NAME ); printf ( "testing rt send result during merge... " ); TestRTInit (); CSphString sError; CSphDictSettings tDictSettings; ISphTokenizer * pTok = sphCreateUTF8Tokenizer(); CSphDict * pDict = sphCreateDictionaryCRC ( tDictSettings, pTok, sError, "rt" ); CSphColumnInfo tCol; CSphSchema tSrcSchema; CSphSourceSettings tParams; tSrcSchema.Reset(); tCol.m_sName = "title"; tSrcSchema.m_dFields.Add ( tCol ); tCol.m_sName = "content"; tSrcSchema.m_dFields.Add ( tCol ); tCol.m_sName = "tag1"; tCol.m_eAttrType = SPH_ATTR_INTEGER; tSrcSchema.AddAttr ( tCol, true ); tCol.m_sName = "tag2"; tCol.m_eAttrType = SPH_ATTR_INTEGER; tSrcSchema.AddAttr ( tCol, true ); SphDocRandomizer_c * pSrc = new SphDocRandomizer_c ( tSrcSchema ); pSrc->SetTokenizer ( pTok ); pSrc->SetDict ( pDict ); pSrc->Setup ( tParams ); Verify ( pSrc->Connect ( sError ) ); Verify ( pSrc->IterateStart ( sError ) ); Verify ( pSrc->UpdateSchema ( &tSrcSchema, sError ) ); CSphSchema tSchema; // source schema must be all dynamic attrs; but index ones must be static tSchema.m_dFields = tSrcSchema.m_dFields; for ( int i=0; iSetTokenizer ( pTok ); // index will own this pair from now on pIndex->SetDictionary ( pDict ); Verify ( pIndex->Prealloc ( false, false, sError ) ); CSphQuery tQuery; CSphQueryResult tResult; tQuery.m_sQuery = "@title cat"; ISphMatchSorter * pSorter = sphCreateQueue ( &tQuery, pIndex->GetMatchSchema(), tResult.m_sError, false ); assert ( pSorter ); CSphVector dMvas; for ( ;; ) { Verify ( pSrc->IterateDocument ( sError ) ); if ( !pSrc->m_tDocInfo.m_iDocID ) break; ISphHits * pHits = pSrc->IterateHits ( sError ); if ( !pHits ) break; pIndex->AddDocument ( pHits, pSrc->m_tDocInfo, NULL, dMvas, sError ); if ( pSrc->m_tDocInfo.m_iDocID==350 ) { pIndex->Commit (); Verify ( pIndex->MultiQuery ( &tQuery, &tResult, 1, &pSorter, NULL ) ); sphFlattenQueue ( pSorter, &tResult, 0 ); } } pIndex->Commit (); pSrc->Disconnect(); for ( int i=0; iSetCaseFolding ( "-, 0..9, A..Z->a..z, _, a..z, U+80..U+FF", sError ) ); // assert ( pTok->SetBlendChars ( "., &", sError ) ); // NOLINT assert ( pTok->EnableSentenceIndexing ( sError ) ); const char * SENTENCE = "\2"; // MUST be in sync with sphinx.cpp const char * sTest[] = { "Bill Gates Jr. attended", "bill", "gates", "jr", "attended", NULL, "Very good, Dr. Watson", "very", "good", "dr", "watson", NULL, "VERY GOOD, DR. WATSON", "very", "good", "dr", "watson", NULL, "He left US. Went abroad", "he", "left", "us", SENTENCE, "went", "abroad", NULL, "Known as Mr. Doe", "known", "as", "mr", "doe", NULL, "Survived by Mrs. Doe", "survived", "by", "mrs", "doe", NULL, "J. R. R. Tolkien", "j", "r", "r", "tolkien", NULL, "That is it. A boundary", "that", "is", "it", SENTENCE, "a", "boundary", NULL, "Just a sentence. And then some.", "just", "a", "sentence", SENTENCE, "and", "then", "some", SENTENCE, NULL, "Right, guy number two? Yes, guy number one!", "right", "guy", "number", "two", SENTENCE, "yes", "guy", "number", "one", SENTENCE, NULL, "S.T.A.L.K.E.R. sold well in the U.K and elsewhere. Including Russia.", "s", "t", "a", "l", "k", "e", "r", "sold", "well", "in", "the", "u", "k", "and", "elsewhere", SENTENCE, "including", "russia", SENTENCE, NULL, "Yoyodine Inc. exists since 1800", "yoyodine", "inc", "exists", "since", "1800", NULL, "John D. Doe, our CEO", "john", "d", "doe", "our", "ceo", NULL, "Yoyodine Inc. (the Company)", "yoyodine", "inc", "the", "company", NULL, NULL }; int i = 0; while ( sTest[i] ) { pTok->SetBuffer ( (BYTE*)sTest[i], strlen ( sTest[i] ) ); i++; BYTE * sTok; while ( ( sTok = pTok->GetToken() )!=NULL ) { assert ( !strcmp ( (char*)sTok, sTest[i] ) ); i++; } assert ( sTest[i]==NULL ); i++; } printf ( "ok\n" ); } ////////////////////////////////////////////////////////////////////////// void TestSpanSearch() { printf ( "testing span search... " ); CSphVector dVec; dVec.Add ( 1 ); dVec.Add ( 3 ); dVec.Add ( 4 ); assert ( FindSpan ( dVec, 1, 5 )==0 ); assert ( FindSpan ( dVec, 3, 5 )==1 ); assert ( FindSpan ( dVec, 4, 5 )==2 ); dVec.Add ( 15 ); dVec.Add ( 17 ); dVec.Add ( 22 ); dVec.Add ( 23 ); assert ( FindSpan ( dVec, 1, 5 )==0 ); assert ( FindSpan ( dVec, 18, 5 )==4 ); assert ( FindSpan ( dVec, 23, 5 )==6 ); printf ( "ok\n" ); } ////////////////////////////////////////////////////////////////////////// const char * CORPUS = "corpus.txt"; const int POOLSIZE = 80*1048576; const int GAP = 4; void BenchStemmer () { CSphString sError; #if SNOWBALL SN_env * pSnow = english_ISO_8859_1_create_env(); #if 1 char test[] = "this"; SN_set_current ( pSnow, strlen(test), (const symbol *)test ); pSnow->p [ pSnow->l ] = 0; english_ISO_8859_1_stem ( pSnow ); stem_en ( (BYTE*)test, strlen(test) ); #endif #endif #if PORTER1 struct stemmer * z = create_stemmer(); #endif BYTE * pRaw = new BYTE [ POOLSIZE ]; FILE * fp = fopen ( CORPUS, "rb" ); if ( !fp ) sphDie ( "fopen %s failed", CORPUS ); int iLen = fread ( pRaw, 1, POOLSIZE, fp ); printf ( "read %d bytes\n", iLen ); fclose ( fp ); ISphTokenizer * pTok = sphCreateSBCSTokenizer(); if ( !pTok->SetCaseFolding ( "A..Z->a..z, a..z", sError ) ) sphDie ( "oops: %s", sError.cstr() ); pTok->SetBuffer ( pRaw, iLen ); BYTE * pTokens = new BYTE [ POOLSIZE ]; BYTE * p = pTokens; BYTE * sTok; int iToks = 0; int iBytes = 0; int iStemmed = 0; while ( ( sTok = pTok->GetToken() )!=NULL ) { BYTE * pStart = p++; // 1 byte for length while ( *sTok ) *p++ = *sTok++; *pStart = (BYTE)( p-pStart-1 ); // store length for ( int i=0; i=pTokens+POOLSIZE ) sphDie ( "out of buffer at tok %d", iToks ); iToks++; } *p++ = '\0'; iBytes = (int)( p - pTokens ); printf ( "tokenized %d tokens\n", iToks ); #if 0 int dCharStats[256]; memset ( dCharStats, 0, sizeof(dCharStats) ); for ( BYTE * t = pTokens; tp, pSnow->l ); p[pSnow->l+1] = 0; #else // crosscheck char buf[256]; memcpy ( buf, p+1, *p+1 ); stem_en ( p+1, *p ); int ll = strlen ( (char*)p+1 ); if ( ll!=pSnow->l || memcmp ( p+1, pSnow->p, ll ) ) { pSnow->p[pSnow->l] = 0; printf ( "%s[%d] vs %s[%d] for orig %s\n", p+1, ll, pSnow->p, pSnow->l, buf ); iDiff++; } #endif #endif #if PORTER1 p [ stem ( z, (char*)p+1, *p-1 )+2 ] = 0; #endif p += *p + GAP + 1; iToks++; } tmStem = sphMicroTimer() - tmStem; if ( iDiff ) printf ( "%d tokens are different\n", iDiff ); if ( iStemmed ) printf ( "%d data bytes stemmed\n", iStemmed ); #if SNOWBALL english_ISO_8859_1_close_env ( pSnow ); #endif uint64_t uHash = sphFNV64 ( pTokens, iBytes ); printf ( "stemmed %d tokens (%d bytes) in %d msec, hash %08x %08x\n", iToks, iBytes, (int)(tmStem/1000), (DWORD)( uHash>>32 ), (DWORD)( uHash & 0xffffffffUL ) ); if ( uHash!=U64C ( 0x54ef4f21994b67db ) ) printf ( "ERROR, HASH MISMATCH\n" ); SafeDelete ( pTok ); SafeDeleteArray ( pRaw ); } int main () { printf ( "RUNNING INTERNAL LIBSPHINX TESTS\n\n" ); #if 0 BenchSort (); #endif #ifdef NDEBUG BenchStripper (); BenchTokenizer ( false ); BenchTokenizer ( true ); BenchExpr (); BenchLocators (); BenchThreads (); #else TestQueryParser (); TestStripper (); TestTokenizer ( false ); TestTokenizer ( true ); TestExpr (); TestMisc (); TestRwlock (); TestCleanup (); TestStridedSort (); TestRTWeightBoundary (); TestWriter(); TestRTSendVsMerge (); TestSentenceTokenizer (); TestSpanSearch (); #endif unlink ( g_sTmpfile ); printf ( "\nSUCCESS\n" ); return 0; } // // $Id$ //