sqlite3通过使用fts3虚表支持全文搜索,默认支持simple和porter两种分词器,并提供了接口来自定义分词器。这里我们利用mmseg来构造自定义的中文分词器。
虽然sqlite在fts3_tokenizer.h中提供了各种接口供用户自定义分词器,但其并未提供c函数供用户来注册自定义的分词器,分词器的注册必须使用sql语句来完成。
SELECTfts3_tokenizer(<tokenizer-name>,<sqlite3_tokenizer_moduleptr>);
其中tokenizer-name是分词器的名称,sqlite3_tokenizer_moduleptr只一个指向sqlite3_tokenizer_module结构的指针并且编码为sqlblob。下面是官方给出的注册函数:
int registerTokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module *p
){
intrc;
sqlite3_stmt*pStmt;
const char*zsql = "SELECT fts3_tokenizer(?,?)";
rc =sqlite3_prepare_v2(db,zsql,-1,&pStmt,0);
if(rc!=sqlITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt,1,zName,sqlITE_STATIC);
sqlite3_bind_blob(pStmt,2,&p,sizeof(p),sqlITE_STATIC);
sqlite3_step(pStmt);
returnsqlite3_finalize(pStmt);
}
要想实现自定义的分词器,最关键的时是得到指向sqlite3_tokenizer_module结构的一个指针,sqlite3_tokenizer_module结构体定义如下:
struct sqlite3_tokenizer_module {
int iVersion; //版本号,必须设置为0
int (*xCreate)( //创建虚表时自动调用并创建分词器
intargc,
const char*const*argv,
sqlite3_tokenizer**ppTokenizer
);
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);//数据库连接关闭时自动调用,用于销毁资源
int (*xOpen)( //插入数据或检索时自动调用以进行分词
sqlite3_tokenizer*pTokenizer,
const char*pInput,intnBytes,
sqlite3_tokenizer_cursor **ppCursor
);
int(*xClose)(sqlite3_tokenizer_cursor *pCursor); //分词结果提取完毕后自动调用
int (*xNext)( //逐个提取分词结果
sqlite3_tokenizer_cursor*pCursor,
const char**ppToken,int *pnBytes,
int*piStartOffset,
int*piEndOffset,
int*piPosition
);
};
有几点需要注意的是:
1分词引擎使用sql语句注册意味着每建立一个sqlite连接都必须注册一次分词器,对于需要使用词库的中文分词器来说也意味着巨大的内存消耗。
2在检索时分词结果的提取和语义的解析式交替进行的。例如我们搜索"kanif ORsqlite"的时候,引擎先将全部传入到分词器,在调用一次next获取到词kanif后,在将词sqlite传入到分词器,直到全部解析完毕。
3由于中文分词本身的特殊性,例如"北京市"很有可能视为一个完整的词,这样在搜索"北京"的时候就无法获取到结果。如果分词器支持将"北京市"切分为"北京市"和"北京"或者将十一月切分为"11月"和"十一",那么需注意(*xNext)函数中的piStartOffset和piEndOffset参数。经测试在插入数据的时候这两个参数无实际用途,但在查询的时候这两个参数决定了下一次的输入串。
附:
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include"fts3_tokenizer.h"
#include "mmseg/mmseg.cpp"
static bool loadDic = true;
typedef struct cus_tokenizer{
sqlite3_tokenizer base;
} cus_tokenizer;
typedef struct cus_tokenizer_cursor{
sqlite3_tokenizer_cursor base;
char *pInput;
int nBytes;
int iToken;
char *pToken;
rmmseg::Algorithm *pAlgor;
} cus_tokenizer_cursor;
void initmmseg(void){
if(!loadDic)
return;
mmseg_load_words("chars.dic");
mmseg_load_words("words.dic");
loadDic =False;
}
static int cusCreate(
int argc,const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
cus_tokenizer *t;
t = (cus_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return sqlITE_NOMEM;
memset(t,sizeof(*t));
initmmseg();
*ppTokenizer = &t->base;
return sqlITE_OK;
}
static intcusDestroy(sqlite3_tokenizer *pTokenizer){
sqlite3_free(pTokenizer);
return sqlITE_OK;
}
static int cusOpen(
sqlite3_tokenizer*pTokenizer,
const char *pInput,
sqlite3_tokenizer_cursor**ppCursor
){
cus_tokenizer_cursor *c;
if(pInput == 0){
nBytes =0;
}else if(nBytes < 0)
nBytes = (int)strlen(pInput);
c = (cus_tokenizer_cursor *)sqlite3_malloc(sizeof(*c));
if(c == NULL)
return sqlITE_NOMEM;
c->iToken =c->nBytes = 0;
c->pInput = c->pToken =NULL;
c->pAlgor = mmseg_algor_create(pInput,nBytes);
c->nBytes = nBytes;
*ppCursor = &c->base;
return sqlITE_OK;
}
static intcusClose(sqlite3_tokenizer_cursor *pCursor){
cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor;
if(c->pInput != NULL){
sqlite3_free(c->pInput);
}
if(c->pToken != NULL){
sqlite3_free(c->pToken);
}
if(c->pAlgor != NULL){
mmseg_algor_destroy(c->pAlgor);
}
c->pInput = c->pToken =NULL;
c->pAlgor = NULL;
sqlite3_free(c);
return sqlITE_OK;
}
static int cusNext(
sqlite3_tokenizer_cursor *pCursor,
const char**ppToken,
int*pnBytes,
int*piStartOffset,
int*piEndOffset,
int*piPosition
){
cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor;
cus_tokenizer *t = (cus_tokenizer *)pCursor->pTokenizer;
if(c->pToken != NULL){
sqlite3_free(c->pToken);
c->pToken = NULL;
}
struct Token token =mmseg_next_token(c->pAlgor);
if(token.length != 0 ){
int l =token.length;
c->pToken = (char *)sqlite3_malloc(l+1);
if(c->pToken == NULL)
return sqlITE_NOMEM;
c->pToken[l] = 0;
memcpy(c->pToken,token.text,l);
*ppToken =c->pToken;
*pnBytes =l;
*piStartOffset = token.offset;
*piEndOffset= token.offset + token.length;
*piPosition= c->iToken++;
returnsqlITE_OK;
}
//一般来说只有插入数据时才会进入到这里
return sqlITE_DONE;
}
static const sqlite3_tokenizer_module cusTokenizerModule ={
0,
cusCreate,
cusDestroy,
cusOpen,
cusClose,
cusNext,
};
int registerTokenizer(
sqlite3 *db,sqlITE_STATIC);
sqlite3_step(pStmt);
returnsqlite3_finalize(pStmt);
}
int main(){
constsqlite3_tokenizer_module *ptr =&cusTokenizerModule;
sqlite3*pDB;
sqlite3_stmt* stmt;
char *errMsg = NULL;
const char*zTail;
int rc =sqlite3_open("test.sqlite3",&pDB);
if(rc){
printf("create error. %s\n",sqlite3_errmsg(pDB));
return rc;
}
chartoken_name[] = "custoken";
registerTokenizer(pDB,token_name,ptr);
rc =sqlite3_exec(pDB,"CREATE VIRTUAL TABLE foo USINGfts3(tokenize=custoken)",&errMsg); if(rc !=sqlITE_OK){ printf("create virtual error,%s\n",errMsg); if(rc !=sqlITE_OK){ printf("create virtual error,errMsg); return rc; } rc =sqlite3_exec(pDB,"INSERT INTO fooVALUES('\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82')",&errMsg); if(rc !=sqlITE_OK){ printf("insert value error,errMsg); return rc; } int nrow =0,ncolumn = 0; char**azResult; //二维数组存放结果 sqlite3_get_table(pDB,"SELECT * FROM foo WHERE content MATCH'\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82'",&azResult,&nrow,&ncolumn,&errMsg ); int i = 0; printf("row:%d column=%d \n",nrow,ncolumn ); printf("\nThe result of querying is : \n" ); for( i=0 ;i<( nrow + 1 ) * ncolumn ; i++ ) printf( "azResult[%d] = %s\n",i,azResult[i] ); sqlite3_free_table( azResult ); sqlite3_close(pDB); return0; }