前端之家收集整理的这篇文章主要介绍了
使用ICU库中的正则表达式匹配关键字之间含有特殊字符的关键字示例,
前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
#include "unicode/regex.h"
#include "unicode/ucnv.h"
#ifdef _DEBUG
#ifdef _WIN64
#pragma comment(lib,"icuin64d.lib")
#pragma comment(lib,"icuuc64d.lib")
#else
#pragma comment(lib,"icuin32d.lib")
#pragma comment(lib,"icuuc32d.lib")
#endif
#else
#ifdef _WIN64
#pragma comment(lib,"icuin64.lib")
#pragma comment(lib,"icuuc64.lib")
#else
#pragma comment(lib,"icuin32.lib")
#pragma comment(lib,"icuuc32.lib")
#endif
#endif
//每次匹配的内容大小为1M
#define CONTENT_SPLIT_MAX_LEN 1024*1024 //
int FindSubNum(UnicodeString USrcStr,UnicodeString USubStr,int index)
{
int32_t num = 0;
int pos = USrcStr.indexOf(USubStr);
while(pos != -1)
{
num++;
pos += index;
pos = USrcStr.indexOf(USubStr,pos);
}
return num;
}
extern "C"SP_DLP_DLLEXPORT int findKeyPhraseReg(char* buf,char *pat_str,UnicodeString keyphrase)
{
if(NULL == buf || NULL == pat_str)
{
return 0;
}
UConverter *cv = NULL;
UErrorCode status = U_ZERO_ERROR;
int32_t buf_len = strlen(buf);
int32_t pat_str_len = strlen(pat_str);
RegexPattern *reg_pattern = NULL;///正则表达式
cv = ucnv_open("utf-8"/*detectCode*/,&status);
if (U_FAILURE(status))
{
ucnv_close(cv);
return 0;
}
//转换模式串为UnicodeString
UChar* subStr = new UChar[pat_str_len + 1];
memset(subStr,(pat_str_len + 1)*2);
ucnv_toUChars(cv,subStr,(pat_str_len+1)*2,pat_str,pat_str_len,&status);
if (U_FAILURE(status))
{
delete[]subStr;
subStr = NULL;
ucnv_close(cv);
return 0;
}
ucnv_close(cv);
UnicodeString patString(subStr);
//释放空间
if (subStr)
{
delete[]subStr;
subStr = NULL;
}
//Unicode正则表达式组装,这些函数经常代替构造函数来创建RegexPattern对象
reg_pattern = RegexPattern::compile(patString,status);
if (U_FAILURE(status))
{
return 0;
}
//把母串转换为Unicode
UChar* result = new UChar[CONTENT_SPLIT_MAX_LEN + 1];
memset(result,(CONTENT_SPLIT_MAX_LEN+1)*2);
//UChar result[CONTENT_SPLIT_MAX_LEN + 1] = {0};
cv = ucnv_open(/*detected*/"UTF-8",&status);
RegexMatcher *reg_matcher = NULL;//匹配器
//此处说明:
//1、优点:分批用icu正则匹配待检测内容,每批内容长度为1M。否则当来一个超大文件时,会导致系统不稳定
//2、缺点:这里分批匹配有缺陷,会导致部分数据被分割后,正则表达式匹配不上。漏掉的匹配次数极限值为分割次数。
int index = 0;
int matchNum = 0;
int src_len = 0;
while(index < buf_len)
{
//将buf中的内容每次1M分批拷入result
if (buf_len - index > CONTENT_SPLIT_MAX_LEN)
{
src_len = CONTENT_SPLIT_MAX_LEN;
}
else
{
src_len = buf_len - index;
}
ucnv_toUChars(cv,result,(CONTENT_SPLIT_MAX_LEN+1)*2,buf + index,src_len,&status);
index += src_len;
UnicodeString inputString(result);
//创建一个正则表达式匹配器
reg_matcher = reg_pattern->matcher(inputString,status);
if (U_FAILURE(status))
{
delete reg_matcher;
reg_matcher = NULL;
continue;
}
//virtual UnicodeString replaceAll(const UnicodeString &replacement,UErrorCode &status)
UnicodeString ustr = reg_matcher->replaceAll(UnicodeString(""),status);
if (U_FAILURE(status))
{
delete reg_matcher;
reg_matcher = NULL;
continue;
}
int len = keyphrase.length();
matchNum += FindSubNum(ustr,keyphrase,len);
//使用完匹配器后要释放
delete reg_matcher;
reg_matcher = NULL;
}
ucnv_close(cv);
if(result)
{
delete []result;
result = NULL;
}
if (reg_pattern)
{
delete reg_pattern;
reg_pattern = NULL;
}
return matchNum;
}