使用ICU库中的正则表达式匹配关键字之间含有特殊字符的关键字示例

#include "unicode/regex.h"
#include "unicode/ucnv.h"
#ifdef _DEBUG
	#ifdef _WIN64
		#pragma comment(lib,"icuin64d.lib")
		#pragma comment(lib,"icuuc64d.lib")
	#else
		#pragma comment(lib,"icuin32d.lib")
		#pragma comment(lib,"icuuc32d.lib")
	#endif
#else
	#ifdef _WIN64
		#pragma comment(lib,"icuin64.lib")
		#pragma comment(lib,"icuuc64.lib")
	#else
		#pragma comment(lib,"icuin32.lib")
		#pragma comment(lib,"icuuc32.lib")
	#endif
#endif

//每次匹配的内容大小为1M
#define CONTENT_SPLIT_MAX_LEN 1024*1024 //
int FindSubNum(UnicodeString USrcStr,UnicodeString USubStr,int index)
{
	int32_t num = 0;
	int pos = USrcStr.indexOf(USubStr);
	while(pos != -1)
	{
		num++;
		pos += index;
		pos = USrcStr.indexOf(USubStr,pos);
	}
	return num;
}


extern "C"SP_DLP_DLLEXPORT int findKeyPhraseReg(char* buf,char *pat_str,UnicodeString keyphrase)
{
	if(NULL == buf || NULL == pat_str)
	{
		return 0;
	}

	UConverter *cv = NULL;
	UErrorCode status = U_ZERO_ERROR;
	int32_t buf_len = strlen(buf);
	int32_t pat_str_len = strlen(pat_str);
	RegexPattern *reg_pattern = NULL;///正则表达式

	cv = ucnv_open("utf-8"/*detectCode*/,&status);
	if (U_FAILURE(status)) 
	{
		ucnv_close(cv);
		return 0;
	}

	//转换模式串为UnicodeString
	UChar* subStr = new UChar[pat_str_len + 1];
	memset(subStr,(pat_str_len + 1)*2);

	ucnv_toUChars(cv,subStr,(pat_str_len+1)*2,pat_str,pat_str_len,&status);
	if (U_FAILURE(status)) 
	{
		delete[]subStr;
		subStr = NULL;
		ucnv_close(cv);
		return 0;
	}
	ucnv_close(cv);
	UnicodeString patString(subStr);
	//释放空间
	if (subStr)
	{
		delete[]subStr;
		subStr = NULL;
	}

	//Unicode正则表达式组装,这些函数经常代替构造函数来创建RegexPattern对象
	reg_pattern = RegexPattern::compile(patString,status);

	if (U_FAILURE(status)) 
	{
		return 0;
	}
	//把母串转换为Unicode
	UChar* result  = new UChar[CONTENT_SPLIT_MAX_LEN + 1];
	memset(result,(CONTENT_SPLIT_MAX_LEN+1)*2);
	//UChar result[CONTENT_SPLIT_MAX_LEN + 1] = {0};
	cv = ucnv_open(/*detected*/"UTF-8",&status);
	RegexMatcher *reg_matcher = NULL;//匹配器

	//此处说明：
	//1、优点：分批用icu正则匹配待检测内容，每批内容长度为1M。否则当来一个超大文件时，会导致系统不稳定
	//2、缺点：这里分批匹配有缺陷，会导致部分数据被分割后，正则表达式匹配不上。漏掉的匹配次数极限值为分割次数。
	int index = 0;
	int matchNum = 0;
	int src_len = 0;
	while(index < buf_len)
	{
		//将buf中的内容每次1M分批拷入result
		if (buf_len - index > CONTENT_SPLIT_MAX_LEN)
		{
			src_len = CONTENT_SPLIT_MAX_LEN;
		}
		else
		{
			src_len = buf_len - index;
		}
		ucnv_toUChars(cv,result,(CONTENT_SPLIT_MAX_LEN+1)*2,buf + index,src_len,&status);
		index += src_len;
		UnicodeString inputString(result);
		//创建一个正则表达式匹配器
		reg_matcher = reg_pattern->matcher(inputString,status);
		if (U_FAILURE(status))
		{
			delete reg_matcher;
			reg_matcher = NULL;
			continue;
		}
		//virtual UnicodeString replaceAll(const UnicodeString &replacement,UErrorCode &status)
		UnicodeString ustr = reg_matcher->replaceAll(UnicodeString(""),status);
		if (U_FAILURE(status))
		{
			delete reg_matcher;
			reg_matcher = NULL;
			continue;
		}
		int len = keyphrase.length();
		matchNum += FindSubNum(ustr,keyphrase,len);
		//使用完匹配器后要释放
		delete reg_matcher;
		reg_matcher = NULL;
	}
	ucnv_close(cv);
	if(result)
	{
		delete []result;
		result = NULL;
	}
	if (reg_pattern)
	{
		delete reg_pattern;
		reg_pattern = NULL;
	}
	return matchNum;
}
使用ICU库中的正则表达式匹配关键字之间含有特殊字符的关键字示例

猜你在找的正则表达式相关文章