使用ICU库中的正则表达式匹配关键字之间含有特殊字符的关键字示例

前端之家收集整理的这篇文章主要介绍了使用ICU库中的正则表达式匹配关键字之间含有特殊字符的关键字示例前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
#include "unicode/regex.h"
#include "unicode/ucnv.h"
#ifdef _DEBUG
	#ifdef _WIN64
		#pragma comment(lib,"icuin64d.lib")
		#pragma comment(lib,"icuuc64d.lib")
	#else
		#pragma comment(lib,"icuin32d.lib")
		#pragma comment(lib,"icuuc32d.lib")
	#endif
#else
	#ifdef _WIN64
		#pragma comment(lib,"icuin64.lib")
		#pragma comment(lib,"icuuc64.lib")
	#else
		#pragma comment(lib,"icuin32.lib")
		#pragma comment(lib,"icuuc32.lib")
	#endif
#endif

//每次匹配的内容大小为1M
#define CONTENT_SPLIT_MAX_LEN 1024*1024 //
int FindSubNum(UnicodeString USrcStr,UnicodeString USubStr,int index)
{
	int32_t num = 0;
	int pos = USrcStr.indexOf(USubStr);
	while(pos != -1)
	{
		num++;
		pos += index;
		pos = USrcStr.indexOf(USubStr,pos);
	}
	return num;
}


extern "C"SP_DLP_DLLEXPORT int findKeyPhraseReg(char* buf,char *pat_str,UnicodeString keyphrase)
{
	if(NULL == buf || NULL == pat_str)
	{
		return 0;
	}

	UConverter *cv = NULL;
	UErrorCode status = U_ZERO_ERROR;
	int32_t buf_len = strlen(buf);
	int32_t pat_str_len = strlen(pat_str);
	RegexPattern *reg_pattern = NULL;///正则表达式

	cv = ucnv_open("utf-8"/*detectCode*/,&status);
	if (U_FAILURE(status)) 
	{
		ucnv_close(cv);
		return 0;
	}

	//转换模式串为UnicodeString
	UChar* subStr = new UChar[pat_str_len + 1];
	memset(subStr,(pat_str_len + 1)*2);

	ucnv_toUChars(cv,subStr,(pat_str_len+1)*2,pat_str,pat_str_len,&status);
	if (U_FAILURE(status)) 
	{
		delete[]subStr;
		subStr = NULL;
		ucnv_close(cv);
		return 0;
	}
	ucnv_close(cv);
	UnicodeString patString(subStr);
	//释放空间
	if (subStr)
	{
		delete[]subStr;
		subStr = NULL;
	}

	//Unicode正则表达式组装,这些函数经常代替构造函数来创建RegexPattern对象
	reg_pattern = RegexPattern::compile(patString,status);

	if (U_FAILURE(status)) 
	{
		return 0;
	}
	//把母串转换为Unicode
	UChar* result  = new UChar[CONTENT_SPLIT_MAX_LEN + 1];
	memset(result,(CONTENT_SPLIT_MAX_LEN+1)*2);
	//UChar result[CONTENT_SPLIT_MAX_LEN + 1] = {0};
	cv = ucnv_open(/*detected*/"UTF-8",&status);
	RegexMatcher *reg_matcher = NULL;//匹配器

	//此处说明:
	//1、优点:分批用icu正则匹配待检测内容,每批内容长度为1M。否则当来一个超大文件时,会导致系统不稳定
	//2、缺点:这里分批匹配有缺陷,会导致部分数据被分割后,正则表达式匹配不上。漏掉的匹配次数极限值为分割次数。
	int index = 0;
	int matchNum = 0;
	int src_len = 0;
	while(index < buf_len)
	{
		//将buf中的内容每次1M分批拷入result
		if (buf_len - index > CONTENT_SPLIT_MAX_LEN)
		{
			src_len = CONTENT_SPLIT_MAX_LEN;
		}
		else
		{
			src_len = buf_len - index;
		}
		ucnv_toUChars(cv,result,(CONTENT_SPLIT_MAX_LEN+1)*2,buf + index,src_len,&status);
		index += src_len;
		UnicodeString inputString(result);
		//创建一个正则表达式匹配器
		reg_matcher = reg_pattern->matcher(inputString,status);
		if (U_FAILURE(status))
		{
			delete reg_matcher;
			reg_matcher = NULL;
			continue;
		}
		//virtual UnicodeString replaceAll(const UnicodeString &replacement,UErrorCode &status)
		UnicodeString ustr = reg_matcher->replaceAll(UnicodeString(""),status);
		if (U_FAILURE(status))
		{
			delete reg_matcher;
			reg_matcher = NULL;
			continue;
		}
		int len = keyphrase.length();
		matchNum += FindSubNum(ustr,keyphrase,len);
		//使用完匹配器后要释放
		delete reg_matcher;
		reg_matcher = NULL;
	}
	ucnv_close(cv);
	if(result)
	{
		delete []result;
		result = NULL;
	}
	if (reg_pattern)
	{
		delete reg_pattern;
		reg_pattern = NULL;
	}
	return matchNum;
}

猜你在找的正则表达式相关文章