前端之家收集整理的这篇文章主要介绍了
ICU正则表达式运用方法,
前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
最近由于项目需求学习了一下ICU下的正则表达式的用法,在运用之前现在官网上下载ICU库,并编译,将include下的头文件路径与lib下的库文件路径都添加到编译器的option选项下,环境实在vs2008下,之后的编码如下
#include <iostream>
#include <list>
//#include <unicode/uregex.h>
#include "unicode/utypes.h"
//#include "unicode/parseerr.h"
#include "testConvAnyCodeUtf8.h"
#include "unicode/ucnv.h"
#include "apr_file_io.h"
#include "unicode/regex.h"
using namespace std;
void MatchFromFile(char* GBpath,list<UnicodeString> listring);
int reg_exp_match(const char*pat,const char* sour);
void print(list<UnicodeString> listString);
#pragma comment(lib,"conv_anycode_utf8.lib")
#pragma comment(lib,"libapr-1.lib")
#pragma comment(lib,"icuind.lib")
#pragma comment(lib,"icuucd.lib")
#define SUM 3
#define BUFFSIZE 1024
int g_num[SUM] = {0};
int main()
{
list<UnicodeString> listString;
char* pathname = "F:\\3_27的\\1.txt";
/*listString.push_back(UnicodeString("\\s123\\s"));*/
listString.push_back(UnicodeString("123"));
/*listString.push_back(UnicodeString("[\\u4e00-\\u9fa5$]+好好学习"));*/
listString.push_back(UnicodeString("好好学习"));
listString.push_back(UnicodeString("d[o]m"));
MatchFromFile(pathname,listString);
print(listString);
system("pause");
return 0;
}
void MatchFromFile(char* GBpath,list<UnicodeString> listring)
{
char* path = conv_anycode_utf8(GBpath,"UTF-8","GB18030");
list<UnicodeString>::const_iterator _regter = listring.begin();
apr_pool_t *pool = NULL;
apr_file_t *file = NULL;
apr_status_t res = APR_SUCCESS;
char* inbuf = new char[BUFFSIZE + 1];
char *result;
int len = 0;
UConverter *cv = NULL;
UErrorCode status = U_ZERO_ERROR;
memset(inbuf,BUFFSIZE + 1);
apr_initialize();
res = apr_pool_create(&pool,NULL);
if(res != APR_SUCCESS)
{
printf("create pool Failed!\n");
return;
}
res = apr_file_open(&file,path,APR_READ,APR_OS_DEFAULT,pool);
if(res != APR_SUCCESS)
{
printf("open file fail!\n");
return ;
}
int i = 0;
while(!apr_file_gets(inbuf,BUFFSIZE+1,file))
{
while(_regter != listring.end())
{
len = (*_regter).length();
result = new char[BUFFSIZE+1];
memset(result,BUFFSIZE*sizeof(char)+1);
const UChar* psrc = (*_regter).getBuffer();
cv = ucnv_open("GB18030",&status);
int32_t num = ucnv_fromUChars(cv,result,psrc,len,&status);
if(status != U_ZERO_ERROR)
{
printf("Convert fail!\n");
break;
}
int n = reg_exp_match(result,inbuf);
g_num[i] += n;
delete [] result;
ucnv_close(cv);
i++;
_regter++;
}
_regter = listring.begin();
i = 0;
}
delete []inbuf;
}
void print(list<UnicodeString> listring)
{
char *result;
int len = 0;
UConverter *cv = NULL;
UErrorCode status = U_ZERO_ERROR;
list<UnicodeString>::const_iterator iter = listring.begin();
for(int j = 0;(j < SUM) && (iter != listring.end());j++,iter++)
{
len = (*iter).length();
result = new char[BUFFSIZE+1];
memset(result,BUFFSIZE*sizeof(char)+1);
const UChar* psrc = (*iter).getBuffer();
cv = ucnv_open("GB18030",&status);
int32_t num = ucnv_fromUChars(cv,&status);
/* ucnv_toUChars(*/
if(status != U_ZERO_ERROR)
{
printf("Convert fail!\n");
break;
}
printf("%s出现:%d次\n",g_num[j]);
delete [] result;
ucnv_close(cv);
}
}
int reg_exp_match(const char*pat,const char* sour)
{
int num = 0;
int LEN = strlen(sour);
int len = strlen(pat);
UConverter *cv = NULL;
RegexPattern *REPattern = NULL;///正则表达式
RegexMatcher *REMatcher = NULL;//匹配器
UErrorCode status = U_ZERO_ERROR;
cv = ucnv_open("GB18030",&status);
UChar* patStr = new UChar[len + 1];
memset(patStr,len + 1);
ucnv_toUChars(cv,patStr,len+1,pat,&status);
ucnv_close(cv);
UnicodeString patString(patStr);
//Unicode正则表达式组装,这些函数经常代替构造函数来创建RegexPattern对象
REPattern = RegexPattern::compile(patString,status);
if (U_FAILURE(status))
{
return 0;
}
//把母串转换为Unicode
UChar* result = new UChar[LEN + 1];
memset(result,LEN+1);
cv = ucnv_open("utf-8",&status);
ucnv_toUChars(cv,LEN+1,sour,LEN,&status);
UnicodeString inputString(result);
//剔除字符串中的一些序列
UnicodeString unEscapedInput = inputString.unescape();
//创建一个正则表达式匹配器
REMatcher = REPattern->matcher(unEscapedInput,status);
if (U_FAILURE(status))
{
return 0;
}
if (U_FAILURE(status))
{
return 0;
}
int64_t pos = 0;
UnicodeString str = REMatcher->group(status);
while(pos < LEN)
{
if(REMatcher->find(pos,status))
{
num++;
pos += REMatcher->end64(status);
}
else
{
break;
}
}
return num;
}
原文链接:https://www.f2er.com/regex/362452.html