假设我将UTF-8内容存储在内存中,如何使用指针读取字符?我认为我需要注意第8位表示多字节字符,但我究竟如何将序列转换为有效的Unicode字符?另外,wchar_t是否存储单个Unicode字符的正确类型?
这就是我的想法:
wchar_t readNextChar (char*& p) { wchar_t unicodeChar; char ch = *p++; if ((ch & 128) != 0) { // This is a multi-byte character,what do I do now? // char chNext = *p++; // ... but how do I assemble the Unicode character? ... } ... return unicodeChar; }
解决方法
您必须将UTF-8位模式解码为其未编码的UTF-32表示.如果需要实际的Unicode代码点,则必须使用32位数据类型.
在Windows上,wchar_t不够大,因为它只有16位.您必须使用unsigned int或unsigned long.仅在处理UTF-16代码单元时才使用wchar_t.
在其他平台上,wchar_t通常为32位.但是在编写可移植代码时,你应该远离wchar_t,除非绝对需要(比如std :: wstring).
尝试更像这样的东西:
#define IS_IN_RANGE(c,f,l) (((c) >= (f)) && ((c) <= (l))) u_long readNextChar (char* &p) { // TODO: since UTF-8 is a variable-length // encoding,you should pass in the input // buffer's actual byte length so that you // can determine if a malformed UTF-8 // sequence would exceed the end of the buffer... u_char c1,c2,*ptr = (u_char*) p; u_long uc = 0; int seqlen; // int datalen = ... available length of p ...; /* if( datalen < 1 ) { // malformed data,do something !!! return (u_long) -1; } */ c1 = ptr[0]; if( (c1 & 0x80) == 0 ) { uc = (u_long) (c1 & 0x7F); seqlen = 1; } else if( (c1 & 0xE0) == 0xC0 ) { uc = (u_long) (c1 & 0x1F); seqlen = 2; } else if( (c1 & 0xF0) == 0xE0 ) { uc = (u_long) (c1 & 0x0F); seqlen = 3; } else if( (c1 & 0xF8) == 0xF0 ) { uc = (u_long) (c1 & 0x07); seqlen = 4; } else { // malformed data,do something !!! return (u_long) -1; } /* if( seqlen > datalen ) { // malformed data,do something !!! return (u_long) -1; } */ for(int i = 1; i < seqlen; ++i) { c1 = ptr[i]; if( (c1 & 0xC0) != 0x80 ) { // malformed data,do something !!! return (u_long) -1; } } switch( seqlen ) { case 2: { c1 = ptr[0]; if( !IS_IN_RANGE(c1,0xC2,0xDF) ) { // malformed data,do something !!! return (u_long) -1; } break; } case 3: { c1 = ptr[0]; c2 = ptr[1]; switch (c1) { case 0xE0: if (!IS_IN_RANGE(c2,0xA0,0xBF)) { // malformed data,do something !!! return (u_long) -1; } break; case 0xED: if (!IS_IN_RANGE(c2,0x80,0x9F)) { // malformed data,do something !!! return (u_long) -1; } break; default: if (!IS_IN_RANGE(c1,0xE1,0xEC) && !IS_IN_RANGE(c1,0xEE,0xEF)) { // malformed data,do something !!! return (u_long) -1; } break; } break; } case 4: { c1 = ptr[0]; c2 = ptr[1]; switch (c1) { case 0xF0: if (!IS_IN_RANGE(c2,0x90,do something !!! return (u_long) -1; } break; case 0xF4: if (!IS_IN_RANGE(c2,0x8F)) { // malformed data,0xF1,0xF3)) { // malformed data,do something !!! return (u_long) -1; } break; } break; } } for(int i = 1; i < seqlen; ++i) { uc = ((uc << 6) | (u_long)(ptr[i] & 0x3F)); } p += seqlen; return uc; }