标签:
/* UTF-8 valid format list: 0xxxxxxx 110xxxxx 10xxxxxx 1110xxxx 10xxxxxx 10xxxxxx 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ char *filter_none_utf8_chars(char *src, int *len) { unsigned char *p; unsigned char *pSub; unsigned char *pStrEnd; unsigned char *pCharEnd; int bytes; unsigned char *filtered; unsigned char *pDest; unsigned char *pInvalidCharStart; pStrEnd = (unsigned char *)src + (*len); p = (unsigned char *)src; pInvalidCharStart = NULL; while (p < pStrEnd) { if (*p < 0x80) { p++; continue; } if ((*p & 0xE0) == 0xC0) //110xxxxx { bytes = 1; } else if ((*p & 0xF0) == 0xE0) //1110xxxx { bytes = 2; } else if ((*p & 0xF8) == 0xF0) //11110xxx { bytes = 3; } else if ((*p & 0xFC) == 0xF8) //111110xx { bytes = 4; } else if ((*p & 0xFE) == 0xFC) //1111110x { bytes = 5; } else { pInvalidCharStart = p; break; } p++; pCharEnd = p + bytes; if (pCharEnd > pStrEnd) { pInvalidCharStart = p - 1; break; } for (; p<pCharEnd; p++) { if ((*p & 0xC0) != 0x80) { break; } } if (p != pCharEnd) { pInvalidCharStart = pCharEnd - (bytes + 1); break; } } if (pInvalidCharStart == NULL) //all chars are valid { return src; } filtered = (unsigned char *)malloc(sizeof(char) * (*len)); if (filtered == NULL) { *len = 0; *src = ‘\0‘; return src; } pDest = filtered; bytes = (char *)pInvalidCharStart - src; if (bytes > 0) { memcpy(pDest, src, bytes); pDest += bytes; } p = pInvalidCharStart + 1; //skip this invalid char while (p < pStrEnd) { if (*p < 0x80) { *pDest++ = *p++; continue; } if ((*p & 0xE0) == 0xC0) //110xxxxx { bytes = 1; } else if ((*p & 0xF0) == 0xE0) //1110xxxx { bytes = 2; } else if ((*p & 0xF8) == 0xF0) //11110xxx { bytes = 3; } else if ((*p & 0xFC) == 0xF8) //111110xx { bytes = 4; } else if ((*p & 0xFE) == 0xFC) //1111110x { bytes = 5; } else //invalid char { p++; continue; } pSub = p + 1; pCharEnd = pSub + bytes; if (pCharEnd > pStrEnd) { p++; continue; } for (; pSub<pCharEnd; pSub++) { if ((*pSub & 0xC0) != 0x80) { break; } } if (pSub != pCharEnd) { p++; continue; } bytes += 1; memcpy(pDest, pSub-bytes, bytes); pDest += bytes; p += bytes; } *len = pDest - filtered; memcpy(src, filtered, *len); * (src + (*len)) = ‘\0‘; free(filtered); return src; }
http://bbs.chinaunix.net/forum.php?mod=viewthread&tid=1230313
一个高效过滤非UTF8字符的C函数(也可用来判断是否utf8)
标签:
原文地址:http://www.cnblogs.com/findumars/p/5068059.html