一个高效过滤非UTF8字符的C函数（也可用来判断是否utf8）

时间：2015-12-22 21:12:20 阅读：196 评论：0 收藏：0 [点我收藏+]

标签：

/*
UTF-8 valid format list:
0xxxxxxx
110xxxxx 10xxxxxx
1110xxxx 10xxxxxx 10xxxxxx
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
char *filter_none_utf8_chars(char *src, int *len)
{
        unsigned char *p;
        unsigned char *pSub;
        unsigned char *pStrEnd;
        unsigned char *pCharEnd;
        int bytes;
        unsigned char *filtered;
        unsigned char *pDest;
        unsigned char *pInvalidCharStart;

        pStrEnd = (unsigned char *)src + (*len);
        p = (unsigned char *)src;
        pInvalidCharStart = NULL;
        while (p < pStrEnd)
        {
                if (*p < 0x80)
                {
                        p++;
                        continue;
                }

                if ((*p & 0xE0) == 0xC0)  //110xxxxx
                {
                        bytes = 1;
                }
                else if ((*p & 0xF0) == 0xE0) //1110xxxx
                {
                        bytes = 2;
                }
                else if ((*p & 0xF8) == 0xF0) //11110xxx
                {
                        bytes = 3;
                }
                else if ((*p & 0xFC) == 0xF8) //111110xx

                {
                        bytes = 4;
                }
                else if ((*p & 0xFE) == 0xFC) //1111110x
                {
                        bytes = 5;
                }
                else
                {
                        pInvalidCharStart = p;
                        break;
                }

                p++;
                pCharEnd = p + bytes;
                if (pCharEnd > pStrEnd)
                {
                        pInvalidCharStart = p - 1;
                        break;
                }

                for (; p<pCharEnd; p++)
                {
                        if ((*p & 0xC0) != 0x80)
                        {
                                break;
                        }
                }

                if (p != pCharEnd)
                {
                        pInvalidCharStart = pCharEnd - (bytes + 1);
                        break;
                }
        }

        if (pInvalidCharStart == NULL) //all chars are valid
        {
                return src;
        }


        filtered = (unsigned char *)malloc(sizeof(char) * (*len));
        if (filtered == NULL)
        {
                *len = 0;
                *src = ‘\0‘;
                return src;
        }

        pDest = filtered;
        bytes = (char *)pInvalidCharStart - src;
        if (bytes > 0)
        {
                memcpy(pDest, src, bytes);
                pDest += bytes;
        }

        p = pInvalidCharStart + 1; //skip this invalid char
        while (p < pStrEnd)
        {
                if (*p < 0x80)
                {
                        *pDest++ = *p++;
                        continue;
                }

                if ((*p & 0xE0) == 0xC0)  //110xxxxx
                {
                        bytes = 1;
                }
                else if ((*p & 0xF0) == 0xE0) //1110xxxx
                {
                        bytes = 2;
                }
                else if ((*p & 0xF8) == 0xF0) //11110xxx
                {
                        bytes = 3;
                }
                else if ((*p & 0xFC) == 0xF8) //111110xx
                {
                        bytes = 4;
                }
                else if ((*p & 0xFE) == 0xFC) //1111110x
                {
                        bytes = 5;
                }                

                else  //invalid char
                {
                        p++;
                        continue;
                }

                pSub = p + 1;
                pCharEnd = pSub + bytes;
                if (pCharEnd > pStrEnd)
                {
                        p++;
                        continue;
                }

                for (; pSub<pCharEnd; pSub++)
                {
                        if ((*pSub & 0xC0) != 0x80)
                        {
                                break;
                        }
                }

                if (pSub != pCharEnd)
                {
                        p++;
                        continue;
                }

                bytes += 1;
                memcpy(pDest,  pSub-bytes, bytes);
                pDest += bytes;
                p += bytes;
        }

        *len = pDest - filtered;
        memcpy(src, filtered, *len);
        * (src + (*len)) = ‘\0‘;

        free(filtered);

        return src;
}

http://bbs.chinaunix.net/forum.php?mod=viewthread&tid=1230313

一个高效过滤非UTF8字符的C函数（也可用来判断是否utf8）

标签：

原文地址：http://www.cnblogs.com/findumars/p/5068059.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行