欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

判断字符串是否为utf8格式,cpp实现

程序员文章站 2022-03-31 11:09:19
...

基于网上其它编程语言实现的版本,此处改写为c、cpp版本。

bool isUtf8Str(const char *str)
{
    int str_len = strlen(str);
    int encodingBytesCount = 0;
    bool allTextsAreASCIIChars = true;

    for (int i = 0; i < str_len; i++)
    {
        char current = str[i];

        if ((current & 0x80) == 0x80)
        {
            allTextsAreASCIIChars = false;
        }
        // First byte
        if (encodingBytesCount == 0)
        {
            if ((current & 0x80) == 0)
            {
                // ASCII chars, from 0x00-0x7F
                continue;
            }

            if ((current & 0xC0) == 0xC0)
            {
                encodingBytesCount = 1;
                current <<= 2;

                // More than two bytes used to encoding a unicode char.
                // Calculate the real length.
                while ((current & 0x80) == 0x80)
                {
                    current <<= 1;
                    encodingBytesCount++;
                }
            }
            else
            {
                // Invalid bits structure for UTF8 encoding rule.
                return false;
            }
        }
        else
        {
            // Following bytes, must start with 10.
            if ((current & 0xC0) == 0x80)
            {
                encodingBytesCount--;
            }
            else
            {
                // Invalid bits structure for UTF8 encoding rule.
                return false;
            }
        }
    }

    if (encodingBytesCount != 0)
    {
        // Invalid bits structure for UTF8 encoding rule.
        // Wrong following bytes count.
        return false;
    }

    // Although UTF8 supports encoding for ASCII chars, we regard as a input stream, whose contents are all ASCII as default encoding.
    return !allTextsAreASCIIChars;
}

//判断string是否utf8格式的字符串。
inline bool isUtf8Str(const std::string &str)
{
    return isUtf8Str(str.c_str());
}
相关标签: cpp utf8