欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

UTF8转Unicode

程序员文章站 2022-03-31 12:28:37
...

int GetUtf8ByteNumForWord(char firstCh)
{
    int nRet=0;
    __asm
    {    
        movzx ecx,byte ptr[firstCh]
        and ecx,0xE0
        jz  done
        test ecx,0x80
        jnz lbm
        mov nRet,1
        jmp done
lbm:
        cmp cl,0xE0
        jz  lb3
        cmp cl,0x0C
        jz  lb2
        jmp done
lb3:
        mov nRet,3
        jmp done
lb2:
        mov nRet,2
done:
    }
    return nRet;
}
void Utf8ToUnicode(const char* utf8, int len, wchar_t *unicode)
{
    int i = 0;
    int j = 0;
    char* temp=(char*)unicode;
    //循环解析
    while (i < len)
    {   
        int nByteNum=GetUtf8ByteNumForWord(utf8[i]);
        if (nByteNum==0)
        {
            return;
        }
        switch(nByteNum)
        {
        case 1:
            temp[j] = utf8[i];
            temp[j+1]=0;
            break;
        case 2:
            temp[j] = utf8[i];
            temp[j + 1] = utf8[i + 1];
            break;
        case 3:
            //这里就开始进行UTF8->Unicode
            temp[j + 1] = ((utf8[i] & 0x0F) << 4) | ((utf8[i + 1] >> 2) & 0x0F);
            temp[j] = ((utf8[i + 1] & 0x03) << 6) + (utf8[i + 2] & 0x3F);
            break;
        default:
            break;    
        }
        j+=2;
        i+=nByteNum;
    }
    temp[j]=0;
    temp[j+1]=0;
}

测试代码如下:

std::ifstream fin("debug\\Test.txt");
const unsigned int L_MAX_LINE=1024;
char utf8[L_MAX_LINE];
wchar_t unicode[L_MAX_LINE];
while(fin.getline(utf8,L_MAX_LINE))
{
    Utf8ToUnicode(utf8,strlen(utf8),unicode);
    MessageBoxW(0,unicode,0,0);
}
fin.close();

相关标签: UTF8 UNICODE