欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

UTF-8字符串在lua中的字长问题

程序员文章站 2024-03-18 08:19:16
...

UTF-8字符串在lua中每个字符的字长是不一样,中文,英文,其他字符。都有各自不同的字长,所以UTF-8也叫作变长字符。规则如下:

1. utf8字符是变长字符

2. 字符长度有规律

UTF-8字符规律

字符串的首个byte表示了该utf8字符的长度

0xxxxxxx - 1 byte

110yxxxx - 192, 2 byte

1110yyyy - 225, 3 byte

11110zzz - 240, 4 byte



所以,要想满足自己对字符串按字符数量截取的话,就要进行一下包装了,直接贴代码,如下:

local UTF8 = {}

function UTF8.chSize(char)
    if not char then
        return 0
    elseif char > 240 then
        return 4
    elseif char > 225 then
        return 3
    elseif char > 192 then
        return 2
    else
        return 1
    end
end


function UTF8.sub(str, startChar, numChars)
    if str == nil then
        return ""
    end
  	local startIndex = 1
    if (startChar==nil) then
      startChar = 1;
    end
    if (numChars==nil) then
      numChars =15;
    end;

    local allChars = numChars

  	while startChar > 1 do
	      local char = string.byte(str, startIndex)
	      startIndex = startIndex + UTF8.chSize(char)
	      startChar = startChar - 1
  	end
 
  	local currentIndex = startIndex
    while currentIndex <= numChars and currentIndex <= #str do
    	local char = string.byte(str, currentIndex)
    	currentIndex = currentIndex + UTF8.chSize(char)
  	end

    if numChars < #str then
        return str:sub(startIndex, currentIndex - 1).."..."
    else
        return str:sub(startIndex, currentIndex - 1)
    end

  	
end

function UTF8.length(str)
	local length = 0;
	local currentIndex = 1;
	while currentIndex <= #str do
    	local char = string.byte(str, currentIndex)
    	currentIndex = currentIndex + UTF8.chSize(char)
    	length = length + 1
  	end
  	return length;
end

function UTF8.toString(str)
    if type(str)=="string" then
        return str
    end
    local char = ""
    if type(str)~="table" then
        return char
    end
    for key, var in pairs(str) do
        if var == "\0" then
            break
        end
        char = char .. var
    end
    return char
end
return UTF8;


相关标签: string lua utf-8