Java按字节数截取字符串，一个中文长度为2

程序员文章站 2022-05-24 23:34:48

...

碰到可能会截取汉字的情况，当然是要不能截取出乱码来，就是不能对整个汉字截取一半。如"我ABC汉字d"这个字符串，截取５个字节的时候，应该是"我ABC"，而截取８个字节的时候，应该是"我ABC汉"，而不应该是"我ABC汉？"，其中"？"为半个汉字，可理解为向前截取

public static String subStr(String str, int subSLength)  
	           throws UnsupportedEncodingException{ 
	       if (str == null)  
	           return "";  
	       else{ 
	           int tempSubLength = subSLength;//截取字节数
	           String subStr = str.substring(0, str.length()<subSLength ? str.length() : subSLength);//截取的子串  
	           int subStrByetsL = subStr.getBytes("GBK").length;//截取子串的字节长度 
	           //int subStrByetsL = subStr.getBytes().length;//截取子串的字节长度 
	           // 说明截取的字符串中包含有汉字  
	           while (subStrByetsL > tempSubLength){  
	        	   int subSLengthTemp = --subSLength;
	               subStr = str.substring(0, subSLengthTemp>str.length() ? str.length() : subSLengthTemp);  
	               subStrByetsL = subStr.getBytes("GBK").length;
	               //subStrByetsL = subStr.getBytes().length;
	           }  
	           return subStr; 
	       }
	   }

备注：将字符编码GBK改为UTF-8，则每个中文长度按3个字符计算

以下方法是向后截取字符串

public static String subStr_1(String str, int start, int end)
	           throws UnsupportedEncodingException{
		
	    if (str == null)  return null;
	    String chinese = "[\u0391-\uFFE5]";
	    byte[] b = str.getBytes("UTF-8");
	    
	    String temp = new String(b, start, end);
	    String last = getLastStr(temp);
	    while(!last.matches(chinese)){
	    	temp = new String(b, start, ++end);
	    	last = getLastStr(temp);
	    }

        return new String(b, start, end);
	   }


public static String getByteStr(String str, int start, int end) throws UnsupportedEncodingException{
		byte[] b = str.getBytes("UTF-8");
		
		return new String(b, start, end);
	}

以下代码会出现半个汉字问题

/**
 * ReadFileByteBuffer.java
 * cn.com.songjy.test.io
 * Function： TODO 
 *
 *   version    date      author
 * ──────────────────────────────────
 *   	1.0	 2013-8-31    songjy
 *
 * Copyright (c) 2013, TNT All Rights Reserved.
*/

package cn.com.songjy.test.io;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * ClassName:ReadFileByteBuffer
 *
 * @author   songjy
 * @version  1.0
 * @since    v1.0
 * @Date	 2013-8-31	下午12:15:21
 */

public class ReadFileByteBuffer {

	private static Log log = LogFactory.getLog(ReadFileByteBuffer.class);
	
	public static void main(String[] args) {
		try {
			FileInputStream fis = new FileInputStream("pom.xml");/*创建文件输入流*/
			FileChannel fc = fis.getChannel();
			ByteBuffer bb = ByteBuffer.allocate(64);//每次取出64字节
			/*将FileChannel中的数据放入ByteBuffer*/
			while(-1 != fc.read(bb)){
				bb.flip();/*锁定ByteBuffer的空白区*/
				Charset charset = Charset.forName("UTF-8");/*创建Charset对象*/
				CharsetDecoder decoder = charset.newDecoder();/*创建解码器（CharsetDecoder）对象*/
				CharBuffer cb = decoder.decode(bb);/*将ByteBuffer的内容转码*/
				log.info(cb);
				bb.clear();
			}
		} catch (FileNotFoundException e) {
			log.error(e.getMessage(), e);
		} catch (IOException e) {
			log.error(e.getMessage(), e);
		}
	}

}

/*会出现java.nio.charset.MalformedInputException错误，原因是“半个中文问题”*/

上一篇：如何友好地展示findbugs分析报告

下一篇：用字节数截取字符串