欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Java按字节数截取字符串,一个中文长度为2

程序员文章站 2022-05-24 23:34:48
...
碰到可能会截取汉字的情况,当然是要不能截取出乱码来,就是不能对整个汉字截取一半。如"我ABC汉字d"这个字符串,截取5个字节的时候,应该是"我ABC",而截取8个字节的时候,应该是"我ABC汉",而不应该是"我ABC汉?",其中"?"为半个汉字,可理解为向前截取

public static String subStr(String str, int subSLength)  
	           throws UnsupportedEncodingException{ 
	       if (str == null)  
	           return "";  
	       else{ 
	           int tempSubLength = subSLength;//截取字节数
	           String subStr = str.substring(0, str.length()<subSLength ? str.length() : subSLength);//截取的子串  
	           int subStrByetsL = subStr.getBytes("GBK").length;//截取子串的字节长度 
	           //int subStrByetsL = subStr.getBytes().length;//截取子串的字节长度 
	           // 说明截取的字符串中包含有汉字  
	           while (subStrByetsL > tempSubLength){  
	        	   int subSLengthTemp = --subSLength;
	               subStr = str.substring(0, subSLengthTemp>str.length() ? str.length() : subSLengthTemp);  
	               subStrByetsL = subStr.getBytes("GBK").length;
	               //subStrByetsL = subStr.getBytes().length;
	           }  
	           return subStr; 
	       }
	   }


备注:将字符编码GBK改为UTF-8,则每个中文长度按3个字符计算


以下方法是向后截取字符串
public static String subStr_1(String str, int start, int end)
	           throws UnsupportedEncodingException{
		
	    if (str == null)  return null;
	    String chinese = "[\u0391-\uFFE5]";
	    byte[] b = str.getBytes("UTF-8");
	    
	    String temp = new String(b, start, end);
	    String last = getLastStr(temp);
	    while(!last.matches(chinese)){
	    	temp = new String(b, start, ++end);
	    	last = getLastStr(temp);
	    }

        return new String(b, start, end);
	   }


public static String getByteStr(String str, int start, int end) throws UnsupportedEncodingException{
		byte[] b = str.getBytes("UTF-8");
		
		return new String(b, start, end);
	}



以下代码会出现半个汉字问题
/**
 * ReadFileByteBuffer.java
 * cn.com.songjy.test.io
 * Function: TODO 
 *
 *   version    date      author
 * ──────────────────────────────────
 *   	1.0	 2013-8-31    songjy
 *
 * Copyright (c) 2013, TNT All Rights Reserved.
*/

package cn.com.songjy.test.io;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * ClassName:ReadFileByteBuffer
 *
 * @author   songjy
 * @version  1.0
 * @since    v1.0
 * @Date	 2013-8-31	下午12:15:21
 */

public class ReadFileByteBuffer {

	private static Log log = LogFactory.getLog(ReadFileByteBuffer.class);
	
	public static void main(String[] args) {
		try {
			FileInputStream fis = new FileInputStream("pom.xml");/*创建文件输入流*/
			FileChannel fc = fis.getChannel();
			ByteBuffer bb = ByteBuffer.allocate(64);//每次取出64字节
			/*将FileChannel中的数据放入ByteBuffer*/
			while(-1 != fc.read(bb)){
				bb.flip();/*锁定ByteBuffer的空白区*/
				Charset charset = Charset.forName("UTF-8");/*创建Charset对象*/
				CharsetDecoder decoder = charset.newDecoder();/*创建解码器(CharsetDecoder)对象*/
				CharBuffer cb = decoder.decode(bb);/*将ByteBuffer的内容转码*/
				log.info(cb);
				bb.clear();
			}
		} catch (FileNotFoundException e) {
			log.error(e.getMessage(), e);
		} catch (IOException e) {
			log.error(e.getMessage(), e);
		}
	}

}

/*会出现java.nio.charset.MalformedInputException错误,原因是“半个中文问题”*/