我用apache httpclient 4.1.1抓取网页,用String的indexof方法搜索其中是否含有感兴趣的关键字,搜索GBK、GB2312编码网页时正常,遇到UFT-8编码网页就无法搜索,抓取下来的中文内容打印出来也是无法辨认。肯定是编码问题了,不知该怎么解决。搜索了好长时间,试了各种转换编码方法,但都不能把抓取下来的中文内容正常打印出来,搜索也都是-1.
------解决方案--------------------
看楼主也很纠结的;当学习,下载了apache httpclient 4.1.3,给个示例(使用探测工具探测编码失败,就不写了):
- Java code
import java.io.BufferedReader;import java.io.InputStreamReader;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpPost;import org.apache.http.impl.client.DefaultHttpClient;public class EncodedPostTest { public static void main(String[] args) throws Exception { HttpClient httpclient = new DefaultHttpClient(); BufferedReader bufReader = null; String charset = ""; try { HttpPost httppost = new HttpPost( "http://localhost:8080/TestJEEProject/EncodingServlet"); HttpResponse response = httpclient.execute(httppost); if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { if (response.getEntity().getContentEncoding() != null) { charset = response.getEntity().getContentEncoding().getValue(); }else if(response.getEntity().getContentType() != null){ String contentType = response.getEntity().getContentType().getValue().toLowerCase().replaceAll("\\s*", ""); charset = contentType.substring(contentType.indexOf("charset=") + "charset=".length()); }else{// //TODO: 使用默认字符编码 charset = "gbk"; } System.out.println("Charset : " + charset); bufReader = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), charset)); String strValue = bufReader.readLine(); while(strValue != null){ if(strValue.indexOf("编码") != -1){ System.out.println(strValue); } strValue = bufReader.readLine(); } } else { System.out.println("Unexpected failure: " + response.getStatusLine().toString()); } } finally { httpclient.getConnectionManager().shutdown(); if(bufReader != null){ bufReader.close(); } } }}