当前位置: 代码迷 >> Java Web开发 >> 获取HTML源码文档中<div class="stat stat_area"></div>格式中的内容,该怎么解决
  详细解决方案

获取HTML源码文档中<div class="stat stat_area"></div>格式中的内容,该怎么解决

热度:718   发布时间:2016-04-17 11:12:45.0
获取HTML源码文档中<div class="stat stat_area"></div>格式中的内容
我要获取的是该http://v.youku.com/v_show/id_XMzI5NTIyNzMy.html页面源码中的<div class="stat stat_area">内容</div>,这个DIV中的内容,可是提取为空,正则表达式应该也没错,高手进来帮个帮。




import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class testtest {


public StringBuffer getContent(URL url)
{
  StringBuffer contentBuffer = new StringBuffer();
   
try {
InputStreamReader istreamReader = new InputStreamReader(url.openStream());
int ch = 0;
//ch读取的字符,如果已到达流的末尾,则返回 -1
while ((ch = istreamReader.read()) != -1)
contentBuffer.append((char)ch);
  } catch (IOException e) {
  e.printStackTrace();
  }

  return contentBuffer;
}

public String getVideoInf(StringBuffer contentString,String patternString)
{
String temp = "";  
List<String> list = new ArrayList<String>();  
Pattern pattern = Pattern.compile(patternString,Pattern.CANON_EQ);
Matcher matcher = pattern.matcher(contentString);
while(matcher.find())
{
list.add(matcher.group());
}

for(int i=0;i<list.size();i++)
temp += list.get(i);
return temp;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
testtest ddTesttest = new testtest();
String urlStr = "http://v.youku.com/v_show/id_XMzI5NTIyNzMy.html";
try {
StringBuffer stringBuffer = ddTesttest.getContent(new URL(urlStr));
String temp = ddTesttest.getVideoInf(stringBuffer,"<div class=\"stat stat_area\">(.*?)</div>");
System.out.println("test:\n"+temp);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

}


------解决方案--------------------
Java code
import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;import java.util.regex.Matcher;import java.util.regex.Pattern;public class Test {    public String getContent(URL url) {        StringBuilder builder = new StringBuilder();        try {            HttpURLConnection con = (HttpURLConnection)url.openConnection();            con.connect();            InputStream iStream = con.getInputStream();            InputStreamReader isr = new InputStreamReader(iStream);            BufferedReader br = new BufferedReader(isr);            while(true){                String line = br.readLine();                if(line==null){                    break;                }                                if(line.length()>0){                    builder.append(line.trim());                }            }        } catch (IOException e) {            return null;        }        return builder.toString();    }    public String getVideoInf(String content, String regex) {                Pattern pattern = Pattern.compile(regex);        Matcher m = pattern.matcher(content);        if(!m.find()){            return null;        }                return m.group(1);    }    /**     * @param args     */    public static void main(String[] args) {        // TODO Auto-generated method stub        Test test = new Test();        String urlStr = "http://v.youku.com/v_show/id_XMzI5NTIyNzMy.html";        try {            String str= test.getContent(new URL(urlStr));            String regexp = "<div\\s*class=\"stat stat_area\"\\s*[^>]*>(.+?)</div>";            String tmp = test.getVideoInf(str, regexp);            System.out.println(tmp);        } catch (MalformedURLException e) {            e.printStackTrace();        }    }}
  相关解决方案