package com.jadyer.httpclient; import java.io.FileInputStream; import java.io.FileOutputStream; import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.tags.Div; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.MetaTag; import org.htmlparser.util.NodeList; /** * HTMLParser入门_02_网络爬虫的雏形_解析文章的主题和作者及关键字等信息 * @see --------------------------------------------------------------------- * @see 所有jar如下 * @see commons-io-2.3.jar * @see commons-codec-1.6.jar(以下7个jar取自HttpClient官网下载的httpcomponents-client-4.2.1-bin.zip) * @see commons-logging-1.1.1.jar * @see fluent-hc-4.2.1.jar * @see httpclient-4.2.1.jar * @see httpclient-cache-4.2.1.jar * @see httpcore-4.2.1.jar * @see httpmime-4.2.1.jar * @see filterbuilder.jar(以下5个jar取自HTMLParser官网下载的HTMLParser-2.0-SNAPSHOT-bin.zip) * @see htmllexer.jar * @see htmlparser.jar * @see sitecapturer.jar * @see thumbelina.jar * @see --------------------------------------------------------------------- * @see 本文所用的HTMLParser工具类,详见我的下面的这一篇文章 * @see http://blog.csdn.net/jadyer/article/details/8656479 * @see --------------------------------------------------------------------- * @create Mar 10, 2013 5:05:55 PM * @author 玄玉<http://blog.csdn/net/jadyer> */ public class SpiderDemo { private static final String articleURI = "http://www.ibm.com/developerworks/cn/java/j-javaroundtable/index.html"; private static final String localHTML = "D:/Download/localHTML.html"; /** * 下载文章 */ private static void downloadArticle() throws Exception { HttpClient httpClient = new DefaultHttpClient(); HttpGet httpGet = new HttpGet(articleURI); try { HttpResponse response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); if(null != entity){ String responseContent = EntityUtils.toString(entity, "UTF-8"); EntityUtils.consume(entity); //文章内容写到本地(IOUtils干完活儿会自动关闭IO流) IOUtils.write(responseContent, new FileOutputStream(localHTML), "UTF-8"); } }finally{ httpClient.getConnectionManager().shutdown(); } } /** * 解析具有某种特征的标签 */ @SuppressWarnings("serial") private static void parseSpecifiedTag() throws Exception{ String inputHTML = IOUtils.toString(new FileInputStream(localHTML), "UTF-8"); Parser parser = new Parser(); parser.setInputHTML(inputHTML); //提取name="title"的<meta>标签,符合条件的应该只有一个<meta>标签,所以这里用单数 //这里我们自己写一个Filter,并且用内部类的方式 NodeList metaTag = parser.parse( new NodeFilter(){ @Override public boolean accept(Node node) { //找到<meta>标签 if(node instanceof MetaTag){ MetaTag mt = (MetaTag)node; //找到<meta name="title">的标签 if(null!=mt.getMetaTagName() && "title".equals(mt.getMetaTagName())){ return true; } } return false; } } ); //提取<meta name="title" content="2010 年春 Java 平台圆桌会议"/>标签中的content属性值 System.out.println("name=title,content=" + ((MetaTag)metaTag.elementAt(0)).getMetaContent()); } /** * 解析文章的简介,关键字,作者姓名等信息 */ private static void parseAbstractAndKeywords() throws Exception{ String html = IOUtils.toString(new FileInputStream(localHTML), "UTF-8"); List<MetaTag> metaTags = HTMLParseUtil.parseTags(html, MetaTag.class, "name", "Abstract"); for(MetaTag mt : metaTags){ System.out.println("文章的简介:" + mt.getMetaContent()); } MetaTag mt = HTMLParseUtil.parseTag(html, MetaTag.class, "name", "Keywords"); System.out.println("文章关键字:" + mt.getMetaContent()); List<Div> divTags = HTMLParseUtil.parseTags(html, Div.class, "class", "author"); for(Div div : divTags){ //div.getStringText()可以得到<div></div>所嵌套的内容 LinkTag aTag = HTMLParseUtil.parseTag(div.getStringText(), LinkTag.class, "class", "dwauthor"); System.out.println("作者姓名:" + aTag.getStringText()); } } }
详细解决方案
HTMLParser入门_02_网络爬虫的雏形_解析稿件的主题和作者及关键字等信息
热度:255 发布时间:2013-03-13 10:56:58.0
相关解决方案
- python模块引见- HTMLParser 简单的HTML和XHTML解析器
- 利用python脚本抓取AC的代码[爬虫+HTMLParser+handle_entityref+正则表达式+模拟登陆+资料操作]
- [转][htmlparser]htmlparser应用例子(全)
- 应用 HttpClient 和 HtmlParser 实现简易爬虫
- htmlparser 除了html标签体(获取body,title纯文本)
- HtmlParser 解析搜寻页面
- 运用 HttpClient 和 HtmlParser 实现简易爬虫
- [转]org.htmlparser.util.ParserException: Error in opening a connection to *
- Python抓取页面中超链接(URL)的三中方法比较(HTMLParser、pyquery、正则表达式)
- 应用 HttpClient 和 HtmlParser 实现简易爬
- HTMLParser
- htmlparser 获取页面婚配链接
- org.htmlparser.util.EncodingChangeException: character
- htmlparser 抓不到网页的内容。该如何解决
- org.htmlparser.util.ParserException: reset stream failed
- 哪位高手知道org.htmlparser.StringNode在哪个版本的jar包
- 【HtmlParser】提取网页的meta信息解决方法
- 使用ASIHTTPRequest 编译提示找不到libxml/HTMLparser.h的解决方法
- org.htmlparser.util.ParserException: Error in opening a connection to ***
- coursera-dl 报错 AttributeError (‘HTMLParser’ object has no attribute ‘unescape’)