//import java.io.File; import java.io.IOException; import java.net.URL; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.PrettyXmlSerializer; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; //import com.sun.xml.internal.txw2.output.XmlSerializer; /** * 数据抓取 */ public class HtmlClean { @SuppressWarnings("deprecation") /** * 数据抓取 */ public void cleanHtml(String htmlurl, String xmlurl, String xpath) throws XPatherException { try { //将目标网址内容抓取下来存到本地的XML文件中(格式化) //long start = System.currentTimeMillis(); HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setUseCdataForScriptAndStyle(true); props.setRecognizeUnicodeChars(true); props.setUseEmptyElementTags(true); props.setAdvancedXmlEscape(true); props.setTranslateSpecialEntities(true); props.setBooleanAttributeValues("empty"); TagNode node = cleaner.clean(new URL(htmlurl)); // System.out.println(in); //System.out.println(((TagNode) ns[0]).getText()); //System.out.println("vreme:" + (System.currentTimeMillis() - start)); new PrettyXmlSerializer(props).writeXmlToFile(node, xmlurl);//格式化保存 String result = new PrettyXmlSerializer(props).getXmlAsString(node); //System.out.println("vreme:" + (System.currentTimeMillis() - start)); System.out.println("*********************************************************"); // // //TagNode Xmlnode = cleaner.clean(new URL(xmlurl));//从已经格式化的XML文件中取出所要的数据 TagNode Xmlnode = cleaner.clean(new String(result));//从已格式化的String中取出所要的数据 Object[] ns = Xmlnode.getElementsByName("title", true); // 标题 if (ns.length > 0) { System.out.println("title=" + ((TagNode) ns[0]).getText()); } ns = Xmlnode.evaluateXPath(xpath); // 选取class为指定dixian1的所有td标签 // for (int i = 0; i < ns.length; i++) { // String in = cleaner.getInnerHtml((TagNode) ns[i]); // System.out.println("<span>" + in + "</span>"); // } System.out.println("*********************************************************"); String in = cleaner.getInnerHtml((TagNode) ns[0]); for(int i=0 ;i<ns.length ;i++){ in = cleaner.getInnerHtml((TagNode) ns[i]); System.out.println(in); if((i+1)%8==0){ System.out.println("*********************************************************"); } } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) throws XPatherException { HtmlClean cleaner = new HtmlClean(); cleaner.cleanHtml("http://app.sipo.gov.cn:8080/sipo2008/searchfee/searchfee_action.jsp?sqh=01351345.1", "E://text/test.xml","//td[@class='dixian1']"); } }?
详细解决方案
初试htmlCleaner组合Xpath
热度:109 发布时间:2012-09-06 10:37:01.0
相关解决方案
- xpath 语法!
- XPATH 怎么返回不包含关键字的属性的Html标签
- htmlcleaner 运用示例
- htmlcleaner 应用示例
- xpath
- php xPath 解析xml文件 有关问题
- XPath 示范
- (2)XPath 语法
- (3)XPath Axes(坐标轴)
- XPath 术语(1)
- javascript XPath 兑现
- javascript XPath 兑现【补充】
- 请教:xpath,xquery这些技术现在用的多吗
- php xPath 解析xml文件 有关问题
- 从另一个 XPath 结果获取 XPath 结果
- 无法使用 xpath 获取 youtube 视频的持续时间
- 语法错误:XPath 不是合法表达式 一般如何查找和修复 XPath 语法问题
- XPath,XML命名空间和Java
- 如何使用不同的搜索( cssSelector / tag / ClassName )创建元素的 Xpath
- java 软件中main方法无法运行 import com.sun.org.apache.xpath.internal.operations.String;
- Python + Selenium(2.5)- 使用 Xpath 定位元素
- 爬虫知识点(xpath)
- 爬虫数据-Xpath(豆瓣读书小案例)
- 爬虫数据提取-xpath
- python爬虫学习笔记day1 -requests模块,数据解析(正则,bs4,xpath)
- python爬虫之数据解析(XPath)
- xpath-helper的使用
- XPATH,代理IP,JSON数据格式,session
- 网络爬虫,xpath
- xpath 解析html代码