当前位置: 代码迷 >> 综合 >> HtmlUnit 模拟浏览器请求 Java可以调用的内置浏览器
  详细解决方案

HtmlUnit 模拟浏览器请求 Java可以调用的内置浏览器

热度:37   发布时间:2024-01-31 09:10:35.0

引入包:

<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit --><dependency><groupId>net.sourceforge.htmlunit</groupId><artifactId>htmlunit</artifactId><version>2.42.0</version></dependency>

请求类:

package com.xxx;import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.ImmediateRefreshHandler;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.TextPage;
import com.gargoylesoftware.htmlunit.UnexpectedPage;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.LogFactory;public class Gaher {private final WebClient webclient;private String referer;public Gaher() {//LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); // 关闭注释webclient = new WebClient(BrowserVersion.FIREFOX_68); // 设置 user-agentwebclient.getOptions().setJavaScriptEnabled(true); // 启用javascriptwebclient.getOptions().setThrowExceptionOnScriptError(false); // 关闭js的异常抛出webclient.getOptions().setCssEnabled(false); // 不加载CSS文件//webclient.getCookieManager().clearCookies();//webclient.getCache().clear();webclient.setRefreshHandler(new ImmediateRefreshHandler());webclient.getOptions().setTimeout(600 * 1000);webclient.setJavaScriptTimeout(600 * 1000);webclient.setAjaxController(new NicelyResynchronizingAjaxController());webclient.setJavaScriptTimeout(600 * 1000);webclient.getOptions().setRedirectEnabled(true);webclient.waitForBackgroundJavaScript(60 * 1000);webclient.getOptions().setThrowExceptionOnFailingStatusCode(false);webclient.getOptions().setUseInsecureSSL(true);}/*** 请求html页并返回html内容* @param url* @return* @throws IOException */public String getHtml(String url) throws IOException {WebRequest request = new WebRequest(new URL(url));if (referer != null) {request.setAdditionalHeader("Referer", referer);}HtmlPage page = webclient.getPage(request);WebResponse response = page.getWebResponse();referer = url;return response.getContentAsString();}/*** 请求非HTML页并返回内容,如请求类型为:text/plain; charset=gb2312* @param url* @return* @throws IOException */public String getText(String url) throws IOException {WebRequest request = new WebRequest(new URL(url));if (referer != null) {request.setAdditionalHeader("Referer", referer);}TextPage page = webclient.getPage(request);WebResponse response = page.getWebResponse();return response.getContentAsString();}/*** 下载资源文件,如:图片,视频* @param url* @param filename* @throws IOException */public void download(String url, String filename) throws IOException {WebRequest request = new WebRequest(new URL(url));if (referer != null) {request.setAdditionalHeader("Referer", referer);}UnexpectedPage page = webclient.getPage(request);InputStream is = page.getWebResponse().getContentAsStream();FileOutputStream output = new FileOutputStream(filename);IOUtils.copy(is, output);output.close();}public void close() {webclient.close();}
}

测试:

public static void main(String[] args) throws IOException {Gaher gaher = new Gaher();String html = gaher.getHtml("https://www.baidu.com/"); // 取txt会报错String text = gaher.getText("https://xxx.txt"); // 取html会报错gaher.download("https://xxx.jpg", "E:\\temp\\tmp\\logs\\a.jpg");gaher.download("http://xxx.mp4", "E:\\temp\\tmp\\logs\\b.mp4");
}

 

  相关解决方案