HtmlUnit测试单元做爬虫

游客

20-10-28 16:42 👁746

前端有时候会遇到项目临时需要网上收集数据的情况，什么方案是简单易懂、长期可用的呢，当然是用浏览器终端测试单元做爬虫是最方便的啦，将平时工作中的测试程序进行简单的修改，然后配合爬虫代理，就可以马上开始数据采集，是不是很方便呀。

HtmlUnit是java下的一款无头浏览器方案，通过相应的API模拟HTML协议，可以请求页面，提交表单，打开链接等等操作，完全模拟用户终端。支持复杂的JavaScript、AJAX库，可以模拟多种浏览器，包括Chrome，Firefox或IE等。

下面提供一个简单的demo，通过调用爬虫代理访问IP查询网站，如果将目标网站修改为需要采集的数据链接，即可获取相应的数据，再加上数据分析模块就可以基本使用，示例如下：

package htmlunit;

import org.apache.http.auth.AuthScope;

import org.apache.http.auth.UsernamePasswordCredentials;

import org.apache.http.client.CredentialsProvider;

import org.apache.http.impl.client.BasicCredentialsProvider;

import com.gargoylesoftware.htmlunit.BrowserVersion;

import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;

import com.gargoylesoftware.htmlunit.WebClient;

import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HtmlunitDemo {

// 代理服务器(产品官网 www.16yun.cn)

final static String proxyHost = "t.16yun.cn";

final static Integer proxyPort = 31111;

// 代理验证信息

final static String proxyUser = "USERNAME";

final static String proxyPass = "PASSWORD";

public static void main(String[] args) {

CredentialsProvider credsProvider = new BasicCredentialsProvider();

credsProvider.setCredentials(

new AuthScope(proxyHost, proxyPort),

new UsernamePasswordCredentials(proxyUser, proxyPass));

WebClient webClient = new WebClient(BrowserVersion.CHROME,proxyHost, proxyPort);

webClient.setCredentialsProvider(credsProvider);

webClient.setAjaxController(new NicelyResynchronizingAjaxController());

webClient.getOptions().setJavaScriptEnabled(true);

webClient.getOptions().setThrowExceptionOnScriptError(false);

webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);

webClient.getOptions().setActiveXNative(false);

webClient.getOptions().setCssEnabled(false);

HtmlPage page = null;

try {

page = webClient.getPage("http://httpbin.org/ip");

} catch (Exception e) {

e.printStackTrace();

} finally {

webClient.close();

}

webClient.waitForBackgroundJavaScript(30000);

String pageXml = page.asXml();

System.out.println(pageXml);

}