附上phantomjs示列代码
package cn.wang.utils
import java.util.Random
import com.gargoylesoftware.htmlunit.BrowserVersion
import com.gargoylesoftware.htmlunit.CookieManager
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController
import com.gargoylesoftware.htmlunit.WebClient
public class htmlUnitUtils {
static WebClient webClient = null
static Random random = new Random()
static{
//1.创建对象
webClient = new WebClient(BrowserVersion.CHROME)
//2.设置参数
//启动js
webClient.getOptions().setJavaScriptEnabled(true)
//关闭css渲染
webClient.getOptions().setCssEnabled(false)
//启动重定向
webClient.getOptions().setRedirectEnabled(true)
//设置连接超时时间 ,这里是10S。如果为0,则无限期等待
webClient.getOptions().setTimeout(1000 * 15)
//启动cookie管理
webClient.setCookieManager(new CookieManager())
//启动ajax代理
webClient.setAjaxController(new NicelyResynchronizingAjaxController())
//js运行时错误,是否抛出异常
webClient.getOptions().setThrowExceptionOnScriptError(false)
//设置浏览器请求信息
webClient.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xmlq=0.9,*/*q=0.8")
webClient.addRequestHeader("Accept-Encoding", "gzip, deflate")
webClient.addRequestHeader("Accept-Language", "zh-CN,zhq=0.8,zh-TWq=0.7,zh-HKq=0.5,en-USq=0.3,enq=0.2")
webClient.addRequestHeader("Connection", "keep-alive")
webClient.addRequestHeader("Upgrade-Insecure-Requests", "1")
}
public static void runJs(String url){
try {
webClient.addRequestHeader("User-Agent", Constant.useragents[random.nextInt(Constant.useragents.length)])
//等待js渲染执行 waitime等待时间(ms)
webClient.waitForBackgroundJavaScript(1000 * 10)
//3.获取页面
webClient.getPage(url)
} catch (Exception e) {
e.printStackTrace()
} finally {
if(webClient != null){
webClient.close()
}
}
}
public static void main(String[] args) {
runJs("http://www.gou.hk/")
System.setProperty("phantomjs.binary.path", "D:\\works\\tool\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe")
}
}
推荐大家使用神箭手云爬虫写爬虫,完全在云上编写和执行爬虫,不需要配置任何开发环境,快速开发快速实现。
简单几行
javascript
就可以实现复杂的爬虫,同时提供很多功能函数:反反爬虫、
js
渲染、数据发布、图表分析、反防盗链等,这些在开发爬虫过程中经常会遇到的问题都由神箭手帮你解决。
神箭手上有开发者文档,详细说明了如何编写爬虫脚本,还有很多网站的源码分享哦。