WebDriver 登陆 Jsoup抓取内容
2016-04-14 00:00
435 查看
摘要:使用WebDriver登陆西祠胡同并抓取的的demo
packagetest;
importjava.io.File;
importjava.io.File;
importjava.io.IOException;
importjava.util.ArrayList;
importjava.util.HashSet;
importjava.util.List;
importjava.util.Properties;
importjava.util.Set;
importorg.apache.log4j.Logger;
importorg.apache.log4j.PropertyConfigurator;
importorg.jsoup.Connection;
importorg.jsoup.Jsoup;
importorg.jsoup.nodes.Document;
importorg.jsoup.nodes.Element;
importorg.jsoup.select.Elements;
importorg.junit.AfterClass;
importorg.junit.BeforeClass;
importorg.junit.Test;
importorg.openqa.jetty.http.SSORealm;
importorg.openqa.selenium.By;
importorg.openqa.selenium.Cookie;
importorg.openqa.selenium.Platform;
importorg.openqa.selenium.WebDriver;
importorg.openqa.selenium.WebElement;
importorg.openqa.selenium.chrome.ChromeDriver;
importorg.openqa.selenium.ie.InternetExplorerDriver;
importorg.openqa.selenium.remote.DesiredCapabilities;
importmx4j.log.Log;
publicclassXiciLogin2{
publicLoggerlog=Logger.getLogger(Main.class);
publicstaticSet<Cookie>cookies=newHashSet<Cookie>();
publicstaticChromeDriverdriver=DriverFactory.create();
/***抓取到每一个分页上所有详细页链接**@paramurl*/
publicList<String>crawlSource(Stringurl){
inttime=1;
System.out.println("开始抓:"+url);
log.info("开始抓:"+url);
List<String>sourceUrls=newArrayList<String>();
StringbaseUrl="http://www.xici.net";
driver.get(url);
Documentdocument=Jsoup.parse(driver.getPageSource());
WebElementwebElement=driver.findElement(By.xpath(".//*[@id='board_t']/tbody/tr/td[2]/a"));
Elementselements=document.select("table#board_ttbodytr");
System.out.println(elements);
if(elements!=null){
for(Elementelement:elements){
if(element.select("td").isEmpty()){
continue;
}
Stringtargets=element.select("tda[onclick=this.parentNode.className='visited';]").attr("href");
if(targets==""||targets==null){
continue;
}
targets=baseUrl+targets;//System.out.println(targets);
sourceUrls.add(targets);
}
}else
{
System.out.println(url+"中没有详细页链接~~");
}
System.out.println(sourceUrls.size());
if(sourceUrls.size()==0&&time<=5){
System.out.println("抓不到啦~重新抓一下");
crawlSource(url);
time++;
}
returnsourceUrls;
}
/***解析详细页出东西*/
publicvoidcrawlTarget(Stringurl){
driver.get(url);
Documentdocument=Jsoup.parse(driver.getPageSource());
System.out.println("抓"+url+"的标题");//取标题Elementelement=
document.select("div#doc_tith1").first();
if(element!=null){
System.out.println("标题:"+element.text());
}else{
System.out.println("");
}
}
publicstaticvoidmain(String[]args){
PropertyConfigurator.configure("log4j.properties");
XiciLogin2xc=newXiciLogin2();
Stringsite="http://www.xici.net/b1513005/";
try{
xc.xiciLogin();//
xc.crawlTarget("http://www.xici.net/d191739198.htm");
xc.getMaxPageNum("http://www.xici.net/b1468535/");
intpage=1;//
intmaxPageNum=xc.getMaxPageNum(site);
do{
StringsourceUrl=site+page;
System.out.println("分页:"+sourceUrl);
List<String>targetsList=xc.crawlSource(sourceUrl);
if(targetsList.isEmpty()){
System.out.println("没抓到详细页!!");
}else{
for(Stringtarget:targetsList){
try{
xc.crawlTarget(target);
Thread.sleep(3000);
}catch(Exceptione){
e.printStackTrace();
}
}
}
page++;
Thread.sleep(3500);
}while(page<=15);
}catch(
Exceptione){
e.printStackTrace();
}
}
/***获取当前入口site的最大分页数**/
publicintgetMaxPageNum(Stringsite){
Documentdocument=null;
intmaxPageNum=0;
try{
document=Jsoup.connect(site).get();
Elementelement=document.select("div#page").first();
Strings=element.text();
if(s.contains("共")){
s=s.split("共")[1];
s=s.split("页")[0];
}
System.out.println(s);
maxPageNum=Integer.parseInt(s);
}catch(IOExceptione){
e.printStackTrace();
}
returnmaxPageNum;
}
publicvoidxiciLogin()throwsException{
System.setProperty("webdriver.chrome.driver",
"C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe");
driver.get("http://account.xici.net/login");
WebElementuser=driver.findElement(By.name("username"));
WebElementpwa=driver.findElement(By.name("password"));//分别将用户名和密码文本框清空
//user.clear();
pwa.clear();//输入用户名和密码user.sendKeys("*******");
pwa.sendKeys("*********");//找到登陆按钮点击//
driver.findElement(By.name("TANGRAM__PSP_3__submit")).click();
driver.findElement(By.xpath("html/body/div[3]/div[2]/div[2]/form/div[4]/button")).click();
//输出titleSystem.out.println(driver.getTitle());cookies=
driver.manage().getCookies();
System.out.println(cookies);
for(Cookiecookie2:cookies){
driver.manage().addCookie(cookie2);
System.out.println(cookie2);
}//能打开15页说明登陆成功//
driver.get("http://www.xici.net/b1402132/15");
}
}
1.环境
pom:<projectxmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion> <groupId>mybatis</groupId> <artifactId>test</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>test</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <!--添加mybatis-generator插件--> <!--——>在Goals框中输入:mybatis-generator:generate运行mybatis插件--> <build> <plugins> <plugin> <groupId>org.mybatis.generator</groupId> <artifactId>mybatis-generator-maven-plugin</artifactId> <version>1.3.2</version> <configuration> <verbose>true</verbose> <overwrite>true</overwrite> </configuration> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.38</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.3.1</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.12</version> </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>net.sourceforge.jexcelapi</groupId> <artifactId>jxl</artifactId> <version>2.6.12</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.2</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.3</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-server</artifactId> <version>2.53.0</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> </dependencies> </project>
2.初始化WebDriver的类DriverFactory.java
packagetest; importjava.util.Arrays; importorg.openqa.selenium.WebDriver; importorg.openqa.selenium.chrome.ChromeDriver; importorg.openqa.selenium.chrome.ChromeOptions; importorg.openqa.selenium.remote.DesiredCapabilities; publicclassDriverFactory{ publicstaticChromeDrivercreate(){ //TODOAuto-generatedmethodstub Stringchromdriver="C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe"; System.setProperty("webdriver.chrome.driver",chromdriver); ChromeOptionsoptions=newChromeOptions(); DesiredCapabilitiescapabilities=DesiredCapabilities.chrome(); capabilities.setCapability("chrome.switches",Arrays.asList("--start-maximized")); options.addArguments("--test-type","--start-maximized"); ChromeDriverdriver=newChromeDriver(options); returndriver; } }
3.西祠胡同的登陆抓取类
相关文章推荐
- JavaScript运行机制
- JavaScript事件对象与事件处理程序
- JavaScript总结
- js数组与字符串的相互转换方法
- JavaScript简介
- 笔记练习:《Javascript入门经典(第5版)》page212_17.7Practice
- JavaScript-js-创建表格-创建多行多列表
- JavaScript九宫格数独生成算法
- 【精心推荐】几款极好的 JavaScript 文件上传插件
- js对json的操作
- Phantomjs 文件读写及文件夹操作
- 调试Javascript代码
- js获取浏览器基本信息:document.body.clientWidth/clientHeight/scrollWidth/scrollTop。
- Seajs 简易文档 提供简单、极致的模块化开发体验
- 使用getJSON()异步请求服务器返回json格式数据
- JavaScript设计模式返璞归真
- Phantomjs 调试方法
- js跨域数据传输
- Jsp自定义标签
- jsonp 跨站的理解