您的位置:首页 > Web前端 > JavaScript

WebDriver 登陆 Jsoup抓取内容

2016-04-14 00:00 435 查看
摘要:使用WebDriver登陆西祠胡同并抓取的的demo

1.环境

pom:

<projectxmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion>

<groupId>mybatis</groupId>
<artifactId>test</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>test</name>
<url>http://maven.apache.org</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<!--添加mybatis-generator插件-->
<!--——>在Goals框中输入:mybatis-generator:generate运行mybatis插件-->
<build>
<plugins>
<plugin>
<groupId>org.mybatis.generator</groupId>
<artifactId>mybatis-generator-maven-plugin</artifactId>
<version>1.3.2</version>
<configuration>
<verbose>true</verbose>
<overwrite>true</overwrite>
</configuration>
</plugin>
</plugins>
</build>

<dependencies>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.12</version>
</dependency>

<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>net.sourceforge.jexcelapi</groupId>
<artifactId>jxl</artifactId>
<version>2.6.12</version>
</dependency>

<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>2.53.0</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>

</dependencies>

</project>


2.初始化WebDriver的类DriverFactory.java

packagetest;

importjava.util.Arrays;

importorg.openqa.selenium.WebDriver;
importorg.openqa.selenium.chrome.ChromeDriver;
importorg.openqa.selenium.chrome.ChromeOptions;
importorg.openqa.selenium.remote.DesiredCapabilities;

publicclassDriverFactory{

publicstaticChromeDrivercreate(){

//TODOAuto-generatedmethodstub
Stringchromdriver="C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe";
System.setProperty("webdriver.chrome.driver",chromdriver);
ChromeOptionsoptions=newChromeOptions();

DesiredCapabilitiescapabilities=DesiredCapabilities.chrome();
capabilities.setCapability("chrome.switches",Arrays.asList("--start-maximized"));
options.addArguments("--test-type","--start-maximized");
ChromeDriverdriver=newChromeDriver(options);
returndriver;
}

}


3.西祠胡同的登陆抓取类

packagetest; importjava.io.File; importjava.io.File; importjava.io.IOException; importjava.util.ArrayList; importjava.util.HashSet; importjava.util.List; importjava.util.Properties; importjava.util.Set; importorg.apache.log4j.Logger; importorg.apache.log4j.PropertyConfigurator; importorg.jsoup.Connection; importorg.jsoup.Jsoup; importorg.jsoup.nodes.Document; importorg.jsoup.nodes.Element; importorg.jsoup.select.Elements; importorg.junit.AfterClass; importorg.junit.BeforeClass; importorg.junit.Test; importorg.openqa.jetty.http.SSORealm; importorg.openqa.selenium.By; importorg.openqa.selenium.Cookie; importorg.openqa.selenium.Platform; importorg.openqa.selenium.WebDriver; importorg.openqa.selenium.WebElement; importorg.openqa.selenium.chrome.ChromeDriver; importorg.openqa.selenium.ie.InternetExplorerDriver; importorg.openqa.selenium.remote.DesiredCapabilities; importmx4j.log.Log; publicclassXiciLogin2{ publicLoggerlog=Logger.getLogger(Main.class); publicstaticSet<Cookie>cookies=newHashSet<Cookie>(); publicstaticChromeDriverdriver=DriverFactory.create(); /***抓取到每一个分页上所有详细页链接**@paramurl*/ publicList<String>crawlSource(Stringurl){ inttime=1; System.out.println("开始抓:"+url); log.info("开始抓:"+url); List<String>sourceUrls=newArrayList<String>(); StringbaseUrl="http://www.xici.net"; driver.get(url); Documentdocument=Jsoup.parse(driver.getPageSource()); WebElementwebElement=driver.findElement(By.xpath(".//*[@id='board_t']/tbody/tr/td[2]/a")); Elementselements=document.select("table#board_ttbodytr"); System.out.println(elements); if(elements!=null){ for(Elementelement:elements){ if(element.select("td").isEmpty()){ continue; } Stringtargets=element.select("tda[onclick=this.parentNode.className='visited';]").attr("href"); if(targets==""||targets==null){ continue; } targets=baseUrl+targets;//System.out.println(targets); sourceUrls.add(targets); } }else { System.out.println(url+"中没有详细页链接~~"); } System.out.println(sourceUrls.size()); if(sourceUrls.size()==0&&time<=5){ System.out.println("抓不到啦~重新抓一下"); crawlSource(url); time++; } returnsourceUrls; } /***解析详细页出东西*/ publicvoidcrawlTarget(Stringurl){ driver.get(url); Documentdocument=Jsoup.parse(driver.getPageSource()); System.out.println("抓"+url+"的标题");//取标题Elementelement= document.select("div#doc_tith1").first(); if(element!=null){ System.out.println("标题:"+element.text()); }else{ System.out.println(""); } } publicstaticvoidmain(String[]args){ PropertyConfigurator.configure("log4j.properties"); XiciLogin2xc=newXiciLogin2(); Stringsite="http://www.xici.net/b1513005/"; try{ xc.xiciLogin();// xc.crawlTarget("http://www.xici.net/d191739198.htm"); xc.getMaxPageNum("http://www.xici.net/b1468535/"); intpage=1;// intmaxPageNum=xc.getMaxPageNum(site); do{ StringsourceUrl=site+page; System.out.println("分页:"+sourceUrl); List<String>targetsList=xc.crawlSource(sourceUrl); if(targetsList.isEmpty()){ System.out.println("没抓到详细页!!"); }else{ for(Stringtarget:targetsList){ try{ xc.crawlTarget(target); Thread.sleep(3000); }catch(Exceptione){ e.printStackTrace(); } } } page++; Thread.sleep(3500); }while(page<=15); }catch( Exceptione){ e.printStackTrace(); } } /***获取当前入口site的最大分页数**/ publicintgetMaxPageNum(Stringsite){ Documentdocument=null; intmaxPageNum=0; try{ document=Jsoup.connect(site).get(); Elementelement=document.select("div#page").first(); Strings=element.text(); if(s.contains("共")){ s=s.split("共")[1]; s=s.split("页")[0]; } System.out.println(s); maxPageNum=Integer.parseInt(s); }catch(IOExceptione){ e.printStackTrace(); } returnmaxPageNum; } publicvoidxiciLogin()throwsException{ System.setProperty("webdriver.chrome.driver", "C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe"); driver.get("http://account.xici.net/login"); WebElementuser=driver.findElement(By.name("username")); WebElementpwa=driver.findElement(By.name("password"));//分别将用户名和密码文本框清空 //user.clear(); pwa.clear();//输入用户名和密码user.sendKeys("*******"); pwa.sendKeys("*********");//找到登陆按钮点击// driver.findElement(By.name("TANGRAM__PSP_3__submit")).click(); driver.findElement(By.xpath("html/body/div[3]/div[2]/div[2]/form/div[4]/button")).click(); //输出titleSystem.out.println(driver.getTitle());cookies= driver.manage().getCookies(); System.out.println(cookies); for(Cookiecookie2:cookies){ driver.manage().addCookie(cookie2); System.out.println(cookie2); }//能打开15页说明登陆成功// driver.get("http://www.xici.net/b1402132/15"); } }
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: