java扫描免费代理服务器
2017-09-22 13:51
281 查看
免费代理服务器的收集与使用(java版)
一、前言:
>概念:
代理(英语:Proxy),也称网络代理,是一种特殊的网络服务,允许一个网络终端(一般为客户端)
通过这个服务与另一个网络终端(一般为服务器)进行非直接的连接。一些网关、路由器等网络设备
具备网络代理功能。一般认为代理服务有利于保障网络终端的隐私或安全,防止攻击。
>功能:
1 突破自生IP访问限制。
2 网络用户可以通过代理访问外国网站。
3 访问一些单位或团体内部资源。
4 突破中国电信的IP封锁。
5 提高访问速度
6 隐藏真实IP。有防火墙的功能。
4000
二、介绍:
>>在学习中,由于需要频繁抓取一些网站的数据,而且保证数据的实时性,有效性,就需要多次访问服务器。
这样的话就会增加服务器的负荷。所以网站管理员采取技术手段,对一定时间进行频繁访问的ip地址封锁处理。所
以此时就需要大量的代理服务器交替使用IP地址。去帮助你探索数据。
>>文中所用到的代理服务器并不是从网络中扫描得到,而是在http://www.xicidaili.com/nn/这个网站中提取而来,
因为如果要是自己扫描的话,得需要很多高性能的服务器和别的技术手段.之前在无忧代理那个网站试过,但是提取
出来的IP地址是对的,端口号是错误的,我估计是在请求的文档加载完成以后采用异步js或者Ajax更新了端口,所以
我放弃了,此处只为学习,故提取现成的事半功陪。
>>由于只是为了验证与学习,文章中代码比较乱,下文中的程序属于半成品,但是为了更好理解,我在此说明
我的编程思路。如果我实在没说明白,就当玩玩而已^_^
文中有两个主方法,其实就是两个小程序,一个是StartIPSet,这个主要是把提取出来的数据放置在数据库中,
以便用到的时候直接在数据库中进行查取,里面有个方法update(),用来更新数据库中的数据,不是自动更新。第二
个主方法是TestProxy,主要功能是测试和验证有用的代理IP。还有一些辅助的类,比如DownloadHtml(用于从西刺代
理这个网站下载网页文档),还有GetCookie(这个类可有可无,主要是在12306网站中要想获取数据的话就必须用到
cookies,在此贴出来,只为学习)。
三、代码:
StartIPSet:
package pitd; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; import java.sql.Statement; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import dao.DBUtil; public class StartIPSet { public static void main(String[] args) { StartIPSet s=new StartIPSet(); s.update(); } public void put() { String sql = "insert into proxyip values(?,?,?,?,?,sysdate)"; Connection con = null; PreparedStatement ps = null; try { /* * fw = new FileWriter( * "C:\\Users\\Administrator\\Desktop\\proxyIP.txt"); */ // out=new BufferedWriter(new // FileWriter("C:\\Users\\Administrator\\Desktop\\proxyIP.txt")); HtmlCleaner cleaner = new HtmlCleaner(); TagNode tagNode = cleaner.clean(DownloadHtml.getHtml()); Object[] action = tagNode.getElementsByName("td", true); System.out.println(action.length); con = DBUtil.getConnection(); con.setAutoCommit(false); ps = con.prepareStatement(sql); for (int i = 1; i < action.length - 6;) { for (int j = 1; j <= 5; j++) { TagNode tna = (TagNode) action[i]; ps.setString(j, tna.getText().toString()); i++; } i = i + 5; ps.addBatch(); } ps.executeBatch(); con.commit(); con.setAutoCommit(true); } catch (SQLException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } finally { DBUtil.closePreparedStatement(ps); DBUtil.closeConnection(con); } } public void update(){ String sql="truncate table proxyip"; Connection con=null; Statement st=null; try { con=DBUtil.getConnection(); st=con.createStatement(); st.execute(sql); } catch (ClassNotFoundException | SQLException e) { e.printStackTrace(); }finally{ DBUtil.closeStatement(st); DBUtil.closeConnection(con); } put(); } }
GetCookie:
package pitd; import java.io.IOException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; public class GetCookie { public static String getCookie(String url){ HttpURLConnection conn=null; String cookie=null; StringBuffer result=new StringBuffer(); try { URL u=new URL(url); conn = (HttpURLConnection)u.openConnection(); conn.setRequestMethod("GET"); conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36"); //conn.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch"); //kongzhi bainma conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); conn.setInstanceFollowRedirects(false); System.out.println(conn.getResponseCode()); String cookieskey = "Set-Cookie"; cookie = conn.getHeaderField(cookieskey); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { URL u=new URL(url); conn = (HttpURLConnection)u.openConnection(); conn.setRequestMethod("GET"); conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36"); //conn.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch"); //kongzhi bainma conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); conn.setInstanceFollowRedirects(false); conn.setRequestProperty("Cookie", cookie); conn.connect(); System.out.println(conn.getResponseCode()); String cookieskey = "Set-Cookie"; String cookie2 = conn.getHeaderField(cookieskey); result.append(cookie+";"+cookie2); System.out.println(result.toString()); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return result.toString(); } }
DownloadHtml:
package pitd; import java.io.BufferedReader; import java.io.FileWriter; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; public class DownloadHtml { public static String getHtml() { URL url; StringBuffer contentBuffer = new StringBuffer(); FileWriter fw; String htmlstr=null; HttpURLConnection conn=null; try { fw = new FileWriter( "C:\\Users\\Administrator\\Desktop\\crawler.txt"); String urlPath="http://www.xicidaili.com/nn/"; url = new URL(urlPath); conn = (HttpURLConnection)url.openConnection(); conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36"); //conn.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch"); //kongzhi bainma conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); conn.setInstanceFollowRedirects(true); conn.setRequestProperty("Connection", "keep-alive"); conn.setRequestProperty("Upgrade-Insecure-Requests", "1"); conn.setRequestProperty("Cookie",GetCookie.getCookie(urlPath)); conn.connect(); int returnCode = conn.getResponseCode(); if (returnCode == 200) { InputStream input = conn.getInputStream(); InputStreamReader istreamReader = new InputStreamReader( input, "UTF-8"); BufferedReader buffStr = new BufferedReader(istreamReader); String str = null; while ((str = buffStr.readLine()) != null) contentBuffer.append(str); htmlstr = contentBuffer.toString(); System.out.println(htmlstr); fw.write(htmlstr); input.close(); istreamReader.close(); buffStr.close(); fw.close(); } } catch (Exception e) { e.printStackTrace(); } finally { if (conn != null) { conn.disconnect(); } } return htmlstr; } }
TestProxy:
package proxyip; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.Proxy; import java.net.URL; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.regex.Matcher; import java.util.regex.Pattern; import pitd.GetCookie; import dao.DBUtil; public class TestProxy { static Proxy proxy = null; Connection con = null; Statement st = null; ResultSet rs = null; public static void main(String[] args) throws IOException { try { @SuppressWarnings("unchecked") // 使用反射加载类。 Class<TestProxy> clazz = (Class<TestProxy>) Class .forName("proxyip.TestProxy"); TestProxy tp = clazz.newInstance(); tp.checkProxy(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { e.printStackTrace(); } } public void checkProxy() { int count = 100; try { String sql = "select * from proxyip"; con = DBUtil.getConnection(); st = con.createStatement(); rs = st.executeQuery(sql); do { rs.next();count--; System.out.print("数据库中取出的数据为:"); System.out.println(rs.getString(1) + "\t" + new Integer(rs.getString(2))); proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress( rs.getString(1), new Integer(rs.getString(2)))); // 确定代理是否设置成功 if (statuOk()) { System.out.println("<<<<----代理成功————>>>>\n代理信息:"); System.out.println("Address:" + rs.getString(1) + "\nPort:" + rs.getString(2) + "\nLocaltion:" + rs.getString("LOCALTION") + "\nAnony:" + rs.getString(4) + "\nProtocal:" + rs.getString(5)); break; } else if (count <= 0) { System.out.println("代理失败,ip资源不足!"); break; } } while (true); } catch (ClassNotFoundException | SQLException e) { e.printStackTrace(); } finally { DBUtil.closeResultSet(rs); DBUtil.closeStatement(st); DBUtil.closeConnection(con); } } public boolean statuOk() { int flag = 0; String localIP = getV4IP(); System.out.println("start..."); try { String ipInfo = getHtml("http://ip.chinaz.com/getip.aspx");// http://city.ip138.com/ip2city.asp Pattern p = Pattern .compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"); Matcher m = p.matcher(ipInfo); if (m.find()) { String proxyIP = m.group(0); System.out.println("正在检测的代理ip:" + proxyIP); if (!localIP.equals(proxyIP)) { System.out.println("本机ip:" + localIP); flag = 1; } } } catch (Exception e) { System.out.println(e.getMessage()); flag = 0; } if (flag == 1) { return true; } else { return false; } } private static String getHtml(String address) throws Exception { StringBuffer html = new StringBuffer(); String result = null; /* * System.getProperties().setProperty("proxySet", "true"); // * 如果不设置,只要代理IP和代理端口正确,此项不设置也可以 String ip = "218.56.132.158"; * * System.getProperties().setProperty("http.proxyHost", * "202.124.205.26"); * System.getProperties().setProperty("http.proxyPort", "3128"); */ URL url = new URL(address); HttpURLConnection con = (HttpURLConnection) url.openConnection(proxy); con.setConnectTimeout(5000); con.setDoInput(true); con.setRequestMethod("GET"); // conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; NT 5.1; GTB5; .NET CLR 2.0.50727; CIBA)"); BufferedInputStream in = new BufferedInputStream(con.getInputStream()); String inputLine; byte[] buf = new byte[4096]; int bytesRead = 0; while (bytesRead >= 0) { inputLine = new String(buf, 0, bytesRead, "ISO-8859-1"); html.append(inputLine); bytesRead = in.read(buf); inputLine = null; } buf = null; in.close(); con = null; url = null; result = new String(html.toString().trim().getBytes("ISO-8859-1"), "gb2312").toLowerCase(); return result; } public String getV4IP() { String ip = ""; String chinaz = "http://ip.chinaz.com/getip.aspx"; StringBuilder inputLine = new StringBuilder(); String read = ""; URL url = null; HttpURLConnection urlConnection = null; BufferedReader in = null; try { url = new URL(chinaz); urlConnection = (HttpURLConnection) url.openConnection(); urlConnection .setRequestProperty( "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.1 a427 10 Safari/537.36"); // conn.setRequestProperty("Accept-Encoding", // "gzip, deflate, sdch"); //kongzhi bainma urlConnection .setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); urlConnection.setInstanceFollowRedirects(true); urlConnection.setRequestProperty("Connection", "keep-alive"); urlConnection.setRequestProperty("Upgrade-Insecure-Requests", "1"); urlConnection.connect(); if (urlConnection.getResponseCode() == 200) { in = new BufferedReader(new InputStreamReader( urlConnection.getInputStream(), "UTF-8")); while ((read = in.readLine()) != null) { inputLine.append(read); } } // System.out.println(inputLine.toString()); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (in != null) { try { in.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } // "\\<dd class\\=\"fz24\">(.*?)\\<\\/dd>" Pattern p = Pattern .compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"); Matcher m = p.matcher(inputLine.toString()); if (m.find()) { String ipstr = m.group(0); ip = ipstr; } return ip; } }
四、截图:
访问并下载网页,用HtmlCleaner进行解析后将数据放置数据库中,共1000个节点,大概100条记录。
提取的数据和原网页的数据:
采用循环检测代理IP,当一个不能使用的时候,自动提取下一条再进行测试,代理成功以后跳出循环,或者IP资源不足的时候跳出循环。
联系邮箱:xhsgg12302@outlook.com
2017_09_22
相关文章推荐
- [置顶] 【Java资源免费分享,网盘自己拿】
- 静态代码扫描之阿里java代码规范IDEA插件
- 免费Java反编译工具decompiler
- JAVA实现的支付宝扫描二维码支付
- 最好的Java程序员免费学习材料
- 【JAVA资料免费下载】158个JAVA免豆精品资料汇总
- I学霸官方免费教程十七:Java面向对象之接口
- 两种免费的Java Obfuscator比较
- 用Java开发代理服务器
- 从头认识java-11.5 扫描输入(1)-简介
- 中国学生可申请免费Java培训
- 经典Java入门视频教程免费下载
- 免费视频教程分享 :java经典面试题深度解析
- 智渔课堂官方免费教程三十:Java基础教程之泛型
- I学霸官方免费教程三十三:Java集合框架之Map集合
- JAVA整合HTML5实现扫描二维码功能项目源码
- Eclipse 做Web项目需要的工具,java提倡免费编程
- I学霸官方免费教程四十五 :Java算法之递归算法
- 【免费Java教学之】Java语言的特性
- Java免费开源数据库、Java嵌入式数据库、Java内存数据库