您的位置:首页 > 理论基础 > 计算机网络

java 网络爬虫之多线程抓取文件

2017-04-20 19:33 1091 查看
记得这个是去年的东西了,今天重新拿出来重温,一些知识都模糊了很多。

一共六个类文件加上一个jar包,Demo文件是主文件;DownloadFile文件的作用是从网络URL上下载文件下来,别人已经封装好了拿来用;DownloadThread文件作用是多线程爬取文件下来,速度快;HttpUtils文件作用是将URL网页装换为可操作的document文件,也是别人已经封装好的;MD5不用我说了吧;Task是处理文件的类;

1 Demo.java

import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Demo {

/**
* @param args
*/

public static ArrayList<Task> arr = new ArrayList<Task>();

public static void main(String[] args) {

GeiALLimgUrl("http://www.csdn.net"); // 封装目标url

int maxindex = 2; // 设置的多线程个数,修改多少个随你

DownloadThread[] d = new DownloadThread[maxindex];

for (int i = 0; i < maxindex; i++) {

d[i] = new DownloadThread(i);

d[i].start();

}
}

public static void GeiALLimgUrl(String url) {
try {
String result = HttpUtils.doGet(url);

Document doc = Jsoup.parse(result);

Elements links = doc.select("img");

for (Element imgs : links) {

System.out.println(imgs.attr("src")); // 抓取的当前URL页面上的图片img

arr.add(new Task(imgs.attr("src"))); // 先存放在集合里,后续再操作
}
} catch (Exception e) {

e.printStackTrace();
}
}

public static Task getTask() {
for (Task s : arr) {
if (!s.hasDownloaded) {
s.hasDownloaded = true;
return s;
}
}
return null;
}
}

2 Task.java

public class Task {

//图片地址
public String imageUrl="";

//图片是否被下载了?
public  boolean hasDownloaded=false;

//图片的名字
public String filename;

//构造函数,提供图片的URL就可以了
public Task(String url){

imageUrl=url;

filename=MD5.string2MD5(url);  //对图片加密,利于爬取的各种操作

int last=imageUrl.lastIndexOf(".");

String ext=imageUrl.substring(last+1);

filename=filename +"."+ext;

System.out.println("文件名:"+filename);
}

}

3 DownloadThread.java

import java.io.IOException;

public class DownloadThread extends Thread{

//当前ID号
public int ID;

public boolean exit=false;

public DownloadThread(int id){

ID=id;
}

@Override
public void run() {
// TODO Auto-generated method stub
//super.run();

DownloadFile download=new DownloadFile();

while(!exit){

//从任务列表中读取一个没有被下载的任务
Task target=Demo.getTask();

if(target!=null){

//下载
System.out.println(ID);
try {

download.downLoadFromUrl(target.imageUrl, target.filename, "c:\\images");

} catch (IOException e) {

e.printStackTrace();
}

}
else{

System.out.println("我是第"+ID+"个线程,我现在没有任务");

//没有任务,休息一下
try {
Thread.sleep(1000);
} catch (InterruptedException e) {

e.printStackTrace();
}
}

}

}

}

4 DownloadFile.java

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

public class DownloadFile {

/**
* 从网络Url中下载文件
* @param urlStr
* @param fileName
* @param savePath
* @throws IOException
*/
public  void  downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{
URL url = new URL(urlStr);
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
//设置超时间为3秒
conn.setConnectTimeout(3*1000);
//防止屏蔽程序抓取而返回403错误
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");

//得到输入流
InputStream inputStream = conn.getInputStream();
//获取自己数组
byte[] getData = readInputStream(inputStream);

//文件保存位置
File saveDir = new File(savePath);
if(!saveDir.exists()){
saveDir.mkdir();
}

File file = new File(saveDir+File.separator+fileName);

if(file.exists())
{
System.out.println("文件已存在,不用重复下载");
return;
}

FileOutputStream fos = new FileOutputStream(file);
fos.write(getData);
if(fos!=null){
fos.close();
}
if(inputStream!=null){
inputStream.close();
}

System.out.println("info:"+url+" download success");

}

/**
* 从输入流中获取字节数组
* @param inputStream
* @return
* @throws IOException
*/
public   byte[] readInputStream(InputStream inputStream) throws IOException {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while((len = inputStream.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
bos.close();
return bos.toByteArray();
}

}

5 HttpUtils.java

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.zip.GZIPInputStream;

public class HttpUtils {

//根据url访问服务器,返回服务器响应文本
public static String doGet(String url) throws Exception
{
//创建一个URL对象,URL
URL localURL = new URL(url);

//设置代理服务器
System.setProperty("http.proxyHost", "127.0.0.1");
System.setProperty("http.proxyPort", "8888");

URLConnection connection = localURL.openConnection();
HttpURLConnection httpURLConnection = (HttpURLConnection)connection;

//设置请求头部的属性
httpURLConnection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E");

//保存输入输出流的对象
InputStream inputStream = null;
InputStreamReader inputStreamReader = null;
BufferedReader reader = null;
StringBuffer resultBuffer = new StringBuffer();
String tempLine = null;

//302强制浏览器跳转,200 ok
if (httpURLConnection.getResponseCode() >= 300) {
throw new Exception("HTTP Request is not success, Response code is " + httpURLConnection.getResponseCode());
}

try {
inputStream = httpURLConnection.getInputStream();

//get header by 'key'
String encoding = httpURLConnection.getHeaderField("Content-Encoding");

//如果返回的是压缩HTML代码
if(encoding!=null && encoding.equals("gzip"))
{
System.out.println("这是一个压缩的HTML\n");
GZIPInputStream gzin;
gzin = new GZIPInputStream(inputStream);
//对返回页面内容进行utf-8解码,从而中文不会乱码
inputStreamReader = new InputStreamReader(gzin,"gbk");

}
else
{
inputStreamReader = new InputStreamReader(inputStream,"gbk");
}
reader = new BufferedReader(inputStreamReader);

while ((tempLine = reader.readLine()) != null) {
resultBuffer.append(tempLine+"\n");
}

} finally {

if (reader != null) {
reader.close();
}

if (inputStreamReader != null) {
inputStreamReader.close();
}

if (inputStream != null) {
inputStream.close();
}

}

return resultBuffer.toString();
}

/*
* currentBase当前搜索网页的URL
* target是从网页标签提取出来的URL(例如href等)
* */
public static String getURL(String currentUrl,String targetUrl)
{
String temp=targetUrl;
//当前页面的路径
//例如:http://www.gdmec.cn/cs/csnew/index.html
//应该要分析出:http://www.gdmec.cn/cs/csnew/
String currentBase="";

String resultURL="";

if(currentUrl.endsWith("/"))
{
currentBase=currentUrl;
}
else
{
int lastPos=currentUrl.lastIndexOf("/");
currentBase=currentUrl.substring(0,lastPos+1);
}

System.out.println("currentBase:"+currentBase);

if(temp.startsWith("http"))
{
return resultURL;
}
else if(temp.startsWith("../"))
{
//resultURL=currentBase+temp.substring(2);

}
else if(temp.startsWith("./"))
{
resultURL=currentBase+temp.substring(2);
}
else if(temp.startsWith("//"))
{
resultURL="http:"+temp;
}
else if(temp.startsWith("/"))
{
resultURL=currentBase+temp.substring(1);
}
else
{
resultURL=currentBase+temp;
}
return resultURL;

}
}

6 MD5.java

import java.security.MessageDigest;

public class MD5 {

/***
* MD5加码 生成32位md5码
*/
public static String string2MD5(String inStr){
MessageDigest md5 = null;
try{
md5 = MessageDigest.getInstance("MD5");
}catch (Exception e){
System.out.println(e.toString());
e.printStackTrace();
return "";
}
char[] charArray = inStr.toCharArray();
byte[] byteArray = new byte[charArray.length];

for (int i = 0; i < charArray.length; i++)
byteArray[i] = (byte) charArray[i];
byte[] md5Bytes = md5.digest(byteArray);
StringBuffer hexValue = new StringBuffer();
for (int i = 0; i < md5Bytes.length; i++){
int val = ((int) md5Bytes[i]) & 0xff;
if (val < 16)
hexValue.append("0");
hexValue.append(Integer.toHexString(val));
}
return hexValue.toString();

}

}

 jar包 jsoup-1.9.2.jar

这里是爬取网络上指定url的图片,其他的比如爬取兼职信息,天气信息等也可以,当然,爬取过多随时会被墙掉,而且一些网页会使用get 或者post来获取信息,这时就要适当修改爬取的方式了,还有一些网页是异步加载,就留给你们自己尝试了。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: