Python爬虫
2016-03-13 16:01
573 查看
之前用来搜职位的Python爬虫
# -*- coding:utf-8 -*- ##GB18030 import urllib import urllib2 import re import os import math import sqlite3 ##import sys ##reload(sys) ##sys.setdefaultencoding('utf8') import socket socket.setdefaulttimeout(25) #int re module the () things must add \(\) oh #if you want to insert into database the content must use decode('GB18030').encode('utf8') x=1 conn=sqlite3.connect("jobs.db") try : conn.execute("create table jobs(id int primary key,name text,wage text,comment text)") #conn.close() except Exception as e: print "create table",e for i in range(1, 5): url = 'http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=190200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=%E5%B5%8C%E5%85%A5%E5%BC%8F%E8%BD%AF%E4%BB%B6&keywordtype=1&curr_page=2&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=01&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=1&fromType=14&dibiaoid=-1' user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) content = response.read().decode('GB18030') #print content #pattern = re.compile('<li>.*<a href="/W/(.*?)" title="(.*?)" class="ah" target="_blank">'+ # '<img src="(.*?)"',re.S) pattern = re.compile('<a adid="" onmousedown="return AdsClick\(\)" href="(.*?)" onclick="zzSearch.acStatRecJob\( 1 \)',re.S) items = re.findall(pattern,content) #print items for item in items: #haveImg = re.search("img",item[3]) #if not haveImg: print i,item url = item request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) content = response.read().decode('GB18030').encode('utf8') pattern = re.compile('<div style="padding-bottom:30px;">(.*?)</div>',re.S) newitems = re.findall(pattern,content) for newitem in newitems: print newitem.replace("<br>","\r\n") try : conn.execute("insert into jobs(id,name,wage,comment) values(%d,'%s','%s','%s')" % (x,item.decode('GB18030').encode('utf8'),'2',newitem.replace('<br>','\r\n'))) conn.commit() except Exception as e: print e x+=1 except urllib2.URLError, e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason if isinstance(e.reason, socket.timeout): print e except socket.timeout, e: print e print "\n" retval = conn.execute("select * from jobs") for val in retval: print val[0] print val[1] print val[2] print val[3] conn.close()
相关文章推荐
- Python爬虫
- Python爬虫
- Python爬虫
- Python爬虫
- Python爬虫
- Python爬虫
- Python爬虫
- Python爬虫
- Python爬虫
- Python爬虫
- python实现统计你一共写了多少行代码
- Python中的列表生成式与生成器学习教程
- Java调用python,出现“无法解析的编译问题,PythonInterpreter无法解析为类型”
- Python基础(四)之条件判断和循环
- Python Adaboost 实现MNIST 分类
- Python连接MySQL并使用fetchall()方法过滤特殊字符
- Python分布式学习(2)
- 深入讲解Python函数中参数的使用及默认参数的陷阱
- Python time sleep()方法
- python中的时间和时间格式转换