您的位置:首页 > 理论基础 > 计算机网络

Python网络爬虫对知乎首页进行爬取

2015-07-15 11:57 603 查看
# -*- coding: UTF-8 -*-

import urllib, urllib2, cookielib, re, time, os

import requests

print 'zhihu_QA'
print 'Please input your email:'
MyEmail = raw_input()

print 'Please input your password:'
MyPassWord = raw_input()

print '---------------------------------------------'

#基本信息
Url = 'http://www.zhihu.com/login'
#User_Agent每个电脑都不一样
User_Agent = **********************************************'
#MyReferer = 'http://www.zhihu.com/'
MyValues = {'email' : MyEmail, 'password' : MyPassWord}
MyHeaders = {'User-Agent' : User_Agent, }

MyRequests = requests.session()
MyCont = MyRequests.post(Url, data = MyValues, headers = MyHeaders)
MyCont2 = MyCont.text.encode('UTF-8')
print MyCont2

def GetNowTime():
return time.strftime("%Y-%m-%d_%H-%M-%S",time.localtime(time.time()))

WordPathName = r'D:\\MyZhiHu' + GetNowTime() + '.docx'

String1 = 'h2'
String2 = 'div'
from bs4 import BeautifulSoup

MySoup1 = BeautifulSoup(MyCont2)

def has_need(tag):
return tag.has_attr('class') and tag.has_attr('feed-item-a') and tag.has_attr('data-type')
def has_need_photo(tag):
return tag.has_attr('class') and tag.has_attr('src')

QuestionNum = 1

from docx import Document
from docx.shared import Inches

MyPhotoPath = 'D:\\MyZhiHuTempPhoto.jpg'

if os.path.exists(WordPathName):
print '文件存在'
MyDocument = Document(WordPathName)
else:
print '文件不存在'
MyDocument = Document()

MyDocument.add_heading(u'我的知乎', 0)

for MyTag in MySoup1.find_all(has_need):
print '****************************************************************'

#找问题标题
MyQuestion = MyTag.find(name = 'a', class_ ='question_link')
#找问题详情
MyMoreQuestion = MyTag.find(name = 'div', class_ ='question-description zm-editable-content')
#找问题回答
MyAnswer = MyTag.find(name = 'textarea', class_ = 'content hidden')
#找回答者和回答者的个人说明
MyFindPerson = MyTag.find(name = 'h3', class_ = 'zm-item-answer-author-wrap')
#找赞同数
MyFindLabel = MyTag.find(name = 'span', class_ = 'count')
MyDocument.add_page_break()

QN = '这是第' + str(QuestionNum) + '个问题'
print QN
QuestionNum += 1

MyDocument.add_paragraph(QN.decode('UTF-8'))

if MyQuestion is not None:
print ('问题:')
print ''
MyDocument.add_heading(u'问题:', level=1)
print ''

MyTempSoup = BeautifulSoup(str(MyQuestion))

MyTST = MyTempSoup.text
print MyTST
MyDocument.add_paragraph(MyTST)

else:
print '该问题不存在!'
MyDocument.add_heading(u'该问题不存在!', level=1)

if MyMoreQuestion is not None:
print '问题详情:'
MyDocument.add_heading(u'问题详情:', level=1)

MyTempSoup = BeautifulSoup(str(MyMoreQuestion))

MyTST = MyTempSoup.text
print MyTST
MyDocument.add_paragraph(MyTST)
else:
print '该问题没有详情!'
MyDocument.add_heading(u'该问题没有详情!', level=1)
print ''

if MyFindPerson is not None:
print '回答作者和个人说明:'
MyDocument.add_heading(u'回答作者和个人说明:', level=1)
MyTempSoup = BeautifulSoup(str(MyFindPerson))
#print MyTempSoup
MyTST = MyTempSoup.text
print MyTST
MyDocument.add_paragraph(MyTST)
else:
print '该问题没有作者!'
MyDocument.add_heading(u'该问题没有作者!', level=1)

if MyFindLabel is not None:
print '回答赞同数:'
MyDocument.add_heading(u'回答赞同数:', level=1)

MyTempSoup = BeautifulSoup(str(MyFindLabel))

MyTST = MyTempSoup.text
print MyTST
MyDocument.add_paragraph(MyTST)

else:
print '该问题没有人赞同!'
MyDocument.add_heading(u'该问题没有人赞同!', level=1)

if MyAnswer is not None:
print '问题回答:'
MyDocument.add_heading(u'问题回答:', level=1)

MyTempSoup = BeautifulSoup(str(MyAnswer))

MyTST = MyTempSoup.text

MyTempAnswer = BeautifulSoup(str(MyTST.encode('UTF-8')))

print '以下是回答正文(含图片链接)'
print MyTempAnswer
print '以上是回答正文(含图片链接)'

MyDocument.add_paragraph(u'以下是回答正文(含图片链接)')
MyDocument.add_paragraph(MyTST)

MyDocument.add_paragraph(u'以上是回答正文(含图片链接)')

PhotoNum = 1

MyStartS = 'src="'
MyEndS = 'jpg'
for MyPhotos in MyTempAnswer.find_all(has_need_photo):
if str(MyPhotos).find(MyStartS) >= 0:
MyUrlS = str(MyPhotos).find(MyStartS)
MyUrlE = str(MyPhotos).find(MyEndS, MyUrlS + 1)
MyUrlP = str(MyPhotos)[MyUrlS + len(MyStartS) : MyUrlE + len(MyEndS)]
#print MyUrlP
#下载图片,用同一个名字命名,因为如果重名会覆盖,所以最后删除就好了
urllib.urlretrieve(MyUrlP, MyPhotoPath)
#这是测试校验用的图片
urllib.urlretrieve(MyUrlP, 'd:\\11\\'+ str(QuestionNum) + '-' + str(PhotoNum) + '.jpg')

PhotoNum += 1

try:
if os.path.exists(MyPhotoPath):
MyDocument.add_picture(MyPhotoPath, width = Inches(3))
else:
print '没有下载的图片(错误)'
MyDocument.add_paragraph(u'没有下载的图片(错误)')
except Exception, e:
print '图片写入失败'
print e
ErrorPhoto = '第' + str(hahatest) + '个问题的第' + str(a) + '张图片写入word失败'
print ErrorPhoto
MyDocument.add_paragraph(ErrorPhoto.decode('UTF-8'))
else:
print '该问题没有回答!'
MyDocument.add_heading(u'该问题没有回答!', level=1)
print '================================================================'
MyDocument.add_heading(u'=========================问题分割线=========================', level=1)

MyDocument.save(WordPathName)

if os.path.exists(MyPhotoPath):
#存在图片,删除图片
os.remove(MyPhotoPath)
print '最后一张缓存图片已删除'
else:
print '最后不存在缓存图片(错误)'






内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: