您的位置：首页 > Web前端 > HTML

urllib2抓取HTML存入Excel

2016-09-17 16:17 183 查看

通过urllib2抓取HTML网页，然后过滤出包含特定字符的行，并写入Excel文件：

# -*- coding: utf-8 -*-

import sys
#import urllib
import urllib2

from xlwt import Workbook

def getdata(keywords, line):
date = ''
if keywords in line: # 本行包含keywords
start = line.find('>',)
end = line.find('</', start)
data = line[start+1:end]
return data
return False

def FetchDataByUrllib(checkUrl):
book=Workbook(encoding='gbk')
# add_sheet新增sheet，默认不能overwrite数据，必须显示指定可更改。
sheet=book.add_sheet('mySheet', cell_overwrite_ok=True)

try:
checkFile = urllib2.urlopen(checkUrl)
except Exception, e:

print e
return

type = sys.getfilesystemencoding()

i = 1
for line in checkFile:
# 根据网页编码格式来解码
line = line.decode("UTF-8").encode(type)
#line = line.decode("GBK").encode(type)

# 逐行全部写入excel文件。
#sheet.write(i,1,line)
#i+=1

# 查找所需的特定数据，写入Excel文件。
targetStr = getdata('体育', line) # 包含'体育'的行
if targetStr != False:
sheet.write(i,1,targetStr)
i+=1

book.save('simple.xls')
print 'finish!'

print '开始...'

myUrl = 'http://www.sina.com.cn'

FetchDataByUrllib(myUrl)

输出结果：

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航