您的位置:首页 > 其它

爬取北京这个页面的网页信息

2017-08-10 14:06 447 查看
#coding=utf-8
import urllib2
import httplib
import re
from pybloomfilter import BloomFilter
import StringIO
import os
import gzip
import zlib
import lxml
from lxml import html
from lxml import etree
import pandas as pd
from bs4 import BeautifulSoup

request_headers = {
'Accept':"image/webp,image/apng,image/*,*/*;q=0.8",
'Accept-Encoding':"gzip, deflate",
'Accept-Language':"zh-CN,zh;q=0.8",
'Connection':"keep-alive",
'Referer':"http://3g.163.com/touch/local?dataversion=A&uversion=A&version=v_standard",
'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Mobile Safari/537.36"
}

# 创建 Bloom Filter
download_bf = BloomFilter(1024 * 1024 * 16, 0.01)
url = 'http://3g.163.com/touch/local?dataversion=A&uversion=A&version=v_standard'

req = urllib2.Request(url, headers=request_headers)
response = urllib2.urlopen(req)
htmlcontent = response.read()

#如果是gzip解码的话,怕出现乱码,要用下面三行
gzipped = response.headers.get('Content-Encoding')
if gzipped:
htmlcontent = zlib.decompress(htmlcontent, 16+zlib.MAX_WBITS)

print htmlcontent

soup = BeautifulSoup(htmlcontent, 'lxml')
urls=[]
news_content=[]
# a=soup.select('div.cm_news_main > ul.cm_ul_round > li > a ')
# print a

# ul_contents=soup.select('ul[class="cm_ul_round ul_page1"] > li > a')
# print ul_contents

for link in soup.select('div.aslide > a'):

urls.append(link.get('href'))
news_content.append(link.text)
print urls

for i in news_content:
print i
print len(news_content)

# for link in soup.select('div.ndi_main > h3 > a'):
#
#        urls.append(link.get('href'))
#        news_content.append(link.text)
# print urls
# print len(news_content)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐