您的位置:首页 > 编程语言 > Python开发

初探python爬虫

2016-12-23 17:51 375 查看
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup #解析网页的库
from selenium import webdriver #模拟执行js,需配合phantomjs使用
import time
import json
import pymysql
import re

try:
# html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
# html = urlopen("http://www.pythonscraping.com/pages/page3.html")
html = urlopen("http://fangjia.fang.com/zz/")
except HTTPError as e:
print(e)
# line = html.read().decode('utf-8')
# print(line)
# print(line.decode('gb18030'))
# print(html.read().decode('utf-8'))

# bsObj = BeautifulSoup(line)

# <find and findAll>

# nameList = bsObj.findAll("span", {"class": "green"})
# for name in nameList:
#     print(name.get_text())

# <children and descendants>
# for child in bsObj.find("div", {"id": "_container"}).descendants:
#     print(child)

# for container in bsObj.find("div", {"id": "_container"}):
#     add=container.get_text()
#     print(add)

# <selenium>
url = "http://fangjia.fang.com"
driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs')
driver.get(url+"/zz/")
time.sleep(3)
iframe= driver.find_element_by_tag_name("iframe")
map= iframe.get_attribute("src")
driver.get(map)
time.sleep(3)

# 得到网页源代码,还用beautifulSoup解析
# pageSource = driver.page_source
# bsObj=BeautifulSoup(pageSource)

for container in driver.find_elements_by_id("_container"):
text=re.sub("\\n"," ",container.text)
if text!="":
print(text)
print("***************************")

driver.close()

def mysqltest():
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='mysql')
cur = conn.cursor()
cur.execute("use mysql")
cur.execute("select * from user")
print(cur.fetchall())
cur.close()
conn.close()

# mysqltest()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: