python 截取网页内容
2011-02-25 21:08
281 查看
程序备份:
用python截取网页的内容。以下代码times部分存在一些问题,需要增加多选结构,以匹配复杂的情况。
#This file is for abtaining data from website "http://bj.58.com/diannao/"
# -*- coding:utf-8 -*-
# file: re_html.py
#
import Tkinter #界面库
import urllib #url解析库
import re # 正则表达式库
#coding=utf-8
from pyExcelerator import * # excel文件读写库
class Window:
def __init__(self, root):
self.root = root # 创建组件
self.label = Tkinter.Label(root, text = '输入URL:')
self.label.place(x = 5, y = 15)
self.entryUrl = Tkinter.Entry(root,width = 30)
self.entryUrl.place(x = 65, y = 15)
self.get = Tkinter.Button(root,
text = '获取数据', command = self.Get)
self.get.place(x = 280, y = 15)
self.edit = Tkinter.Text(root,width = 470,height = 600)
self.edit.place(y = 50)
def Get(self):
url = self.entryUrl.get() # 获取URL
page = urllib.urlopen(url) # 打开URL
data = page.read() # 读取URL内容
p_prices = re.compile(r"""(?<=<td width="60" class="price">)[^<]*(?=</td>)""")
p_products = re.compile("""(?<=" target="_blank" class="t">)[^<]*(?=</a>)""")
p_places = re.compile("""(?<=/diannao/' class='u'>)[^<]*(?=</a>)""")
p_times = re.compile("""(?<=<td class="pd" width="70">)[^<]*(?=</td>)""")
prices = p_prices.findall(data)
products = p_products.findall(data)
places = p_places.findall(data)
times = p_times.findall(data)
self.edit.insert(Tkinter.END, len(prices))
self.edit.insert(Tkinter.END, '/n')
self.edit.insert(Tkinter.END, len(products))
self.edit.insert(Tkinter.END, '/n')
self.edit.insert(Tkinter.END, len(places))
self.edit.insert(Tkinter.END, '/n')
self.edit.insert(Tkinter.END, len(times))
self.edit.insert(Tkinter.END, '/n')
for i in range(1,len(places)):
self.edit.insert(Tkinter.END, prices[i]+' ')
self.edit.insert(Tkinter.END, products[i]+' ')
self.edit.insert(Tkinter.END, places[i]+' ')
self.edit.insert(Tkinter.END, times[i]+'/n')
page.close()
root = Tkinter.Tk()
window = Window(root)
root.minsize(600,480)
root.mainloop()
用python截取网页的内容。以下代码times部分存在一些问题,需要增加多选结构,以匹配复杂的情况。
#This file is for abtaining data from website "http://bj.58.com/diannao/"
# -*- coding:utf-8 -*-
# file: re_html.py
#
import Tkinter #界面库
import urllib #url解析库
import re # 正则表达式库
#coding=utf-8
from pyExcelerator import * # excel文件读写库
class Window:
def __init__(self, root):
self.root = root # 创建组件
self.label = Tkinter.Label(root, text = '输入URL:')
self.label.place(x = 5, y = 15)
self.entryUrl = Tkinter.Entry(root,width = 30)
self.entryUrl.place(x = 65, y = 15)
self.get = Tkinter.Button(root,
text = '获取数据', command = self.Get)
self.get.place(x = 280, y = 15)
self.edit = Tkinter.Text(root,width = 470,height = 600)
self.edit.place(y = 50)
def Get(self):
url = self.entryUrl.get() # 获取URL
page = urllib.urlopen(url) # 打开URL
data = page.read() # 读取URL内容
p_prices = re.compile(r"""(?<=<td width="60" class="price">)[^<]*(?=</td>)""")
p_products = re.compile("""(?<=" target="_blank" class="t">)[^<]*(?=</a>)""")
p_places = re.compile("""(?<=/diannao/' class='u'>)[^<]*(?=</a>)""")
p_times = re.compile("""(?<=<td class="pd" width="70">)[^<]*(?=</td>)""")
prices = p_prices.findall(data)
products = p_products.findall(data)
places = p_places.findall(data)
times = p_times.findall(data)
self.edit.insert(Tkinter.END, len(prices))
self.edit.insert(Tkinter.END, '/n')
self.edit.insert(Tkinter.END, len(products))
self.edit.insert(Tkinter.END, '/n')
self.edit.insert(Tkinter.END, len(places))
self.edit.insert(Tkinter.END, '/n')
self.edit.insert(Tkinter.END, len(times))
self.edit.insert(Tkinter.END, '/n')
for i in range(1,len(places)):
self.edit.insert(Tkinter.END, prices[i]+' ')
self.edit.insert(Tkinter.END, products[i]+' ')
self.edit.insert(Tkinter.END, places[i]+' ')
self.edit.insert(Tkinter.END, times[i]+'/n')
page.close()
root = Tkinter.Tk()
window = Window(root)
root.minsize(600,480)
root.mainloop()
相关文章推荐
- Python使用HTMLParser抓取网页内容
- Python3伪装浏览器爬虫读取网页内容
- [Python3.x]网络爬虫(一):利用urllib通过指定的URL抓取网页内容
- 分享:Python3伪装浏览器爬虫读取网页内容
- Python下使用Scrapy爬取网页内容的实例
- python基于BeautifulSoup实现抓取网页指定内容的方法
- 零基础写python爬虫之使用urllib2组件抓取网页内容
- python处理经过gzip压缩的网页内容
- Python---定向爬取网页的内容
- Python_BeautifulSoup 抓取网页内容入门
- python抓取网页内容示例分享
- 网页内容截取部分打印
- 【Python编程】读取网页内容并存储过滤
- python解析网页中javascript动态添加的内容(一)
- python抓取网页内容
- Python3访问并下载网页内容的方法
- python3 获取网页内容保存到文件
- [Python]网络爬虫(二):利用urllib2通过指定的URL抓取网页内容
- python 抓取网页内容教程
- python-爬虫-自带库抓取网页内容