您的位置:首页 > 编程语言 > Python开发

Python简单抓取新浪某网页新闻链接及标题

2012-06-14 01:24 676 查看
进行了一些拓展(还可以再扩展,吧tele中间路径从主页中获取到,然后用map给用户选择):

#这里可以再改进,进行扩展,自行输入时间(貌似都一样,正则还是可以用)
#doc = urlopen("http://roll.tech.sina.com.cn/tele/2012-05-01.shtml").read()
newsYear = raw_input("Please input the year likes 2012: ")
newsMouth = raw_input("Please input the mouth likes 03: ")
newsDay = raw_input("Please input the day likes 02: ")
doc = urlopen("http://roll.tech.sina.com.cn/tele/" + \
newsYear + "-" + \
newsMouth + "-" + \
newsDay + \
".shtml").read()


没怎么接触网页及网络相关知识,再用没上手的Python,下面这段程序一波三折,bugs不少,但勉强还算是实现了对http://roll.tech.sina.com.cn/tele/2012-05-01.shtml网页新闻的抓取。win系统服务没加上,及一堆问题,待续……

# -*- coding: cp936 -*-
import win32serviceutil
import win32service
import win32event
from urllib import urlretrieve
from urllib import urlopen
import smtplib
from email.mime.text import MIMEText
from email.MIMEMultipart import MIMEMultipart
from email.Header import Header
#这个正则库感觉很棒
import re
import os
import xlrd

doc = urlopen("http://roll.tech.sina.com.cn/tele/2012-05-01.shtml").read()

#分别寻找链接和新闻标题
def extract_url(info):
rege = "<li><a href=\"(.*)\" target=_blank>"
url = re.findall(rege, info)
return url

def extract_title(info):
pat = "\" target=_blank>(.*)</a><span class="
title = re.findall(pat, info)
return title

url = extract_url(doc)
title = extract_title(doc)

#xlrd模块的使用有问题,打开路径出错
# XLRDError('Unsupported format, or corrupt file: ' + msg)
def get_email_list():
path = os.getcwd()
wb = xlrd.open_workbook(path + "\\email_list.xls", encoding_override="cp1252")
sheet = wb.sheet_by_name("mail")
first_column = sheet.col_values(0)

return first_column

#emaildest = get_email_list()

n = len(url)
#邮件表格仍然有问题
#换为div去实现简单换行
#<a href="http://www.baidu.com" target="_blank">fsddfs</a>
mail_context = ""
#mail_context += "<table width=\"700\" border=\"1\" align=\"left\" face=\"宋体\">"
for i in range(0, n):
#    mail_context += "<tr><td><font size=\"2\">"
#    mail_context += "<span class=\"line1\"><a href=\""
#    mail_context += url[i]
#    mail_context += "\" target=_blank>"
#    mail_context += title[i]
#    mail_context += "</a></span>"
#    mail_context += "</font></td>"
#    mail_context += "</table>"
mail_context += "<div><a href=\""
mail_context += url[i]
mail_context += "\" target=\"_blank\">"
mail_context += title[i]
mail_context += "</a></div>"

def sendsimplemail(text, dest):
msg = MIMEText(text, 'html', 'gb2312')
msg['Subject'] = Header('title', 'gb2312')
msg['From'] = 'sourcedest'
msg['To'] = str(dest)
try:
#登录邮箱服务器
smtp = smtplib.SMTP()
#登录验证
smtp.connect(r'smtp.Serve.com')
smtp.login('user','password')
#发送邮箱
#有验证现在,所以发送还是相同为好
smtp.sendmail('user', dest, msg.as_string())
smtp.close()
except Exception, e:
print e

dest = ['targetEmailAddress']
sendsimplemail(mail_context, dest)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: