您的位置:首页 > 运维架构 > 网站架构

统计某个网站内的一些内容,很不完善。大家不要看了,浪费时间

2014-01-17 10:09 295 查看
#coding = gb2312

#from HTMLParser import HTMLParser
from HTMLParser import *
#import HTMLParser
import urllib
import sys
import time
import MySQLdb

#html_parser = HTMLParser.HTMLParser()

def db_insert_77169(id,sub_id,data,link):
    conn = MySQLdb.connect(host='localhost',user='root',passwd='111111',db='sample',port=3306,charset='gb2312')
    cur = conn.cursor()
    #count = cur.execute('select * from botsample')
    #values=(id,data,link)
    values=[id,str(sub_id),data,link]
    cur.execute('insert into 77169_download_info values(%s,%s,%s,%s)',values)
    conn.commit()
    cur.close()
    conn.close()

def db_insert_77169_temp(id,sub_id):
    conn = MySQLdb.connect(host='localhost',user='root',passwd='111111',db='sample',port=3306,charset='gb2312')
    cur = conn.cursor()
    #count = cur.execute('select * from botsample')
    #values=(id,data,link)
    #values=[id,str(sub_id),data,link]
    cur.execute('insert into 77169_download_info_temp values(%s,%s)',[id,str(sub_id)])
    conn.commit()
    cur.close()
    conn.close()

class parselinks(HTMLParser):
    def __init__(self):
        self.data=[]
        self.link=[]
        self.href=0
        self.linkname=''
        self.linkname_temp=''
        HTMLParser.__init__(self)
    def handle_starttag(self,tag,attrs):
        if tag =='a':
            if len(attrs)==2:
                for name,value in attrs:
                    if name == 'href':
                        self.linkname_temp = value
                        self.href=1

    def handle_data(self,data):
#        print data
        if self.href:
            self.linkname+=data
    def handle_endtag(self,tag):
        if tag=='a':
            self.linkname=''.join(self.linkname.split())
            self.linkname=self.linkname.strip()
            if  self.linkname:
                self.data.append(self.linkname)
                self.link.append(self.linkname_temp)
                self.linkname_temp = ''
            self.linkname=''
            self.href=0
    def getresult(self):
        #print len(self.data)
        #print len(self.link)
        #for value in self.data:
        if len(self.data)==len(self.link):
            for i in range(len(self.data)):
        #    value_local = '\xb1\xbe\xb5\xd8\xcf\xc2\xd4\xd8'
            #value1 ="本地下载"
            #if not cmp(value_local,value):
                try:
                    db_insert_77169(total_id,i,self.data[i],self.link[i])
                except:
                    db_insert_77169_temp(total_id,i)
                    #import HTMLParser
                    #html_parser = HTMLParser.HTMLParser()
                    #temp_link = html_parser.unescape(self.link[i])
                    #self.link[i] = html_parser.unescape(self.link[i])
                    #db_insert(total_id,i,self.data[i],temp_link)
                    #exit()
                #time.sleep(1)
                #print total_id,i,self.data[i],self.link[i]
if __name__=="__main__":
    #print __name__
    #total_id = 0
    for total_id in range(0,70000):
        try:
            url_link = "http://soft.aaaaa.com/HTML/" + str(total_id) + ".html"
            IParser = parselinks()
            data_dl = urllib.urlopen(url_link).read()
            #print data_dl
            IParser.feed(data_dl)
            IParser.getresult()
            IParser.close()
            print total_id
            #time.sleep(1)
        except:
            continue
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: