用Python导出QQ空间的日志到WordPress
2010-01-05 04:31
561 查看
用Python导出QQ空间的日志到WordPress
文章来源:http://www.keakon.cn/bbs/thread-964-1-1.html方法很简单,找出日志的地址,再遍历列出日志的内容。
因为单纯导出没用,还得转换成其他格式,所以我保存到一个列表里,每篇日志都对应其中的一个字典元素,字典的属性都用unicode编码。
然后dump出来,可以方便以后用Python进行再处理(默认为blogs.txt文件)。
并转换成了WordPress用的格式(默认为qzone.xml文件)。
本想用多线程来下载,但似乎没必要,因为只花了80秒,我的149篇日志就全部下载下来了。
如果空间有设置访问权限的话,可以用Client这个模块来处理,把注释改下就行了。
此外,这个也可以盗取别人的日志,但愿不要滥用…
最后,评论我没下载,因为WordPress好像不能导入评论。
代码如下:
复制内容到剪贴板
代码:
# -*- coding: gbk -*- from __future__ import with_statement import codecs from datetime import datetime from datetime import timedelta from os import linesep import cPickle #import Client from urllib2 import urlopen mainUrl = 'http://%s.qzone.qq.com/' listUrl = 'http://b.qzone.qq.com/cgi-bin/blognew/blog_output_toppage?uin=%(qq)s&vuin=0&property=GoRE&getall=1&imgdm=imgcache.qq.com&bdm=b.qzone.qq.com&cate=&numperpage=100&sorttype=0&arch=0&pos=%(pos)d&direct=1' blogUrl = 'http://qzone.qq.com/blog/%(qq)s-%(blogid)s' GMT_FORMAT = '%a, %d %b %Y %H:%M:%S +0800' HEADER = u'''<?xml version="1.0" encoding="UTF-8"?> <rss version="2.0" xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:wfw="http://wellformedweb.org/CommentAPI/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:wp="http://wordpress.org/export/1.0/" > <channel> <title>%(author)s的QQ空间</title> <deion>%(deion)s</deion> <pubDate>%(time)s</pubDate> <generator>keakon的QQ空间导出程序</generator> <language>zh-CN</language> <wp:wxr_version>1.0</wp:wxr_version> '''.replace('\n', linesep) FOOTER = '''</channel> </rss>'''.replace('\n', linesep) #cj = Client.MSIEJar(delayload=True) #cj.load_from_registry() #opener = Client.build_opener(Client.HTTPProcessor(cj)) #Client.install_opener(opener) def getBasicInfo(qq): AUTHOR = '< name="author" content="' AUTHOR_LEN = len(AUTHOR) DESC = '< name="Deion" content="' DESC_LEN = len(DESC) #res = Client.urlopen(mainUrl % qq) res = urlopen(mainUrl % qq) html = res.read() begin = html.find(AUTHOR) if begin == -1: raise URLError, 'HTML not complete.' begin += AUTHOR_LEN end = html.find('"', begin) author = unicode(html[begin:end], 'utf8', 'replace') begin = html.find(DESC) if begin == -1: raise URLError, 'HTML not complete.' begin += DESC_LEN end = html.find('"', begin) deion = unicode(html[begin:end], 'utf8', 'replace') return author, deion def getBlogList(qq): global listUrl CATEGORY = "selectCategory('" CAT_LEN = len(CATEGORY) BLOG = 'selectBlog(' BLOG_LEN = len(BLOG) pos = 0 round = 0 blogs = [] while pos == len(blogs): #res = Client.urlopen(listUrl % {'qq': qq, 'pos': pos}) res = urlopen(listUrl % {'qq': qq, 'pos': pos}) html = res.read() res.close() begin = 0 while True: begin = html.find(CATEGORY, begin) if begin == -1: break else: begin += CAT_LEN end = html.find("')", begin) blog = {} blog['category'] = unicode(html[begin:end], 'gb18030', 'replace') begin = html.find(BLOG, end) if begin == -1: raise URLError, 'HTML not complete.' else: begin += BLOG_LEN end = html.find(')', begin) blog['id'] = html[begin:end] blogs.append(blog) begin = end pos += 100 print '已找到%d篇' % len(blogs) return blogs def getBlogContent(qq, author, blogs, outFile): global blogUrl TITLE = u'<h4 class="c_tx">' TIT_LEN = len(TITLE) TITLE_END = u'</h4>' TIME = u'发表时间:' TIME_LEN = len(TIME) DETAIL = u'<div id="blogDetailDiv"' DETAIL_END = u'<img id="paperPicArea1"' DETAIL_END_DIV = u'</div>' TIME_FORMAT = '%Y年%m月%d日 %H:%M' DATE_FORMAT = '%Y-%m-%d %H:%M:%S' ITEM = ''' <item> <title>%(title)s</title> <pubDate>%(pubDate)s</pubDate> <dc:creator><![CDATA[%(author)s]]></dc:creator> <content:encoded><![CDATA[%(content)s]]></content:encoded> <wp:post_date>%(time)s</wp:post_date> <wp:post_date_gmt>%(gmtTime)s</wp:post_date_gmt> <wp:comment_status>open</wp:comment_status> <wp:ping_status>open</wp:ping_status> <wp:post_name>%(title)s</wp:post_name> <wp:status>publish</wp:status> <wp:post_parent>0</wp:post_parent> <wp:menu_order>0</wp:menu_order> <wp:post_type>post</wp:post_type> <wp:post_password></wp:post_password> </item> '''.replace('\n', linesep) for index, blog in enumerate(blogs): url = blogUrl % {'qq': qq, 'blogid': blog['id']} print '正在下载第%(index)d篇日志: %(url)s' % {'index': index + 1, 'url': url} #res = Client.urlopen(url) res = urlopen(url) html = res.read() res.close() content = unicode(html, 'gbk', 'replace') begin = content.find(TITLE) if begin == -1: print 'HTML not complete. ID: ' + blog['id'] continue begin += TIT_LEN end = content.find(TITLE_END, begin) blog['title'] = content[begin:end] begin = content.find(TIME, end) if begin == -1: print 'HTML not complete. ID: ' + blog['id'] continue begin += TIME_LEN end = content.find('\r\n', begin) blog['time'] = datetime.strptime(content[begin:end].encode('gbk'), TIME_FORMAT) begin = content.find(DETAIL, end) if begin == -1: print 'HTML not complete. ID: ' + blog['id'] continue begin = content.find('>', begin) + 1 if begin == 0: print 'HTML not complete. ID: ' + blog['id'] continue end = content.find(DETAIL_END, begin) if end == -1: print 'HTML not complete. ID: ' + blog['id'] continue # 去掉最后2个div关闭标签 end2 = content.rfind(DETAIL_END_DIV, begin, end) if end2 != -1: end3 = content.rfind(DETAIL_END_DIV, begin, end2) end = end3 != -1 and end3 or end2 blog['content'] = content[begin:end].strip() outFile.write(ITEM % {'title': blog['title'], 'author': author, 'content': blog['content'], 'time': blog['time'].strftime(DATE_FORMAT), 'gmtTime': (blog['time'] -timedelta(hours=8)).strftime(DATE_FORMAT), 'pubDate': blog['time'].strftime(GMT_FORMAT)}) def main(qq, filename='qzone.xml', filename2='blogs.txt'): author, deion = getBasicInfo(qq) blogs = getBlogList(qq) if not blogs: print '没有找到日志。若您设置了QQ空间权限,请用IE登录QQ空间,并启用。' exit(1) categories = set([blog['category'] for blog in blogs]) with codecs.open(filename, 'w', 'utf8') as out # write header outFile.write(HEADER % {'author': author, 'deion': deion, 'time': datetime.now().strftime(GMT_FORMAT)}) for category in set([blog['category'] for blog in blogs]): outFile.write(u' <wp:category><wp:category_nicename>%(category)s</wp:category_nicename><wp:category_parent></wp:category_parent><wp:cat_name><![CDATA[%(category)s]]></wp:cat_name></wp:category>%(linesep)s' % {'category': category, 'linesep': linesep}) # write item getBlogContent(qq, author, blogs, outFile) # write footer outFile.write(FOOTER) with open(filename2, 'w') as outFile2: cPickle.dump(blogs, outFile2) print '全部导出完毕' if __name__ == "__main__": main('123456789') # 这里填你的QQ号
相关文章推荐
- Python导出QQ空间的日志
- 使用python解析Wordpress导出的xml文件
- python日志模块的封装
- Python日志模块logging
- python 的日志logging模块学习
- Python logging浅尝(将log同时输出到Console和日志文件)
- Python脚本---把MySQL数据库表中的数据导出生成csv格式文件
- python3 实现 virtual judge 日志(二):爬取HDU的页面 (2016.11.4更新)
- python实现apahce网站日志分析示例
- python日志
- Python日志8/1
- python logging 日志轮转文件不删除问题
- Python--Nginx+uWSGI+Flask (三)日志找错误
- python 简单日志文件
- [Python]日志模块logging的应用
- Python sftp到远程服务器读取日志文件
- python调用autoit组件自动使用plsqldev导出每日需要的业务报表
- python logging 日志
- 本文以python实现了一个日志文件中ip提取与统计程序
- linux下python导出sybase 数据库 表记录的方式