python2 调用库MySQLdb及爬取百度知道问答对程序
2017-07-20 09:42
246 查看
参考博文和链接均在下面,有些细节没有写,自行百度
感谢各位前辈大佬
平台Ubuntu16 Python2.7一:安装MySQL及可视化工具Workbench
安装MySQL:
http://blog.csdn.net/lina_acm/article/details/51810898
查验MySQL:
sudo netstat -tap | grep mysql
安装MySQL:
sudo apt-get install mysql-server
sudo apt-get install mysql-client
sudo apt-get install libmysqlclient-dev
安装mysql可视化工具Workbench:
http://blog.csdn.net/jgirl_333/article/details/48575281
sudo apt-get install mysql-workbench
安装mysqldb(驱动):
sudo apt-get install python-mysqldb
http://blog.csdn.net/boycycyzero/article/details/42787797
pip install mysql-python
# import MySQLdb
Python爬取数据并写入MySQL数据库
http://blog.csdn.net/Oscer2016/article/details/70257956?locationNum=9&fps=1
二:插入中文长字符串可能遇到的问题
库内容的存储类型都需要是utf8
关于解决mysql workbench无法插入中文的问题
问题描述:
在向表格中插入中文时出现了这样的报错
解决方法
在建立数据库的时候collation做如下选择 点击Schema右边的双向下的箭头,就会显示出被收起的Collation选项,我这里默认是latin1 - default
collation 需要做下下图所示的修改
修改为utf8 - default collation 可能根据存储的不同需要需要选择不同的保存格式,这点尤其重要
存储长字符串,都改为utf8 - default collation 同时采用LONGTEXT
mysql中[Err] 1366 - Incorrect string value: '\xE5\x8D\x问题
问题描述:向数据库插入数据时出现:
[Err] 1366 -Incorrect string value: '\xE5\x8D\x8E\xE5\xB8\x88' for column 'uaddr' at row 1
解决办法:将该表中的每一列的字符集都改为utf-8
三:Python调用MySQL
主要参考:
python下的MySQLdb使用
http://drizzlewalk.blog.51cto.com/2203401/448874
python操作mysql数据库python-mysql.html
http://www.runoob.com/python/
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Wed Jul 31 16:21:97 2017 @author: azurewit """ #073101 删除重复链接 #073103 获取可接受答案 #080202 添加表名全局变量 import time import urllib import re import requests import MySQLdb from bs4 import BeautifulSoup def get_page(Q_No, url, data = None): global page_question_No, Attempts #获取URL的requests wb_data = requests.get(url) wb_data.encoding = ('gbk') soup = BeautifulSoup(wb_data.text, 'lxml') #定义爬取的数据 webdata = soup.select('a.ti') answer_sum = list(soup.find_all('dd', class_ = "dd answer")) if data==None: for title_pre, url, answer_pre in zip(webdata, webdata, answer_sum): data = [ title_pre.get_text(), url.get('href'), answer_pre.get_text() ] #进入问答页面 url_sub = data[1] if url_sub in new_urls : continue if url_sub not in new_urls : print '\n正在第 %d 次尝试获取问答对……' % (Attempts) Attempts += 1 new_urls.add(url_sub) wb_data_sub = requests.get(url_sub) wb_data_sub.encoding = ('gbk') soup_sub = BeautifulSoup(wb_data_sub.text, 'lxml') title = soup_sub.find('span', class_ = "ask-title ") best_answer = soup_sub.find('pre', class_ = "best-text mb-10") img_answer = soup_sub.find('img', class_ = "word-replace") if title != None: if best_answer != None : question_now = title.get_text() if img_answer != None: best = data[2] type_st = 'point 1-1' else: best = best_answer.get_text() type_st = 'point 1-2' elif best_answer == None: question_now = title.get_text() best_answer 4000 = soup_sub.find('div', class_ = "best-text mb-10") if best_answer != None : if img_answer != None: best = data[2] type_st = 'point 2-1' else: best = best_answer.get_text() type_st = 'point 2-2' else: better_answer = soup_sub.find('div', class_ = "answer-text line") if better_answer != None: if img_answer != None: better = data[2] best = better type_st = 'point 2-3' else: best = better_answer.get_text() type_st = 'point 2-4' else: better_answer = soup_sub.find('div', class_ = "answer-text mb-10") if img_answer != None: better = data[2] best = better type_st = 'point 2-5' elif better_answer != None: best = better_answer.get_text() type_st = 'point 2-6' else : best = data[2] type_st = 'point 2-7' else: question_now = data[0] best = data[2] type_st = 'point 3-1' haskeyword = re.search(rekey_word, question_now.encode("UTF-8")) has3points = re.search(re_3points, best) if haskeyword == None or has3points != None: continue else: page_now = page_question_No page_question_No += 1 print '\n===================\n爬取的第 %d 问答对为\ :\n===================\n' % (page_question_No) print question_now print best cursor = db.cursor() question_j = question_now.encode("UTF-8") best_j = best.encode("UTF-8") type_j = type_st.encode("UTF-8") keyword_j = key_word sql = "INSERT INTO test_table (ID, KEYWORD, LINK, TYPE, QUESTION, ANSWER) \ VALUES ('%d', '%s', '%s', '%s', '%s', '%s')" % \ ( page_now, keyword_j, url_sub, type_j, question_j , best_j) try: cursor.execute(sql) db.commit() except: db.rollback() time.sleep(1) #迭代页数 def get_more_page(start, end): for one in range(start, end, 10): get_page(one,url+str(one)) time.sleep(1) #主体 global new_urls, re_3points, Attempts, key_word, rekey_word, table_name new_urls = set() page_question_No = 0 Attempts = 1 re_3points = '\.{3}$' table_name = 'test_table' print('连接到mysql服务器...') db = MySQLdb.connect("localhost","root","你的mySQL密码","azure_schema" ,charset='utf8') print('连接上了!') cursor_pre = db.cursor() sql = "DELETE FROM test_table " try: # 执行SQL语句 cursor_pre.execute(sql) # 提交MySQL db.commit() except: # 发生错误时回滚 db.rollback() db.close() db = MySQLdb.connect("localhost", "root", "你的mySQL密码", "azure_schema", charset='utf8') print('清空并连接上了!') #定义爬取关键词、页数 #key_word = raw_input('请输入关键词\n') key_words =( '深度学习','自动驾驶','ImageNet','机器视觉','图像识别',\ '机器学习','正则化','卷积神经网络','数据稀疏','稀疏编码',\ '循环神经网络','递归神经网络','无人驾驶','逻辑回归','前向信号计算',\ '自学习聚类','遗传算法','朴素贝叶斯算法','智能算法','人脸识别',\ 'PageRank算法','最近邻分类算法','Kmeans算法','AdaBoost算法','SVM 支持向量机',\ 'CART分类','回归树','自编码器','图像检测','OCR',\ 'TensorFlow','AdaBoot','caffe','torch','MXNet',\ 'theano','python','DeepMind','聚类算法','贝叶斯方法',\ '人脸标注','逻辑感知','数据分析','数据挖掘','贝叶斯统计',\ '博弈论','指纹识别','聊天机器人','AlphaGo','大数据',\ '云计算','物联网','人工智能','智能机器人','语言识别',\ '自然语言处理','专家系统','自然语言理解', 'OpenCV', '图像处理',\ '颜色空间','多分辨处理','形态学处理','图像拼接','并行计算',\ 'GPU加速','数据结构','声学特征提取','声纹识别','线性预测',\ '模板匹配','语言模型','声纹注册','HMM模型','模式识别',\ '特征提取','数据预处理','模型评估','回归算法','分类算法',\ '图形API','虚拟现实','降维算法','人机交互','受限波尔兹曼模型',\ '数据管理平台','知识图谱','随机森林','关联规则学习','计算机视觉',\ '生物特征识别','搜索引擎','凸分析','算法复杂度','Boosting',\ '逻辑语句','语义网络','决策树','信息过滤系统','数据库' ) #pages = input('每个关键字的总页面数: \n') pages = 10 for key_word in key_words: rekey_word = '\w*%s\w*' % (key_word) #定义将要爬取的URL url = "https://zhidao.baidu.com/search?word=" + urllib.quote(key_word)+"&pn=" Page_star = 0 Page_end = int(pages)*10 get_more_page(Page_star, Page_end) print '********************\n完成获取关键字为: %s \ \n目前共获取问答对: %d \n********************' \ % (key_word, page_question_No) print '********************\n完成获取: %d \ \n********************' % (page_question_No) db.close()
有错的话,(*^__^*) 嘻嘻……
反弹,biubiubiu~~~
相关文章推荐
- Python调用(运行)外部程序
- python调用(运行)外部程序
- 发布vc调用Python的独立运行程序
- python外部调用程序
- python写的调用ms sqlserver数据并发送邮件的小程序
- Python3 被 PHP 程序调用执行返回乱码问题
- go和python调用其它程序并得到程序输出
- Python海量数据处理之_Hadoop(三)程序调用
- Windows C++程序调用包含TensorFlow代码的Python脚本出错问题的解决办法
- python : html 调用本地python程序
- Python调用(运行)外部程序
- PY++ 自动将你的C++程序接口封装供python调用
- idea中通过java程序直接调用python文件
- c++调用Python的第二个小程序
- python 调用C程序的结构体和函数
- c程序多次调用python脚本的正确打开模式
- Python调用(运行)外部程序
- python调用exe程序
- Python是这样调用matlab程序的!