python基础(xml,html,bs4)
2015-12-01 16:40
429 查看
http://python-data.dr-chuck.net/comments_42.xml
<commentinfo><note>This file contains the sample data for testing</note><comments><comment><name>Leven</name><count>100</count></comment><comment><name>Mahdiya</name><count>97</count></comment><comment><name>Ajayraj</name><count>87</count></comment><comment><name>Lillian</name><count>86</count></comment><comment><name>Aon</name><count>86</count></comment><comment><name>Ruaraidh</name><count>78</count></comment><comment><name>Gursees</name><count>75</count></comment><comment><name>Emmanuel</name><count>74</count></comment><comment><name>Christy</name><count>72</count></comment><comment><name>Annoushka</name><count>72</count></comment><comment><name>Inara</name><count>72</count></comment><comment><name>Caite</name><count>70</count></comment><comment><name>Rosangel</name><count>70</count></comment><comment><name>Iana</name><count>66</count></comment><comment><name>Anise</name><count>66</count></comment><comment><name>Jaosha</name><count>65</count></comment><comment><name>Cadyn</name><count>65</count></comment><comment><name>Edward</name><count>63</count></comment><comment><name>Charlotte</name><count>61</count></comment><comment><name>Sammy</name><count>60</count></comment><comment><name>Zarran</name><count>60</count></comment><comment><name>Rowen</name><count>59</count></comment><comment><name>Stanislaw</name><count>59</count></comment><comment><name>Maighdlin</name><count>57</count></comment><comment><name>Connan</name><count>56</count></comment><comment><name>Warrick</name><count>54</count></comment><comment><name>Diya</name><count>52</count></comment><comment><name>Lawson</name><count>52</count></comment><comment><name>Wu</name><count>51</count></comment><comment><name>Irmak</name><count>47</count></comment><comment><name>Emilija</name><count>47</count></comment><comment><name>Kayda</name><count>41</count></comment><comment><name>Ellenor</name><count>41</count></comment><comment><name>Kyra</name><count>41</count></comment><comment><name>Nikita</name><count>38</count></comment><comment><name>Kaelah</name><count>35</count></comment><comment><name>Meko</name><count>32</count></comment><comment><name>Marissa</name><count>31</count></comment><comment><name>Ayat</name><count>24</count></comment><comment><name>Sali</name><count>19</count></comment><comment><name>Hashem</name><count>19</count></comment><comment><name>Tygan</name><count>18</count></comment><comment><name>Rioden</name><count>17</count></comment><comment><name>Cruiz</name><count>16</count></comment><comment><name>Caoilfinn</name><count>13</count></comment><comment><name>Ewen</name><count>8</count></comment><comment><name>Baighley</name><count>7</count></comment><comment><name>Ramone</name><count>1</count></comment><comment><name>Kyran</name><count>1</count></comment><comment><name>Noelani</name><count>1</count></comment></comments></commentinfo>
import xml.etree.ElementTree as ET import urllib2 input = urllib2.urlopen('http://python-data.dr-chuck.net/comments_42.xml').read() commentinfo = ET.fromstring(input) lst = commentinfo.findall('comments/comment') sum = 0 for item in lst: sum += int(item.find('count').text) print sum #2553
from xml.dom import minidom import os import glob import cv2 """ <annotation> """ def InitVoc(vocfile): doc = minidom.Document() annotation = doc.createElement("annotation") doc.appendChild(annotation) addObject(doc, annotation, vocfile) f = file(vocfile.split('.')[0] + ".xml","w") doc.writexml(f) f.close() def addObject(doc, annotation, vocfile): #<folder>VOC2007</folder> folder = doc.createElement("folder") folder.appendChild(doc.createTextNode("VOC2007")) annotation.appendChild(folder) #<filename>XXXXXX</filename> filename = doc.createElement("filename") filename.appendChild(doc.createTextNode(vocfile)) annotation.appendChild(filename) """ <source> <database>The VOC2007 Database</database> <annotation>PASCAL VOC2007</annotation> <image>flickr</image> <flickrid>341012865</flickrid> </source> """ source = doc.createElement("source") annotation.appendChild(source) database = doc.createElement("database") database.appendChild(doc.createTextNode("The VOC2007 Database")) annotation2 = doc.createElement("annotation") annotation2.appendChild(doc.createTextNode("PASCAL VOC2007")) image = doc.createElement("image") image.appendChild(doc.createTextNode("flickr")) flickrid = doc.createElement("flickrid") flickrid.appendChild(doc.createTextNode("wang")) source.appendChild(database) source.appendChild(annotation2) source.appendChild(image) source.appendChild(flickrid) """ <owner> <flickrid>Fried Camels</flickrid> <name>Jinky the Fruit Bat</name> </owner> """ owner = doc.createElement("owner") annotation.appendChild(owner) flickrid2 = doc.createElement("flickrid") flickrid2.appendChild(doc.createTextNode("wang")) name = doc.createElement("image") name.appendChild(doc.createTextNode("wang")) owner.appendChild(flickrid2) owner.appendChild(name) """ <size> <width>353</width> <height>500</height> <depth>3</depth> </size> """ img = cv2.imread(bmpfile) size = doc.createElement("size") annotation.appendChild(size) width = doc.createElement("width") width.appendChild(doc.createTextNode(str(len(img[0])))) height = doc.createElement("height") height.appendChild(doc.createTextNode(str(len(img)))) depth = doc.createElement("depth") depth.appendChild(doc.createTextNode(str(img[0][0].size))) size.appendChild(width) size.appendChild(height) size.appendChild(depth) #<segmented>0</segmented> segmented = doc.createElement("segmented") segmented.appendChild(doc.createTextNode("0")) annotation.appendChild(segmented) """ <object> <name>dog</name> <pose>Left</pose> <truncated>1</truncated> <difficult>0</difficult> <bndbox> <xmin>48</xmin> <ymin>240</ymin> <xmax>195</xmax> <ymax>371</ymax> </bndbox> </object> """ f = open(vocfile.split('.')[0] + '.txs') strs = "" for i in f.read(): if '\\x' not in repr(i): strs += i for line in strs.split(' '): elem = line.split('{') if len(elem) > 1: if len(elem[1].split(',')) == 4: nums = elem[1].split(',') if len(elem[0]) == 1 and len(img) > int(nums[0]) and len(img) >= int(nums[2]) and len(img[0]) >= int(nums[3]) and len(img[0]) > int(nums[1]): object = doc.createElement("object") annotation.appendChild(object) name2 = doc.createElement("name") name2.appendChild(doc.createTextNode(elem[0])) pose = doc.createElement("pose") pose.appendChild(doc.createTextNode("Left")) truncated = doc.createElement("truncated") truncated.appendChild(doc.createTextNode("1")) difficult = doc.createElement("difficult") difficult.appendChild(doc.createTextNode("0")) bndbox = doc.createElement("bndbox") xmin = doc.createElement("xmin") xmin.appendChild(doc.createTextNode(nums[0])) ymin = doc.createElement("ymin") ymin.appendChild(doc.createTextNode(nums[1])) xmax = doc.createElement("xmax") xmax.appendChild(doc.createTextNode(nums[2])) ymax = doc.createElement("ymax") ymax.appendChild(doc.createTextNode(nums[3])) bndbox.appendChild(xmin) bndbox.appendChild(ymin) bndbox.appendChild(xmax) bndbox.appendChild(ymax) object.appendChild(name2) object.appendChild(pose) object.appendChild(truncated) object.appendChild(difficult) object.appendChild(bndbox) os.chdir("E:\\shared\\Format_Trans_20160328\\src_txs") bmpfiles = glob.glob("*.jpg") for bmpfile in bmpfiles: InitVoc(bmpfile)
from xml.dom.minidom import * import struct class PltHeader(object): def __init__(self, uSize, uCharNum, uTest, uStrokeNum, reserve): self.uSize = uSize self.uCharNum = uCharNum self.uText = uTest self.uStrokeNum = uStrokeNum self.reserve = reserve my_plt = PltHeader(0, 0, [], 0, 0) dom1 = xml.dom.minidom.parse("E:/ADAB_set/ADAB_set/set_2/inkml/1233225548643.inkml") dom2 = xml.dom.minidom.parse("E:/ADAB_set/ADAB_set/set_2/upx/1233225548643.upx") root1 = dom1.documentElement root2 = dom2.documentElement itemlist1 = root1.getElementsByTagName('trace') itemText = root2.getElementsByTagName("alternate") uTest = repr(itemText[0].getAttribute("value")).split('\\u') for i in uTest: if i != "u'": my_plt.uText.append(int(i.strip(' ').strip("'"), 16)) my_plt.uStrokeNum = len(itemlist1) my_plt.uCharNum = len(my_plt.uText) for j in range(256 - my_plt.uCharNum): my_plt.uText.append(0) my_plt.uText.append(my_plt.reserve) nums = [] count = 0 for item in itemlist1: for coordinate in str(item.firstChild.data).split(','): count += 1 nums.append(int(coordinate.split(" ")[0])) nums.append(int(coordinate.split(" ")[1])) nums.append(65535) nums.append(0) nums.append(65535) nums.append(65535) my_plt.uSize = 520 + (my_plt.uStrokeNum + 1 + count) * 2 * 2 binfile = open("C:/Users/samsung/Desktop/1.plt","w+b") ss = struct.pack('HH258H', my_plt.uSize, my_plt.uCharNum, *my_plt.uText) binfile.write(ss) ss = struct.pack('%dH'%(len(nums)),*nums) binfile.write(ss) binfile.close() print my_plt.uStrokeNum, my_plt.uSize, my_plt.uText, my_plt.uCharNum print len(nums)
from xml.dom.minidom import * import struct import os os.chdir("E:/ADAB_set/ADAB_set") reserve = 0 for root, dirs, files in os.walk(os.getcwd()): for dir in dirs: binfile = open("C:/Users/samsung/Desktop/%s.plt"%dir,"ab") inkml_files = [] upx_files = [] os.chdir(".\\%s"%dir) for inkml_root, inkml_dirs, inkml_files in os.walk(".\\inkml"): break for upx_root, upx_dirs, upx_files in os.walk(".\\upx"): break for i in range(len(inkml_files)): uText = [] uStrokeNum = 0 uCharNum = 0 uSize = 0 domInkml = parse(".\\inkml\\%s"%inkml_files[i]) domUpx = parse(".\\upx\\%s"%upx_files[i]) rootInkml = domInkml.documentElement rootUpx = domUpx.documentElement itemListInkml = rootInkml.getElementsByTagName('trace') itemValue = rootUpx.getElementsByTagName("alternate") strTest = repr(itemValue[0].getAttribute("value")).split('\\u') print strTest print i for i in strTest: if i != strTest[0]: if len(i.split(" ")) > 1: uText.append(int(i.split(" ")[0], 16)) else: uText.append(int(i.strip(' ').strip("'"), 16)) print uText uStrokeNum = len(itemListInkml) uCharNum = len(uText) for j in range(256 - uCharNum): uText.append(0) uText.append(uStrokeNum) uText.append(reserve) nums = [] count = 0 for item in itemListInkml: for coordinate in str(item.firstChild.data).split(','): count += 1 nums.append(int(coordinate.split(" ")[0])) nums.append(int(coordinate.split(" ")[1])) nums.append(65535) nums.append(0) nums.append(65535) nums.append(65535) uSize = 520 + (uStrokeNum + 1 + count) * 2 * 2 ss = struct.pack('HH258H', uSize, uCharNum, *uText) binfile.write(ss) ss = struct.pack('%dH'%(len(nums)),*nums) binfile.write(ss) binfile.close() os.chdir("..\\") break
<html> <head> <title>People that Avah knows</title> <style> .overlay{ opacity:0.99; background-color:#eee; position:fixed; width:100%; height:100%; top:0px; left:0px; z-index:1000; } </style> </head> <body> <h1>People that Avah knows</h1> <div class="overlay" id="overlay" style="display:none" > <center> <h2> This screen randomly changes the height between list items and vanishes after a while to make sure that you retrieve and process the data in a Python program rather than simply counting down pressing links, and and doing the assignment without writing a Python program :). The names are in the same order in the HTML even though they shift around on the screen visually. Your Python program can look at the page as long as it likes. </h2> </center> </div> <ul> <li style="margin-top: 7px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Daniyal.html">Daniyal</a></li> <li style="margin-top: 4px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fares.html">Fares</a></li> <li style="margin-top: 15px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Kynan.html">Kynan</a></li> <li style="margin-top: 10px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Erika.html">Erika</a></li> <li style="margin-top: 29px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Milly.html">Milly</a></li> <li style="margin-top: 12px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Ceara.html">Ceara</a></li> <li style="margin-top: 1px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Rennie.html">Rennie</a></li> <li style="margin-top: 31px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Franco.html">Franco</a></li> <li style="margin-top: 19px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Roxie.html">Roxie</a></li> <li style="margin-top: 4px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Giyia.html">Giyia</a></li> <li style="margin-top: 32px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Zuzanna.html">Zuzanna</a></li> <li style="margin-top: 15px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Calean.html">Calean</a></li> <li style="margin-top: 26px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Robyn.html">Robyn</a></li> <li style="margin-top: 22px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Sainabou.html">Sainabou</a></li> <li style="margin-top: 21px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Maximilian.html">Maximilian</a></li> <li style="margin-top: 36px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Katso.html">Katso</a></li> <li style="margin-top: 9px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Armaan.html">Armaan</a></li> <li style="margin-top: 23px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Keiva.html">Keiva</a></li> <li style="margin-top: 8px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Eiko.html">Eiko</a></li> <li style="margin-top: 28px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Salahudin.html">Salahudin</a></li> <li style="margin-top: 15px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Marwa.html">Marwa</a></li> <li style="margin-top: 13px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Nodoka.html">Nodoka</a></li> <li style="margin-top: 46px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Dhyia.html">Dhyia</a></li> <li style="margin-top: 15px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Lianne.html">Lianne</a></li> <li style="margin-top: 5px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Tyree.html">Tyree</a></li> <li style="margin-top: 24px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Diona.html">Diona</a></li> <li style="margin-top: 35px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Lliam.html">Lliam</a></li> <li style="margin-top: 15px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Crystyn.html">Crystyn</a></li> <li style="margin-top: 34px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Maca.html">Maca</a></li> <li style="margin-top: 1px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Marina.html">Marina</a></li> <li style="margin-top: 22px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Leah.html">Leah</a></li> <li style="margin-top: 21px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Avril.html">Avril</a></li> <li style="margin-top: 47px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Riagan.html">Riagan</a></li> <li style="margin-top: 13px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Jaying.html">Jaying</a></li> <li style="margin-top: 59px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Emaly.html">Emaly</a></li> <li style="margin-top: 26px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Rheanne.html">Rheanne</a></li> <li style="margin-top: 46px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Owais.html">Owais</a></li> <li style="margin-top: 31px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Aria.html">Aria</a></li> <li style="margin-top: 8px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Kyie.html">Kyie</a></li> <li style="margin-top: 48px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Keryis.html">Keryis</a></li> <li style="margin-top: 32px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Marcous.html">Marcous</a></li> <li style="margin-top: 44px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Catrin.html">Catrin</a></li> <li style="margin-top: 54px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Marcelina.html">Marcelina</a></li> <li style="margin-top: 52px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Madeline.html">Madeline</a></li> <li style="margin-top: 21px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Patrikas.html">Patrikas</a></li> <li style="margin-top: 66px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Lacey.html">Lacey</a></li> <li style="margin-top: 57px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Eason.html">Eason</a></li> <li style="margin-top: 9px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Kyrillos.html">Kyrillos</a></li> <li style="margin-top: 16px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Benjamin.html">Benjamin</a></li> <li style="margin-top: 34px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Ege.html">Ege</a></li> <li style="margin-top: 39px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Marwan.html">Marwan</a></li> <li style="margin-top: 6px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Celik.html">Celik</a></li> <li style="margin-top: 42px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Kendal.html">Kendal</a></li> <li style="margin-top: 18px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Kasja.html">Kasja</a></li> <li style="margin-top: 22px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Asena.html">Asena</a></li> <li style="margin-top: 12px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Harris.html">Harris</a></li> <li style="margin-top: 79px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Si.html">Si</a></li> <li style="margin-top: 56px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Lumi.html">Lumi</a></li> <li style="margin-top: 33px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Chevy.html">Chevy</a></li> <li style="margin-top: 24px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Famara.html">Famara</a></li> <li style="margin-top: 47px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Cara.html">Cara</a></li> <li style="margin-top: 80px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Elisa.html">Elisa</a></li> <li style="margin-top: 23px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Nihaal.html">Nihaal</a></li> <li style="margin-top: 76px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Vivian.html">Vivian</a></li> <li style="margin-top: 73px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Farrah.html">Farrah</a></li> <li style="margin-top: 69px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Shonagh.html">Shonagh</a></li> <li style="margin-top: 30px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Monty.html">Monty</a></li> <li style="margin-top: 80px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Turner.html">Turner</a></li> <li style="margin-top: 6px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Oliver.html">Oliver</a></li> <li style="margin-top: 24px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Aayan.html">Aayan</a></li> <li style="margin-top: 3px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Atom.html">Atom</a></li> <li style="margin-top: 3px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Abby.html">Abby</a></li> <li style="margin-top: 68px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Makala.html">Makala</a></li> <li style="margin-top: 79px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Rupert.html">Rupert</a></li> <li style="margin-top: 30px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Aine.html">Aine</a></li> <li style="margin-top: 50px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Feden.html">Feden</a></li> <li style="margin-top: 44px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Annick.html">Annick</a></li> <li style="margin-top: 44px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Avah.html">Avah</a></li> <li style="margin-top: 72px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Emilie.html">Emilie</a></li> <li style="margin-top: 47px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Autumn.html">Autumn</a></li> <li style="margin-top: 25px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Edyn.html">Edyn</a></li> <li style="margin-top: 43px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Efe.html">Efe</a></li> <li style="margin-top: 29px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Kie.html">Kie</a></li> <li style="margin-top: 105px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Dougal.html">Dougal</a></li> <li style="margin-top: 58px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Carolyn.html">Carolyn</a></li> <li style="margin-top: 30px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Lana.html">Lana</a></li> <li style="margin-top: 54px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Miryn.html">Miryn</a></li> <li style="margin-top: 32px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Pearce.html">Pearce</a></li> <li style="margin-top: 73px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Ash.html">Ash</a></li> <li style="margin-top: 49px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Zahide.html">Zahide</a></li> <li style="margin-top: 76px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Mathias.html">Mathias</a></li> <li style="margin-top: 53px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Vaila.html">Vaila</a></li> <li style="margin-top: 104px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Abbey.html">Abbey</a></li> <li style="margin-top: 64px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Ayooluwa.html">Ayooluwa</a></li> <li style="margin-top: 117px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Miriam.html">Miriam</a></li> <li style="margin-top: 83px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Levon.html">Levon</a></li> <li style="margin-top: 3px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Colin.html">Colin</a></li> <li style="margin-top: 65px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Noah.html">Noah</a></li> <li style="margin-top: 70px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Taegan.html">Taegan</a></li> <li style="margin-top: 122px;"><a href="https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Zennon.html">Zennon</a></li> </ul> <script> // http://stackoverflow.com/questions/20423322/simple-setting-off-display-none-block-with-javascript function showHide(id) { var el = document.getElementById(id); if( el && el.style.display == 'none') el.style.display = 'block'; else el.style.display = 'none'; } setTimeout('showHide("overlay");', 2500); </script> </body> </html>
import urllib2 from bs4 import BeautifulSoup url = raw_input() for i in range(7): html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) tags = soup('a') print tags[17].string url = tags[17].get('href', None)
<html> <head> <title>Welcome to the comments assignment from www.pythonlearn.com</title> </head> <body> <h1>This file contains the sample data for testing</h1> <table border="2"> <tr> <td>Name</td><td>Comments</td> </tr> <tr><td>Leven</td><td><span class="comments">100</span></td></tr> <tr><td>Mahdiya</td><td><span class="comments">97</span></td></tr> <tr><td>Ajayraj</td><td><span class="comments">87</span></td></tr> <tr><td>Lillian</td><td><span class="comments">86</span></td></tr> <tr><td>Aon</td><td><span class="comments">86</span></td></tr> <tr><td>Ruaraidh</td><td><span class="comments">78</span></td></tr> <tr><td>Gursees</td><td><span class="comments">75</span></td></tr> <tr><td>Emmanuel</td><td><span class="comments">74</span></td></tr> <tr><td>Christy</td><td><span class="comments">72</span></td></tr> <tr><td>Annoushka</td><td><span class="comments">72</span></td></tr> <tr><td>Inara</td><td><span class="comments">72</span></td></tr> <tr><td>Caite</td><td><span class="comments">70</span></td></tr> <tr><td>Rosangel</td><td><span class="comments">70</span></td></tr> <tr><td>Iana</td><td><span class="comments">66</span></td></tr> <tr><td>Anise</td><td><span class="comments">66</span></td></tr> <tr><td>Jaosha</td><td><span class="comments">65</span></td></tr> <tr><td>Cadyn</td><td><span class="comments">65</span></td></tr> <tr><td>Edward</td><td><span class="comments">63</span></td></tr> <tr><td>Charlotte</td><td><span class="comments">61</span></td></tr> <tr><td>Sammy</td><td><span class="comments">60</span></td></tr> <tr><td>Zarran</td><td><span class="comments">60</span></td></tr> <tr><td>Rowen</td><td><span class="comments">59</span></td></tr> <tr><td>Stanislaw</td><td><span class="comments">59</span></td></tr> <tr><td>Maighdlin</td><td><span class="comments">57</span></td></tr> <tr><td>Connan</td><td><span class="comments">56</span></td></tr> <tr><td>Warrick</td><td><span class="comments">54</span></td></tr> <tr><td>Diya</td><td><span class="comments">52</span></td></tr> <tr><td>Lawson</td><td><span class="comments">52</span></td></tr> <tr><td>Wu</td><td><span class="comments">51</span></td></tr> <tr><td>Irmak</td><td><span class="comments">47</span></td></tr> <tr><td>Emilija</td><td><span class="comments">47</span></td></tr> <tr><td>Kayda</td><td><span class="comments">41</span></td></tr> <tr><td>Ellenor</td><td><span class="comments">41</span></td></tr> <tr><td>Kyra</td><td><span class="comments">41</span></td></tr> <tr><td>Nikita</td><td><span class="comments">38</span></td></tr> <tr><td>Kaelah</td><td><span class="comments">35</span></td></tr> <tr><td>Meko</td><td><span class="comments">32</span></td></tr> <tr><td>Marissa</td><td><span class="comments">31</span></td></tr> <tr><td>Ayat</td><td><span class="comments">24</span></td></tr> <tr><td>Sali</td><td><span class="comments">19</span></td></tr> <tr><td>Hashem</td><td><span class="comments">19</span></td></tr> <tr><td>Tygan</td><td><span class="comments">18</span></td></tr> <tr><td>Rioden</td><td><span class="comments">17</span></td></tr> <tr><td>Cruiz</td><td><span class="comments">16</span></td></tr> <tr><td>Caoilfinn</td><td><span class="comments">13</span></td></tr> <tr><td>Ewen</td><td><span class="comments">8</span></td></tr> <tr><td>Baighley</td><td><span class="comments">7</span></td></tr> <tr><td>Ramone</td><td><span class="comments">1</span></td></tr> <tr><td>Kyran</td><td><span class="comments">1</span></td></tr> <tr><td>Noelani</td><td><span class="comments">1</span></td></tr> </table> </body> </html>
import urllib2 from bs4 import BeautifulSoup url = raw_input() html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) tags = soup('span') count, sum = 0, 0 for tag in tags: count += 1 sum += int(tag.string) print count, sum
相关文章推荐
- Python 多线程学习04
- python模块结构和布局
- 关于Python35爬虫的一些个人想法(我是菜鸟)
- python numpy数组的几种排序方式---by香蕉麦乐迪
- python 常用方法总结
- python 基础(函数,参数)
- python基础(json,socket)
- 检测输入的是几位数
- Python+Selenium自动化
- python 基本操作 多维数组 循环
- python根据百度地图api将地址转成经纬度
- The Zen of Python
- 决定配置完整的python ubuntu14.04 python-dev distribute
- Python 多线程学习03
- Python基础——使用with结构打开多个文件
- leetcode之Palindrome Linked List
- Python的几个开
- python [3.2] urllib的使用
- Python 多线程学习02
- python入门--基本数据类型