您的位置:首页 > 其它

给爬到的网址链接加入“朋友值”

2016-11-18 16:34 337 查看
def compute_ranks(graph):
print graph
d = 0.8 # damping factor
numloops = 10

ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages

for i in range(0, numloops):
newranks = {}
for page in graph:
newrank = (1 - d) / npages

for node in graph:
if page in graph[node]:
newrank = newrank + d*(ranks[node] / len(graph[node]))

newranks[page] = newrank
ranks = newranks
return ranks

cache = {
'http://udacity.com/cs101x/urank/index.html': """<html>
<body>
<h1>Dave's Cooking Algorithms</h1>
<p>
Here are my favorite recipies:
<ul>
<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
</ul>

For more expert opinions, check out the
<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
</body>
</html>

""",
'http://udacity.com/cs101x/urank/zinc.html': """<html>
<body>
<h1>The Zinc Chef</h1>
<p>
I learned everything I know from
<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
</p>
<p>
For great hummus, try
<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.

</body>
</html>

""",
'http://udacity.com/cs101x/urank/nickel.html': """<html>
<body>
<h1>The Nickel Chef</h1>
<p>
This is the
<a href="http://udacity.com/cs101x/urank/kathleen.html">
best Hummus recipe!
</a>

</body>
</html>

""",
'http://udacity.com/cs101x/urank/kathleen.html': """<html>
<body>
<h1>
Kathleen's Hummus Recipe
</h1>
<p>

<ol>
<li> Open a can of garbonzo beans.
<li> Crush them in a blender.
<li> Add 3 tablesppons of tahini sauce.
<li> Squeeze in one lemon.
<li> Add salt, pepper, and buttercream frosting to taste.
</ol>

</body>
</html>

""",
'http://udacity.com/cs101x/urank/arsenic.html': """<html>
<body>
<h1>
The Arsenic Chef's World Famous Hummus Recipe
</h1>
<p>

<ol>
<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
<li> Force her to make hummus for you.
</ol>

</body>
</html>

""",
'http://udacity.com/cs101x/urank/hummus.html': """<html>
<body>
<h1>
Hummus Recipe
</h1>
<p>

<ol>
<li> Go to the store and buy a container of hummus.
<li> Open it.
</ol>

</body>
</html>

""",
}

def crawl_web(seed): # returns index, graph of inl
4000
inks
tocrawl = [seed]
crawled = []
graph = {}  # <url>, [list of pages it links to]
index = {}
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)

graph[page] = outlinks

union(tocrawl, outlinks)
crawled.append(page)
return index, graph

def get_page(url):
if url in cache:
return cache
最后这个视频是加入了一个算法,给搜索到的网址加入了一个值,代表他的“朋友值”,看了之后不是很理解

但是本人感觉就是,谁出现在别人的页面里多,朋友值越大,越能被搜索到。

全部代码如上,等以后想翻看再来看看。。。

再贴一次视频地址:
计算机科学导论" target=_blank>
else:
return None

def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote

def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links

def union(a, b):
for e in b:
if e not in a:
a.append(e)

def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)

def add_to_index(index, keyword, url):
if keyword in index:
index[keyword].append(url)
else:
index[keyword] = [url]

def lookup(index, keyword):
if keyword in index:
return index[keyword]
else:
return None

index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
ranks = compute_ranks(graph)
print ranks[/code]
最后这个视频是加入了一个算法,给搜索到的网址加入了一个值,代表他的“朋友值”,看了之后不是很理解

但是本人感觉就是,谁出现在别人的页面里多,朋友值越大,越能被搜索到。

全部代码如上,等以后想翻看再来看看。。。

再贴一次视频地址:[url=https://cn.udacity.com/course/intro-to-computer-science--cs101]计算机科学导论


有些没有中文字幕,比较坑爹。

正好又看到一个人的简书笔记 :[url=http://www.jianshu.com/p/229936a65a35]简书

可以参考一下。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: