您的位置：首页 > 编程语言 > Python开发

大数据学习系列----python写MapReduce

2017-05-25 00:00 525 查看

摘要: 正常用java，也可以用python

功能实现

统计文本文件中所有单词出现的频率。

文本文件：

foo foo quux labs foo bar quux abc bar see you by test welcome test
abc labs foo me python hadoop ab ac bc bec python

Map代码

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import sys

#输入为标准输入stdin
for line in sys.stdin:
#删除开头和结尾的空行
line = line.strip()
#以默认空格分隔单词到words列表
words = line.split()
for word in words:
#输出所有单词，格式为“单词,1”以便作为Reduce的输入
print '%s\t%s' % (word,1)0

Reduce代码

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from operator import itemgetter
import sys

current_word = None
current_count = 0
word = None

#获取标准输入，即mapper.py的标准输出
for line in sys.stdin:
#删除开头和结尾的空行
line = line.strip()

#解析mapper.py输出作为程序的输入，以tab作为分隔符
word,count = line.split('\t',1)

#转换count从字符型到整型
try:
count = int(count)
except ValueError:
#count非数字时，忽略此行
continue

#要求mapper.py的输出做排序（sort）操作，以便对连续的word做判断
if current_word == word:
current_count += count
else:
if current_word:
#输出当前word统计结果到标准输出
print '%s\t%s' % (current_word,current_count)
current_count = count
current_word = word

#输出最后一个word统计
if current_word == word:
print '%s\t%s' % (current_word,current_count)

本地测试

[root@wx ~]# cd /root/hadooptest/
[root@wx hadooptest]# cat input.txt | ./mapper.py
foo 1
foo 1
quux 1
labs 1
foo 1
bar 1
quux 1
abc 1
bar 1
see 1
you 1
by 1
test 1
welcome 1
test 1
abc 1
labs 1
foo 1
me 1
python 1
hadoop 1
ab 1
ac 1
bc 1
bec 1
python 1

[root@wx hadooptest]# cat input.txt | ./mapper.py | sort -k1,1 | ./reducer.py
ab 1
abc 2
ac 1
bar 2
bc 1
bec 1
by 1
foo 4
hadoop 1
labs 2
me 1
python 2
quux 2
see 1
test 2
welcome 1
you 1

Hadoop平台运行

HDFS上创建存储目录

/usr/local/hadoop-2.6.4/bin/hadoop fs -mkdir -p /user/root/word

文件上传到HDFS中

/usr/local/hadoop-2.6.4/bin/hadoop fs -put /root/hadooptest/input.txt /user/root/word

查看目录下文件

/usr/local/hadoop-2.6.4/bin/hadoop fs -ls /user/root/word
#结果：
Found 1 items
-rw-r--r-- 2 root supergroup 118 2016-03-22 13:36 /user/root/word/input.txt

执行MapReduce任务，输出文件定为/output/word

/usr/local/hadoop-2.6.4/bin/hadoop jar /usr/local/hadoop-2.6.4/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar -files 'mapper.py,reducer.py' -input /user/root/word -output /output/word -mapper ./mapper.py -reducer ./reducer.py

查看生成结果文件

[root@wx hadooptest]# /usr/local/hadoop-2.6.4/bin/hadoop fs -ls /output/word
Found 2 items
-rw-r--r-- 2 root supergroup 0 2016-03-22 13:47 /output/word/_SUCCESS
-rw-r--r-- 2 root supergroup 110 2016-03-22 13:47 /output/word/part-00000

查看结果数据

[root@wx hadooptest]# /usr/local/hadoop-2.6.4/bin/hadoop fs -cat /output/word/part-00000
ab 1
abc 2
ac 1
bar 2
bc 1
bec 1
by 1
foo 4
hadoop 1
labs 2
me 1
python 2
quux 2
see 1
test 2
welcome 1
you 1

活动使用开源框架：MRJob

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 大数据 Hadoop

相关文章推荐

新的分享

章节导航