Python Map Reduce Program

Mapper.py

#!/usr/bin/env python

import sys

for line in sys.stdin:

  line = line.strip()

  words = line.split()

for word in words:

  print '%s\t%s' % (word, 1)

Reducer.py

#!/usr/bin/env python

from operator import itemgetter

import sys

current_word = None

current_count = 0

word = None

# input comes from STDIN

for line in sys.stdin:

    line = line.strip()

    word, count = line.split('\t', 1)

    # convert count (currently a string) to int

    try:

        count = int(count)

    except ValueError:

        continue

    if current_word == word:

        current_count += count

    else:

        if current_word:

            print '%s\t%s' % (current_word, current_count)

        current_count = count

        current_word = word

if current_word == word:

    print '%s\t%s' % (current_word, current_count)

 

 

 

How to run –

  • Unix
    • echo “foo foo quux labs foo bar quux” | ~/Mapper.py
    • echo “foo foo quux labs foo bar quux” | ~/Mapper.py | sort -k1,1 | ~/Reducer.py
  • Hadoop
    • hadoop dfs -mkdir -p /tmp/synopsys/input
    • hadoop dfs -copuyFromLocal ~/example/WordCount1/file* /tmp/synopsys/input
    • hadoop dfs -ls  /tmp/synopsys/input/
    • /home/woir/hadoop-2.6.0/bin/hadoop jar /home/woir/hadoop-2.6.0/share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar \
      -file /home/woir/Mapper.py -mapper /home/woir/Mapper.py \
      -file /home/woir/Reducer.py -reducer /home/woir/Reducer.py \
      -input /tmp/synopsys/input/* -output /user/amar/gutenberg-output1