欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

hadoop streaming - example

程序员文章站 2024-03-16 09:36:28
...
#!/bin/sh

EXECPATH=`pwd` 
HPHOME=/home/hadoop/hadoop-0.20.2-cdh3u0/bin/
JAR_PACKEGE=/home/hadoop/hadoop-0.20.2-cdh3u0/contrib/streaming/hadoop-streaming-0.20.2-cdh3u0.jar

IN_PATH=/online/visit_urls/$1*/$2*
OUT_PATH=/user/rendonghui/itg/time_online/$1_$2
echo $IN_PATH, $OUT_PATH
MAP_FILE=$EXECPATH/map.py
RED_FILE=$EXECPATH/red.py
FILE1=$EXECPATH/host_list
FILE2=$EXECPATH/transtime.py
echo $MAP_FILE, $RED_FILE
${HPHOME}hadoop fs -rmr $OUT_PATH

${HPHOME}hadoop jar $JAR_PACKEGE \
        -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
        -D stream.num.map.output.key.fields=2 \
        -D num.key.fields.for.partition=1 \
        -D mapred.text.key.comparator.options="-k1,1n -k2" \
        -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
        -inputformat org.apache.hadoop.mapred.SequenceFileAsTextInputFormat \
        -outputformat org.apache.hadoop.mapred.SequenceFileOutputFormat \
        -numReduceTasks 100 \
        -input $IN_PATH \
        -output $OUT_PATH \
        -file $FILE1 \
        -file $FILE2 \
        -file $MAP_FILE \
        -mapper $MAP_FILE \
        -file $RED_FILE \
        -reducer $RED_FILE \
        -jobconf mapred.job.queue.name=bi \
        -jobconf mapred.job.name=$1_$2_time_online

#-D num.key.fields.for.partition=1 \
#sh job_stat.sh $2
#${HPHOME}hadoop fs -text $OUT_PATH/p*|python ana.py >output/time_$2.csv



转载于:https://my.oschina.net/u/1778317/blog/269657

上一篇: Gson的使用-4

下一篇: