根据业务需求对饿了么数据进行数据清洗(三)
程序员文章站
2023-12-31 14:57:46
...
一、明确业务需求
统计商家所有商品的平均评分作为该商店的评分,取出分数最高的前三十商家。(有效数据不包括月售量小于10、评分为0的)
二、进行MapReduce清洗
package com.yc.elm.utils;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class RatingAnalysisJob {
public static class RatingAnalysisJobMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
private Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, NullWritable, Text>.Context context)
throws IOException, InterruptedException {
if (value != null) {
String[] strs = value.toString().split("\\t");
if (strs.length == 7 && !strs[5].equals("0.0") && !strs[2].equals("152161001")
&& Integer.parseInt(strs[6]) >= 10) {
text.set(strs[0] + "\t" + strs[1] + "\t" + strs[2] + "\t" + strs[3] + "\t" + strs[4] + "\t"
+ strs[5] + "\t" + strs[6]);
context.write(NullWritable.get(), text);
}
}
}
}
public static class RatingAnalysisJobReducer extends Reducer<NullWritable, Text, NullWritable, Text> {
@Override
protected void reduce(NullWritable arg0, Iterable<Text> arg1,
Reducer<NullWritable, Text, NullWritable, Text>.Context arg2) throws IOException, InterruptedException {
super.reduce(arg0, arg1, arg2);
}
}
public static void main(String[] args) {
try {
Configuration config = new Configuration();
Job job = Job.getInstance(config, "RatingAnalysisJob");
job.setJarByClass(RatingAnalysisJob.class);
job.setMapperClass(RatingAnalysisJobMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(RatingAnalysisJobReducer.class);
FileInputFormat.setInputPaths(job, new Path("hdfs://master:9000/elm/elm_data.txt"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://master:9000/elm/result02/"));
System.out.println(job.waitForCompletion(true) ? 0 : 1);
} catch (Exception e) {
e.printStackTrace();
}
}
}
结果:
三、在Hive中创建elm_rating表并插入数据
create external table elm_rating(
foodID string,
foodName string,
restaurantID string,
restaurantName string,
price double,
rating float,
saleCount int
)
row format delimited
fields terminated by '\t'
location '/elm/result02/';
结果:
四、查出符合条件的前三十商家,并存入表rating
create table rating(
restaurantID string,
restaurantName string,
rating double
)
row format delimited
fields terminated by ','
location '/elm/rating/'
insert into rating select a.restaurantid id, a.restaurantname name, a.rating rating from ( select restaurantID, restaurantName ,avg(rating ) rating from elm_rating group by restaurantID, restaurantName ) a order by rating desc limit 30;
结果:
rating 建表是一定要以“,”分割,否则在后面用Sqoop做数据迁移的时候会出问题。