大数据——项目流程
程序员文章站
2022-03-05 10:10:05
...
1.数据的预处理阶段
2.数据的入库操作阶段
3.数据的分析阶段
4.数据保存到数据库阶段
5.数据的查询显示阶段
一、数据的预处理阶段
Mapper:
Reduce:
Driver:
二、数据的入库操作阶段(把预处理之后的数据进行入库到hive中)
- 1、创建hive表
(textfile)格式
create table 表名(
videoId string,
uploader string,
age int
row format delimited
fields terminated by ":"
collection items terminated by ","
stored as textfile;
(orc)格式
create table video_user_orc (
uploader string,
videos int,
friends int)
row format delimited
fields terminated by ","
stored as orc;
- 2、分别导入预处理之后的视频数据到原始表video_ori和导入原始用户表的数据到video_user_ori中
load data local inpath '数据路径 ' into table 表名;
- 3、从原始表查询数据并插入对i应的ORC表中
insert into table orc表 select *from 元数据表;
三、 数据的分析阶段(对入库之后的数据进行hivesql查询操作)
- 1、 将查到的数据保存到指定文件中
hive -e 'select * from 库名.表名 where 条件' > 要存储的路径
替换引号为空 :%s/"//g
替换[ 为空 :%s/[//g
替换] 为空 :%s/]//g
四、数据保存到数据库阶段(把hive分析出的数据保存到hbase中)
- 1、创建hive对应的数据库外部表
create external table rate(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by ","
stored as textfile;
- 2、加载第3步的结果数据到外部表中
load data local inpath '数据路径' overwrite into table 外部表名;
- 3、创建hive管理表与HBase进行映射
create table hbase_rate(
videoId string,
uploader string,
age int
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties("hbase.columns.mapping" = ":key,data:uploader,data:age,data:category,data:length,data:views,data:rate,data:ratings,data:comments,data:relatedId")
tblproperties("hbase.table.name" = "表名");
- 4、请写出通过insert overwrite select,插入表的语句
insert into table 表名 select * from 外部表名;
五、数据的查询显示阶段(通过hbaseapi进行查询操作)
- 1、请使用hbaseapi 对hbase_rate表,按照通过startRowKey=1和endRowKey=100进行扫描查询出结果。
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.quorum", "node01:2181,node02:2181,node03:2181");
Connection connection = ConnectionFactory.createConnection(conf);
Admin admin = connection.getAdmin();
Table tableName = connection.getTable(TableName.valueOf("hbase_rate"));
Scan scan = new Scan();
ResultScanner scanner = tableName.getScanner(scan);
for (Result result : scanner) {
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
System.out.println(
"RowKey= "+Bytes.toString(CellUtil.cloneRow(cell))+
"\tcloneFamily= "+Bytes.toString(CellUtil.cloneFamily(cell))+
"\tcloneQualifier= "+Bytes.toString(CellUtil.cloneQualifier(cell))+
"\tcloneValue= "+Bytes.toString(CellUtil.cloneValue(cell))
);
}
}
}
- 2、请使用hbaseapi对hbase_comments表,只查询comments列的值。
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("hbase.zookeeper.quorum", "node01:2181,node02:2181,node03:2181");
Connection connection = ConnectionFactory.createConnection(conf);
Admin admin = connection.getAdmin();
Table tableName = connection.getTable(TableName.valueOf("hbase_comments"));
Scan scan = new Scan();
scan.addColumn(Bytes.toBytes("info"),Bytes.toBytes("comments"));
ResultScanner scanner = tableName.getScanner(scan);
for (Result result : scanner) {
Cell[] cells = result.rawCells();
for (Cell cell : cells) {
System.out.println(
"RowKey= " + Bytes.toString(CellUtil.cloneRow(cell)) +
"\tcloneFamily= " + Bytes.toString(CellUtil.cloneFamily(cell)) +
"\tcloneQualifier= " + Bytes.toString(CellUtil.cloneQualifier(cell)) +
"\tcloneValue= " + Bytes.toString(CellUtil.cloneValue(cell))
);
}
}
}