欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

spark处理较复杂的log日志文件

程序员文章站 2024-03-17 20:38:46
...
1593136280858|{"cm":{"ln":"-55.0","sv":"V2.9.6","os":"8.0.4","g":"aaa@qq.com","mid":"489","nw":"3G","l":"es","vc":"4","hw":"640*960","ar":"MX","uid":"489","t":"1593123253541","la":"5.2","md":"sumsung-18","vn":"1.3.4","ba":"Sumsung","sr":"I"},
"ap":"app",
"et":[{"ett":"1593050051366","en":"loading","kv":{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}},{"ett":"1593108791764","en":"ad","kv":{"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"}},{"ett":"1593111271266","en":"notification","kv":{"ap_time":"1593097087883","action":"1","type":"1","content":""}},{"ett":"1593066033562","en":"active_background","kv":{"active_source":"3"}},{"ett":"1593135644347","en":"comment","kv":{"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"}}]}
1593136280858|{"cm":{"ln":"-114.9","sv":"V2.7.8","os":"8.0.4","g":"aaa@qq.com","mid":"490","nw":"3G","l":"pt","vc":"8","hw":"640*1136","ar":"MX","uid":"490","t":"1593121224789","la":"-44.4","md":"Huawei-8","vn":"1.0.1","ba":"Huawei","sr":"O"},
"ap":"app",
"et":[{"ett":"1593063223807","en":"loading","kv":{"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"}},{"ett":"1593095105466","en":"ad","kv":{"activityId":"1","displayMills":"1966","entry":"3","action":"2","contentType":"0"}},{"ett":"1593051718208","en":"notification","kv":{"ap_time":"1593095336265","action":"2","type":"3","content":""}},{"ett":"1593100021275","en":"comment","kv":{"p_comment_id":4,"addtime":"1593098946009","praise_count":220,"other_id":4,"comment_id":9,"reply_count":151,"userid":4,"content":"抄应螟皮釉倔掉汉蛋蕾街羡晶"}},{"ett":"1593105344120","en":"praise","kv":{"target_id":9,"id":7,"type":1,"add_time":"1593098545976","userid":8}}]}

日志文件 部分字段如上

spark-shell下 进行

一、导入日志文件转为df:

scala> val fileRDD = sc.textFile("hdfs://hadoop:9000/app/data/op.log")
scala> val opJsonDF = opJsonStr.toDF

二、ap为独立字段所以分成三段

scala> val opJscala explodesonObj = opJsonDF.select(get_json_object($"value","$.cm").alias("cm"),get_json_object($"value","$.ap").alias("ap"),get_json_object($"value","$.et").alias("et"))

show:
spark处理较复杂的log日志文件
三、先将cm字段通过get_json_object 转化为表结构

scala> val opJsonObj1 = opJsonObj.select($"ap",get_json_object($"cm","$.ln").alias("ln"),get_json_object($"cm","$.sv").alias("sv"),get_json_object($"cm","$.os").alias("os"),get_json_object($"cm","$.g").alias("g")
,get_json_object($"cm","$.mid").alias("mid"),get_json_object($"cm","$.nw").alias("nw"),get_json_object($"cm","$.l").alias("l"),get_json_object($"cm","$.vc").alias("vc"),get_json_object($"cm","$.hw").alias("hw")
,get_json_object($"cm","$.ar").alias("ar"),get_json_object($"cm","$.uid").alias("uid"),get_json_object($"cm","$.t").alias("t"),get_json_object($"cm","$.la").alias("la")
,get_json_object($"cm","$.md").alias("md"),get_json_object($"cm","$.vn").alias("vn"),get_json_object($"cm","$.ba").alias("ba"),get_json_object($"cm","$.sr").alias("sr"),$"et")

spark处理较复杂的log日志文件
四、导入必要的包

scala>  import spark.implicits._
import spark.implicits._

scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._

scala> import org.apache.spark.sql.types._
import org.apache.spark.sql.types._

scala>  import org.apache.spark.sql._
import org.apache.spark.sql._

五、查询

scala> val opJsonObj2  =  opJsonObj1.select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",from_json($"et", 
ArrayType(StructType(StructField("ett",StringType)::StructField("en",StringType)::StructField("kv",StringType)::Nil))).alias("events"))

spark处理较复杂的log日志文件
六、explode 行转列

scala> val opJsonObj3 = opJsonObj2.select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr"
,explode($"events").alias("events"))

spark处理较复杂的log日志文件
七、行转列后在截分
spark处理较复杂的log日志文件

功能

#将 vc的String类型转换成Integer类型
scala> opJsonObj4.withColumn("vc",$"vc".cast(DataTypes.IntegerType))
相关标签: spark 大数据