Spark实战(5) DataFrame基础之GroupBy和Aggregate
程序员文章站
2022-06-13 22:06:05
...
groupBy()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('aggs').getOrCreate()
df = spark.read.csv('sales_info.csv', inferSchema = True, header = True)
df.printSchema()
df.show()
df.groupby("Company")
df.groupby("Company").mean().show()
df.groupby("Company").sum().show()
Aggregation
# general method do aggregation
df.agg({'Sales':'sum'}).show()
df.agg({'Sales':'max'}).show() # notion line
# we could set the group by and then use aggregate
group_data = df.groupBy("Company")
group_data.agg({'Sales':'max'}).show() # the same effect with the notion line
Aggregation Function
from pyspark.sql.functions import countDistinct, avg, stddev
# aggregate function to select
df.select(countDistinct('Sales'))
df.select(countDistinct('Sales'),alias('Count of Distinct Sales'))
# how to format the result
from pyspark.sql.functions import format_number
# add alias
sales_std = df.select(stddev('Sales').alias('std'))
# format the number
sales_std.select(format_number('std',2)).show()
# how to sort the result
# sort things by sales
df.orderBy("Sales").show()
df.orderBy(df['Sales'].desc()).show() # descending order