Entire Column Aggregations:
sql:
select sum(sal) from emp;
scala> val emp = sc.textFile("/user/cloudera/spLab/emp")
emp: org.apache.spark.rdd.RDD[String] = /user/cloudera/spLab/emp MapPartitionsRDD[1] at textFile at <console>:27
scala> val sals = emp.map(x => x.split(",")(2).toInt)
sals: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[2] at map at <console>:29
scala> sals.collect
res0: Array[Int] = Array(40000, 50000, 50000, 90000, 10000, 40000, 80000, 50000)
scala> sals.sum
res1: Double = 410000.0
scala> sals.reduce((a,b) => a + b)
res2: Int = 410000
scala>
---> reduce will be computed cluster. n
---> sum will collect all partitions data into client and computation happens at local.
sql:
select sum(sal), count(*), avg(sal),
max(sal) , min(sal) from emp;
scala> val tot = sals.sum
tot: Double = 410000.0
scala> val cnt = sals.count
cnt: Long = 8
scala> val avg = sals.mean
avg: Double = 51250.0
scala> val max = sals.max
max: Int = 90000
scala> val min = sals.min
min: Int = 10000
scala> val m = sals.reduce(Math.max(_,_))
m: Int = 90000
scala>
scala> val res = (tot,cnt,avg,max,min)
res: (Double, Long, Double, Int, Int) = (410000.0,8,51250.0,90000,10000)
scala> tot
res3: Double = 410000.0
scala>
----------------------------------------
nice blog
ReplyDelete