Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Tuesday 2 May 2017

Demo of Pig Grouping Aggregations

Demo of grouping aggregations on structured

data.
-----------------------------------


[cloudera@quickstart ~]$ hadoop fs -mkdir piglab
[cloudera@quickstart ~]$ hadoop fs -

copyFromLocal emp piglab
[cloudera@quickstart ~]$ hadoop fs -ls piglab
Found 1 items
-rw-r--r--   1 cloudera cloudera        158

2017-05-02 18:55 piglab/emp
[cloudera@quickstart ~]$

[cloudera@quickstart ~]$ hadoop fs -cat

piglab/emp
101,aaaa,40000,m,11
102,bbbbbb,50000,f,12
103,cccc,50000,m,12
104,dd,90000,f,13
105,ee,10000,m,12
106,dkd,40000,m,12
107,sdkfj,80000,f,13
108,iiii,50000,m,11
[cloudera@quickstart ~]$

[cloudera@quickstart ~]$ pig
grunt> emp = load 'piglab/emp' using PigStorage

(',')
>>      as (id:int, name:chararray, sal:int,
>>   sex:chararray, dno:int);
grunt>dump emp
(101,aaaa,40000,m,11)
(102,bbbbbb,50000,f,12)
(103,cccc,50000,m,12)
(104,dd,90000,f,13)
(105,ee,10000,m,12)
(106,dkd,40000,m,12)
(107,sdkfj,80000,f,13)
(108,iiii,50000,m,11)

required task: sex based sum aggr on sal.
sql--> select sex,sum(sal) from emp
          group by sex;

grunt> e = foreach emp generate sex, sal;
grunt> describe e;
e: {sex: chararray,sal: int}
grunt> grp = group e by sex;
grunt> describe grp
grp: {group: chararray,e: {(sex: chararray,sal:

int)}}
grunt> dump grp

(f,{(f,80000),(f,90000),(f,50000)})
(m,{(m,50000),(m,40000),(m,10000),(m,50000),

(m,40000)})

grunt> res1 = foreach grp generate
>>           group as sex, SUM(e.sal) as tot;
grunt> dump res1

(f,220000)
(m,190000)

sql--> select sex, avg(sal) from emp group by sex;
grunt> res2 = foreach grp generate
     group as sex, AVG(e.sal) as avg;


sql--> select sex, max(sal) from emp group by sex;

grunt> res3 = foreach grp generate
    group as sex, MAX(e.sal) as max;

sql--> select sex, min(sal) from emp group by sex;

grunt> res4 = foreach grp generate
     group as sex, MIN(e.sal) as min;

sql--> select sex, count(*) from emp group by sex;

grunt> res5 = foreach grp generate
    group as sex, COUNT(e) as cnt;

sql--> select sex, sum(sal) as tot,
       count(*) as cnt, avg(sal) as avg,
        max(sal) as max, min(sal) as min
    from emp group by sex;

grunt> res6 = foreach grp generate
   group as sex, SUM(e.sal) as tot,
       COUNT(e) as cnt, AVG(e.sal) as avg,
        MAX(e.sal) as max,
          MIN(e.sal) as min;
grunt> dump res6
(f,220000,3,73333.33333333333,90000,50000)
(m,190000,5,38000.0,50000,10000)

grunt> store res6 into 'piglab/results1';

grunt> ls piglab
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/emp<r 1> 158
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/results1 <dir>
grunt> ls piglab/results1
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/results1/_SUCCESS<r 1> 0
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/results1/part-r-00000<r 1> 72
grunt> cat piglab/results1/part-r-00000
f 220000 3 73333.33333333333 90000 50000
m 190000 5 38000.0 50000 10000
grunt>

grunt> store res6 into 'piglab/results2'
>>    using PigStorage(',');

grunt> cat piglab/results2/part-r-00000
f,220000,3,73333.33333333333,90000,50000
m,190000,5,38000.0,50000,10000
grunt>























1 comment: