Demo of grouping aggregations on structured
data.
-----------------------------------
[cloudera@quickstart ~]$ hadoop fs -mkdir piglab
[cloudera@quickstart ~]$ hadoop fs -
copyFromLocal emp piglab
[cloudera@quickstart ~]$ hadoop fs -ls piglab
Found 1 items
-rw-r--r-- 1 cloudera cloudera 158
2017-05-02 18:55 piglab/emp
[cloudera@quickstart ~]$
[cloudera@quickstart ~]$ hadoop fs -cat
piglab/emp
101,aaaa,40000,m,11
102,bbbbbb,50000,f,12
103,cccc,50000,m,12
104,dd,90000,f,13
105,ee,10000,m,12
106,dkd,40000,m,12
107,sdkfj,80000,f,13
108,iiii,50000,m,11
[cloudera@quickstart ~]$
[cloudera@quickstart ~]$ pig
grunt> emp = load 'piglab/emp' using PigStorage
(',')
>> as (id:int, name:chararray, sal:int,
>> sex:chararray, dno:int);
grunt>dump emp
(101,aaaa,40000,m,11)
(102,bbbbbb,50000,f,12)
(103,cccc,50000,m,12)
(104,dd,90000,f,13)
(105,ee,10000,m,12)
(106,dkd,40000,m,12)
(107,sdkfj,80000,f,13)
(108,iiii,50000,m,11)
required task: sex based sum aggr on sal.
sql--> select sex,sum(sal) from emp
group by sex;
grunt> e = foreach emp generate sex, sal;
grunt> describe e;
e: {sex: chararray,sal: int}
grunt> grp = group e by sex;
grunt> describe grp
grp: {group: chararray,e: {(sex: chararray,sal:
int)}}
grunt> dump grp
(f,{(f,80000),(f,90000),(f,50000)})
(m,{(m,50000),(m,40000),(m,10000),(m,50000),
(m,40000)})
grunt> res1 = foreach grp generate
>> group as sex, SUM(e.sal) as tot;
grunt> dump res1
(f,220000)
(m,190000)
sql--> select sex, avg(sal) from emp group by sex;
grunt> res2 = foreach grp generate
group as sex, AVG(e.sal) as avg;
sql--> select sex, max(sal) from emp group by sex;
grunt> res3 = foreach grp generate
group as sex, MAX(e.sal) as max;
sql--> select sex, min(sal) from emp group by sex;
grunt> res4 = foreach grp generate
group as sex, MIN(e.sal) as min;
sql--> select sex, count(*) from emp group by sex;
grunt> res5 = foreach grp generate
group as sex, COUNT(e) as cnt;
sql--> select sex, sum(sal) as tot,
count(*) as cnt, avg(sal) as avg,
max(sal) as max, min(sal) as min
from emp group by sex;
grunt> res6 = foreach grp generate
group as sex, SUM(e.sal) as tot,
COUNT(e) as cnt, AVG(e.sal) as avg,
MAX(e.sal) as max,
MIN(e.sal) as min;
grunt> dump res6
(f,220000,3,73333.33333333333,90000,50000)
(m,190000,5,38000.0,50000,10000)
grunt> store res6 into 'piglab/results1';
grunt> ls piglab
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/emp<r 1> 158
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/results1 <dir>
grunt> ls piglab/results1
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/results1/_SUCCESS<r 1> 0
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/results1/part-r-00000<r 1> 72
grunt> cat piglab/results1/part-r-00000
f 220000 3 73333.33333333333 90000 50000
m 190000 5 38000.0 50000 10000
grunt>
grunt> store res6 into 'piglab/results2'
>> using PigStorage(',');
grunt> cat piglab/results2/part-r-00000
f,220000,3,73333.33333333333,90000,50000
m,190000,5,38000.0,50000,10000
grunt>
data.
-----------------------------------
[cloudera@quickstart ~]$ hadoop fs -mkdir piglab
[cloudera@quickstart ~]$ hadoop fs -
copyFromLocal emp piglab
[cloudera@quickstart ~]$ hadoop fs -ls piglab
Found 1 items
-rw-r--r-- 1 cloudera cloudera 158
2017-05-02 18:55 piglab/emp
[cloudera@quickstart ~]$
[cloudera@quickstart ~]$ hadoop fs -cat
piglab/emp
101,aaaa,40000,m,11
102,bbbbbb,50000,f,12
103,cccc,50000,m,12
104,dd,90000,f,13
105,ee,10000,m,12
106,dkd,40000,m,12
107,sdkfj,80000,f,13
108,iiii,50000,m,11
[cloudera@quickstart ~]$
[cloudera@quickstart ~]$ pig
grunt> emp = load 'piglab/emp' using PigStorage
(',')
>> as (id:int, name:chararray, sal:int,
>> sex:chararray, dno:int);
grunt>dump emp
(101,aaaa,40000,m,11)
(102,bbbbbb,50000,f,12)
(103,cccc,50000,m,12)
(104,dd,90000,f,13)
(105,ee,10000,m,12)
(106,dkd,40000,m,12)
(107,sdkfj,80000,f,13)
(108,iiii,50000,m,11)
required task: sex based sum aggr on sal.
sql--> select sex,sum(sal) from emp
group by sex;
grunt> e = foreach emp generate sex, sal;
grunt> describe e;
e: {sex: chararray,sal: int}
grunt> grp = group e by sex;
grunt> describe grp
grp: {group: chararray,e: {(sex: chararray,sal:
int)}}
grunt> dump grp
(f,{(f,80000),(f,90000),(f,50000)})
(m,{(m,50000),(m,40000),(m,10000),(m,50000),
(m,40000)})
grunt> res1 = foreach grp generate
>> group as sex, SUM(e.sal) as tot;
grunt> dump res1
(f,220000)
(m,190000)
sql--> select sex, avg(sal) from emp group by sex;
grunt> res2 = foreach grp generate
group as sex, AVG(e.sal) as avg;
sql--> select sex, max(sal) from emp group by sex;
grunt> res3 = foreach grp generate
group as sex, MAX(e.sal) as max;
sql--> select sex, min(sal) from emp group by sex;
grunt> res4 = foreach grp generate
group as sex, MIN(e.sal) as min;
sql--> select sex, count(*) from emp group by sex;
grunt> res5 = foreach grp generate
group as sex, COUNT(e) as cnt;
sql--> select sex, sum(sal) as tot,
count(*) as cnt, avg(sal) as avg,
max(sal) as max, min(sal) as min
from emp group by sex;
grunt> res6 = foreach grp generate
group as sex, SUM(e.sal) as tot,
COUNT(e) as cnt, AVG(e.sal) as avg,
MAX(e.sal) as max,
MIN(e.sal) as min;
grunt> dump res6
(f,220000,3,73333.33333333333,90000,50000)
(m,190000,5,38000.0,50000,10000)
grunt> store res6 into 'piglab/results1';
grunt> ls piglab
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/emp<r 1> 158
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/results1 <dir>
grunt> ls piglab/results1
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/results1/_SUCCESS<r 1> 0
hdfs://quickstart.cloudera:8020/user/cloudera/piglab/results1/part-r-00000<r 1> 72
grunt> cat piglab/results1/part-r-00000
f 220000 3 73333.33333333333 90000 50000
m 190000 5 38000.0 50000 10000
grunt>
grunt> store res6 into 'piglab/results2'
>> using PigStorage(',');
grunt> cat piglab/results2/part-r-00000
f,220000,3,73333.33333333333,90000,50000
m,190000,5,38000.0,50000,10000
grunt>
Thank you sir
ReplyDelete