Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Thursday, 4 May 2017

Pig : CoGroup examples Vs Union Examples


-- co grouping
grunt> cat piglab/emp
101,aaaa,40000,m,11
102,bbbbbb,50000,f,12
103,cccc,50000,m,12
104,dd,90000,f,13
105,ee,10000,m,12
106,dkd,40000,m,12
107,sdkfj,80000,f,13
108,iiii,50000,m,11
grunt>
[cloudera@quickstart ~]$ cat > emp2
201,Ravi,80000,m,12
203,Varuna,100000,f,13
204,Vanila,50000,f,12
205,Mani,30000,m,14
206,Manisha,30000,f,14
[cloudera@quickstart ~]$ hadoop fs -
copyFromLocal emp2 piglab
[cloudera@quickstart ~]$
 sql:
   select dno, sum(sal) from (
      select dno, sal from emp1
              union all
      select dno, sal from emp2
       ) e group by dno;
runt> emp1 = load 'piglab/emp' using      
PigStorage(',')
    as (id:int, name:chararray, sal:int,    
sex:chararray,
      dno:int);
grunt> emp2 = load 'piglab/emp2' using
PigStorage(',')
    as (id:int, name:chararray, sal:int,    
sex:chararray,
     dno:int);
grunt> describe emp1
emp1: {id: int,name: chararray,sal: int,sex:
chararray,dno: int}
grunt> describe emp2
emp2: {id: int,name: chararray,sal: int,sex:
chararray,dno: int}
grunt> e1 = foreach emp1 generate dno, sal;
grunt> e2 = foreach emp2 generate dno, sal;
grunt> e = union e1, e2;
grunt> grp = group e by dno;
grunt> res = foreach grp generate group as dno,
               SUM(e.sal) as tot;
grunt> dump res
(11,180000)
(12,280000)
(13,270000)
(14,60000)
 -- in above output,
    we missed clarity,
    total salary for branch1 and brach2
    above is the aggregation of all branches.
 -- we want,
   seperate total salary for branch 1 and
   seperate for branch2.
 -- for this cogroup is used.
  -- cogroup  will construct seperate
     innerbags for each relation(dataset).
   so that seperate aggregations, we can apply.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2
e2: {dno: int,sal: int}
grunt> cg = cogroup e1 by dno, e2 by dno;
grunt> describe cg
cg: {group: int,e1: {(dno: int,sal: int)},e2:
{(dno: int,sal: int)}}
grunt> dump cg
 -- when cogroup is applied it returns
  n+1 fields .
  where n is number of input relations
   (datasets).
   first one is group
   2nd onwards bags.
(11,{(11,50000),(11,40000)},{(11,90000)})
(12,{(12,40000),(12,10000),(12,50000),
(12,50000)},{(12,50000),(12,80000)})
(13,{(13,80000),(13,90000)},{(13,100000)})
(14,{},{(14,30000),(14,30000)})
grunt> res = foreach cg generate
        group as dno ,
         SUM(e1.sal) as tot1,
         SUM(e2.sal) as tot2;
grunt> describe res
res: {dno: int,tot1: long,tot2: long}
(11,90000,90000)
(12,150000,130000)
(13,170000,100000)
(14,,60000)
---------------------------------
  how to perform,
    seperate aggregations on each dataset
  with out cogrouping.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2
e2: {dno: int,sal: int}
grunt> ee1 = foreach e1 generate *, 'branch1' as
branch;
grunt> ee2 = foreach e2 generate *, 'branch2' as
branch;
grunt> ee = union ee1 , ee2;
grunt> grp = group ee by (dno, branch);
grunt> res = foreach grp generate
>>         group.dno as dno, group.branch as
branch,
>>              SUM(ee.sal) as tot;
grunt> describe res
res: {dno: int,branch: chararray,tot: long}
grunt> dump res
(11,branch1,90000)
(11,branch2,90000)
(12,branch1,150000)
(12,branch2,130000)
(13,branch1,170000)
(13,branch2,100000)
(14,branch2,60000)
---------------------------------
 using Cogroup , multiple aggregations
  seperately for each dataset.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2;
e2: {dno: int,sal: int}
grunt> cg = cogroup e1 by dno, e2 by dno;
grunt> res = foreach cg generate
>>     group as dno,
>>     SUM(e1.sal) as tot1,
>>     SUM(e2.sal) as tot2,
>>     COUNT(e1) as cnt1,
>>     COUNT(e2) as cnt2,
>>     MAX(e1.sal) as max1,
>>     MAX(e2.sal) as max2;
grunt> describe res;
res: {dno: int,tot1: long,tot2: long,cnt1:
long,cnt2: long,max1: int,max2: int}
grunt> dump res
(11,90000,90000,2,1,50000,90000)
(12,150000,130000,4,2,50000,80000)
(13,170000,100000,2,1,90000,100000)
(14,,60000,0,2,,30000)


------------------------------
   Entire Column aggregations using CoGroup.

 s1 = foreach emp1 generate sal;
 s2 = foreach emp2 generate sal;
 g = cogroup s1  all, s2 all ;
 r = foreach g generate
      SUM(s1.sal) as tot1,
       SUM(s2.sal) as tot2;
 r = foreach r generate * , tot1+tot2 as tot;
dump r
(410000,380000,790000)
     








































   
















3 comments: