Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Thursday, 4 May 2017

Pig : CoGroup examples Vs Union Examples


-- co grouping
grunt> cat piglab/emp
101,aaaa,40000,m,11
102,bbbbbb,50000,f,12
103,cccc,50000,m,12
104,dd,90000,f,13
105,ee,10000,m,12
106,dkd,40000,m,12
107,sdkfj,80000,f,13
108,iiii,50000,m,11
grunt>
[cloudera@quickstart ~]$ cat > emp2
201,Ravi,80000,m,12
203,Varuna,100000,f,13
204,Vanila,50000,f,12
205,Mani,30000,m,14
206,Manisha,30000,f,14
[cloudera@quickstart ~]$ hadoop fs -
copyFromLocal emp2 piglab
[cloudera@quickstart ~]$
 sql:
   select dno, sum(sal) from (
      select dno, sal from emp1
              union all
      select dno, sal from emp2
       ) e group by dno;
runt> emp1 = load 'piglab/emp' using      
PigStorage(',')
    as (id:int, name:chararray, sal:int,    
sex:chararray,
      dno:int);
grunt> emp2 = load 'piglab/emp2' using
PigStorage(',')
    as (id:int, name:chararray, sal:int,    
sex:chararray,
     dno:int);
grunt> describe emp1
emp1: {id: int,name: chararray,sal: int,sex:
chararray,dno: int}
grunt> describe emp2
emp2: {id: int,name: chararray,sal: int,sex:
chararray,dno: int}
grunt> e1 = foreach emp1 generate dno, sal;
grunt> e2 = foreach emp2 generate dno, sal;
grunt> e = union e1, e2;
grunt> grp = group e by dno;
grunt> res = foreach grp generate group as dno,
               SUM(e.sal) as tot;
grunt> dump res
(11,180000)
(12,280000)
(13,270000)
(14,60000)
 -- in above output,
    we missed clarity,
    total salary for branch1 and brach2
    above is the aggregation of all branches.
 -- we want,
   seperate total salary for branch 1 and
   seperate for branch2.
 -- for this cogroup is used.
  -- cogroup  will construct seperate
     innerbags for each relation(dataset).
   so that seperate aggregations, we can apply.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2
e2: {dno: int,sal: int}
grunt> cg = cogroup e1 by dno, e2 by dno;
grunt> describe cg
cg: {group: int,e1: {(dno: int,sal: int)},e2:
{(dno: int,sal: int)}}
grunt> dump cg
 -- when cogroup is applied it returns
  n+1 fields .
  where n is number of input relations
   (datasets).
   first one is group
   2nd onwards bags.
(11,{(11,50000),(11,40000)},{(11,90000)})
(12,{(12,40000),(12,10000),(12,50000),
(12,50000)},{(12,50000),(12,80000)})
(13,{(13,80000),(13,90000)},{(13,100000)})
(14,{},{(14,30000),(14,30000)})
grunt> res = foreach cg generate
        group as dno ,
         SUM(e1.sal) as tot1,
         SUM(e2.sal) as tot2;
grunt> describe res
res: {dno: int,tot1: long,tot2: long}
(11,90000,90000)
(12,150000,130000)
(13,170000,100000)
(14,,60000)
---------------------------------
  how to perform,
    seperate aggregations on each dataset
  with out cogrouping.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2
e2: {dno: int,sal: int}
grunt> ee1 = foreach e1 generate *, 'branch1' as
branch;
grunt> ee2 = foreach e2 generate *, 'branch2' as
branch;
grunt> ee = union ee1 , ee2;
grunt> grp = group ee by (dno, branch);
grunt> res = foreach grp generate
>>         group.dno as dno, group.branch as
branch,
>>              SUM(ee.sal) as tot;
grunt> describe res
res: {dno: int,branch: chararray,tot: long}
grunt> dump res
(11,branch1,90000)
(11,branch2,90000)
(12,branch1,150000)
(12,branch2,130000)
(13,branch1,170000)
(13,branch2,100000)
(14,branch2,60000)
---------------------------------
 using Cogroup , multiple aggregations
  seperately for each dataset.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2;
e2: {dno: int,sal: int}
grunt> cg = cogroup e1 by dno, e2 by dno;
grunt> res = foreach cg generate
>>     group as dno,
>>     SUM(e1.sal) as tot1,
>>     SUM(e2.sal) as tot2,
>>     COUNT(e1) as cnt1,
>>     COUNT(e2) as cnt2,
>>     MAX(e1.sal) as max1,
>>     MAX(e2.sal) as max2;
grunt> describe res;
res: {dno: int,tot1: long,tot2: long,cnt1:
long,cnt2: long,max1: int,max2: int}
grunt> dump res
(11,90000,90000,2,1,50000,90000)
(12,150000,130000,4,2,50000,80000)
(13,170000,100000,2,1,90000,100000)
(14,,60000,0,2,,30000)


------------------------------
   Entire Column aggregations using CoGroup.

 s1 = foreach emp1 generate sal;
 s2 = foreach emp2 generate sal;
 g = cogroup s1  all, s2 all ;
 r = foreach g generate
      SUM(s1.sal) as tot1,
       SUM(s2.sal) as tot2;
 r = foreach r generate * , tot1+tot2 as tot;
dump r
(410000,380000,790000)
     








































   
















5 comments:

  1. Good Post! Thank you so much for sharing this pretty post, it was so good to read and useful to improve my knowledge as updated one, keep blogging.

    https://www.emexotechnologies.com/online-courses/big-data-hadoop-training-in-electronic-city/

    ReplyDelete
  2. Good Post! Thank you so much for sharing this pretty post, it was so good to read and useful to improve my knowledge as updated one, keep blogging.
    Big Data Hadoop Training in electronic city

    ReplyDelete