Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Tuesday 16 May 2017

Pig : Joins

[cloudera@quickstart ~]$ hadoop fs -cat spLab/e
101,aaaa,40000,m,11
102,bbbbbb,50000,f,12
103,cccc,50000,m,12
104,dd,90000,f,13
105,ee,10000,m,12
106,dkd,40000,m,12
107,sdkfj,80000,f,13
108,iiii,50000,m,11
109,jj,10000,m,14
110,kkk,20000,f,15
111,dddd,30000,m,15
[cloudera@quickstart ~]$ hadoop fs -cat spLab/d
11,marketing,hyd
12,hr,del
13,fin,del
21,admin,hyd
22,production,del
[cloudera@quickstart ~]$
$ cat > joins.pig

 emp = load 'spLab/e' using PigStorage(',')
    as (id:int, name:chararray, sal:int,
       sex:chararray, dno:int);

 dept = load 'spLab/d' using PigStorage(',')
   as (dno:int, dname:chararray, dloc:chararray);

 ij = join  emp by dno, dept by dno;

 lj = join emp by dno left outer, dept by dno;

 rj = join emp by dno right outer, dept by dno;

 fj = join emp by dno full outer, dept by dno;


cntr+d [to save]

grunt> run joins.pig
runt> describe ij
ij: {emp::id: int,emp::name: chararray,emp::sal: int,emp::sex: chararray,emp::dno: int,dept::dno: int,dept::dname: chararray,dept::dloc: chararray}
grunt> describe fj
fj: {emp::id: int,emp::name: chararray,emp::sal: int,emp::sex: chararray,emp::dno: int,dept::dno: int,dept::dname: chararray,dept::dloc: chararray}
grunt> ed = foreach fj generate
>>    emp::id as id, emp::name as name, emp::sal as sal,
>>     emp::sex as sex, dept::dname as dname, dept::dloc as city;
-- neutralizing schema. omitting reference operators.

grunt> dump ed
(108,iiii,50000,m,marketing,hyd)
(101,aaaa,40000,m,marketing,hyd)
(106,dkd,40000,m,hr,del)
(105,ee,10000,m,hr,del)
(103,cccc,50000,m,hr,del)
(102,bbbbbb,50000,f,hr,del)
(107,sdkfj,80000,f,fin,del)
(104,dd,90000,f,fin,del)
(109,jj,10000,m,,)
(111,dddd,30000,m,,)
(110,kkk,20000,f,,)
(,,,,admin,hyd)
(,,,,production,del)

-----------------------------

grunt> ed1 = foreach fj generate
>>       emp::dno as dno1, dept::dno as dno2, emp::sal as sal;
grunt> describe ed1
ed1: {dno1: int,dno2: int,sal: int}
grunt> stats = foreach ed1 generate
>>   (dno1 is not null and dno2 is not null ? 'Working' :
>>     (dno2 is null ? 'BenchTeam' : 'BenchProject')) as stat, sal;
grunt> grp = group stats by stat;
grunt> res = foreach grp generate
>>     group as stat, SUM(stats.sal) as tot;
grunt> r = foreach res generate stat,
>>             (tot is null ? 0:tot) as tot;

grunt> dump r;
(Working,410000)
(BenchTeam,60000)
(BenchProject,0)


-----------------------------------------

to get top 3 salaries.


 sals = foreach emp generate sal;
 sals2 = distinct sals;
 sals3 = order sals2 by sal desc;
 sals4 = limit sals3 3;

 e = join emp by sal, sals4 by sal;
 etop3 = foreach e generate
     emp::id as id, emp::name as name,
        emp::sal as sal;
 dump etop3;

-------------------------------

 emp ---> id name sal sex dno
 dept --> dno dname dloc mid
 mngrs ---> mid  mname  phone


  what is total salary budjet for each manager.


[cloudera@quickstart ~]$ hadoop fs -copyFromLocal dept spLab/dd
[cloudera@quickstart ~]$ hadoop fs -copyFromLocal mngrs spLab
[cloudera@quickstart ~]$

dep = load 'spLab/dd' using PigStorage(',')
   as (dno:int, dname:chararray, dloc:chararray,
  mid:chararray);

mgr = load 'spLab/mngrs' using PigStorage(',')
   as (mid:chararray, mname:chararray,
       ph:chararray);

edep = join emp by dno, dep by dno;

edep2 = foreach edep generate
         emp::sal as sal, dep::mid as mid;

edm = join edep2 by mid, mgr by mid;
edm2 = foreach edm generate
        mgr::mname as mname ,
          edep2::sal as sal;

grpx = group edm2 by mname;
resx = foreach grpx generate
       group as mname,
          SUM(edm2.sal) as tot;
dump resx
------------------------------------






 








 




















2 comments: