Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Tuesday, 16 May 2017

Pig : Joins

[cloudera@quickstart ~]$ hadoop fs -cat spLab/e
101,aaaa,40000,m,11
102,bbbbbb,50000,f,12
103,cccc,50000,m,12
104,dd,90000,f,13
105,ee,10000,m,12
106,dkd,40000,m,12
107,sdkfj,80000,f,13
108,iiii,50000,m,11
109,jj,10000,m,14
110,kkk,20000,f,15
111,dddd,30000,m,15
[cloudera@quickstart ~]$ hadoop fs -cat spLab/d
11,marketing,hyd
12,hr,del
13,fin,del
21,admin,hyd
22,production,del
[cloudera@quickstart ~]$
$ cat > joins.pig

 emp = load 'spLab/e' using PigStorage(',')
    as (id:int, name:chararray, sal:int,
       sex:chararray, dno:int);

 dept = load 'spLab/d' using PigStorage(',')
   as (dno:int, dname:chararray, dloc:chararray);

 ij = join  emp by dno, dept by dno;

 lj = join emp by dno left outer, dept by dno;

 rj = join emp by dno right outer, dept by dno;

 fj = join emp by dno full outer, dept by dno;


cntr+d [to save]

grunt> run joins.pig
runt> describe ij
ij: {emp::id: int,emp::name: chararray,emp::sal: int,emp::sex: chararray,emp::dno: int,dept::dno: int,dept::dname: chararray,dept::dloc: chararray}
grunt> describe fj
fj: {emp::id: int,emp::name: chararray,emp::sal: int,emp::sex: chararray,emp::dno: int,dept::dno: int,dept::dname: chararray,dept::dloc: chararray}
grunt> ed = foreach fj generate
>>    emp::id as id, emp::name as name, emp::sal as sal,
>>     emp::sex as sex, dept::dname as dname, dept::dloc as city;
-- neutralizing schema. omitting reference operators.

grunt> dump ed
(108,iiii,50000,m,marketing,hyd)
(101,aaaa,40000,m,marketing,hyd)
(106,dkd,40000,m,hr,del)
(105,ee,10000,m,hr,del)
(103,cccc,50000,m,hr,del)
(102,bbbbbb,50000,f,hr,del)
(107,sdkfj,80000,f,fin,del)
(104,dd,90000,f,fin,del)
(109,jj,10000,m,,)
(111,dddd,30000,m,,)
(110,kkk,20000,f,,)
(,,,,admin,hyd)
(,,,,production,del)

-----------------------------

grunt> ed1 = foreach fj generate
>>       emp::dno as dno1, dept::dno as dno2, emp::sal as sal;
grunt> describe ed1
ed1: {dno1: int,dno2: int,sal: int}
grunt> stats = foreach ed1 generate
>>   (dno1 is not null and dno2 is not null ? 'Working' :
>>     (dno2 is null ? 'BenchTeam' : 'BenchProject')) as stat, sal;
grunt> grp = group stats by stat;
grunt> res = foreach grp generate
>>     group as stat, SUM(stats.sal) as tot;
grunt> r = foreach res generate stat,
>>             (tot is null ? 0:tot) as tot;

grunt> dump r;
(Working,410000)
(BenchTeam,60000)
(BenchProject,0)


-----------------------------------------

to get top 3 salaries.


 sals = foreach emp generate sal;
 sals2 = distinct sals;
 sals3 = order sals2 by sal desc;
 sals4 = limit sals3 3;

 e = join emp by sal, sals4 by sal;
 etop3 = foreach e generate
     emp::id as id, emp::name as name,
        emp::sal as sal;
 dump etop3;

-------------------------------

 emp ---> id name sal sex dno
 dept --> dno dname dloc mid
 mngrs ---> mid  mname  phone


  what is total salary budjet for each manager.


[cloudera@quickstart ~]$ hadoop fs -copyFromLocal dept spLab/dd
[cloudera@quickstart ~]$ hadoop fs -copyFromLocal mngrs spLab
[cloudera@quickstart ~]$

dep = load 'spLab/dd' using PigStorage(',')
   as (dno:int, dname:chararray, dloc:chararray,
  mid:chararray);

mgr = load 'spLab/mngrs' using PigStorage(',')
   as (mid:chararray, mname:chararray,
       ph:chararray);

edep = join emp by dno, dep by dno;

edep2 = foreach edep generate
         emp::sal as sal, dep::mid as mid;

edm = join edep2 by mid, mgr by mid;
edm2 = foreach edm generate
        mgr::mname as mname ,
          edep2::sal as sal;

grpx = group edm2 by mname;
resx = foreach grpx generate
       group as mname,
          SUM(edm2.sal) as tot;
dump resx
------------------------------------






 








 




















6 comments:

  1. Good Post! Thank you so much for sharing this pretty post, it was so good to read and useful to improve my knowledge as updated one, keep blogging.

    https://www.emexotechnologies.com/online-courses/big-data-hadoop-training-in-electronic-city/

    ReplyDelete
  2. Good Post! Thank you so much for sharing this pretty post, it was so good to read and useful to improve my knowledge as updated one, keep blogging.
    Big Data Hadoop Training in electronic city

    ReplyDelete