grunt> ls pdemo/e1
hdfs://localhost/user/training/pdemo/e1<r 1> 39
grunt> cat pdemo/e1
ravi,30000,m
rani,40000,f
giri,50000,m
grunt> cat pdemo/e2
hari,60000,11
hara,70000,12
grunt> e1 = load 'pdemo/e1' using PigStorage(',')
>> as (name:chararray, sal:int, sex:chararray);
grunt> cat pdemo/e2
hari,60000,11
hara,70000,12
grunt> e2 = load 'pdemo/e2' using PigStorage(',')
>> as (name:chararray, sal:int ,
>> dno:int);
grunt> ee1 = foreach e1 generate *,0 as dno;
grunt> describe ee1;
ee1: {name: chararray,sal: int,sex: chararray,dno: int}
grunt>
grunt> ee2 = foreach e2 generate name,sal,
>> '*' as sex, dno;
grunt> describe ee2
ee2: {name: chararray,sal: int,sex: chararray,dno: int}
grunt> e = union ee1, ee2;
grunt> dump e;
(ravi,30000,m,0)
(rani,40000,f,0)
(giri,50000,m,0)
(hari,60000,*,11)
(hara,70000,*,12)
________________________________________
sql-->
select dno , sum(sal) as tot from emp
group by dno
order by tot desc
limit 3;
______________________________
emp = load 'pdemo/emp' using PigStorage(',')
as (id:int, name:chararray, sal:int,
sex:chararray, dno:int);
e = foreach emp generate dno, sal;
grp = group e by dno;
res = foreach grp generate
group as dno, SUM(e.sal) as tot;
ores = order res by tot desc;
top3 = limit ores 3;
above model is correct , if each dno total salary is unique.
if total sal is taken by multiple dno s.
you get wrong results.
solution: joins/ udfs.
ex:
15 9L
18 9L
25 9L
30 8L
40 7L
top3;
15 9L
18 9L
25 9L
2nd top, 3rd top were missed.
_________________________________________
Joins examples;
[training@localhost ~]$ cat emp
101,vino,26000,m,11
102,Sri,25000,f,11
103,mohan,13000,m,13
104,lokitha,8000,f,12
105,naga,6000,m,13
101,janaki,10000,f,12
201,aaa,30000,m,12
202,bbbb,50000,f,13
203,ccc,10000,f,13
204,ddddd,50000,m,13
304,xxx,70000,m,14
305,yyy,80000,f,15
[training@localhost ~]$
[training@localhost ~]$ cat > dept
11,marketing,hyd
12,hr,del
13,finance,hyd
20,prod,hyd
21,admin,chennai
[training@localhost ~]$
[training@localhost ~]$ hadoop fs -copyFromLocal emp pdemo/empl
[training@localhost ~]$ hadoop fs -copyFromLocal dept pdemo
[training@localhost ~]$
emp = load ' ... ' ----
dept = load ' ... ' ------
ij = join emp by dno, dept by dno;
lj = join emp by dno left outer, dept by dno;
rj = join emp by dno right outer, dept by dno;
fj = join emp by dno full outer, dept by dno;
dump fj;
(101,vino,26000,m,11,11,marketing,hyd)
(102,Sri,25000,f,11,11,marketing,hyd)
(104,lokitha,8000,f,12,12,hr,del)
(101,janaki,10000,f,12,12,hr,del)
(201,aaa,30000,m,12,12,hr,del)
(103,mohan,13000,m,13,13,finance,hyd)
(105,naga,6000,m,13,13,finance,hyd)
(202,bbbb,50000,f,13,13,finance,hyd)
(203,ccc,10000,f,13,13,finance,hyd)
(204,ddddd,50000,m,13,13,finance,hyd)
(304,xxx,70000,m,14,,,)
(305,yyy,80000,f,15,,,)
(,,,,,20,prod,hyd)
(,,,,,21,admin,chennai)
grunt> describe fj
fj: {emp::id: int,emp::name: chararray,emp::sal: int,emp::sex: chararray,emp::dno: int,dept::dno: int,dept::dname: chararray,dept::loc: chararray}
grunt> fj2 = foreach fj generate
>> emp::dno as dno1 ,
>> dept::dno as dno2,
>> emp::sal as sal;
grunt> describe fj2;
fj2: {dno1: int,dno2: int,sal: int}
grunt>
(11,11,26000)
(11,11,25000)
(12,12,8000)
(12,12,10000)
(12,12,30000)
(13,13,13000)
(13,13,6000)
(13,13,50000)
(13,13,10000)
(13,13,50000)
(14,,70000)
(15,,80000)
(,20,)
(,21,)
grunt> fj3 = foreach fj2 generate
>> (dno1 is not null and dno2 is not null ? 'Working':(dno1 is not null and dno2 is null ? 'BenchTeam':'BenchProj')) as stat, sal;
grunt> describe fj3;
fj3: {stat: chararray,sal: int}
grunt> dump fj3
(Working,26000)
(Working,25000)
(Working,8000)
(Working,10000)
(Working,30000)
(Working,13000)
(Working,6000)
(Working,50000)
(Working,10000)
(Working,50000)
(BenchTeam,70000)
(BenchTeam,80000)
(BenchProj,)
(BenchProj,)
grunt> describe fj3
fj3: {stat: chararray,sal: int}
grunt> grp = group fj3 by stat;
grunt> res = foreach grp generate
>> group as stat , SUM(fj3.sal) as tot;
grunt> dump res
(Working,228000)
(BenchProj,)
(BenchTeam,150000)
____________________________________________
hdfs://localhost/user/training/pdemo/e1<r 1> 39
grunt> cat pdemo/e1
ravi,30000,m
rani,40000,f
giri,50000,m
grunt> cat pdemo/e2
hari,60000,11
hara,70000,12
grunt> e1 = load 'pdemo/e1' using PigStorage(',')
>> as (name:chararray, sal:int, sex:chararray);
grunt> cat pdemo/e2
hari,60000,11
hara,70000,12
grunt> e2 = load 'pdemo/e2' using PigStorage(',')
>> as (name:chararray, sal:int ,
>> dno:int);
grunt> ee1 = foreach e1 generate *,0 as dno;
grunt> describe ee1;
ee1: {name: chararray,sal: int,sex: chararray,dno: int}
grunt>
grunt> ee2 = foreach e2 generate name,sal,
>> '*' as sex, dno;
grunt> describe ee2
ee2: {name: chararray,sal: int,sex: chararray,dno: int}
grunt> e = union ee1, ee2;
grunt> dump e;
(ravi,30000,m,0)
(rani,40000,f,0)
(giri,50000,m,0)
(hari,60000,*,11)
(hara,70000,*,12)
________________________________________
sql-->
select dno , sum(sal) as tot from emp
group by dno
order by tot desc
limit 3;
______________________________
emp = load 'pdemo/emp' using PigStorage(',')
as (id:int, name:chararray, sal:int,
sex:chararray, dno:int);
e = foreach emp generate dno, sal;
grp = group e by dno;
res = foreach grp generate
group as dno, SUM(e.sal) as tot;
ores = order res by tot desc;
top3 = limit ores 3;
above model is correct , if each dno total salary is unique.
if total sal is taken by multiple dno s.
you get wrong results.
solution: joins/ udfs.
ex:
15 9L
18 9L
25 9L
30 8L
40 7L
top3;
15 9L
18 9L
25 9L
2nd top, 3rd top were missed.
_________________________________________
Joins examples;
[training@localhost ~]$ cat emp
101,vino,26000,m,11
102,Sri,25000,f,11
103,mohan,13000,m,13
104,lokitha,8000,f,12
105,naga,6000,m,13
101,janaki,10000,f,12
201,aaa,30000,m,12
202,bbbb,50000,f,13
203,ccc,10000,f,13
204,ddddd,50000,m,13
304,xxx,70000,m,14
305,yyy,80000,f,15
[training@localhost ~]$
[training@localhost ~]$ cat > dept
11,marketing,hyd
12,hr,del
13,finance,hyd
20,prod,hyd
21,admin,chennai
[training@localhost ~]$
[training@localhost ~]$ hadoop fs -copyFromLocal emp pdemo/empl
[training@localhost ~]$ hadoop fs -copyFromLocal dept pdemo
[training@localhost ~]$
emp = load ' ... ' ----
dept = load ' ... ' ------
ij = join emp by dno, dept by dno;
lj = join emp by dno left outer, dept by dno;
rj = join emp by dno right outer, dept by dno;
fj = join emp by dno full outer, dept by dno;
dump fj;
(101,vino,26000,m,11,11,marketing,hyd)
(102,Sri,25000,f,11,11,marketing,hyd)
(104,lokitha,8000,f,12,12,hr,del)
(101,janaki,10000,f,12,12,hr,del)
(201,aaa,30000,m,12,12,hr,del)
(103,mohan,13000,m,13,13,finance,hyd)
(105,naga,6000,m,13,13,finance,hyd)
(202,bbbb,50000,f,13,13,finance,hyd)
(203,ccc,10000,f,13,13,finance,hyd)
(204,ddddd,50000,m,13,13,finance,hyd)
(304,xxx,70000,m,14,,,)
(305,yyy,80000,f,15,,,)
(,,,,,20,prod,hyd)
(,,,,,21,admin,chennai)
grunt> describe fj
fj: {emp::id: int,emp::name: chararray,emp::sal: int,emp::sex: chararray,emp::dno: int,dept::dno: int,dept::dname: chararray,dept::loc: chararray}
grunt> fj2 = foreach fj generate
>> emp::dno as dno1 ,
>> dept::dno as dno2,
>> emp::sal as sal;
grunt> describe fj2;
fj2: {dno1: int,dno2: int,sal: int}
grunt>
(11,11,26000)
(11,11,25000)
(12,12,8000)
(12,12,10000)
(12,12,30000)
(13,13,13000)
(13,13,6000)
(13,13,50000)
(13,13,10000)
(13,13,50000)
(14,,70000)
(15,,80000)
(,20,)
(,21,)
grunt> fj3 = foreach fj2 generate
>> (dno1 is not null and dno2 is not null ? 'Working':(dno1 is not null and dno2 is null ? 'BenchTeam':'BenchProj')) as stat, sal;
grunt> describe fj3;
fj3: {stat: chararray,sal: int}
grunt> dump fj3
(Working,26000)
(Working,25000)
(Working,8000)
(Working,10000)
(Working,30000)
(Working,13000)
(Working,6000)
(Working,50000)
(Working,10000)
(Working,50000)
(BenchTeam,70000)
(BenchTeam,80000)
(BenchProj,)
(BenchProj,)
grunt> describe fj3
fj3: {stat: chararray,sal: int}
grunt> grp = group fj3 by stat;
grunt> res = foreach grp generate
>> group as stat , SUM(fj3.sal) as tot;
grunt> dump res
(Working,228000)
(BenchProj,)
(BenchTeam,150000)
____________________________________________
No comments:
Post a Comment