Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Saturday 9 July 2016

Hive Lab 14: Joins and Cartisian Product


 Joins 
______________

   used to collect data from two or more tables.

    joins are two types.

 i) Inner Joins
 ii) External Joins.

Inner Joins:
   matching rows with given join condition.

   ex:
  tab1 ---> name,city
___________
  A,hyd
  B,del
  c,hyd
  d,del
  e,pune
  f,mumbai   

tab2 ---> city, dept
_________
  hyd,production
  del,hr
  pune,accounts
  chennai,admin
_______________

 inner join --> based on city.


 a,hyd,hyd,production
 B,del,del,hr
 c,hyd,hyd,production
 d,del,del,hr
 e,pune,pune,accounts

outer join -->
   matching records and non-matching records.


 outer joins are 3 types.
  i) left outer join.
 ii) right outer join
 iii) full outer join.

left outer join:

   matchings + non matchings of left side.
   ( complete presence from left side table).
   
 tabx:--> id, dno
____________
 101,11
 102,14
___________

 taby ---> dno,loc
 _____________
 11,hyd
 13,del
 _____________--


 tabx  lefter outer join taby --> based on dno.


   101,11,11,hyd
   102,14,null,null

 right outer join -->
   matchings and non matchings of right side table.

   (complete presence from right side table ).

 tabx right outer join taby -->based on dno.

   101,11,11,hyd
   null,null,13,del

full outer join --->
  matchings, non matchings of left side,
 and  non matchings of right side table.

 tabx full outer join taby ---> based on dno.

   101,11,11,hyd
   102,14,null,null
   null,null,13,del

___________________________________


[training@localhost ~]$ cat emp
101,vino,26000,m,11
102,Sri,25000,f,11
103,mohan,13000,m,13
104,lokitha,8000,f,12
105,naga,6000,m,13
101,janaki,10000,f,12
201,aaa,30000,m,12
202,bbbb,50000,f,13
203,ccc,10000,f,13
204,ddddd,50000,m,13
304,xxx,70000,m,14
305,yyy,80000,f,15
[training@localhost ~]$ cat dept
11,marketing,hyd
12,hr,del
13,finance,hyd
20,prod,hyd
21,admin,chennai
[training@localhost ~]$ 


hive> create database joinsdb;
OK
Time taken: 1.761 seconds
hive> use joinsdb;
OK
Time taken: 0.027 seconds
hive> create table emp(id int, name string, 
    >   sal int, sex string, dno int)
    > row format delimited 
    >   fields terminated by ',';
OK
Time taken: 0.297 seconds
hive> load data local inpath 'emp' into table emp;
Copying data from file:/home/training/emp
Copying file: file:/home/training/emp
Loading data to table joinsdb.emp
OK
Time taken: 0.207 seconds
hive> select * from emp;
OK
101     vino    26000   m       11
102     Sri     25000   f       11
103     mohan   13000   m       13
104     lokitha 8000    f       12
105     naga    6000    m       13
101     janaki  10000   f       12
201     aaa     30000   m       12
202     bbbb    50000   f       13
203     ccc     10000   f       13
204     ddddd   50000   m       13
304     xxx     70000   m       14
305     yyy     80000   f       15
Time taken: 0.194 seconds
hive> 

hive> load data local inpath 'dept'
    >  into table dept;
Copying data from file:/home/training/dept
Copying file: file:/home/training/dept
Loading data to table joinsdb.dept
OK
Time taken: 0.12 seconds
hive> select * from dept;
OK
11      marketing       hyd
12      hr      del
13      finance hyd
20      prod    hyd
21      admin   chennai
Time taken: 0.068 seconds
hive> 

  Inner Join:

hive> select id,name,sal,sex,l.dno,r.dno,
    >    dname,loc
    >  from  emp l join dept r
    >  on (l.dno=r.dno);

101     vino    26000   m       11      11      marketing hyd
102     Sri     25000   f       11      11      marketing hyd
104     lokitha 8000    f       12      12      hrdel
101     janaki  10000   f       12      12      hrdel
201     aaa     30000   m       12      12      hrdel
103     mohan   13000   m       13      13      finance   hyd
105     naga    6000    m       13      13      finance   hyd
202     bbbb    50000   f       13      13      finance   hyd
203     ccc     10000   f       13      13      finance   hyd
204     ddddd   50000   m       13      13      finance   hyd
Time taken: 14.11 seconds
hive> 


Left outer Join :


hive> select id,name,sal,sex,l.dno,r.dno,
    >    dname,loc                       
    >  from  emp l left outer join dept r
    >  on (l.dno=r.dno);                 



101     vino    26000   m       11      11      marketing hyd
102     Sri     25000   f       11      11      marketing hyd
104     lokitha 8000    f       12      12      hrdel
101     janaki  10000   f       12      12      hrdel
201     aaa     30000   m       12      12      hrdel
103     mohan   13000   m       13      13      finance   hyd
105     naga    6000    m       13      13      finance   hyd
202     bbbb    50000   f       13      13      finance   hyd
203     ccc     10000   f       13      13      finance   hyd
204     ddddd   50000   m       13      13      finance   hyd
304     xxx     70000   m       14      NULL    NULL      NULL
305     yyy     80000   f       15      NULL    NULL      NULL
Time taken: 12.786 seconds
hive> 


Right outer Join:

hive> select id,name,sal,sex,l.dno,r.dno,
    >    dname,loc                       
    >  from  emp l right outer join dept r
    >  on (l.dno=r.dno);  

101     vino    26000   m       11      11      marketing hyd
102     Sri     25000   f       11      11      marketing hyd
104     lokitha 8000    f       12      12      hrdel
101     janaki  10000   f       12      12      hrdel
201     aaa     30000   m       12      12      hrdel
103     mohan   13000   m       13      13      finance   hyd
105     naga    6000    m       13      13      finance   hyd
202     bbbb    50000   f       13      13      finance   hyd
203     ccc     10000   f       13      13      finance   hyd
204     ddddd   50000   m       13      13      finance   hyd
NULL    NULL    NULL    NULL    NULL    20      prod      hyd
NULL    NULL    NULL    NULL    NULL    21      admin     chennai
Time taken: 12.429 seconds
hive> 

Full outer Join;

hive> select id,name,sal,sex,l.dno,r.dno, 
    >    dname,loc                        
    >  from  emp l full outer join dept r 
    >  on (l.dno=r.dno);   

   

Denormalizing:

hive> create table info(id int, name string, 
    >   sal int, sex string , dname string, 
    >   loc string);
OK
Time taken: 0.04 seconds
hive> insert overwrite table info
    >   select id, name, sal, sex, dname, loc
    >  from emp l full outer join dept r
    >  on (l.dno=r.dno);

           

hive> select * from info;
OK
101     vino    26000   m       marketing       hyd
102     Sri     25000   f       marketing       hyd
104     lokitha 8000    f       hr      del
101     janaki  10000   f       hr      del
201     aaa     30000   m       hr      del
103     mohan   13000   m       finance hyd
105     naga    6000    m       finance hyd
202     bbbb    50000   f       finance hyd
203     ccc     10000   f       finance hyd
204     ddddd   50000   m       finance hyd
304     xxx     70000   m       NULL    NULL
305     yyy     80000   f       NULL    NULL
NULL    NULL    NULL    NULL    prod    hyd
NULL    NULL    NULL    NULL    admin   chennai
Time taken: 0.071 seconds
hive> 

_________________

Task:

hive> create table projects(dno1 int, dno2 int,
    >   sal int);
OK
Time taken: 0.072 seconds
hive> insert overwrite table projects
    >   select l.dno, r.dno, sal from 
    >   emp l full outer join dept r 
    >   on (l.dno=r.dno);

hive> create table proj(stat string, sal int);
hive> insert overwrite table proj
    >  select 
    > if(dno1 is not null and dno2 is not null,
    > 'Working',
    >  if(dno2 is null,'BenchTeam','BenchProj')),
    >  sal from projects;

hive> select * from proj;
OK
Working 26000
Working 25000
Working 8000
Working 10000
Working 30000
Working 13000
Working 6000
Working 50000
Working 10000
Working 50000
BenchTeam       70000
BenchTeam       80000
BenchProj       NULL
BenchProj       NULL
Time taken: 0.087 seconds
hive> 

hive> insert overwrite table proj
    >  select stat, if(sal is null,0,sal)
    >  from proj;


hive> create table summary(stat string, 
    >    totsal int, cnt int);
OK
Time taken: 0.042 seconds
hive> insert overwrite table summary
    >   select stat, sum(sal), count(*)
    >  from proj
    >    group by stat;

hive> select * from summary;
OK
BenchProj       0       2
BenchTeam       150000  2
Working 228000  10


___________________________

[training@localhost ~]$ cat trans
01/01/2010,30000
01/07/2010,40000
01/08/2010,30000
01/26/2010,40000
02/01/2010,30000
02/07/2010,40000
02/09/2010,30000
02/28/2010,40000
03/01/2010,80000
03/07/2010,40000
04/01/2010,30000
04/17/2010,70000
04/18/2010,80000
04/26/2010,90000
05/01/2010,10000
05/07/2010,20000
05/08/2010,30000
05/26/2010,60000
06/01/2010,30000
06/17/2010,50000
07/01/2010,30000
07/07/2010,40000
08/08/2010,10000
08/26/2010,20000
09/01/2010,90000
09/07/2010,40000
09/18/2010,30000
09/30/2010,40000
10/01/2010,30000
10/07/2010,40000
10/08/2010,30000
11/26/2010,40000
11/01/2010,30000
12/07/2010,40000
12/08/2010,30000
12/26/2010,40000
[training@localhost ~]$ 

hive> create table rawsales(dt string, 
    >  amt int)
    row format delimited
     fields terminated by ',';
OK
Time taken: 0.057 seconds
hive> load data local inpath 'trans'
    >  into table rawsales;
Copying data from file:/home/training/trans
Copying file: file:/home/training/trans
Loading data to table joinsdb.rawsales
OK
Time taken: 0.514 seconds
hive> 


create table raw2(dt array<string> , amt int);

insert overwrite table raw2
  select split(dt,'/') , amt from rawsales;


hive> select * from raw2 limit 3;
OK
["01","01","2010"]      30000
["01","07","2010"]      40000
["01","08","2010"]      30000
Time taken: 0.061 seconds
hive> insert overwrite table sales
    >   select 
    > concat(dt[2],'-',dt[0],'-',dt[1]), amt
    >  from raw2;

hive> create table sales2(mon int, amt int);
OK
Time taken: 0.044 seconds
hive> insert overwrite table sales2
    >   select month(dt), amt from sales;


hive> create table rep1(mon int, tot int);
OK
Time taken: 0.034 seconds
hive> insert overwrite table rep1
    >   select mon, sum(amt) from sales2
    >    group by mon;



hive> create table carts(m1 int, m2 int, 
    >   tot1 int, tot2 int);             
OK
Time taken: 0.044 seconds
hive> insert overwrite table carts
    >   select l.mon , r.mon , l.tot, r.tot
    >  from rep1 l join rep1 r;


hive> insert overwrite table carts
    >  select * from carts
    >    where (m1-m2)=1;

hive> select * from carts;
OK
12      11      110000  70000
7       6       70000   80000
2       1       140000  140000
8       7       30000   70000
4       3       270000  120000
9       8       200000  30000
10      9       100000  200000
5       4       120000  270000
11      10      70000   100000
3       2       120000  140000
6       5       80000   120000


hive> alter table carts add columns(per int);
OK
Time taken: 0.067 seconds
hive> insert overwrite table carts
    >   select m1,m2,tot1,tot2,
    >     ((tot1-tot2)*100)/tot2 from carts;

hive> select * from carts;
OK
12      11      110000  70000   57
7       6       70000   80000   -12
2       1       140000  140000  0
8       7       30000   70000   -57
4       3       270000  120000  125
9       8       200000  30000   566
10      9       100000  200000  -50
5       4       120000  270000  -55
11      10      70000   100000  -30
3       2       120000  140000  -14
6       5       80000   120000  -33
Time taken: 0.061 seconds
hive> 



























   
  

5 comments:

  1. Material is of too helpful thanks a lot for the efforts

    ReplyDelete
  2. This comment has been removed by the author.

    ReplyDelete
  3. Hi sir..Please upload UDF Lab material..

    ReplyDelete
  4. thank you.it is great content very nice blog easy to understand.this is one of the recommanded blog.we recommended to our students with nice exolanation

    Hadoop online training in usa|uk|india|australia|south africa|

    ReplyDelete