Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Tuesday 28 June 2016

Hive Lab8 : Xml data processing 3

Hive xml3
______________________

xml tags(nodes) with parameters.
__________________________

[training@localhost ~]$ cat xml6
<tr><cid>101</cid><pr pid="p1">1000</pr><pr pid="p3">3000</pr></tr>
<tr><cid>102</cid><pr pid="p1">1200</pr><pr pid="p2">2000</pr></tr>
<tr><cid>103</cid><pr pid="p2">5000</pr><pr pid="p3">3000</pr></tr>
[training@localhost ~]$ 

[training@localhost ~]$ cat customers
<rec><cid>101</cid><name>Mani</name><city>Hyd</city></rec>
<rec><cid>102</cid><name>Vani</name><city>Pune</city></rec>
<rec><cid>103</cid><name>Veni</name><city>Hyd</city></rec>
[training@localhost ~]$ 

[training@localhost ~]$ cat products
<rec><pid>p1</pid><brand>Samsung</brand><pname>Mobile</pname><mrp>5000</mrp></rec>
<rec><pid>p2</pid><brand>Lg</brand><pname>Mobile</pname><mrp>7000</mrp></rec>
<rec><pid>p3</pid><brand>Lg</brand><pname>Laptop</pname><mrp>6000</mrp></rec>
[training@localhost ~]$ 

hive> create database xml10;
OK
Time taken: 0.034 seconds
hive> use xml10;
OK
Time taken: 0.027 seconds
hive> create table transraw(line string);
OK
Time taken: 0.057 seconds
hive> create table custraw(line string);
OK
Time taken: 0.034 seconds
hive> create table prodraw(line string);
OK
Time taken: 0.038 seconds
hive> 

hive> load data local inpath 'customers' into table custraw;
Copying data from file:/home/training/customers
Copying file: file:/home/training/customers
Loading data to table xml10.custraw
OK
Time taken: 0.155 seconds
hive> select * from custraw;
OK
<rec><cid>101</cid><name>Mani</name><city>Hyd</city></rec>
<rec><cid>102</cid><name>Vani</name><city>Pune</city></rec>
<rec><cid>103</cid><name>Veni</name><city>Hyd</city></rec>
Time taken: 0.065 seconds
hive> 

hive> create table customers(cid string, 
    >   name string, city string);
OK
Time taken: 0.043 seconds
hive> insert overwrite table customers 
    >   select xpath_string(line,'rec/cid'),
    >    xpath_string(line,'rec/name'),     
    >    xpath_string(line,'rec/city') 
    > from custraw;

hive> select * from customers;
OK
101     Mani    Hyd
102     Vani    Pune
103     Veni    Hyd
Time taken: 0.059 seconds
hive> 

hive> load data local inpath 'products'
    >  into table prodraw;
Copying data from file:/home/training/products
Copying file: file:/home/training/products
Loading data to table xml10.prodraw
OK
Time taken: 0.109 seconds
hive> select * from prodraw;
OK
<rec><pid>p1</pid><brand>Samsung</brand><pname>Mobile</pname><mrp>5000</mrp></rec>
<rec><pid>p2</pid><brand>Lg</brand><pname>Mobile</pname><mrp>7000</mrp></rec>
<rec><pid>p3</pid><brand>Lg</brand><pname>Laptop</pname><mrp>6000</mrp></rec>
Time taken: 0.057 seconds
hive> 
hive> create table products(pid string, 
    >  pname string, brand string, 
    >   mrp int);
OK
Time taken: 0.036 seconds
hive> insert overwrite table products
    >   select xpath_string(line,'rec/pid'),
    >    xpath_string(line,'rec/pname'),    
    >    xpath_string(line,'rec/brand'),
    >    xpath_int(line,'rec/mrp')          
    > from prodraw;


hive> select * from products;
OK
p1      Mobile  Samsung 5000
p2      Mobile  Lg      7000
p3      Laptop  Lg      6000
Time taken: 0.06 seconds
hive> 

hive> select * from transraw;
OK
Time taken: 0.052 seconds
hive> load data local inpath 'xml6'
    >  into table transraw;
Copying data from file:/home/training/xml6
Copying file: file:/home/training/xml6
Loading data to table xml10.transraw
OK
Time taken: 0.094 seconds
hive> select * from transraw;
OK
<tr><cid>101</cid><pr pid="p1">1000</pr><pr pid="p3">3000</pr></tr>
<tr><cid>102</cid><pr pid="p1">1200</pr><pr pid="p2">2000</pr></tr>
<tr><cid>103</cid><pr pid="p2">5000</pr><pr pid="p3">3000</pr></tr>
Time taken: 0.056 seconds
hive> 

hive> create table transraw2(cid string, 
    >   pid array<string>,
    >  pr array<string>);
OK
Time taken: 0.034 seconds
hive> insert overwrite table transraw2
    >   select xpath_string(line,'tr/cid'),
    >  xpath(line,'tr/pr/@pid'), 
    >  xpath(line,'tr/pr/text()') from transraw;

hive> select * from transraw2;
OK
101     ["p1","p3"]     ["1000","3000"]
102     ["p1","p2"]     ["1200","2000"]
103     ["p2","p3"]     ["5000","3000"]
Time taken: 0.058 seconds
hive> create table cidpid(cid string, pid string);
OK
Time taken: 0.041 seconds
hive> insert overwrite table cidpid
    >   select cid, mypid from transraw2
    >    lateral view explode(pid) p as mypid;

hive> select * from cidpid;
OK
101     p1
101     p3
102     p1
102     p2
103     p2
103     p3
Time taken: 0.054 seconds
hive> create table cidpr(cid string, pr int);
OK
Time taken: 0.072 seconds
hive> insert overwrite table cidpr
    >   select cid, mypy from transraw2
    >     lateral view explode(pr) p as  mypy;


hive> select * from cidpr;
OK
101     1000
101     3000
102     1200
102     2000
103     5000
103     3000
Time taken: 0.069 seconds
hive> alter table cidpid add columns(n int);
OK
Time taken: 0.056 seconds
hive>        

 add jar Desktop/hivejars.jar;
 create temporary function auto 
   as 'hive.analytics.SeqNumber';

hive> insert overwrite table cidpid
    >   select cid, pid, auto() from cidpid;

hive> select * from cidpid;
OK
101     p1      1
101     p3      2
102     p1      3
102     p2      4
103     p2      5
103     p3      6
Time taken: 0.066 seconds
hive> 

hive> alter table cidpr add columns(n int);
OK
Time taken: 0.057 seconds
hive> insert overwrite table cidpr
    >   select cid, pr, auto() from cidpr;

hive> select * from cidpr;
OK
101     1000    1
101     3000    2
102     1200    3
102     2000    4
103     5000    5
103     3000    6
Time taken: 0.061 seconds
hive> 

hive> create table trans(cid string, pid string,
    >  pr int);
OK
Time taken: 0.031 seconds
hive> insert overwrite table trans
    >   select l.cid, pid, pr 
    >   from 
    >  cidpid l join cidpr r 
    >  on (l.n=r.n);

hive> select * from trans;
OK
101     p1      1000
101     p3      3000
102     p1      1200
102     p2      2000
103     p2      5000
103     p3      3000
Time taken: 0.063 seconds
hive> 

hive> select * from products;
OK
p1      Mobile  Samsung 5000
p2      Mobile  Lg      7000
p3      Laptop  Lg      6000
Time taken: 0.052 seconds
hive> create table  info1(cid string, 
    >   pid string, pname string, 
    >   brand string, mrp int, pr int);
OK
Time taken: 0.075 seconds
hive> insert overwrite table info1
    >    select cid, l.pid, pname, brand,
    >    mrp , pr  
    >   from trans l join products r
    >  on (l.pid = r.pid);
hive> select * from info1;
OK
101     p1      Mobile  Samsung 5000    1000
102     p1      Mobile  Samsung 5000    1200
102     p2      Mobile  Lg      7000    2000
103     p2      Mobile  Lg      7000    5000
101     p3      Laptop  Lg      6000    3000
103     p3      Laptop  Lg      6000    3000
Time taken: 0.057 seconds
hive> select * from customers;
OK
101     Mani    Hyd
102     Vani    Pune
103     Veni    Hyd
Time taken: 0.059 seconds
hive> 
hive> create table info(cid string, 
    >  name string, pid string, 
    >  pname string, brand string, 
    >  mrp int, pr int, city string)
    > row format delimited 
    >   fields terminated by ',';
OK
Time taken: 0.082 seconds
hive>  

hive> insert overwrite table info        
    >   select l.cid, name, pid, pname, brand,
    >  mrp , pr , city                        
    > from info1 l join customers r
    >  on (l.cid = r.cid);

hive> select * from info;
OK
101     Mani    p1      Mobile  Samsung 5000    1000      Hyd
101     Mani    p3      Laptop  Lg      6000    3000      Hyd
102     Vani    p1      Mobile  Samsung 5000    1200      Pune
102     Vani    p2      Mobile  Lg      7000    2000      Pune
103     Veni    p2      Mobile  Lg      7000    5000      Hyd
103     Veni    p3      Laptop  Lg      6000    3000      Hyd
Time taken: 0.053 seconds
hive> 

___________________________________



































Hive Lab7 : Hive Partitioned Tables (A simple Case)


[training@localhost ~]$ cat trans
01/01/2010,30000
01/07/2010,40000
01/08/2010,30000
01/26/2010,40000
02/01/2010,30000
02/07/2010,40000
02/09/2010,30000
02/28/2010,40000
03/01/2010,80000
03/07/2010,40000
04/01/2010,30000
04/17/2010,70000
04/18/2010,80000
04/26/2010,90000
05/01/2010,10000
05/07/2010,20000
05/08/2010,30000
05/26/2010,60000
06/01/2010,30000
06/17/2010,50000
07/01/2010,30000
07/07/2010,40000
08/08/2010,10000
08/26/2010,20000
09/01/2010,90000
09/07/2010,40000
09/18/2010,30000
09/30/2010,40000
10/01/2010,30000
10/07/2010,40000
10/08/2010,30000
11/26/2010,40000
11/01/2010,30000
12/07/2010,40000
12/08/2010,30000
12/26/2010,40000
[training@localhost ~]$ 

hive> create table sales(dt string, 

amt int)
    >  row format delimited
    >   fields terminated by ',';
OK
Time taken: 0.046 seconds
hive> load data local inpath 'trans'
    >  into table sales;
Copying data from 

file:/home/training/trans
Copying file: 

file:/home/training/trans
Loading data to table cases.sales
OK
Time taken: 0.194 seconds
hive>

 insert overwrite table sales2
     select * from (
        select dt, amt from sales
          union all
   select concat(substr(dt,1,9),"1") 

as dt,
    amt+1000 as amt  from sales 
    union all
  select concat(substr(dt,1,9),"2") as 

dt,
  amt+3000  as amt from sales         

union all
  select concat(substr(dt,1,9),"3") as 

dt,
  amt+5000 as amt from sales          

union all
  select concat(substr(dt,1,9),"4") as 

dt,
   amt+10000 as amt from sales ) x;    
hive> create table sales02(dt 

array<string>,
    >     amt int);
OK
Time taken: 0.042 seconds
hive> insert overwrite table sales02
    >   select split(dt,'/'), amt from 

sales2;

hive> select * from sales02 limit 5;
OK
["01","01","2010"]      30000
["01","01","2013"]      35000
["01","01","2011"]      31000
["01","01","2014"]      40000
["01","01","2012"]      33000
Time taken: 0.083 seconds
hive> 
hive> create table sales3 like sales;
hive> insert overwrite table sales3
    >   select concat(dt[2],'-',dt

[0],'-',dt[1]),
    >  amt from sales02;

hive> create table salespart(dt 

string,
    >    amt int)      
    > partitioned by (y int, m int, d 

int) 
    > row format delimited           
    >   fields terminated by ',';
OK
Time taken: 0.034 seconds
hive> set 

hive.exec.dynamic.partition=true;
hive> set 

hive.exec.dynamic.partition.mode=nonst

rict;
hive> set 

hive.exec.max.dynamic.partitions.perno

de=10000;
hive> set 

hive.exec.max.dynamic.partitions=10000

0;


hive> insert overwrite table salespart
      partition(y,m,d)
      select dt, amt, year(dt), month

(dt),
       day(dt) from sales3;

-- to access a perticular day

hive> select * from salespart
   where y=2010 and m=1 and d=8; --> 1 

part

-- to access a perticular month.
hive> select * from salespart
   where y=2010 and m=1; ---> 31 parts

-- to access a year.

hive> select * from salespart
   where y=2014; ---> 366 part

---> range of dates.
Hive> select * from salespart
   where (y=2002 and m>=5)
             or
         (y>2002 and y<2009)
              or
         (y=2009 and m<=6);

______________________________________

___














________________________________




    




Hive Lab6 : xml data processing 2

Hive xml2
______________________

hive> create database xxmls;
OK
Time taken: 0.182 seconds
hive> use xxmls;
OK
Time taken: 0.017 seconds

[training@localhost ~]$ cat xml5
<tr><cid>101</cid><pr>3000</pr><qnt>3</qnt><pr>5000</pr><qnt>2</qnt><pr>1000</pr><qnt>5</qnt></tr>
<tr><cid>102</cid><pr>2000</pr><qnt>5</qnt><pr>5000</pr><qnt>2</qnt></tr>
<tr><cid>101</cid><pr>6000</pr><qnt>5</qnt></tr>
[training@localhost ~]$ 


hive> create table raw(line string);
OK
Time taken: 0.252 seconds
hive> load data local inpath 'xml5' 
    >  into table raw;
Copying data from file:/home/training/xml5
Copying file: file:/home/training/xml5
Loading data to table xxmls.raw
OK
Time taken: 0.178 seconds
hive> select * from raw;
OK
<tr><cid>101</cid><pr>3000</pr><qnt>3</qnt><pr>5000</pr><qnt>2</qnt><pr>1000</pr><qnt>5</qnt></tr>
<tr><cid>102</cid><pr>2000</pr><qnt>5</qnt><pr>5000</pr><qnt>2</qnt></tr>
<tr><cid>101</cid><pr>6000</pr><qnt>5</qnt></tr>
Time taken: 0.155 seconds
hive> 


hive> create table raw2(cid int, 
    > pr array<string>, qnt array<string>);
OK
Time taken: 0.069 seconds
hive> insert overwrite table raw2
    >   select xpath_int(line,'tr/cid'),
    >     xpath(line,'tr/pr/text()'),
    >     xpath(line,'tr/qnt/text()') 
    > from raw;

hive> create table cidpr(cid int , pr int);
hive> insert overwrite table cidpr
    >   select cid, mypr from raw2
    >    lateral view explode(pr) p as mypr;

hive> create table cidqnt(cid int, qnt int);

hive> insert overwrite table cidqnt
    >    select cid, myqnt from raw2
    >   lateral view explode(qnt) q as myqnt;


hive> select * from cidpr;
OK
101     3000
101     5000
101     1000
102     2000
102     5000
101     6000
Time taken: 0.06 seconds
hive> select * from cidqnt;
OK
101     3
101     2
101     5
102     5
102     2
101     5
Time taken: 0.046 seconds
hive> 


hive> add jar Desktop/hivejars.jar;  
hive> create temporary function auto 
    >  as 'hive.analytics.SeqNumber';


 hive> alter table cidpr add columns(n int);
 hive> insert overwrite table cidpr
    select cid, pr, auto() from cidpr;

 hive> alter table cidqnt add columns(n int);
 hive> insert overwrite table cidqnt
    select cid, qnt, auto() from cidqnt;

 hive> create table trans(cid int, pr int, 
             qnt int, bill int);
 hive> insert overwrite table trans
       select l.cid, pr, qnt, pr*qnt
    from cidpr l join cidqnt r
           on (l.n=r.n);

 hive> create table report(cid int, totbill int);
 hive> insert overwrite table report
    select cid, sum(bill) from trans
        group by cid;
 hive> select * from report;
________________________

hive.analytics.SeqNumber.java
______________________
package hive.analytics;

import java.io.IOException;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.IntWritable;

public class SeqNumber extends UDF
{
int cnt = 0;
public IntWritable  evaluate()
throws IOException
{
cnt++;
return  new IntWritable(cnt);
}


}

______________________________

keep all above statements into a script file.
   
   hscript1.hql
 ________________

 create database urheroes;
 use urheroes;
 create table raw(line string);
 load data local inpath 'xml5' into table raw;
 create table raw2(cid int, pr array<string>, qnt array<string>);
 insert overwrite table raw2
  select xpath_int(line,'tr/cid'),
    xpath(line,'tr/pr/text()'),
    xpath(line,'tr/qnt/text()') from raw;
 create table cidpr(cid int, pr int);
 insert overwrite table cidpr
   select cid, mypr from raw2
    lateral view explode(pr) p as mypr;
 create table cidqnt(cid int, qnt int);
 insert overwrite table cidqnt
   select cid, myq from raw2
    lateral view explode(qnt) q as myq;
 alter table cidpr add columns(n int);
 alter table cidqnt add columns(n int);
 add jar Desktop/hivejars.jar;
 create temporary function auto 
   as 'hive.analytics.SeqNumber';
 insert overwrite table cidpr
   select cid, pr, auto() from cidpr;
 insert overwrite table cidqnt
   select cid, qnt, auto() from cidqnt;
 create table trans(cid int, pr int, qnt int, 
   bill int);
 insert overwrite table trans
   select l.cid, pr, qnt, pr*qnt
   from cidpr l join cidqnt r
   on (l.n=r.n);
 create table report(cid int, totbill int);
 insert overwrite table report
   select cid, sum(bill) from trans
    group by cid;
 select * from trans;
 select * from report;
 drop table raw;
 drop table raw2;
 drop table cidpr;
 drop table cidqnt;
________________________

 how to submit script:

 $ hive -f hscript1.hql
_______________________________________











































       

Friday 24 June 2016

Hive Lab5 : xml data processing

[training@localhost ~]$ cat sampxml1
<rec>
  <name>Ravi</name>
  <age>25</age>
  <sex>m</sex>
</rec>
<rec>
  <name>Rani</name>
  <sex>f</sex>
  <city>Hyd</city>
</rec>
[training@localhost ~]$ 

hive> create table samp(line string);
OK
Time taken: 0.309 seconds
hive> load data local inpath 'sampxml1'
    >  into table samp;
Copying data from file:/home/training/sampxml1
Copying file: file:/home/training/sampxml1
Loading data to table ourxml.samp
OK
Time taken: 0.202 seconds
hive> 

hive> select * from samp;
OK
<rec>
  <name>Ravi</name>
  <age>25</age>
  <sex>m</sex>
</rec>
<rec>
  <name>Rani</name>
  <sex>f</sex>
  <city>Hyd</city>
</rec>
Time taken: 0.184 seconds
hive> 

 above xml file has 2 records,
 when this file is loaded into hive table, hive treats, each line as a record,
  as per xml == two records.
  as per hive --> 10 records(rows.).

this data is not ready for hive,
hive expects total xml record in a single line.
so data preperation step is required.

Prepare a mapreduce Job , which can convert xml record into single row.

  (Later we see in Mapreduce session).,

[training@localhost ~]$ cat xml1
<rec><name>Ravi</name><age>25</age><city>hyd</city></rec>
<rec><name>Rani</name><age>24</age><sex>f</sex></rec>
<rec><name>Sampath</name><sex>m</sex><city>Del</city></rec>
[training@localhost ~]$ 


hive> create table raw(line string);
OK
Time taken: 0.05 seconds
hive> load data local inpath 'xml1'
    >  into table raw;
Copying data from file:/home/training/xml1
Copying file: file:/home/training/xml1
Loading data to table ourxml.raw
OK
Time taken: 0.123 seconds
hive> 


hive> select * from raw;
OK
<rec><name>Ravi</name><age>25</age><city>hyd</city></rec>
<rec><name>Rani</name><age>24</age><sex>f</sex></rec>
<rec><name>Sampath</name><sex>m</sex><city>Del</city></rec>
Time taken: 0.066 seconds
hive> select count(*) from raw;
3
-- 3 rows.

hive> select xpath_string(line,'rec/name') from raw;
Ravi
Rani
Sampath

hive> select 
    >  xpath_string(line,'rec/name'),
    >  xpath_int(line,'rec/age'),
    >  xpath_string(line,'rec/sex'),
    >  xpath_string(line,'rec/city')
    > from raw;

Ravi    25              hyd
Rani    24      f
Sampath 0       m       Del

if string fields is missed, it returns blank string, if numeric field is missed it returns 0.


_________________

xml with nested tags.

[training@localhost ~]$ cat xml2
<rec><name><fname>Ravi</fname><lname>kumar</lname></name><age>25</age><contact><email><personal>ravi@gmail.com</personal><official>ravi@infy.com</official></email><phone><mobile>12345</mobile><office>12346</office><residence>12347</residence></phone></contact></rec>
[training@localhost ~]$ 

hive> create table xraw(line string);
OK
Time taken: 0.075 seconds
hive> load data local inpath 'xml2'
    >  into table xraw;

hive> select * from xraw;
OK
<rec><name><fname>Ravi</fname><lname>kumar</lname></name><age>25</age><contact><email><personal>ravi@gmail.com</personal><official>ravi@infy.com</official></email><phone><mobile>12345</mobile><office>12346</office><residence>12347</residence></phone></contact></rec>
Time taken: 0.064 seconds
hive> 

hive> create table info(fname string, 
    >   lname string, 
    > age int, personal_email string,
    >  official_email string,
    >  mobile string, office string, 
    >  residence string);
OK
Time taken: 0.042 seconds
hive> 

hive> insert overwrite table info
    >   select 
    > xpath_string(line,'rec/name/fname'),
    > xpath_string(line,'rec/name/lname'),
    > xpath_int(line,'rec/age'),          
    > xpath_string(line,'rec/contact/email/personal'),
    > xpath_string(line,'rec/contact/email/official'),
    > xpath_string(line,'rec/contact/phone/mobile'),
    > xpath_string(line,'rec/contact/phone/office'),
    > xpath_string(line,'rec/contact/phone/residence') 
    >  from xraw;

hive> select * from info;
OK
Ravi    kumar   25      ravi@gmail.com  ravi@infy.com     12345   12346   12347
Time taken: 0.064 seconds
hive> 
_____________

xml with collections.



[training@localhost ~]$ cat xml3
<rec><name>Amar</name><qual>Btech</qual><qual>Mtech</qual></rec>
<rec><name>Amala</name><qual>Bsc</qual><qual>Msc</qual><qual>Mtech</qual></rec>
<rec><name>Akash</name><qual>Btech</qual><qual>Mba</qual></rec>
[training@localhost ~]$ 

hive> create table yraw(line string);
OK
Time taken: 0.043 seconds
hive> load data local inpath 'xml3'
    >  into table yraw;

hive> select * from yraw;
<rec><name>Amar</name><qual>Btech</qual><qual>Mtech</qual></rec>
<rec><name>Amala</name><qual>Bsc</qual><qual>Msc</qual><qual>Mtech</qual></rec>
<rec><name>Akash</name><qual>Btech</qual><qual>Mba</qual></rec>

hive> 
hive> create table raw2(name string, 
    >    qual  array<string>);
OK
Time taken: 0.039 seconds
hive> insert overwrite table raw2
    >   select xpath_string(line,'rec/name'),
    >  xpath(line,'rec/qual/text()')
    >   from yraw;

hive> select * from raw2;
OK
Amar    ["Btech","Mtech"]
Amala   ["Bsc","Msc","Mtech"]
Akash   ["Btech","Mba"]
Time taken: 0.061 seconds
hive> 

hive> select name, size(qual) from raw2;

Amar    2
Amala   3
Akash   2

how to access array elements,

 by using index numbers

 indexing starts from 0.

hive> select qual[0], qual[1],
          qual[2] from raw2;
Btech   Mtech   NULL
Bsc     Msc     Mtech
Btech   Mba     NULL

search for elements with in array.

hive> select * from raw2 
    >  where array_contains(qual,'Mtech');


Amar    ["Btech","Mtech"]
Amala   ["Bsc","Msc","Mtech"]

_______________
Flattening Array elements:

hive> select explode(qual) as q 
   from raw2;

Btech
Mtech
Bsc
Msc
Mtech
Btech
Mba


hive> select name, explode(qual) as q from raw2;
FAILED: Error in semantic analysis: UDTF's are not supported outside the SELECT clause, nor nested in expressions
hive> 

-- above statement is invalid,
   bcoz, udtf s can not be applied with other column expressions.

hive> create table  yinfo(name string, 
    >      qual string);
OK
Time taken: 0.035 seconds
hive> insert overwrite table yinfo 
    >   select name, myq from raw2
    >    lateral view explode(qual) q as myq;

hive> select * from yinfo;
OK
Amar    Btech
Amar    Mtech
Amala   Bsc
Amala   Msc
Amala   Mtech
Akash   Btech
Akash   Mba
Time taken: 0.055 seconds

hive> select * from yinfo   
    >   where qual in ('Msc','Mtech');


Amar    Mtech
Amala   Msc
Amala   Mtech

hive> create table yres(qual string, cnt int);
OK
Time taken: 0.036 seconds
hive> insert overwrite table yres
    >   select qual, count(*) from yinfo
    >    group by qual;

hive> select * from yres;
OK
Bsc     1
Btech   2
Mba     1
Msc     1
Mtech   2
Time taken: 0.051 seconds
hive> 

____________________

Assignment:

[training@localhost ~]$ cat xml4
<tr><cid>101</cid><pr>1000</pr><pr>3000</pr></tr>
<tr><cid>102</cid><pr>1200</pr><pr>2000</pr><pr>5000</pr></tr>
<tr><cid>101</cid><pr>4000</pr></tr>
[training@localhost ~]$ 

hive> create database sales;
hive> use sales;
hive> create table raw(line string);
hive> load data local inpath 'xml4'
   into table raw;
hive> create table raw2(cid string, 
    pr array<string>);

hive> insert overwrite table raw2
   select xpath_string(line,'tr/cid'),
   xpath(line,'tr/pr/text()')
    from raw;
hive> create table raw3(cid string, 
   pr int);
hive> insert overwrite table raw3
    select cid, mypr from  raw2
   lateral view explode(pr) p as mypr;

hive> create table results(cid string, 
   totbill int);
hive> insert overwrite table results
   select cid, sum(pr)
   from raw3
   group by cid;

hive> select * from results;

--- my students are really super and energitic -- 
_______________  __________


















____________________











__________________________________ 




______________________________--

















Hive Lab4 : Bucketing Tables

hive> create database bucks;
OK
Time taken: 1.775 seconds
hive> use bucks;
OK
Time taken: 0.022 seconds
hive> create table sales(pid string, pr int)
    > row format delimited  
    >   fields terminated by ',';
OK
Time taken: 0.364 seconds
hive> load data local inpath 'sales' 
    >   into table sales;
Copying data from file:/home/training/sales
Copying file: file:/home/training/sales
Loading data to table bucks.sales
OK
Time taken: 0.187 seconds
hive> 

hive> create table buckstab(pid string, 
    >   pr int)
    >  clustered by (pid)
    > into 3 buckets;
OK
Time taken: 0.051 seconds
hive> 

loading data into buckets.

hive> set hive.enforce.bucketing=true;
hive> insert overwrite table buckstab
    >   select * from sales;

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/bucks.db/sales
Found 1 items
-rw-r--r--   1 training supergroup        185 2016-06-21 07:35 /user/hive/warehouse/bucks.db/sales/sales
[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/bucks.db/buckstab
Found 3 items
-rw-r--r--   1 training supergroup         62 2016-06-21 07:40 /user/hive/warehouse/bucks.db/buckstab/000000_0
-rw-r--r--   1 training supergroup         35 2016-06-21 07:40 /user/hive/warehouse/bucks.db/buckstab/000001_0
-rw-r--r--   1 training supergroup         88 2016-06-21 07:40 /user/hive/warehouse/bucks.db/buckstab/000002_0
[training@localhost ~]$ 






Hive Lab3 : Partitioned Tables


Hi I will update this, soon.
Thank You.

Hive Lab2 : Temperature data Handling and Other operations

[training@localhost ~]$ gedit tmpr.txt
[training@localhost ~]$ cat tmpr.txt
xxxxx2006xxx34xxx
xxxxx2006xxx35xxx
xxxxx2006xxx32xxx
xxxxx2007xxx24xxx
xxxxx2007xxx21xxx
xxxxx2008xxx37xxx
xxxxx2008xxx39xxx
[training@localhost ~]$ 

hive> create database mytmpr;
OK
Time taken: 1.756 seconds
hive> use mytmpr;                   
OK
Time taken: 0.032 seconds
hive> create table raw(line string);
OK
Time taken: 0.345 seconds
hive> 

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/mytmpr.db
Found 1 items
drwxr-xr-x   - training supergroup          0 2016-06-23 20:31 /user/hive/warehouse/mytmpr.db/raw
[training@localhost ~]$ 

-- when database is created, with name database and .db extension one directory will be created in warehouse location.

 /user/hive/warehouse/mytmpr.db

hive> load data local inpath  'tmpr.txt' 
    >  into table raw; 

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/mytmpr.db/raw
Found 1 items
-rw-r--r--   1 training supergroup        126 2016-06-23 20:35 /user/hive/warehouse/mytmpr.db/raw/tmpr.txt
[training@localhost ~]$ 
hive> select * from raw;
OK
xxxxx2006xxx34xxx
xxxxx2006xxx35xxx
xxxxx2006xxx32xxx
xxxxx2007xxx24xxx
xxxxx2007xxx21xxx
xxxxx2008xxx37xxx
xxxxx2008xxx39xxx
Time taken: 0.186 seconds
hive> 

hive> create table tmpr(y int, t int);
OK
Time taken: 0.058 seconds
hive> insert overwrite table tmpr
    >   select substr(line,6,4),
    >     substr(line,13,2) from raw;


hive> select * from tmpr;
OK
2006    34
2006    35
2006    32
2007    24
2007    21
2008    37
2008    39
Time taken: 0.057 seconds
hive> describe tmpr;
OK
y       int
t       int
Time taken: 0.065 seconds
hive> 
hive> create table results(y int, 
 max int, min int);
hive> insert overwrite table results
   select y, max(t), min(t)
   from tmpr group by y;

hive> select * from results;
OK
2006    35      32
2007    24      21
2008    39      37
Time taken: 0.058 seconds
hive> 

[training@localhost ~]$ hadoop fs -cat /user/hive/warehouse/mytmpr.db/tmpr/000000_0
2006 34
2006 35
2006 32
2007 24
2007 21
2008 37
2008 39
[training@localhost ~]$ 

   default delimiter for hive table is  \001.

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/mytmpr.db/results
Found 1 items
-rw-r--r--   1 training supergroup         33 2016-06-23 20:48 /user/hive/warehouse/mytmpr.db/results/000000_0
[training@localhost ~]$ hadoop fs -cat /user/hive/warehouse/mytmpr.db/results/000000_0
2006 35 32
2007 24 21
2008 39 37
[training@localhost ~]$ 
______________________________

if temperature contains both positives and negatives..

[training@localhost ~]$ cat tmpr2.txt
xxxxx2006xxx34xxx
xxxxx2006xxx-35xxx
xxxxx2006xxx-14xxx
xxxxx2006xxx32xxx
xxxxx2007xxx-24xxx
xxxxx2007xxx-21xxx
xxxxx2007xxx35xxx
xxxxx2008xxx-37xxx
xxxxx2008xxx39xxx
[training@localhost ~]$ 
hive> create database urtmpr;
OK
Time taken: 0.022 seconds
hive> use urtmpr;
OK
Time taken: 0.011 seconds
hive> create table raw(line string);
OK
Time taken: 0.032 seconds
hive> load data local inpath 'tmpr2.txt'
    >  into table raw;

hive> create table tmpr(y int, t int);
OK
Time taken: 0.035 seconds
hive> insert overwrite table tmpr
    >  select * from (
    >    select substr(line,6,4),
    >      substr(line,13,2) from raw
    >   where substr(line,13,1)!='-'
    >      union all
    >    select substr(line,6,4),    
    >      substr(line,13,3) from raw
    >   where substr(line,13,1)='-') x;


-- in hive only "union all" available., which allows duplicates.
-- hive union should be placed as subquery.
-- subquery should have aliase.

hive> select * from tmpr;
OK
2006    34
2006    -35
2006    -14
2006    32
2007    -24
2007    -21
2007    35
2008    -37
2008    39
Time taken: 0.056 seconds
hive> 

hive> create table results(y int, 
    >   max int, min int);
OK
Time taken: 0.047 seconds
hive> insert overwrite table results
    >   select y, max(t) , min(t)
    >   from tmpr group by y;


hive> select * from results;
OK
2006    34      -35
2007    35      -24
2008    39      -37
Time taken: 0.054 seconds
hive> 



_____________________________


Both inner and external tables can use custom locations.


hive> create table hvtab1(line string)
  location '/user/ursloc';
hive> load data local inpath 'file1'
   into table hvtab1;

[training@localhost ~]$ hadoop fs -ls /user/ursloc
Found 1 items
-rw-r--r--   1 training supergroup         61 2016-06-23 21:15 /user/ursloc/file1
[training@localhost ~]$ 

A Single Location can be applied for mutliple tables.

hive> create table hvtab2(line string)
   location '/user/ursloc';

now tables use same data and same location.

_____________

how to reuse:

hive> create external table 
    >   xtab(line string);
OK
Time taken: 0.055 seconds
hive> load data local inpath 'file1'
    >  into table xtab;

hive> select * from xtab;
OK
aaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaa
Time taken: 0.059 seconds
hive> drop table xtab;
OK
Time taken: 0.062 seconds
hive> show tables;
OK
raw
results
tmpr
Time taken: 0.042 seconds
hive> 

hive> create table xtab(line string);
OK
Time taken: 0.031 seconds
hive> select * from xtab;
OK
aaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaa
Time taken: 0.06 seconds
hive> 

-- create behavior:
  if directory is already existed, it will use it, if not existed, it will create a new directory.


_________________

what if, backend directory is deleted.
 what will happen to table?

 table will not be deleted.

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/urtmpr.db/raw
Found 1 items
-rw-r--r--   1 training supergroup        167 2016-06-23 21:00 /user/hive/warehouse/urtmpr.db/raw/tmpr2.txt
[training@localhost ~]$ hadoop fs -rmr /user/hive/warehouse/urtmpr.db/raw
Deleted hdfs://localhost/user/hive/warehouse/urtmpr.db/raw



hive> show tables;
OK
raw
results
tmpr
xtab
Time taken: 0.045 seconds
hive> select * from raw;
OK
Time taken: 0.036 seconds
hive> 
________________













___