SreeRam Hadoop Notes: June 2016

Tuesday, 28 June 2016

Hive Lab8 : Xml data processing 3

Hive xml3
______________________

xml tags(nodes) with parameters.
__________________________

[training@localhost ~]$ cat xml6
<tr><cid>101</cid><pr pid="p1">1000</pr><pr pid="p3">3000</pr></tr>
<tr><cid>102</cid><pr pid="p1">1200</pr><pr pid="p2">2000</pr></tr>
<tr><cid>103</cid><pr pid="p2">5000</pr><pr pid="p3">3000</pr></tr>
[training@localhost ~]$

[training@localhost ~]$ cat customers
<rec><cid>101</cid><name>Mani</name><city>Hyd</city></rec>
<rec><cid>102</cid><name>Vani</name><city>Pune</city></rec>
<rec><cid>103</cid><name>Veni</name><city>Hyd</city></rec>
[training@localhost ~]$

[training@localhost ~]$ cat products
<rec><pid>p1</pid><brand>Samsung</brand><pname>Mobile</pname><mrp>5000</mrp></rec>
<rec><pid>p2</pid><brand>Lg</brand><pname>Mobile</pname><mrp>7000</mrp></rec>
<rec><pid>p3</pid><brand>Lg</brand><pname>Laptop</pname><mrp>6000</mrp></rec>
[training@localhost ~]$

hive> create database xml10;
OK
Time taken: 0.034 seconds
hive> use xml10;
OK
Time taken: 0.027 seconds
hive> create table transraw(line string);
OK
Time taken: 0.057 seconds
hive> create table custraw(line string);
OK
Time taken: 0.034 seconds
hive> create table prodraw(line string);
OK
Time taken: 0.038 seconds
hive>

hive> load data local inpath 'customers' into table custraw;
Copying data from file:/home/training/customers
Copying file: file:/home/training/customers
Loading data to table xml10.custraw
OK
Time taken: 0.155 seconds
hive> select * from custraw;
OK
<rec><cid>101</cid><name>Mani</name><city>Hyd</city></rec>
<rec><cid>102</cid><name>Vani</name><city>Pune</city></rec>
<rec><cid>103</cid><name>Veni</name><city>Hyd</city></rec>
Time taken: 0.065 seconds
hive>

hive> create table customers(cid string,
> name string, city string);
OK
Time taken: 0.043 seconds
hive> insert overwrite table customers
> select xpath_string(line,'rec/cid'),
> xpath_string(line,'rec/name'),
> xpath_string(line,'rec/city')
> from custraw;

hive> select * from customers;
OK
101 Mani Hyd
102 Vani Pune
103 Veni Hyd
Time taken: 0.059 seconds
hive>

hive> load data local inpath 'products'
> into table prodraw;
Copying data from file:/home/training/products
Copying file: file:/home/training/products
Loading data to table xml10.prodraw
OK
Time taken: 0.109 seconds
hive> select * from prodraw;
OK
<rec><pid>p1</pid><brand>Samsung</brand><pname>Mobile</pname><mrp>5000</mrp></rec>
<rec><pid>p2</pid><brand>Lg</brand><pname>Mobile</pname><mrp>7000</mrp></rec>
<rec><pid>p3</pid><brand>Lg</brand><pname>Laptop</pname><mrp>6000</mrp></rec>
Time taken: 0.057 seconds
hive>
hive> create table products(pid string,
> pname string, brand string,
> mrp int);
OK
Time taken: 0.036 seconds
hive> insert overwrite table products
> select xpath_string(line,'rec/pid'),
> xpath_string(line,'rec/pname'),
> xpath_string(line,'rec/brand'),
> xpath_int(line,'rec/mrp')
> from prodraw;

hive> select * from products;
OK
p1 Mobile Samsung 5000
p2 Mobile Lg 7000
p3 Laptop Lg 6000
Time taken: 0.06 seconds
hive>

hive> select * from transraw;
OK
Time taken: 0.052 seconds
hive> load data local inpath 'xml6'
> into table transraw;
Copying data from file:/home/training/xml6
Copying file: file:/home/training/xml6
Loading data to table xml10.transraw
OK
Time taken: 0.094 seconds
hive> select * from transraw;
OK
<tr><cid>101</cid><pr pid="p1">1000</pr><pr pid="p3">3000</pr></tr>
<tr><cid>102</cid><pr pid="p1">1200</pr><pr pid="p2">2000</pr></tr>
<tr><cid>103</cid><pr pid="p2">5000</pr><pr pid="p3">3000</pr></tr>
Time taken: 0.056 seconds
hive>

hive> create table transraw2(cid string,
> pid array<string>,
> pr array<string>);
OK
Time taken: 0.034 seconds
hive> insert overwrite table transraw2
> select xpath_string(line,'tr/cid'),
> xpath(line,'tr/pr/@pid'),
> xpath(line,'tr/pr/text()') from transraw;

hive> select * from transraw2;
OK
101 ["p1","p3"] ["1000","3000"]
102 ["p1","p2"] ["1200","2000"]
103 ["p2","p3"] ["5000","3000"]
Time taken: 0.058 seconds
hive> create table cidpid(cid string, pid string);
OK
Time taken: 0.041 seconds
hive> insert overwrite table cidpid
> select cid, mypid from transraw2
> lateral view explode(pid) p as mypid;

hive> select * from cidpid;
OK
101 p1
101 p3
102 p1
102 p2
103 p2
103 p3
Time taken: 0.054 seconds
hive> create table cidpr(cid string, pr int);
OK
Time taken: 0.072 seconds
hive> insert overwrite table cidpr
> select cid, mypy from transraw2
> lateral view explode(pr) p as mypy;

hive> select * from cidpr;
OK
101 1000
101 3000
102 1200
102 2000
103 5000
103 3000
Time taken: 0.069 seconds
hive> alter table cidpid add columns(n int);
OK
Time taken: 0.056 seconds
hive>

add jar Desktop/hivejars.jar;
create temporary function auto
as 'hive.analytics.SeqNumber';

hive> insert overwrite table cidpid
> select cid, pid, auto() from cidpid;

hive> select * from cidpid;
OK
101 p1 1
101 p3 2
102 p1 3
102 p2 4
103 p2 5
103 p3 6
Time taken: 0.066 seconds
hive>

hive> alter table cidpr add columns(n int);
OK
Time taken: 0.057 seconds
hive> insert overwrite table cidpr
> select cid, pr, auto() from cidpr;

hive> select * from cidpr;
OK
101 1000 1
101 3000 2
102 1200 3
102 2000 4
103 5000 5
103 3000 6
Time taken: 0.061 seconds
hive>

hive> create table trans(cid string, pid string,
> pr int);
OK
Time taken: 0.031 seconds
hive> insert overwrite table trans
> select l.cid, pid, pr
> from
> cidpid l join cidpr r
> on (l.n=r.n);

hive> select * from trans;
OK
101 p1 1000
101 p3 3000
102 p1 1200
102 p2 2000
103 p2 5000
103 p3 3000
Time taken: 0.063 seconds
hive>

hive> select * from products;
OK
p1 Mobile Samsung 5000
p2 Mobile Lg 7000
p3 Laptop Lg 6000
Time taken: 0.052 seconds
hive> create table info1(cid string,
> pid string, pname string,
> brand string, mrp int, pr int);
OK
Time taken: 0.075 seconds
hive> insert overwrite table info1
> select cid, l.pid, pname, brand,
> mrp , pr
> from trans l join products r
> on (l.pid = r.pid);
hive> select * from info1;
OK
101 p1 Mobile Samsung 5000 1000
102 p1 Mobile Samsung 5000 1200
102 p2 Mobile Lg 7000 2000
103 p2 Mobile Lg 7000 5000
101 p3 Laptop Lg 6000 3000
103 p3 Laptop Lg 6000 3000
Time taken: 0.057 seconds
hive> select * from customers;
OK
101 Mani Hyd
102 Vani Pune
103 Veni Hyd
Time taken: 0.059 seconds
hive>
hive> create table info(cid string,
> name string, pid string,
> pname string, brand string,
> mrp int, pr int, city string)
> row format delimited
> fields terminated by ',';
OK
Time taken: 0.082 seconds
hive>

hive> insert overwrite table info
> select l.cid, name, pid, pname, brand,
> mrp , pr , city
> from info1 l join customers r
> on (l.cid = r.cid);

hive> select * from info;
OK
101 Mani p1 Mobile Samsung 5000 1000 Hyd
101 Mani p3 Laptop Lg 6000 3000 Hyd
102 Vani p1 Mobile Samsung 5000 1200 Pune
102 Vani p2 Mobile Lg 7000 2000 Pune
103 Veni p2 Mobile Lg 7000 5000 Hyd
103 Veni p3 Laptop Lg 6000 3000 Hyd
Time taken: 0.053 seconds
hive>

___________________________________

Hive Lab7 : Hive Partitioned Tables (A simple Case)

[training@localhost ~]$ cat trans
01/01/2010,30000
01/07/2010,40000
01/08/2010,30000
01/26/2010,40000
02/01/2010,30000
02/07/2010,40000
02/09/2010,30000
02/28/2010,40000
03/01/2010,80000
03/07/2010,40000
04/01/2010,30000
04/17/2010,70000
04/18/2010,80000
04/26/2010,90000
05/01/2010,10000
05/07/2010,20000
05/08/2010,30000
05/26/2010,60000
06/01/2010,30000
06/17/2010,50000
07/01/2010,30000
07/07/2010,40000
08/08/2010,10000
08/26/2010,20000
09/01/2010,90000
09/07/2010,40000
09/18/2010,30000
09/30/2010,40000
10/01/2010,30000
10/07/2010,40000
10/08/2010,30000
11/26/2010,40000
11/01/2010,30000
12/07/2010,40000
12/08/2010,30000
12/26/2010,40000
[training@localhost ~]$

hive> create table sales(dt string,

amt int)
> row format delimited
> fields terminated by ',';
OK
Time taken: 0.046 seconds
hive> load data local inpath 'trans'
> into table sales;
Copying data from

file:/home/training/trans
Copying file:

file:/home/training/trans
Loading data to table cases.sales
OK
Time taken: 0.194 seconds
hive>

insert overwrite table sales2
select * from (
select dt, amt from sales
union all
select concat(substr(dt,1,9),"1")

as dt,
amt+1000 as amt from sales
union all
select concat(substr(dt,1,9),"2") as

dt,
amt+3000 as amt from sales

union all
select concat(substr(dt,1,9),"3") as

dt,
amt+5000 as amt from sales

union all
select concat(substr(dt,1,9),"4") as

dt,
amt+10000 as amt from sales ) x;
hive> create table sales02(dt

array<string>,
> amt int);
OK
Time taken: 0.042 seconds
hive> insert overwrite table sales02
> select split(dt,'/'), amt from

sales2;

hive> select * from sales02 limit 5;
OK
["01","01","2010"] 30000
["01","01","2013"] 35000
["01","01","2011"] 31000
["01","01","2014"] 40000
["01","01","2012"] 33000
Time taken: 0.083 seconds
hive>
hive> create table sales3 like sales;
hive> insert overwrite table sales3
> select concat(dt[2],'-',dt

[0],'-',dt[1]),
> amt from sales02;

hive> create table salespart(dt

string,
> amt int)
> partitioned by (y int, m int, d

int)
> row format delimited
> fields terminated by ',';
OK
Time taken: 0.034 seconds
hive> set

hive.exec.dynamic.partition=true;
hive> set

hive.exec.dynamic.partition.mode=nonst

rict;
hive> set

hive.exec.max.dynamic.partitions.perno

de=10000;
hive> set

hive.exec.max.dynamic.partitions=10000

0;

hive> insert overwrite table salespart
partition(y,m,d)
select dt, amt, year(dt), month

(dt),
day(dt) from sales3;

-- to access a perticular day

hive> select * from salespart
where y=2010 and m=1 and d=8; --> 1

part

-- to access a perticular month.
hive> select * from salespart
where y=2010 and m=1; ---> 31 parts

-- to access a year.

hive> select * from salespart
where y=2014; ---> 366 part

---> range of dates.
Hive> select * from salespart
where (y=2002 and m>=5)
or
(y>2002 and y<2009)
or
(y=2009 and m<=6);

______________________________________

___

________________________________

Hive Lab6 : xml data processing 2

Hive xml2
______________________

hive> create database xxmls;
OK
Time taken: 0.182 seconds
hive> use xxmls;
OK
Time taken: 0.017 seconds

[training@localhost ~]$ cat xml5
<tr><cid>101</cid><pr>3000</pr><qnt>3</qnt><pr>5000</pr><qnt>2</qnt><pr>1000</pr><qnt>5</qnt></tr>
<tr><cid>102</cid><pr>2000</pr><qnt>5</qnt><pr>5000</pr><qnt>2</qnt></tr>
<tr><cid>101</cid><pr>6000</pr><qnt>5</qnt></tr>
[training@localhost ~]$

hive> create table raw(line string);
OK
Time taken: 0.252 seconds
hive> load data local inpath 'xml5'
> into table raw;
Copying data from file:/home/training/xml5
Copying file: file:/home/training/xml5
Loading data to table xxmls.raw
OK
Time taken: 0.178 seconds
hive> select * from raw;
OK
<tr><cid>101</cid><pr>3000</pr><qnt>3</qnt><pr>5000</pr><qnt>2</qnt><pr>1000</pr><qnt>5</qnt></tr>
<tr><cid>102</cid><pr>2000</pr><qnt>5</qnt><pr>5000</pr><qnt>2</qnt></tr>
<tr><cid>101</cid><pr>6000</pr><qnt>5</qnt></tr>
Time taken: 0.155 seconds
hive>

hive> create table raw2(cid int,
> pr array<string>, qnt array<string>);
OK
Time taken: 0.069 seconds
hive> insert overwrite table raw2
> select xpath_int(line,'tr/cid'),
> xpath(line,'tr/pr/text()'),
> xpath(line,'tr/qnt/text()')
> from raw;

hive> create table cidpr(cid int , pr int);
hive> insert overwrite table cidpr
> select cid, mypr from raw2
> lateral view explode(pr) p as mypr;

hive> create table cidqnt(cid int, qnt int);

hive> insert overwrite table cidqnt
> select cid, myqnt from raw2
> lateral view explode(qnt) q as myqnt;

hive> select * from cidpr;
OK
101 3000
101 5000
101 1000
102 2000
102 5000
101 6000
Time taken: 0.06 seconds
hive> select * from cidqnt;
OK
101 3
101 2
101 5
102 5
102 2
101 5
Time taken: 0.046 seconds
hive>

hive> add jar Desktop/hivejars.jar;
hive> create temporary function auto
> as 'hive.analytics.SeqNumber';

hive> alter table cidpr add columns(n int);
hive> insert overwrite table cidpr
select cid, pr, auto() from cidpr;

hive> alter table cidqnt add columns(n int);
hive> insert overwrite table cidqnt
select cid, qnt, auto() from cidqnt;

hive> create table trans(cid int, pr int,
qnt int, bill int);
hive> insert overwrite table trans
select l.cid, pr, qnt, pr*qnt
from cidpr l join cidqnt r
on (l.n=r.n);

hive> create table report(cid int, totbill int);
hive> insert overwrite table report
select cid, sum(bill) from trans
group by cid;
hive> select * from report;
________________________

hive.analytics.SeqNumber.java
______________________
package hive.analytics;

import java.io.IOException;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.IntWritable;

public class SeqNumber extends UDF
{
int cnt = 0;
public IntWritable evaluate()
throws IOException
{
cnt++;
return new IntWritable(cnt);
}

}

______________________________

keep all above statements into a script file.

hscript1.hql
________________

create database urheroes;
use urheroes;
create table raw(line string);
load data local inpath 'xml5' into table raw;
create table raw2(cid int, pr array<string>, qnt array<string>);
insert overwrite table raw2
select xpath_int(line,'tr/cid'),
xpath(line,'tr/pr/text()'),
xpath(line,'tr/qnt/text()') from raw;
create table cidpr(cid int, pr int);
insert overwrite table cidpr
select cid, mypr from raw2
lateral view explode(pr) p as mypr;
create table cidqnt(cid int, qnt int);
insert overwrite table cidqnt
select cid, myq from raw2
lateral view explode(qnt) q as myq;
alter table cidpr add columns(n int);
alter table cidqnt add columns(n int);
add jar Desktop/hivejars.jar;
create temporary function auto
as 'hive.analytics.SeqNumber';
insert overwrite table cidpr
select cid, pr, auto() from cidpr;
insert overwrite table cidqnt
select cid, qnt, auto() from cidqnt;
create table trans(cid int, pr int, qnt int,
bill int);
insert overwrite table trans
select l.cid, pr, qnt, pr*qnt
from cidpr l join cidqnt r
on (l.n=r.n);
create table report(cid int, totbill int);
insert overwrite table report
select cid, sum(bill) from trans
group by cid;
select * from trans;
select * from report;
drop table raw;
drop table raw2;
drop table cidpr;
drop table cidqnt;
________________________

how to submit script:

$ hive -f hscript1.hql
_______________________________________

Friday, 24 June 2016

Hive Lab5 : xml data processing

[training@localhost ~]$ cat sampxml1
<rec>
<name>Ravi</name>
<age>25</age>
<sex>m</sex>
</rec>
<rec>
<name>Rani</name>
<sex>f</sex>
<city>Hyd</city>
</rec>
[training@localhost ~]$

hive> create table samp(line string);
OK
Time taken: 0.309 seconds
hive> load data local inpath 'sampxml1'
> into table samp;
Copying data from file:/home/training/sampxml1
Copying file: file:/home/training/sampxml1
Loading data to table ourxml.samp
OK
Time taken: 0.202 seconds
hive>

hive> select * from samp;
OK
<rec>
<name>Ravi</name>
<age>25</age>
<sex>m</sex>
</rec>
<rec>
<name>Rani</name>
<sex>f</sex>
<city>Hyd</city>
</rec>
Time taken: 0.184 seconds
hive>

above xml file has 2 records,
when this file is loaded into hive table, hive treats, each line as a record,
as per xml == two records.
as per hive --> 10 records(rows.).

this data is not ready for hive,
hive expects total xml record in a single line.
so data preperation step is required.

Prepare a mapreduce Job , which can convert xml record into single row.

(Later we see in Mapreduce session).,

[training@localhost ~]$ cat xml1
<rec><name>Ravi</name><age>25</age><city>hyd</city></rec>
<rec><name>Rani</name><age>24</age><sex>f</sex></rec>
<rec><name>Sampath</name><sex>m</sex><city>Del</city></rec>
[training@localhost ~]$

hive> create table raw(line string);
OK
Time taken: 0.05 seconds
hive> load data local inpath 'xml1'
> into table raw;
Copying data from file:/home/training/xml1
Copying file: file:/home/training/xml1
Loading data to table ourxml.raw
OK
Time taken: 0.123 seconds
hive>

hive> select * from raw;
OK
<rec><name>Ravi</name><age>25</age><city>hyd</city></rec>
<rec><name>Rani</name><age>24</age><sex>f</sex></rec>
<rec><name>Sampath</name><sex>m</sex><city>Del</city></rec>
Time taken: 0.066 seconds
hive> select count(*) from raw;
3
-- 3 rows.

hive> select xpath_string(line,'rec/name') from raw;
Ravi
Rani
Sampath

hive> select
> xpath_string(line,'rec/name'),
> xpath_int(line,'rec/age'),
> xpath_string(line,'rec/sex'),
> xpath_string(line,'rec/city')
> from raw;

Ravi 25 hyd
Rani 24 f
Sampath 0 m Del

if string fields is missed, it returns blank string, if numeric field is missed it returns 0.

_________________

xml with nested tags.

[training@localhost ~]$ cat xml2
<rec><name><fname>Ravi</fname><lname>kumar</lname></name><age>25</age><contact><email><personal>ravi@gmail.com</personal><official>ravi@infy.com</official></email><phone><mobile>12345</mobile><office>12346</office><residence>12347</residence></phone></contact></rec>
[training@localhost ~]$

hive> create table xraw(line string);
OK
Time taken: 0.075 seconds
hive> load data local inpath 'xml2'
> into table xraw;

hive> select * from xraw;
OK
<rec><name><fname>Ravi</fname><lname>kumar</lname></name><age>25</age><contact><email><personal>ravi@gmail.com</personal><official>ravi@infy.com</official></email><phone><mobile>12345</mobile><office>12346</office><residence>12347</residence></phone></contact></rec>
Time taken: 0.064 seconds
hive>

hive> create table info(fname string,
> lname string,
> age int, personal_email string,
> official_email string,
> mobile string, office string,
> residence string);
OK
Time taken: 0.042 seconds
hive>

hive> insert overwrite table info
> select
> xpath_string(line,'rec/name/fname'),
> xpath_string(line,'rec/name/lname'),
> xpath_int(line,'rec/age'),
> xpath_string(line,'rec/contact/email/personal'),
> xpath_string(line,'rec/contact/email/official'),
> xpath_string(line,'rec/contact/phone/mobile'),
> xpath_string(line,'rec/contact/phone/office'),
> xpath_string(line,'rec/contact/phone/residence')
> from xraw;

hive> select * from info;
OK
Ravi kumar 25 ravi@gmail.com ravi@infy.com 12345 12346 12347
Time taken: 0.064 seconds
hive>
_____________

xml with collections.

[training@localhost ~]$ cat xml3
<rec><name>Amar</name><qual>Btech</qual><qual>Mtech</qual></rec>
<rec><name>Amala</name><qual>Bsc</qual><qual>Msc</qual><qual>Mtech</qual></rec>
<rec><name>Akash</name><qual>Btech</qual><qual>Mba</qual></rec>
[training@localhost ~]$

hive> create table yraw(line string);
OK
Time taken: 0.043 seconds
hive> load data local inpath 'xml3'
> into table yraw;

hive> select * from yraw;
<rec><name>Amar</name><qual>Btech</qual><qual>Mtech</qual></rec>
<rec><name>Amala</name><qual>Bsc</qual><qual>Msc</qual><qual>Mtech</qual></rec>
<rec><name>Akash</name><qual>Btech</qual><qual>Mba</qual></rec>

hive>
hive> create table raw2(name string,
> qual array<string>);
OK
Time taken: 0.039 seconds
hive> insert overwrite table raw2
> select xpath_string(line,'rec/name'),
> xpath(line,'rec/qual/text()')
> from yraw;

hive> select * from raw2;
OK
Amar ["Btech","Mtech"]
Amala ["Bsc","Msc","Mtech"]
Akash ["Btech","Mba"]
Time taken: 0.061 seconds
hive>

hive> select name, size(qual) from raw2;

Amar 2
Amala 3
Akash 2

how to access array elements,

by using index numbers

indexing starts from 0.

hive> select qual[0], qual[1],
qual[2] from raw2;
Btech Mtech NULL
Bsc Msc Mtech
Btech Mba NULL

search for elements with in array.

hive> select * from raw2
> where array_contains(qual,'Mtech');

Amar ["Btech","Mtech"]
Amala ["Bsc","Msc","Mtech"]

_______________
Flattening Array elements:

hive> select explode(qual) as q
from raw2;

Btech
Mtech
Bsc
Msc
Mtech
Btech
Mba

hive> select name, explode(qual) as q from raw2;
FAILED: Error in semantic analysis: UDTF's are not supported outside the SELECT clause, nor nested in expressions
hive>

-- above statement is invalid,
bcoz, udtf s can not be applied with other column expressions.

hive> create table yinfo(name string,
> qual string);
OK
Time taken: 0.035 seconds
hive> insert overwrite table yinfo
> select name, myq from raw2
> lateral view explode(qual) q as myq;

hive> select * from yinfo;
OK
Amar Btech
Amar Mtech
Amala Bsc
Amala Msc
Amala Mtech
Akash Btech
Akash Mba
Time taken: 0.055 seconds

hive> select * from yinfo
> where qual in ('Msc','Mtech');

Amar Mtech
Amala Msc
Amala Mtech

hive> create table yres(qual string, cnt int);
OK
Time taken: 0.036 seconds
hive> insert overwrite table yres
> select qual, count(*) from yinfo
> group by qual;

hive> select * from yres;
OK
Bsc 1
Btech 2
Mba 1
Msc 1
Mtech 2
Time taken: 0.051 seconds
hive>

____________________

Assignment:

[training@localhost ~]$ cat xml4
<tr><cid>101</cid><pr>1000</pr><pr>3000</pr></tr>
<tr><cid>102</cid><pr>1200</pr><pr>2000</pr><pr>5000</pr></tr>
<tr><cid>101</cid><pr>4000</pr></tr>
[training@localhost ~]$

hive> create database sales;
hive> use sales;
hive> create table raw(line string);
hive> load data local inpath 'xml4'
into table raw;
hive> create table raw2(cid string,
pr array<string>);

hive> insert overwrite table raw2
select xpath_string(line,'tr/cid'),
xpath(line,'tr/pr/text()')
from raw;
hive> create table raw3(cid string,
pr int);
hive> insert overwrite table raw3
select cid, mypr from raw2
lateral view explode(pr) p as mypr;

hive> create table results(cid string,
totbill int);
hive> insert overwrite table results
select cid, sum(pr)
from raw3
group by cid;

hive> select * from results;

--- my students are really super and energitic --
_______________ __________

____________________

__________________________________

______________________________--

Hive Lab4 : Bucketing Tables

hive> create database bucks;
OK
Time taken: 1.775 seconds
hive> use bucks;
OK
Time taken: 0.022 seconds
hive> create table sales(pid string, pr int)
> row format delimited
> fields terminated by ',';
OK
Time taken: 0.364 seconds
hive> load data local inpath 'sales'
> into table sales;
Copying data from file:/home/training/sales
Copying file: file:/home/training/sales
Loading data to table bucks.sales
OK
Time taken: 0.187 seconds
hive>

hive> create table buckstab(pid string,
> pr int)
> clustered by (pid)
> into 3 buckets;
OK
Time taken: 0.051 seconds
hive>

loading data into buckets.

hive> set hive.enforce.bucketing=true;
hive> insert overwrite table buckstab
> select * from sales;

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/bucks.db/sales
Found 1 items
-rw-r--r-- 1 training supergroup 185 2016-06-21 07:35 /user/hive/warehouse/bucks.db/sales/sales
[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/bucks.db/buckstab
Found 3 items
-rw-r--r-- 1 training supergroup 62 2016-06-21 07:40 /user/hive/warehouse/bucks.db/buckstab/000000_0
-rw-r--r-- 1 training supergroup 35 2016-06-21 07:40 /user/hive/warehouse/bucks.db/buckstab/000001_0
-rw-r--r-- 1 training supergroup 88 2016-06-21 07:40 /user/hive/warehouse/bucks.db/buckstab/000002_0
[training@localhost ~]$

Hive Lab3 : Partitioned Tables

Hi I will update this, soon.
Thank You.

Hive Lab2 : Temperature data Handling and Other operations

[training@localhost ~]$ gedit tmpr.txt
[training@localhost ~]$ cat tmpr.txt
xxxxx2006xxx34xxx
xxxxx2006xxx35xxx
xxxxx2006xxx32xxx
xxxxx2007xxx24xxx
xxxxx2007xxx21xxx
xxxxx2008xxx37xxx
xxxxx2008xxx39xxx
[training@localhost ~]$

hive> create database mytmpr;
OK
Time taken: 1.756 seconds
hive> use mytmpr;
OK
Time taken: 0.032 seconds
hive> create table raw(line string);
OK
Time taken: 0.345 seconds
hive>

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/mytmpr.db
Found 1 items
drwxr-xr-x - training supergroup 0 2016-06-23 20:31 /user/hive/warehouse/mytmpr.db/raw
[training@localhost ~]$

-- when database is created, with name database and .db extension one directory will be created in warehouse location.

/user/hive/warehouse/mytmpr.db

hive> load data local inpath 'tmpr.txt'
> into table raw;

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/mytmpr.db/raw
Found 1 items
-rw-r--r-- 1 training supergroup 126 2016-06-23 20:35 /user/hive/warehouse/mytmpr.db/raw/tmpr.txt
[training@localhost ~]$
hive> select * from raw;
OK
xxxxx2006xxx34xxx
xxxxx2006xxx35xxx
xxxxx2006xxx32xxx
xxxxx2007xxx24xxx
xxxxx2007xxx21xxx
xxxxx2008xxx37xxx
xxxxx2008xxx39xxx
Time taken: 0.186 seconds
hive>

hive> create table tmpr(y int, t int);
OK
Time taken: 0.058 seconds
hive> insert overwrite table tmpr
> select substr(line,6,4),
> substr(line,13,2) from raw;

hive> select * from tmpr;
OK
2006 34
2006 35
2006 32
2007 24
2007 21
2008 37
2008 39
Time taken: 0.057 seconds
hive> describe tmpr;
OK
y int
t int
Time taken: 0.065 seconds
hive>
hive> create table results(y int,
max int, min int);
hive> insert overwrite table results
select y, max(t), min(t)
from tmpr group by y;

hive> select * from results;
OK
2006 35 32
2007 24 21
2008 39 37
Time taken: 0.058 seconds
hive>

[training@localhost ~]$ hadoop fs -cat /user/hive/warehouse/mytmpr.db/tmpr/000000_0
2006 34
2006 35
2006 32
2007 24
2007 21
2008 37
2008 39
[training@localhost ~]$

default delimiter for hive table is \001.

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/mytmpr.db/results
Found 1 items
-rw-r--r-- 1 training supergroup 33 2016-06-23 20:48 /user/hive/warehouse/mytmpr.db/results/000000_0
[training@localhost ~]$ hadoop fs -cat /user/hive/warehouse/mytmpr.db/results/000000_0
2006 35 32
2007 24 21
2008 39 37
[training@localhost ~]$
______________________________

if temperature contains both positives and negatives..

[training@localhost ~]$ cat tmpr2.txt
xxxxx2006xxx34xxx
xxxxx2006xxx-35xxx
xxxxx2006xxx-14xxx
xxxxx2006xxx32xxx
xxxxx2007xxx-24xxx
xxxxx2007xxx-21xxx
xxxxx2007xxx35xxx
xxxxx2008xxx-37xxx
xxxxx2008xxx39xxx
[training@localhost ~]$
hive> create database urtmpr;
OK
Time taken: 0.022 seconds
hive> use urtmpr;
OK
Time taken: 0.011 seconds
hive> create table raw(line string);
OK
Time taken: 0.032 seconds
hive> load data local inpath 'tmpr2.txt'
> into table raw;

hive> create table tmpr(y int, t int);
OK
Time taken: 0.035 seconds
hive> insert overwrite table tmpr
> select * from (
> select substr(line,6,4),
> substr(line,13,2) from raw
> where substr(line,13,1)!='-'
> union all
> select substr(line,6,4),
> substr(line,13,3) from raw
> where substr(line,13,1)='-') x;

-- in hive only "union all" available., which allows duplicates.
-- hive union should be placed as subquery.
-- subquery should have aliase.

hive> select * from tmpr;
OK
2006 34
2006 -35
2006 -14
2006 32
2007 -24
2007 -21
2007 35
2008 -37
2008 39
Time taken: 0.056 seconds
hive>

hive> create table results(y int,
> max int, min int);
OK
Time taken: 0.047 seconds
hive> insert overwrite table results
> select y, max(t) , min(t)
> from tmpr group by y;

hive> select * from results;
OK
2006 34 -35
2007 35 -24
2008 39 -37
Time taken: 0.054 seconds
hive>

_____________________________

Both inner and external tables can use custom locations.

hive> create table hvtab1(line string)
location '/user/ursloc';
hive> load data local inpath 'file1'
into table hvtab1;

[training@localhost ~]$ hadoop fs -ls /user/ursloc
Found 1 items
-rw-r--r-- 1 training supergroup 61 2016-06-23 21:15 /user/ursloc/file1
[training@localhost ~]$

A Single Location can be applied for mutliple tables.

hive> create table hvtab2(line string)
location '/user/ursloc';

now tables use same data and same location.

_____________

how to reuse:

hive> create external table
> xtab(line string);
OK
Time taken: 0.055 seconds
hive> load data local inpath 'file1'
> into table xtab;

hive> select * from xtab;
OK
aaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaa
Time taken: 0.059 seconds
hive> drop table xtab;
OK
Time taken: 0.062 seconds
hive> show tables;
OK
raw
results
tmpr
Time taken: 0.042 seconds
hive>

hive> create table xtab(line string);
OK
Time taken: 0.031 seconds
hive> select * from xtab;
OK
aaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaa
Time taken: 0.06 seconds
hive>

-- create behavior:
if directory is already existed, it will use it, if not existed, it will create a new directory.

_________________

what if, backend directory is deleted.
what will happen to table?

table will not be deleted.

[training@localhost ~]$ hadoop fs -ls /user/hive/warehouse/urtmpr.db/raw
Found 1 items
-rw-r--r-- 1 training supergroup 167 2016-06-23 21:00 /user/hive/warehouse/urtmpr.db/raw/tmpr2.txt
[training@localhost ~]$ hadoop fs -rmr /user/hive/warehouse/urtmpr.db/raw
Deleted hdfs://localhost/user/hive/warehouse/urtmpr.db/raw

hive> show tables;
OK
raw
results
tmpr
xtab
Time taken: 0.045 seconds
hive> select * from raw;
OK
Time taken: 0.036 seconds
hive>
________________

___

Data science Software Course Training in Ameerpet Hyderabad

Tuesday, 28 June 2016

Friday, 24 June 2016