Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Friday, 24 June 2016

Hive Lab5 : xml data processing

[training@localhost ~]$ cat sampxml1
<rec>
  <name>Ravi</name>
  <age>25</age>
  <sex>m</sex>
</rec>
<rec>
  <name>Rani</name>
  <sex>f</sex>
  <city>Hyd</city>
</rec>
[training@localhost ~]$ 

hive> create table samp(line string);
OK
Time taken: 0.309 seconds
hive> load data local inpath 'sampxml1'
    >  into table samp;
Copying data from file:/home/training/sampxml1
Copying file: file:/home/training/sampxml1
Loading data to table ourxml.samp
OK
Time taken: 0.202 seconds
hive> 

hive> select * from samp;
OK
<rec>
  <name>Ravi</name>
  <age>25</age>
  <sex>m</sex>
</rec>
<rec>
  <name>Rani</name>
  <sex>f</sex>
  <city>Hyd</city>
</rec>
Time taken: 0.184 seconds
hive> 

 above xml file has 2 records,
 when this file is loaded into hive table, hive treats, each line as a record,
  as per xml == two records.
  as per hive --> 10 records(rows.).

this data is not ready for hive,
hive expects total xml record in a single line.
so data preperation step is required.

Prepare a mapreduce Job , which can convert xml record into single row.

  (Later we see in Mapreduce session).,

[training@localhost ~]$ cat xml1
<rec><name>Ravi</name><age>25</age><city>hyd</city></rec>
<rec><name>Rani</name><age>24</age><sex>f</sex></rec>
<rec><name>Sampath</name><sex>m</sex><city>Del</city></rec>
[training@localhost ~]$ 


hive> create table raw(line string);
OK
Time taken: 0.05 seconds
hive> load data local inpath 'xml1'
    >  into table raw;
Copying data from file:/home/training/xml1
Copying file: file:/home/training/xml1
Loading data to table ourxml.raw
OK
Time taken: 0.123 seconds
hive> 


hive> select * from raw;
OK
<rec><name>Ravi</name><age>25</age><city>hyd</city></rec>
<rec><name>Rani</name><age>24</age><sex>f</sex></rec>
<rec><name>Sampath</name><sex>m</sex><city>Del</city></rec>
Time taken: 0.066 seconds
hive> select count(*) from raw;
3
-- 3 rows.

hive> select xpath_string(line,'rec/name') from raw;
Ravi
Rani
Sampath

hive> select 
    >  xpath_string(line,'rec/name'),
    >  xpath_int(line,'rec/age'),
    >  xpath_string(line,'rec/sex'),
    >  xpath_string(line,'rec/city')
    > from raw;

Ravi    25              hyd
Rani    24      f
Sampath 0       m       Del

if string fields is missed, it returns blank string, if numeric field is missed it returns 0.


_________________

xml with nested tags.

[training@localhost ~]$ cat xml2
<rec><name><fname>Ravi</fname><lname>kumar</lname></name><age>25</age><contact><email><personal>ravi@gmail.com</personal><official>ravi@infy.com</official></email><phone><mobile>12345</mobile><office>12346</office><residence>12347</residence></phone></contact></rec>
[training@localhost ~]$ 

hive> create table xraw(line string);
OK
Time taken: 0.075 seconds
hive> load data local inpath 'xml2'
    >  into table xraw;

hive> select * from xraw;
OK
<rec><name><fname>Ravi</fname><lname>kumar</lname></name><age>25</age><contact><email><personal>ravi@gmail.com</personal><official>ravi@infy.com</official></email><phone><mobile>12345</mobile><office>12346</office><residence>12347</residence></phone></contact></rec>
Time taken: 0.064 seconds
hive> 

hive> create table info(fname string, 
    >   lname string, 
    > age int, personal_email string,
    >  official_email string,
    >  mobile string, office string, 
    >  residence string);
OK
Time taken: 0.042 seconds
hive> 

hive> insert overwrite table info
    >   select 
    > xpath_string(line,'rec/name/fname'),
    > xpath_string(line,'rec/name/lname'),
    > xpath_int(line,'rec/age'),          
    > xpath_string(line,'rec/contact/email/personal'),
    > xpath_string(line,'rec/contact/email/official'),
    > xpath_string(line,'rec/contact/phone/mobile'),
    > xpath_string(line,'rec/contact/phone/office'),
    > xpath_string(line,'rec/contact/phone/residence') 
    >  from xraw;

hive> select * from info;
OK
Ravi    kumar   25      ravi@gmail.com  ravi@infy.com     12345   12346   12347
Time taken: 0.064 seconds
hive> 
_____________

xml with collections.



[training@localhost ~]$ cat xml3
<rec><name>Amar</name><qual>Btech</qual><qual>Mtech</qual></rec>
<rec><name>Amala</name><qual>Bsc</qual><qual>Msc</qual><qual>Mtech</qual></rec>
<rec><name>Akash</name><qual>Btech</qual><qual>Mba</qual></rec>
[training@localhost ~]$ 

hive> create table yraw(line string);
OK
Time taken: 0.043 seconds
hive> load data local inpath 'xml3'
    >  into table yraw;

hive> select * from yraw;
<rec><name>Amar</name><qual>Btech</qual><qual>Mtech</qual></rec>
<rec><name>Amala</name><qual>Bsc</qual><qual>Msc</qual><qual>Mtech</qual></rec>
<rec><name>Akash</name><qual>Btech</qual><qual>Mba</qual></rec>

hive> 
hive> create table raw2(name string, 
    >    qual  array<string>);
OK
Time taken: 0.039 seconds
hive> insert overwrite table raw2
    >   select xpath_string(line,'rec/name'),
    >  xpath(line,'rec/qual/text()')
    >   from yraw;

hive> select * from raw2;
OK
Amar    ["Btech","Mtech"]
Amala   ["Bsc","Msc","Mtech"]
Akash   ["Btech","Mba"]
Time taken: 0.061 seconds
hive> 

hive> select name, size(qual) from raw2;

Amar    2
Amala   3
Akash   2

how to access array elements,

 by using index numbers

 indexing starts from 0.

hive> select qual[0], qual[1],
          qual[2] from raw2;
Btech   Mtech   NULL
Bsc     Msc     Mtech
Btech   Mba     NULL

search for elements with in array.

hive> select * from raw2 
    >  where array_contains(qual,'Mtech');


Amar    ["Btech","Mtech"]
Amala   ["Bsc","Msc","Mtech"]

_______________
Flattening Array elements:

hive> select explode(qual) as q 
   from raw2;

Btech
Mtech
Bsc
Msc
Mtech
Btech
Mba


hive> select name, explode(qual) as q from raw2;
FAILED: Error in semantic analysis: UDTF's are not supported outside the SELECT clause, nor nested in expressions
hive> 

-- above statement is invalid,
   bcoz, udtf s can not be applied with other column expressions.

hive> create table  yinfo(name string, 
    >      qual string);
OK
Time taken: 0.035 seconds
hive> insert overwrite table yinfo 
    >   select name, myq from raw2
    >    lateral view explode(qual) q as myq;

hive> select * from yinfo;
OK
Amar    Btech
Amar    Mtech
Amala   Bsc
Amala   Msc
Amala   Mtech
Akash   Btech
Akash   Mba
Time taken: 0.055 seconds

hive> select * from yinfo   
    >   where qual in ('Msc','Mtech');


Amar    Mtech
Amala   Msc
Amala   Mtech

hive> create table yres(qual string, cnt int);
OK
Time taken: 0.036 seconds
hive> insert overwrite table yres
    >   select qual, count(*) from yinfo
    >    group by qual;

hive> select * from yres;
OK
Bsc     1
Btech   2
Mba     1
Msc     1
Mtech   2
Time taken: 0.051 seconds
hive> 

____________________

Assignment:

[training@localhost ~]$ cat xml4
<tr><cid>101</cid><pr>1000</pr><pr>3000</pr></tr>
<tr><cid>102</cid><pr>1200</pr><pr>2000</pr><pr>5000</pr></tr>
<tr><cid>101</cid><pr>4000</pr></tr>
[training@localhost ~]$ 

hive> create database sales;
hive> use sales;
hive> create table raw(line string);
hive> load data local inpath 'xml4'
   into table raw;
hive> create table raw2(cid string, 
    pr array<string>);

hive> insert overwrite table raw2
   select xpath_string(line,'tr/cid'),
   xpath(line,'tr/pr/text()')
    from raw;
hive> create table raw3(cid string, 
   pr int);
hive> insert overwrite table raw3
    select cid, mypr from  raw2
   lateral view explode(pr) p as mypr;

hive> create table results(cid string, 
   totbill int);
hive> insert overwrite table results
   select cid, sum(pr)
   from raw3
   group by cid;

hive> select * from results;

--- my students are really super and energitic -- 
_______________  __________


















____________________











__________________________________ 




______________________________--

















5 comments:

  1. hi sir, can you please load Hbase class,morning class from 7to 8:30 am

    ReplyDelete
  2. Thank you sir for making so easy to track study notes for us all

    ReplyDelete
  3. Good Post! Thank you so much for sharing this pretty post, it was so good to read and useful to improve my knowledge as updated one, keep blogging.
    Big Data Hadoop Training in electronic city

    ReplyDelete