Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Saturday, 11 June 2016

Pig Lab3

load, foreach , 

3) dump :-
    
    to execute data flow.
   writes output into console.

    entire dataflow will be converted as single map reduce job.

   ex:

  grunt> dump res;

______________________________________________

4) store:-

    to execute data flow.
   writes output into disk (local/hdfs)

   grunt> store res into 'results1';
    -- tab is delimiter.
         
   grunt> store res into 'results2' 
          using PigStorage(',');

   load and store operators use Storage Methods.
_______________________________________
     
5) filter :-

      to create subsets , based on given criteria.

     (row filter, equivalant to 'where' clause of sql select statement )

 grunt> e1 = filter emp by (sex=='m');
grunt> dump e1

grunt> e1 = filter emp by (sex=='m' and dno==12)
>> ;
grunt> dump e1
   
___________________________________________

6) Limit :-

    to fetch top n number of tuples.

 grunt> top3 = limit emp 3;
grunt> dump top3

(101,vino,26000,m,11)
(102,Sri,25000,f,11)
(103,mohan,13000,m,13)

7) sample:-

     to create subsets, in random sample style.

(102,Sri,25000,f,11)
(104,lokitha,8000,f,12)
(203,ccc,10000,f,13)

grunt> rs = sample emp 0.5;
grunt> dump rs      

_________________________________________________

8) Aggregated functions in PigLatin.

   SUM(), AVG(), MAX(), MIN(), COUNT()

   
grunt> r = foreach emp generate SUM(sal) as tot;
grunt> dump r

   ABOVE statement will be failed during execution,.

    bcoz, AGGREGATED functions are applied only on inner bags.
   when you group data , inner bags will be produced.

_________________________________________

9) group: -
      to get inner bags foreach data group.
   based on grouping field.

grunt> describe emp;
emp: {id: int,name: chararray,sal: int,sex: chararray,dno: int}
grunt> -- select sex, sum(sal) from emp group by sex
grunt> e = foreach emp generate sex, sal;
grunt> bySex = group e by sex;
grunt> describe bySex
bySex: {group: chararray,e: {sex: chararray,sal: int}}
grunt> dump bySex

grunt> res = foreach bySex generate
>>         group as sex, SUM(e.sal) as tot;
grunt> describe res
res: {sex: chararray,tot: long}
grunt> store res into 'myhdfs1';

grunt> cat myhdfs1/part-r-00000
f       103000
m       125000
____________________________________


grunt> describe emp
emp: {id: int,name: chararray,sal: int,sex: chararray,dno: int}
grunt> ee = foreach emp generate dno, sal;
grunt> byDno = group ee by dno;
grunt> res = foreach byDno generate 
>>      group as dno, SUM(ee.sal) as tot;
grunt> store res into 'pdemo/res1';

grunt> ls pdemo/res1
hdfs://localhost/user/training/pdemo/res1/_logs <dir>
hdfs://localhost/user/training/pdemo/res1/part-r-00000<r 1>       28
grunt> cat pdemo/res1/part-r-00000
11      51000
12      48000
13      129000
grunt> 

grunt> -- single grouping and multiple aggregations
grunt> res1 = foreach bySex generate
>>    group as sex, 
>>     SUM(e.sal) as tot,
>>    AVG(e.sal) as avg,
>>    MAX(e.sal) as min,
>>    MIN(e.sal) as mn,
>>    COUNT(e) as cnt;
grunt> dump res1
(f,103000,20600.0,50000,8000,5)
(m,125000,25000.0,50000,6000,5)

_________________________________

grunt> -- multi grouping..
grunt> e = foreach emp generate dno, sex, sal;
grunt> grp = group e by dno, sex;
2016-06-09 19:28:35,893 [main] ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1000: Error during parsing. Unrecognized alias sex
Details at logfile: /home/training/pig_1465437040017.log
grunt> 
  above statement is invalid.
  pig does not allow groping by multiple fields.

  solution:
   make multiple fields as a tuple field, and group it by tuple.

grunt> grp = group e by (dno, sex);
grunt> describe grp
grp: {group: (dno: int,sex: chararray),e: {dno: int,sex: chararray,sal: int}}
grunt> res = foreach grp generate 
>>     group.dno, group.sex, SUM(e.sal) as tot; 
grunt> dump res
(11,f,25000)
(11,m,26000)
(12,f,18000)
(12,m,30000)
(13,f,60000)
(13,m,69000)
_________________________________
grunt> -- select sum(sal) from emp;
grunt> -- old one
grunt> e = foreach emp generate 'ibm' as org, sal;
grunt> dump e; 

(ibm,26000)
(ibm,25000)
(ibm,13000)
(ibm,8000)
(ibm,6000)
(ibm,10000)
(ibm,30000)
(ibm,50000)
(ibm,10000)
(ibm,50000)

grunt> grp = group e by org;
grunt> res = foreach grp generate 
>>         SUM(e.sal) as tot;
grunt> dump res
(228000)

_____________________________________

 2nd one --- for entire column aggregation.

grunt> describe emp;
emp: {id: int,name: chararray,sal: int,sex: chararray,dno: int}
grunt> e = foreach emp generate sal;
grunt> grp = group e all;
grunt> dump grp

(all,{(26000),(25000),(13000),(8000),(6000),(10000),(30000),(50000),(10000),(50000)})

grunt> res = foreach grp generate 
>>     SUM(e.sal) as tot, 
>>    AVG(e.sal) as avg, MAX(e.sal) as max,
>>    MIN(e.sal) as min, COUNT(e) as cnt;
grunt> dump res
(228000,22800.0,50000,6000,10)
_____________________________________________
Assignment:

  describe emp--> id,name,sal,sex,dno

________________________________
    Marketing A 30
    marketing   B       40
     :
     :
    Fin A 50
     :
    Fin D       30
_________________________

____________________________________________________

Cogroup:


































































































  









    











































































5 comments:

  1. root@ubuntu:/home# mkdir nrd
    root@ubuntu:/home# cd nrd/
    root@ubuntu:/home/nrd# nano emp
    root@ubuntu:/home/nrd# cat emp
    100,aa1,10000,m,10
    101,aa2,80000,f,10
    102,aa3,20000,m,10
    103,aa4,30000,f,10
    104,aa5,40000,m,10
    105,aa6,50000,f,10
    106,aa7,60000,m,10
    107,aa8,70000,f,10
    109,aa9,90000,m,10
    200,ba1,10000,m,11
    201,ba2,80000,f,11
    202,ba3,20000,m,11
    203,ba4,30000,f,11
    204,ba5,40000,m,11
    205,ba6,50000,f,11
    206,ba7,60000,m,11
    207,ba8,70000,f,11
    209,ba9,90000,m,11
    300,ca1,10000,m,12
    301,ca2,80000,f,12
    302,ca3,20000,m,12
    303,ca4,30000,f,12
    304,ca5,40000,m,12
    305,ca6,50000,f,12
    306,ca7,60000,m,12
    307,ca8,70000,f,12
    309,ca9,90000,m,12
    400,da1,10000,m,13
    401,da2,80000,f,13
    402,da3,20000,m,13
    403,da4,30000,f,13
    404,da5,40000,m,13
    405,da6,50000,f,13
    406,da7,60000,m,13
    407,da8,70000,f,13
    409,da9,90000,m,13

    root@ubuntu:/home/nrd# hadoop fs -mkdir /PIG1
    root@ubuntu:/home/nrd# hadoop fs -put emp /PIG1/


    root@ubuntu:/home/nrd# pig

    grunt> e = load '/PIG1/emp' using PigStorage(',') as (eid:int,ename:chararray,esal:int,gender:chararray,dno:int);

    grunt> dump e;

    (101,aa2,80000,f,10)
    (102,aa3,20000,m,10)
    (103,aa4,30000,f,10)
    (104,aa5,40000,m,10)
    (105,aa6,50000,f,10)
    (106,aa7,60000,m,10)
    (107,aa8,70000,f,10)
    (109,aa9,90000,m,10)
    (200,ba1,10000,m,11)
    (201,ba2,80000,f,11)
    (202,ba3,20000,m,11)
    (203,ba4,30000,f,11)
    (204,ba5,40000,m,11)
    (205,ba6,50000,f,11)
    (206,ba7,60000,m,11)
    (207,ba8,70000,f,11)
    (209,ba9,90000,m,11)
    (300,ca1,10000,m,12)
    (301,ca2,80000,f,12)
    (302,ca3,20000,m,12)
    (303,ca4,30000,f,12)
    (304,ca5,40000,m,12)
    (305,ca6,50000,f,12)
    (306,ca7,60000,m,12)
    (307,ca8,70000,f,12)
    (309,ca9,90000,m,12)
    (400,da1,10000,m,13)
    (401,da2,80000,f,13)
    (402,da3,20000,m,13)
    (403,da4,30000,f,13)
    (404,da5,40000,m,13)
    (405,da6,50000,f,13)
    (406,da7,60000,m,13)
    (407,da8,70000,f,13)
    (409,da9,90000,m,13)

    grunt> e1 = foreach e generate eid,ename,(esal >= 70000 ? 'A' :(esal >= 50000 ? 'B' : (esal >= 30000 ? 'C' : 'D' ))) as grade,gender,(dno==10 ? 'Marketing' : (dno==11 ? 'HR' : (dno==12 ? 'Sales' : 'Others'))) as dname;


    grunt> dump e1;


    (100,aa1,D,m,Marketing)
    (101,aa2,A,f,Marketing)
    (102,aa3,D,m,Marketing)
    (103,aa4,C,f,Marketing)
    (104,aa5,C,m,Marketing)
    (105,aa6,B,f,Marketing)
    (106,aa7,B,m,Marketing)
    (107,aa8,A,f,Marketing)
    (109,aa9,A,m,Marketing)
    (200,ba1,D,m,HR)
    (201,ba2,A,f,HR)
    (202,ba3,D,m,HR)
    (203,ba4,C,f,HR)
    (204,ba5,C,m,HR)
    (205,ba6,B,f,HR)
    (206,ba7,B,m,HR)
    (207,ba8,A,f,HR)
    (209,ba9,A,m,HR)
    (300,ca1,D,m,Sales)
    (301,ca2,A,f,Sales)
    (302,ca3,D,m,Sales)
    (303,ca4,C,f,Sales)
    (304,ca5,C,m,Sales)
    (305,ca6,B,f,Sales)
    (306,ca7,B,m,Sales)
    (307,ca8,A,f,Sales)
    (309,ca9,A,m,Sales)
    (400,da1,D,m,Others)
    (401,da2,A,f,Others)
    (402,da3,D,m,Others)
    (403,da4,C,f,Others)
    (404,da5,C,m,Others)
    (405,da6,B,f,Others)
    (406,da7,B,m,Others)
    (407,da8,A,f,Others)
    (409,da9,A,m,Others)


    grunt> e2 = group e1 by (dname,grade);

    grunt> describe e2;
    e2: {group: (dname: chararray,grade: chararray),e1: {eid: int,ename: chararray,grade: chararray,gender: chararray,dname: chararray}}

    ReplyDelete
  2. dump e2;

    ((HR,A),{(209,ba9,A,m,HR),(207,ba8,A,f,HR),(201,ba2,A,f,HR)})
    ((HR,B),{(206,ba7,B,m,HR),(205,ba6,B,f,HR)})
    ((HR,C),{(204,ba5,C,m,HR),(203,ba4,C,f,HR)})
    ((HR,D),{(200,ba1,D,m,HR),(202,ba3,D,m,HR)})
    ((Sales,A),{(307,ca8,A,f,Sales),(309,ca9,A,m,Sales),(301,ca2,A,f,Sales)})
    ((Sales,B),{(305,ca6,B,f,Sales),(306,ca7,B,m,Sales)})
    ((Sales,C),{(303,ca4,C,f,Sales),(304,ca5,C,m,Sales)})
    ((Sales,D),{(302,ca3,D,m,Sales),(300,ca1,D,m,Sales)})
    ((Others,A),{(409,da9,A,m,Others),(401,da2,A,f,Others),(407,da8,A,f,Others)})
    ((Others,B),{(406,da7,B,m,Others),(405,da6,B,f,Others)})
    ((Others,C),{(403,da4,C,f,Others),(404,da5,C,m,Others)})
    ((Others,D),{(400,da1,D,m,Others),(402,da3,D,m,Others)})
    ((Marketing,A),{(109,aa9,A,m,Marketing),(101,aa2,A,f,Marketing),(107,aa8,A,f,Marketing)})
    ((Marketing,B),{(106,aa7,B,m,Marketing),(105,aa6,B,f,Marketing)})
    ((Marketing,C),{(103,aa4,C,f,Marketing),(104,aa5,C,m,Marketing)})
    ((Marketing,D),{(102,aa3,D,m,Marketing),(100,aa1,D,m,Marketing)})


    grunt> e3 = foreach e2 generate flatten(group),COUNT(e1);

    grunt> describe e3;

    e3: {group::dname: chararray,group::grade: chararray,long}

    grunt> dump e3;

    (HR,A,3)
    (HR,B,2)
    (HR,C,2)
    (HR,D,2)
    (Sales,A,3)
    (Sales,B,2)
    (Sales,C,2)
    (Sales,D,2)
    (Others,A,3)
    (Others,B,2)
    (Others,C,2)
    (Others,D,2)
    (Marketing,A,3)
    (Marketing,B,2)
    (Marketing,C,2)
    (Marketing,D,2)


    grunt> store e3 into '/PIG1/PigAsgn1' using PigStorage('\t');
    root@ubuntu:/home/nrd# hadoop fs -cat /PIG1/PigAsgn1/part-r-00000
    HR A 3
    HR B 2
    HR C 2
    HR D 2
    Sales A 3
    Sales B 2
    Sales C 2
    Sales D 2
    Others A 3
    Others B 2
    Others C 2
    Others D 2
    Marketing A 3
    Marketing B 2
    Marketing C 2
    Marketing D 2

    ReplyDelete