Data science Software Course Training in Ameerpet Hyderabad

Data science Software Course Training in Ameerpet Hyderabad

Tuesday 27 September 2016

MR Lab 8 : Entire Column Agrregations, Elemination of Duplicates

performing Entire Column aggregation.

ex:
   select sum(sal) from emp;

package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;

public class SalMap extends Mapper<LongWritable,Text,Text,IntWritable>
{
  public void map(LongWritable k, Text v, Context con)
   throws IOException, InterruptedException
   {
      String line = v.toString();
      int sal = Integer.parseInt(
                  line.split(",")[2]
                                   );
      con.write( new Text("Ibm"), new IntWritable(sal));
   }
}
------------


package mr.analytics;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driverx
{

 public static void main(String[] args) throws Exception {
 
  Configuration c = new Configuration();
  Job j = new Job(c, "colaggr");
  j.setJarByClass(Driverx.class);

  j.setMapperClass(SalMap.class);
  j.setReducerClass(RedForSum.class);
 //j.setSortComparatorClass(SortComparator.class);
  j.setOutputKeyClass(Text.class);
  j.setOutputValueClass(IntWritable.class);

 
  FileInputFormat.addInputPath(j, new Path(args[0]));
  FileOutputFormat.setOutputPath(j, new Path(args[1]));
  System.exit(j.waitForCompletion(true) ? 0:1);
 
 }

}


[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/emp mrlab/today1

[training@localhost ~]$ hadoop fs -cat mrlab/today1/part-r-00000
Ibm     88000

--------------------------------------------------

Eleminating duplicate rows ..based on entire row match.

[training@localhost ~]$ cat > profiles
101,aaa
102,bbb
101,aaa
101,aaa
101,aaa
102,bbb
103,ccc
101,xxx
101,aaa
[training@localhost ~]$ hadoop fs -copyFromLocal profiles mrlab
[training@localhost ~]$


package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class NoDupeRowsMap extends Mapper<LongWritable,Text,Text,NullWritable>
{
  public void map(LongWritable k, Text v, Context con)
   throws IOException, InterruptedException
   {
      con.write( v , NullWritable.get());
   }
}







---------
package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class RedForNoDupeRows  extends Reducer<Text,NullWritable,Text,NullWritable>
{
    public void reduce(Text k,Iterable<NullWritable> vlist, Context con)
    throws IOException, InterruptedException
    {
        con.write(k, NullWritable.get());
    }

}

----
package mr.analytics;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driverx
{

 public static void main(String[] args) throws Exception {
 
  Configuration c = new Configuration();
  Job j = new Job(c, "colaggr");
  j.setJarByClass(Driverx.class);

  j.setMapperClass(NoDupeRowsMap.class);
  j.setReducerClass(RedForNoDupeRows.class);
 //j.setSortComparatorClass(SortComparator.class);
  j.setOutputKeyClass(Text.class);
  j.setOutputValueClass(NullWritable.class);

 
  FileInputFormat.addInputPath(j, new Path(args[0]));
  FileOutputFormat.setOutputPath(j, new Path(args[1]));
  System.exit(j.waitForCompletion(true) ? 0:1);
 
 }

}




[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles mrlab/today3



[training@localhost ~]$ hadoop fs -cat mrlab/today3/part-r-00000
101,aaa
101,xxx
102,bbb
103,ccc

---------------------------

eleminating duplicates based on some column match.



[training@localhost ~]$ hadoop fs -copyFromLocal profiles mrlab/profiles2
[training@localhost ~]$ hadoop fs -cat mrlab/profiles2
101,aaa
102,bbb
101,aaa
101,aaa
101,aaa
102,bbb
103,ccc
101,xxx
101,aaa
101,abc
101,bbc
102,def
[training@localhost ~]$



package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class NoDupeKeysMap extends Mapper<LongWritable,Text,Text,Text>
{
  public void map(LongWritable k, Text v, Context con)
   throws IOException, InterruptedException
   {
      String line = v.toString();
      String[] w = line.split(",");
      String id = w[0];
      con.write( new Text(id) , v);
   }
}

------------

package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class RedForNoDupeKeys  extends Reducer<Text,Text,Text,NullWritable>
{
    public void reduce(Text k,Iterable<Text> vlist, Context con)
    throws IOException, InterruptedException
    {
      for(Text line : vlist)
      {
          con.write(line,NullWritable.get());
          break;
      }
    }

}

----------

package mr.analytics;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driverx
{

 public static void main(String[] args) throws Exception {
 
  Configuration c = new Configuration();
  Job j = new Job(c, "colaggr");
  j.setJarByClass(Driverx.class);

  j.setMapperClass(NoDupeKeysMap.class);
  j.setReducerClass(RedForNoDupeKeys.class);
 //j.setSortComparatorClass(SortComparator.class);
  j.setOutputKeyClass(Text.class);
  j.setOutputValueClass(Text.class);

 
  FileInputFormat.addInputPath(j, new Path(args[0]));
  FileOutputFormat.setOutputPath(j, new Path(args[1]));
  System.exit(j.waitForCompletion(true) ? 0:1);
 
 }

}





[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles2  mrlab/today4


[training@localhost ~]$ hadoop fs -cat mrlab/today4/part-r-00000
101,aaa
102,bbb
103,ccc

------------
to get Last duplicate, do following change in reducer.

package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class RedForNoDupeKeys  extends Reducer<Text,Text,Text,NullWritable>
{
    public void reduce(Text k,Iterable<Text> vlist, Context con)
    throws IOException, InterruptedException
    {
        String line="";
      for(Text ln : vlist)
      {
          line = ln.toString();
 
      }
      con.write(new Text(line), NullWritable.get());
     
    }

}
--------


[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles2  mrlab/today5


[training@localhost ~]$ hadoop fs -cat mrlab/today5/part-r-00000
101,bbc
102,def
103,ccc

-------------------------------















14 comments:

  1. thank your valuable content.we are very thankful to you.one of the recommanded blog.which is very useful to new learners and professionals.content is very useful for hadoop learners

    Hadoop training in hyderabad
    Hadoop training in usa

    Hadoop training classes in ameerpet

    ReplyDelete
  2. hadoop is now on swing.we are very appriciate your efforts.thank you for offering such unique content.one of the recommanded hadoop blog

    hadoop certification in hyderabad
    hadoop training in hyderabad
    hadoop training institute in hyderabad
    hadoop training institute in ameerpet

    ReplyDelete
  3. thank tou for offering such a nice content very very unique blog .one of the recommanded blog for students and professionals

    Data science training in hyderabad
    Data science training in ameerpet

    ReplyDelete
  4. Very useful article, thank you for sharing with us. Know more about Big Data Hadoop Online Training Tutorial Videos

    ReplyDelete
  5. Thanks for sharing the information very useful info about Hadoop and keep updating us, Please........


    ReplyDelete
  6. Just found your post by searching on the Google, I am Impressed and Learned Lot of new thing from your post. I am new to blogging and always try to learn new skill as I believe that blogging is the full time job for learning new things day by day. "Emergers Technologies"

    ReplyDelete
  7. thank for you your sharing information excellent blog in hadoop
    best hadoop training in hyderabad!

    ReplyDelete
  8. I prefer to study this kind of material. Nicely written information in this post, the quality of content is fine and the conclusion is lovely. Things are very open and intensely clear explanation of issues...Best Python Training Institute In Chennai | Best AWS Training Institute In Chennai |

    ReplyDelete
  9. http://www.kalyanhadooptraining.com/2016/08/free-big-data-project-with-end-to-end.html?google_comment_id=z12hufxjar3hx1wvv234sldbhqz3xl4h504

    ReplyDelete
  10. thank your valuable content.we are very thankful to you.one of the recommanded blog.which is very useful to new learners and professionals.content is very useful for hadoop learners

    Best Spring Classroom Training Institute
    Best Devops Classroom Training Institute
    Best Corejava Classroom Training Institute
    Best Advanced Classroom Training Institute

    ReplyDelete
  11. Thanks For Sharing The Information The Information Shared Is Very Valuable Please Keep Updating

    Us Time Just Went On Reading The article Hadoop Online Course

    ReplyDelete
  12. This comment has been removed by the author.

    ReplyDelete
  13. Awesome. You have clearly explained …Its very useful for me to know about new things. Keep on blogging.

    BEST PYTHON TUTORIAL FOR BEGINNERS

    ReplyDelete