SreeRam Hadoop Notes: MR Lab 8 : Entire Column Agrregations, Elemination of Duplicates

performing Entire Column aggregation.

ex:
   select sum(sal) from emp;

package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;

public class SalMap extends Mapper<LongWritable,Text,Text,IntWritable>
{
public void map(LongWritable k, Text v, Context con)
   throws IOException, InterruptedException
   {
      String line = v.toString();
      int sal = Integer.parseInt(
                  line.split(",")[2]
                                   );
      con.write( new Text("Ibm"), new IntWritable(sal));
   }
}
------------

package mr.analytics;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driverx
{

public static void main(String[] args) throws Exception {

Configuration c = new Configuration();
Job j = new Job(c, "colaggr");
j.setJarByClass(Driverx.class);

j.setMapperClass(SalMap.class);
j.setReducerClass(RedForSum.class);
//j.setSortComparatorClass(SortComparator.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(IntWritable.class);

FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, new Path(args[1]));
System.exit(j.waitForCompletion(true) ? 0:1);

}

}

[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/emp mrlab/today1

[training@localhost ~]$ hadoop fs -cat mrlab/today1/part-r-00000
Ibm     88000

--------------------------------------------------

Eleminating duplicate rows ..based on entire row match.

[training@localhost ~]$ cat > profiles
101,aaa
102,bbb
101,aaa
101,aaa
101,aaa
102,bbb
103,ccc
101,xxx
101,aaa
[training@localhost ~]$ hadoop fs -copyFromLocal profiles mrlab
[training@localhost ~]$

package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class NoDupeRowsMap extends Mapper<LongWritable,Text,Text,NullWritable>
{
public void map(LongWritable k, Text v, Context con)
   throws IOException, InterruptedException
   {
      con.write( v , NullWritable.get());
   }
}

---------
package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class RedForNoDupeRows extends Reducer<Text,NullWritable,Text,NullWritable>
{
    public void reduce(Text k,Iterable<NullWritable> vlist, Context con)
    throws IOException, InterruptedException
    {
        con.write(k, NullWritable.get());
    }

}

----
package mr.analytics;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driverx
{

public static void main(String[] args) throws Exception {

Configuration c = new Configuration();
Job j = new Job(c, "colaggr");
j.setJarByClass(Driverx.class);

j.setMapperClass(NoDupeRowsMap.class);
j.setReducerClass(RedForNoDupeRows.class);
//j.setSortComparatorClass(SortComparator.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(NullWritable.class);

FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, new Path(args[1]));
System.exit(j.waitForCompletion(true) ? 0:1);

}

}

[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles mrlab/today3

[training@localhost ~]$ hadoop fs -cat mrlab/today3/part-r-00000
101,aaa
101,xxx
102,bbb
103,ccc

---------------------------

eleminating duplicates based on some column match.

[training@localhost ~]$ hadoop fs -copyFromLocal profiles mrlab/profiles2
[training@localhost ~]$ hadoop fs -cat mrlab/profiles2
101,aaa
102,bbb
101,aaa
101,aaa
101,aaa
102,bbb
103,ccc
101,xxx
101,aaa
101,abc
101,bbc
102,def
[training@localhost ~]$

package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class NoDupeKeysMap extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable k, Text v, Context con)
   throws IOException, InterruptedException
   {
    String line = v.toString();
    String[] w = line.split(",");
    String id = w[0];
      con.write( new Text(id) , v);
   }
}

------------

package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class RedForNoDupeKeys extends Reducer<Text,Text,Text,NullWritable>
{
    public void reduce(Text k,Iterable<Text> vlist, Context con)
    throws IOException, InterruptedException
    {
      for(Text line : vlist)
      {
        con.write(line,NullWritable.get());
          break;
      }
    }

}

----------

package mr.analytics;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driverx
{

public static void main(String[] args) throws Exception {

Configuration c = new Configuration();
Job j = new Job(c, "colaggr");
j.setJarByClass(Driverx.class);

j.setMapperClass(NoDupeKeysMap.class);
j.setReducerClass(RedForNoDupeKeys.class);
//j.setSortComparatorClass(SortComparator.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(Text.class);

FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, new Path(args[1]));
System.exit(j.waitForCompletion(true) ? 0:1);

}

}

[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles2 mrlab/today4

[training@localhost ~]$ hadoop fs -cat mrlab/today4/part-r-00000
101,aaa
102,bbb
103,ccc

------------
to get Last duplicate, do following change in reducer.

package mr.analytics;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class RedForNoDupeKeys extends Reducer<Text,Text,Text,NullWritable>
{
    public void reduce(Text k,Iterable<Text> vlist, Context con)
    throws IOException, InterruptedException
    {
        String line="";
      for(Text ln : vlist)
      {
        line = ln.toString();

      }
      con.write(new Text(line), NullWritable.get());

    }

}
--------

[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles2 mrlab/today5

[training@localhost ~]$ hadoop fs -cat mrlab/today5/part-r-00000
101,bbc
102,def
103,ccc

-------------------------------

SreeRam Hadoop Notes

Data science Software Course Training in Ameerpet Hyderabad

Tuesday, 27 September 2016

MR Lab 8 : Entire Column Agrregations, Elemination of Duplicates

14 comments: