performing Entire Column aggregation.
ex:
select sum(sal) from emp;
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class SalMap extends Mapper<LongWritable,Text,Text,IntWritable>
{
public void map(LongWritable k, Text v, Context con)
throws IOException, InterruptedException
{
String line = v.toString();
int sal = Integer.parseInt(
line.split(",")[2]
);
con.write( new Text("Ibm"), new IntWritable(sal));
}
}
------------
package mr.analytics;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driverx
{
public static void main(String[] args) throws Exception {
Configuration c = new Configuration();
Job j = new Job(c, "colaggr");
j.setJarByClass(Driverx.class);
j.setMapperClass(SalMap.class);
j.setReducerClass(RedForSum.class);
//j.setSortComparatorClass(SortComparator.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, new Path(args[1]));
System.exit(j.waitForCompletion(true) ? 0:1);
}
}
[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/emp mrlab/today1
[training@localhost ~]$ hadoop fs -cat mrlab/today1/part-r-00000
Ibm 88000
--------------------------------------------------
Eleminating duplicate rows ..based on entire row match.
[training@localhost ~]$ cat > profiles
101,aaa
102,bbb
101,aaa
101,aaa
101,aaa
102,bbb
103,ccc
101,xxx
101,aaa
[training@localhost ~]$ hadoop fs -copyFromLocal profiles mrlab
[training@localhost ~]$
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class NoDupeRowsMap extends Mapper<LongWritable,Text,Text,NullWritable>
{
public void map(LongWritable k, Text v, Context con)
throws IOException, InterruptedException
{
con.write( v , NullWritable.get());
}
}
---------
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class RedForNoDupeRows extends Reducer<Text,NullWritable,Text,NullWritable>
{
public void reduce(Text k,Iterable<NullWritable> vlist, Context con)
throws IOException, InterruptedException
{
con.write(k, NullWritable.get());
}
}
----
package mr.analytics;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driverx
{
public static void main(String[] args) throws Exception {
Configuration c = new Configuration();
Job j = new Job(c, "colaggr");
j.setJarByClass(Driverx.class);
j.setMapperClass(NoDupeRowsMap.class);
j.setReducerClass(RedForNoDupeRows.class);
//j.setSortComparatorClass(SortComparator.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, new Path(args[1]));
System.exit(j.waitForCompletion(true) ? 0:1);
}
}
[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles mrlab/today3
[training@localhost ~]$ hadoop fs -cat mrlab/today3/part-r-00000
101,aaa
101,xxx
102,bbb
103,ccc
---------------------------
eleminating duplicates based on some column match.
[training@localhost ~]$ hadoop fs -copyFromLocal profiles mrlab/profiles2
[training@localhost ~]$ hadoop fs -cat mrlab/profiles2
101,aaa
102,bbb
101,aaa
101,aaa
101,aaa
102,bbb
103,ccc
101,xxx
101,aaa
101,abc
101,bbc
102,def
[training@localhost ~]$
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class NoDupeKeysMap extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable k, Text v, Context con)
throws IOException, InterruptedException
{
String line = v.toString();
String[] w = line.split(",");
String id = w[0];
con.write( new Text(id) , v);
}
}
------------
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class RedForNoDupeKeys extends Reducer<Text,Text,Text,NullWritable>
{
public void reduce(Text k,Iterable<Text> vlist, Context con)
throws IOException, InterruptedException
{
for(Text line : vlist)
{
con.write(line,NullWritable.get());
break;
}
}
}
----------
package mr.analytics;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driverx
{
public static void main(String[] args) throws Exception {
Configuration c = new Configuration();
Job j = new Job(c, "colaggr");
j.setJarByClass(Driverx.class);
j.setMapperClass(NoDupeKeysMap.class);
j.setReducerClass(RedForNoDupeKeys.class);
//j.setSortComparatorClass(SortComparator.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, new Path(args[1]));
System.exit(j.waitForCompletion(true) ? 0:1);
}
}
[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles2 mrlab/today4
[training@localhost ~]$ hadoop fs -cat mrlab/today4/part-r-00000
101,aaa
102,bbb
103,ccc
------------
to get Last duplicate, do following change in reducer.
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class RedForNoDupeKeys extends Reducer<Text,Text,Text,NullWritable>
{
public void reduce(Text k,Iterable<Text> vlist, Context con)
throws IOException, InterruptedException
{
String line="";
for(Text ln : vlist)
{
line = ln.toString();
}
con.write(new Text(line), NullWritable.get());
}
}
--------
[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles2 mrlab/today5
[training@localhost ~]$ hadoop fs -cat mrlab/today5/part-r-00000
101,bbc
102,def
103,ccc
-------------------------------
ex:
select sum(sal) from emp;
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class SalMap extends Mapper<LongWritable,Text,Text,IntWritable>
{
public void map(LongWritable k, Text v, Context con)
throws IOException, InterruptedException
{
String line = v.toString();
int sal = Integer.parseInt(
line.split(",")[2]
);
con.write( new Text("Ibm"), new IntWritable(sal));
}
}
------------
package mr.analytics;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driverx
{
public static void main(String[] args) throws Exception {
Configuration c = new Configuration();
Job j = new Job(c, "colaggr");
j.setJarByClass(Driverx.class);
j.setMapperClass(SalMap.class);
j.setReducerClass(RedForSum.class);
//j.setSortComparatorClass(SortComparator.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, new Path(args[1]));
System.exit(j.waitForCompletion(true) ? 0:1);
}
}
[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/emp mrlab/today1
[training@localhost ~]$ hadoop fs -cat mrlab/today1/part-r-00000
Ibm 88000
--------------------------------------------------
Eleminating duplicate rows ..based on entire row match.
[training@localhost ~]$ cat > profiles
101,aaa
102,bbb
101,aaa
101,aaa
101,aaa
102,bbb
103,ccc
101,xxx
101,aaa
[training@localhost ~]$ hadoop fs -copyFromLocal profiles mrlab
[training@localhost ~]$
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class NoDupeRowsMap extends Mapper<LongWritable,Text,Text,NullWritable>
{
public void map(LongWritable k, Text v, Context con)
throws IOException, InterruptedException
{
con.write( v , NullWritable.get());
}
}
---------
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class RedForNoDupeRows extends Reducer<Text,NullWritable,Text,NullWritable>
{
public void reduce(Text k,Iterable<NullWritable> vlist, Context con)
throws IOException, InterruptedException
{
con.write(k, NullWritable.get());
}
}
----
package mr.analytics;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driverx
{
public static void main(String[] args) throws Exception {
Configuration c = new Configuration();
Job j = new Job(c, "colaggr");
j.setJarByClass(Driverx.class);
j.setMapperClass(NoDupeRowsMap.class);
j.setReducerClass(RedForNoDupeRows.class);
//j.setSortComparatorClass(SortComparator.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, new Path(args[1]));
System.exit(j.waitForCompletion(true) ? 0:1);
}
}
[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles mrlab/today3
[training@localhost ~]$ hadoop fs -cat mrlab/today3/part-r-00000
101,aaa
101,xxx
102,bbb
103,ccc
---------------------------
eleminating duplicates based on some column match.
[training@localhost ~]$ hadoop fs -copyFromLocal profiles mrlab/profiles2
[training@localhost ~]$ hadoop fs -cat mrlab/profiles2
101,aaa
102,bbb
101,aaa
101,aaa
101,aaa
102,bbb
103,ccc
101,xxx
101,aaa
101,abc
101,bbc
102,def
[training@localhost ~]$
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class NoDupeKeysMap extends Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable k, Text v, Context con)
throws IOException, InterruptedException
{
String line = v.toString();
String[] w = line.split(",");
String id = w[0];
con.write( new Text(id) , v);
}
}
------------
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class RedForNoDupeKeys extends Reducer<Text,Text,Text,NullWritable>
{
public void reduce(Text k,Iterable<Text> vlist, Context con)
throws IOException, InterruptedException
{
for(Text line : vlist)
{
con.write(line,NullWritable.get());
break;
}
}
}
----------
package mr.analytics;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driverx
{
public static void main(String[] args) throws Exception {
Configuration c = new Configuration();
Job j = new Job(c, "colaggr");
j.setJarByClass(Driverx.class);
j.setMapperClass(NoDupeKeysMap.class);
j.setReducerClass(RedForNoDupeKeys.class);
//j.setSortComparatorClass(SortComparator.class);
j.setOutputKeyClass(Text.class);
j.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(j, new Path(args[0]));
FileOutputFormat.setOutputPath(j, new Path(args[1]));
System.exit(j.waitForCompletion(true) ? 0:1);
}
}
[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles2 mrlab/today4
[training@localhost ~]$ hadoop fs -cat mrlab/today4/part-r-00000
101,aaa
102,bbb
103,ccc
------------
to get Last duplicate, do following change in reducer.
package mr.analytics;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class RedForNoDupeKeys extends Reducer<Text,Text,Text,NullWritable>
{
public void reduce(Text k,Iterable<Text> vlist, Context con)
throws IOException, InterruptedException
{
String line="";
for(Text ln : vlist)
{
line = ln.toString();
}
con.write(new Text(line), NullWritable.get());
}
}
--------
[training@localhost ~]$ hadoop jar Desktop/myapp.jar mr.analytics.Driverx mrlab/profiles2 mrlab/today5
[training@localhost ~]$ hadoop fs -cat mrlab/today5/part-r-00000
101,bbc
102,def
103,ccc
-------------------------------
thank your valuable content.we are very thankful to you.one of the recommanded blog.which is very useful to new learners and professionals.content is very useful for hadoop learners
ReplyDeleteHadoop training in hyderabad
Hadoop training in usa
Hadoop training classes in ameerpet
hadoop is now on swing.we are very appriciate your efforts.thank you for offering such unique content.one of the recommanded hadoop blog
ReplyDeletehadoop certification in hyderabad
hadoop training in hyderabad
hadoop training institute in hyderabad
hadoop training institute in ameerpet
thank tou for offering such a nice content very very unique blog .one of the recommanded blog for students and professionals
ReplyDeleteData science training in hyderabad
Data science training in ameerpet
Very useful article, thank you for sharing with us. Know more about Big Data Hadoop Online Training Tutorial Videos
ReplyDeleteThanks for sharing the information very useful info about Hadoop and keep updating us, Please........
ReplyDeleteJust found your post by searching on the Google, I am Impressed and Learned Lot of new thing from your post. I am new to blogging and always try to learn new skill as I believe that blogging is the full time job for learning new things day by day. "Emergers Technologies"
ReplyDeletethank for you your sharing information excellent blog in hadoop
ReplyDeletebest hadoop training in hyderabad!
I prefer to study this kind of material. Nicely written information in this post, the quality of content is fine and the conclusion is lovely. Things are very open and intensely clear explanation of issues...Best Python Training Institute In Chennai | Best AWS Training Institute In Chennai |
ReplyDeletehttp://www.kalyanhadooptraining.com/2016/08/free-big-data-project-with-end-to-end.html?google_comment_id=z12hufxjar3hx1wvv234sldbhqz3xl4h504
ReplyDeletethank your valuable content.we are very thankful to you.one of the recommanded blog.which is very useful to new learners and professionals.content is very useful for hadoop learners
ReplyDeleteBest Spring Classroom Training Institute
Best Devops Classroom Training Institute
Best Corejava Classroom Training Institute
Best Advanced Classroom Training Institute
Thanks For Sharing The Information The Information Shared Is Very Valuable Please Keep Updating
ReplyDeleteUs Time Just Went On Reading The article Hadoop Online Course
Useful information...Thanks for sharing...
ReplyDeletePython training in Chennai/Python training in OMR/Python training in Velachery/Python certification training in Chennai/Python training fees in Chennai/Python training with placement in Chennai/Python training in Chennai with Placement/Python course in Chennai/Python Certification course in Chennai/Python online training in Chennai/Python training in Chennai Quora/Best Python Training in Chennai/Best Python training in OMR/Best Python training in Velachery/Best Python course in Chennai/<a
This comment has been removed by the author.
ReplyDeleteAwesome. You have clearly explained …Its very useful for me to know about new things. Keep on blogging.
ReplyDeleteBEST PYTHON TUTORIAL FOR BEGINNERS