Developer’s template: MapReduce (Java)

Developer’s template series is intended to ease the life of  Bigdata developers with their application development and leave behind the headache of starting from the scratch. Here is a mapreduce java program with its pom file.

Prerequisites

  • Hadoop cluster
  • Eclipse
  • Maven
  • Java

MapReduce – Java

package com.abc.javahadoop;

import java.io.IOException;
import java.util.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.util.*;

public class WordCount extends Configured implements Tool {

public static void main(String args[]) throws Exception {
int res = ToolRunner.run(new WordCount(), args);
System.exit(res);
}

public int run(String[] args) throws Exception {

//String inpath = “hdfs://<hdfs>:8020/tmp/wcIn”;
//String outpath = “hdfs://<hdfs>:8020/tmp/wcOut”;
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
String queueName = args[2];

/*Path inputPath = new Path(inpath);
Path outputPath = new Path(outpath);
*/

Configuration conf = getConf();
conf.set(“mapreduce.job.queuename”,queueName);
Job job = new Job(conf, this.getClass().toString());

FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);

job.setJobName(“MR”);
job.setJarByClass(WordCount.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);

return job.waitForCompletion(true) ? 0 : 1;
}

public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

@Override
public void map(LongWritable key, Text value,
Mapper.Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}

public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {

@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}

context.write(key, new IntWritable(sum));
}
}

}

pom.xml

<project xmlns=”http://maven.apache.org/POM/4.0.0&#8243; xmlns:xsi=”http://www.w3.org/2001/XMLSchema-instance&#8221;
xsi:schemaLocation=”http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd”&gt;
<modelVersion>4.0.0</modelVersion>

<groupId>com.abc</groupId>
<artifactId>javahadoop</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>javahadoop</name>
<url>http://maven.apache.org</url&gt;

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.0</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<!– Additional configuration. –>
<artifactSet>
<excludes>
<exclude>org.apache.hadoop:jar</exclude>
</excludes>
</artifactSet>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation=”org.apache.maven.plugins.shade.resource.ManifestResourceTransformer”>
<manifestEntries>
<Main-Class>com/abc/javahadoop/WordCount</Main-Class>
<Build-Number>123</Build-Number>
</manifestEntries>
</transformer>
</transformers>
<finalName>uber-${artifactId}-${version}</finalName>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>

Execute

yarn jar <jarname> <arg1> <arg2>

stay tuned..

Advertisements

About shalishvj : My Experience with BigData

6+ years of experience using Bigdata technologies in Architect, Developer and Administrator roles for various clients. • Experience using Hortonworks, Cloudera, AWS distributions. • Cloudera Certified Developer for Hadoop. • Cloudera Certified Administrator for Hadoop. • Spark Certification from Big Data Spark Foundations. • SCJP, OCWCD. • Experience in setting up Hadoop clusters in PROD, DR, UAT , DEV environments.
This entry was posted in Java-Maven-Hadoop and tagged , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s