Tutorial Join

1
Joins
Create a java project JoinMap and create the following classes:
Hpot-Tech
Joins
package com.hp.join;
// == JobBuilder
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
public class JobBuilder {
private final Class<?> driverClass;
private final Job job;
private final int extraArgCount;
private final String extrArgsUsage;
private String[] extraArgs;
public JobBuilder(Class<?> driverClass) throws IOException {
this(driverClass, 0, "");
}
public JobBuilder(Class<?> driverClass, int extraArgCount, String extrArgsUsage) throws IOException {
this.driverClass = driverClass;
this.extraArgCount = extraArgCount;
this.job = new Job();
this.job.setJarByClass(driverClass);
this.extrArgsUsage = extrArgsUsage;
}
// vv JobBuilder
public static Job parseInputAndOutput(Tool tool, Configuration conf,
String[] args) throws IOException {
if (args.length != 2) {
printUsage(tool, "<input> <output>");
return null;
Hpot-Tech
Joins
}
Job job = new Job(conf);
job.setJarByClass(tool.getClass());
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job;
}
public static void printUsage(Tool tool, String extraArgsUsage) {
System.err.printf("Usage: %s [genericOptions] %s\n\n",
tool.getClass().getSimpleName(), extraArgsUsage);
GenericOptionsParser.printGenericCommandUsage(System.err);
}
// ^^ JobBuilder
public JobBuilder withCommandLineArgs(String... args) throws IOException {
Configuration conf = job.getConfiguration();
GenericOptionsParser parser = new GenericOptionsParser(conf, args);
String[] otherArgs = parser.getRemainingArgs();
if (otherArgs.length < 2 && otherArgs.length > 3 + extraArgCount) {
System.err.printf("Usage: %s [genericOptions] [-overwrite] <input path> <output path> %s\n\n",
driverClass.getSimpleName(), extrArgsUsage);
GenericOptionsParser.printGenericCommandUsage(System.err);
System.exit(-1);
}
int index = 0;
boolean overwrite = false;
if (otherArgs[index].equals("-overwrite")) {
overwrite = true;
index++;
}
Path input = new Path(otherArgs[index++]);
Path output = new Path(otherArgs[index++]);
if (index < otherArgs.length) {
extraArgs = new String[otherArgs.length - index];
System.arraycopy(otherArgs, index, extraArgs, 0, otherArgs.length - index);
}
if (overwrite) {
Hpot-Tech
Joins
output.getFileSystem(conf).delete(output, true);
}
FileInputFormat.addInputPath(job, input);
FileOutputFormat.setOutputPath(job, output);
return this;
}
public Job build() {
return job;
}
public String[] getExtraArgs() {
return extraArgs;
}
}
Hpot-Tech
Joins
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class JoinRecordMapper extends MapReduceBase
implements Mapper<LongWritable, Text, TextPair, Text> {
private NcdcRecordParser parser = new NcdcRecordParser();
public void map(LongWritable key, Text value,
OutputCollector<TextPair, Text> output, Reporter reporter)
throws IOException {
parser.parse(value);
output.collect(new TextPair(parser.getStationId(), "1"), value);
}
}
Hpot-Tech
Joins
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.MultipleInputs;
import org.apache.hadoop.util.*;
@SuppressWarnings("deprecation")
public class JoinRecordWithStationName extends Configured implements Tool {
public static class KeyPartitioner implements Partitioner<TextPair, Text> {
@Override
public void configure(JobConf job) {}
@Override
public int getPartition(TextPair key, Text value, int numPartitions) {
return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
@Override
public int run(String[] args) throws Exception {
if (args.length != 3) {
JobBuilder.printUsage(this, "<ncdc input> <station input> <output>");
return -1;
}
JobConf conf = new JobConf(getConf(), getClass());
conf.setJobName("Join record with station name");
Path ncdcInputPath = new Path(args[0]);
Path stationInputPath = new Path(args[1]);
Path outputPath = new Path(args[2]);
MultipleInputs.addInputPath(conf, ncdcInputPath,
TextInputFormat.class, JoinRecordMapper.class);
MultipleInputs.addInputPath(conf, stationInputPath,
TextInputFormat.class, JoinStationMapper.class);
Hpot-Tech
Joins
FileOutputFormat.setOutputPath(conf, outputPath);
conf.setPartitionerClass(KeyPartitioner.class);
conf.setOutputValueGroupingComparator(TextPair.FirstComparator.class);
conf.setMapOutputKeyClass(TextPair.class);
conf.setReducerClass(JoinReducer.class);
conf.setOutputKeyClass(Text.class);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
args = new String[3];
args[0] = "inputncdc";
args[1] = "inputstation";
args[2] = "output"+System.currentTimeMillis();
int exitCode = ToolRunner.run(new JoinRecordWithStationName(), args);
System.exit(exitCode);
}
}
Hpot-Tech
Joins
import java.util.Iterator;
public class JoinReducer extends MapReduceBase implements
Reducer<TextPair, Text, Text, Text> {
public void reduce(TextPair key, Iterator<Text> values,
OutputCollector<Text, Text> output, Reporter reporter)
Text stationName = new Text(values.next());
while (values.hasNext()) {
Text record = values.next();
Text outValue = new Text(stationName.toString() + "\t" + record.toString());
output.collect(key.getFirst(), outValue);
}
}
}
Hpot-Tech
Joins
import org.apache.hadoop.io.*;
public class JoinStationMapper extends MapReduceBase
implements Mapper<LongWritable, Text, TextPair, Text> {
private NcdcStationMetadataParser parser = new NcdcStationMetadataParser();
public void map(LongWritable key, Text value,
OutputCollector<TextPair, Text> output, Reporter reporter)
if (parser.parse(value)) {
output.collect(new TextPair(parser.getStationId(), "0"),
new Text(parser.getStationName()));
}
}
}
Hpot-Tech
10
Joins
import java.math.*;
public class MetOfficeRecordParser {
private String year;
private String airTemperatureString;
private int airTemperature;
private boolean airTemperatureValid;
public void parse(String record) {
if (record.length() < 18) {
return;
}
year = record.substring(3, 7);
if (isValidRecord(year)) {
airTemperatureString = record.substring(13, 18);
if (!airTemperatureString.trim().equals("---")) {
BigDecimal temp = new BigDecimal(airTemperatureString.trim());
temp = temp.multiply(new BigDecimal(BigInteger.TEN));
airTemperature = temp.intValueExact();
airTemperatureValid = true;
}
}
}
private boolean isValidRecord(String year) {
try {
Integer.parseInt(year);
return true;
} catch (NumberFormatException e) {
return false;
}
}
public void parse(Text record) {
parse(record.toString());
}
Hpot-Tech
11
Joins
public String getYear() {

return year;
}
public int getAirTemperature() {
return airTemperature;
}
public String getAirTemperatureString() {
return airTemperatureString;
}
public boolean isValidTemperature() {
return airTemperatureValid;
}
}
Hpot-Tech
12
Joins
import java.text.*;
import java.util.Date;
public class NcdcRecordParser {
private static final int MISSING_TEMPERATURE = 9999;
private static final DateFormat DATE_FORMAT =
new SimpleDateFormat("yyyyMMddHHmm");
private String stationId;
private String observationDateString;
private String year;
private String airTemperatureString;
private int airTemperature;
private boolean airTemperatureMalformed;
private String quality;
public void parse(String record) {
stationId = record.substring(4, 10) + "-" + record.substring(10, 15);
observationDateString = record.substring(15, 27);
year = record.substring(15, 19);
airTemperatureMalformed = false;
// Remove leading plus sign as parseInt doesn't like them
if (record.charAt(87) == '+') {
airTemperature = Integer.parseInt(airTemperatureString);
} else if (record.charAt(87) == '-') {
} else {
airTemperatureMalformed = true;
}
quality = record.substring(92, 93);
}
Hpot-Tech
13
Joins
public void parse(Text record) {

parse(record.toString());
}
public boolean isValidTemperature() {
return !airTemperatureMalformed && airTemperature != MISSING_TEMPERATURE
&& quality.matches("[01459]");
}
public boolean isMalformedTemperature() {
return airTemperatureMalformed;
}
public boolean isMissingTemperature() {
return airTemperature == MISSING_TEMPERATURE;
}
public String getStationId() {
return stationId;
}
public Date getObservationDate() {
try {
System.out.println(observationDateString);
return DATE_FORMAT.parse(observationDateString);
} catch (ParseException e) {
throw new IllegalArgumentException(e);
}
}
public String getYear() {
return year;
}
public int getYearInt() {
return Integer.parseInt(year);
}
public int getAirTemperature() {
return airTemperature;
Hpot-Tech
14
Joins
}
public String getAirTemperatureString() {
return airTemperatureString;
}
public String getQuality() {
return quality;
}
}
Hpot-Tech
15
Joins
import java.io.*;
import java.util.*;
import org.apache.hadoop.io.IOUtils;
public class NcdcStationMetadata {
private Map<String, String> stationIdToName = new HashMap<String, String>();
public void initialize(File file) throws IOException {
BufferedReader in = null;
try {
in = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
NcdcStationMetadataParser parser = new NcdcStationMetadataParser();
String line;
while ((line = in.readLine()) != null) {
if (parser.parse(line)) {
stationIdToName.put(parser.getStationId(), parser.getStationName());
}
}
} finally {
IOUtils.closeStream(in);
}
}
public String getStationName(String stationId) {
String stationName = stationIdToName.get(stationId);
if (stationName == null || stationName.trim().length() == 0) {
return stationId; // no match: fall back to ID
}
return stationName;
}
public Map<String, String> getStationIdToNameMap() {
return Collections.unmodifiableMap(stationIdToName);
}
}
Hpot-Tech
16
Joins
public class NcdcStationMetadataParser {
private String stationId;
private String stationName;
public boolean parse(String record) {
if (record.length() < 42) { // header
return false;
}
String usaf = record.substring(0, 6);
String wban = record.substring(7, 12);
stationId = usaf + "-" + wban;
stationName = record.substring(13, 42);
try {
Integer.parseInt(usaf); // USAF identifiers are numeric
return true;
} catch (NumberFormatException e) {
return false;
}
}
public boolean parse(Text record) {
return parse(record.toString());
}
public String getStationId() {
return stationId;
}
public String getStationName() {
return stationName;
}
}
Hpot-Tech
17
Joins
// cc TextPair A Writable implementation that stores a pair of Text objects
// cc TextPairComparator A RawComparator for comparing TextPair byte representations
// cc TextPairFirstComparator A custom RawComparator for comparing the first field of TextPair byte representations
// vv TextPair
import java.io.*;
import org.apache.hadoop.io.*;
public class TextPair implements WritableComparable<TextPair> {
private Text first;
private Text second;
public TextPair() {
set(new Text(), new Text());
}
public TextPair(String first, String second) {
set(new Text(first), new Text(second));
}
public TextPair(Text first, Text second) {
set(first, second);
}
public void set(Text first, Text second) {
this.first = first;
this.second = second;
}
public Text getFirst() {
return first;
}
public Text getSecond() {
return second;
}
@Override
Hpot-Tech
18
Joins
public void write(DataOutput out) throws IOException {

first.write(out);
second.write(out);
}
@Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
@Override
public int hashCode() {
return first.hashCode() * 163 + second.hashCode();
}
@Override
public boolean equals(Object o) {
if (o instanceof TextPair) {
TextPair tp = (TextPair) o;
return first.equals(tp.first) && second.equals(tp.second);
}
return false;
}
@Override
public String toString() {
return first + "\t" + second;
}
@Override
public int compareTo(TextPair tp) {
int cmp = first.compareTo(tp.first);
if (cmp != 0) {
return cmp;
}
return second.compareTo(tp.second);
}
// ^^ TextPair
Hpot-Tech
19
Joins
// vv TextPairComparator
public static class Comparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public Comparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
try {
int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
int cmp = TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2);
if (cmp != 0) {
return cmp;
}
return TEXT_COMPARATOR.compare(b1, s1 + firstL1, l1 - firstL1,
b2, s2 + firstL2, l2 - firstL2);
} catch (IOException e) {
}
}
}
static {
WritableComparator.define(TextPair.class, new Comparator());
}
// ^^ TextPairComparator
// vv TextPairFirstComparator
public static class FirstComparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public FirstComparator() {
super(TextPair.class);
Hpot-Tech
20
Joins
}
@Override
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
try {
return TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2);
} catch (IOException e) {
}
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
if (a instanceof TextPair && b instanceof TextPair) {
return ((TextPair) a).first.compareTo(((TextPair) b).first);
}
return super.compare(a, b);
}
}
// ^^ TextPairFirstComparator
// vv TextPair
}
// ^^ TextPair
Hpot-Tech
21
Joins
Create the following folder and copy the file:
Hpot-Tech
22
Joins
Hpot-Tech
23
Joins
Run the application:
Hpot-Tech
24
Joins
Submit the jar in cluster:

Export the jar and submit as follows:
Create the necessary input folders:
Un common the path initialization as follow:
/*args = new String[3];
args[0] = "inputncdc";
args[1] = "inputstation";
args[2] = "output"+System.currentTimeMillis();*/
#hadoop fs -mkdir incdc/

#hadoop fs -mkdir instation/
#hadoop fs -copyFromLocal /hadoop/data/sample.txt incdc/
#hadoop fs -copyFromLocal /hadoop/data/stations*.txt instation/
#hadoop jar /hadoop/hadoop/myhadoopjoin.jar com.hp.join.JoinRecordWithStationName incdc instation outputs
Hpot-Tech
25
Joins
You can view the data as follows:
Hpot-Tech

Tutorial Join

Transféré par

Informations du document

Copyright

Formats disponibles

Partager ce document

Partager ou intégrer le document

Options de partage

Avez-vous trouvé ce document utile ?

Ce contenu est-il inapproprié ?

Droits d'auteur :

Formats disponibles

Tutorial Join

Transféré par

Droits d'auteur :

Formats disponibles

1

Create a java project JoinMap and create the following classes:

public String getYear() {

public void parse(Text record) {

public void write(DataOutput out) throws IOException {

Create the following folder and copy the file:

Run the application:

Submit the jar in cluster:

#hadoop fs -mkdir incdc/

You can view the data as follows:

Vous aimerez peut-être aussi