Group correspoding key and values

I have a use case to write a map reducing code where I have to group the values corresponding to the same queue:

Input:

A,B  
A,C  
B,A  
B,D  

Output:

A {B,C}  
B {A,D}

I have written this code:

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class GroupKeyValues {

    public static class Map extends Mapper<LongWritable, Text, Text, Text> {

        public void map(LongWritable key, Text value, Context con)
                throws IOException, InterruptedException {

            Text myKey = new Text();
            Text myVal = new Text();
            String line = value.toString();
            StringTokenizer st = new StringTokenizer(line);

            while (st.hasMoreTokens()) {

                String thisH = st.nextToken();
                String[] splitData = thisH.split(",");
                myKey.set(splitData[0]);
                myVal.set(splitData[1]);
            }
            con.write(myKey, myVal);

        }

    }

    @SuppressWarnings("deprecation")
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        @SuppressWarnings("deprecation")
        Job job = new Job(conf, "GroupKeyValues");

        job.setJarByClass(GroupKeyValues.class);
        job.setMapperClass(Map.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        Path outputPath = new Path(args[1]);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        outputPath.getFileSystem(conf).delete(outputPath);

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}      

Answers

You are missing reducer that will aggregate values into a single "row" value. For example, you can use ArrayWritable like this:

public static class AggregatingReducer extends Reducer<Text, Text, Text, ArrayWritable> {
  private ArrayWritable result = new ArrayWritable(Text.class);

  public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    List<Text> list = new ArrayList<>();
    for (Text value : values) {
      list.add(value);
    }
    result.set(list.toArray(new Text[list.size()]));
    context.write(key, result);
  }
}

In the job setup, make sure to add this:

job.setReducerClass(AggregatingReducer.class);
job.setOutputValueClass(ArrayWritable.class);  //instead of Text.class

Alternatively (depending on what you need) you can concatenate reducer values into StringBuilder and emit Text instead of accumulating it into and emitting it as ArrayWritable.

UPDATE: Here is the example of StringBuilder use with comma delimiter:

public static class AggregatingReducer extends Reducer<Text, Text, Text, Text> {
  private Text result = new Text();

  public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    StringBuilder sb = new StringBuilder();
    for (Text value : values) {
      if (sb.length() != 0) {
        sb.append(',');
      }
      sb.append(value);
    }
    result.set(sb.toString());
    context.write(key, result);
  }
}

In the driver value type needs to be changed back to Text:

job.setOutputValueClass(Text.class);
Posted on by yurgis