hadoop java API（二）

KinglyJn 2017-08-08

HDFS java API Base

首先要拷贝hadoop配置文件和 log4j.properties 到maven项目的类路径(e.g. resources 目录)

<!-- hadoop客户端依赖包-->
<dependency>
  	<groupId>org.apache.hadoop</groupId>
  	<artifactId>hadoop-client</artifactId>
  	<version>2.5.0</version>
</dependency>

以下是一些API的应用示例：

/**
 * 获取文件系统
 */
public FileSystem getFileSystem() throws IOException {
	Configuration config = new Configuration();
	return  FileSystem.get(config);
}

/**
 * 读取文件
 * bin/hdfs dfs -text xxx/filename.xxx
 */
@Test
public void test01() throws IOException {
	FileSystem fs = getFileSystem();
	Path path = new Path("/user/ubuntu/wc/input/wc.txt");
	FSDataInputStream is = fs.open(path);
	IOUtils.copyBytes(is, System.out, 1024, true);
}

/**
 * 上传文件
 * bin/hdfs dfs -put xxx/srcfile xxx/distfile
 */
@Test
public void test02() throws IOException {
	FileSystem fs = getFileSystem();
	FileInputStream is = new FileInputStream(new File("src/main/resources/log4j.properties"));
	FSDataOutputStream fso = fs.create(new Path("/user/ubuntu/wc/input/log4j.properties"), true);
	IOUtils.copyBytes(is, fso, 1024, true);
}

/**
 * 创建文件夹
 * bin/hdfs dfs -mkdir -p xxx/dirpath
 */
@Test
public void test03() throws IOException {
	FileSystem fs = getFileSystem();
	boolean result = fs.mkdirs(new Path("/user/ubuntu/wc/input2/1"));
	System.out.println(result);
}

/**
 * 删除文件夹
 * bin/hdfs dfs -rm -r xxx/dirpath
 */
@Test
public void test04() throws IOException {
	FileSystem fs = getFileSystem();
	boolean result = fs.delete(new Path("/user/ubuntu/wc/input2"), true);
	System.out.println(result);
}

Mapreduce 编程模型

Mapreduce编程模型分为五步三大块，即：

五步：
input -> map() -> shuffle -> reduce() -> output

三大块：
Mapper块、Reducer块、Driver块

Mapreduce编程模型处理的数据的格式：

都是键值对 <key, value>
所以Mapreduce在设计的时候首先要考虑到就是怎样将数据的格式转化成五个处理过程的<key, value>的形式

下面以wordcount程序进行说明：（基本是八股文）

package com.hadoop.test01;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountMapReduce {
	/**
	 * 1. mapper
	 */
	public static class WordCountMapper 
      		extends Mapper<LongWritable, Text, Text, IntWritable> {
		private Text mapOutputKey = new Text();
		private IntWritable mapOutputValue = new IntWritable(1);
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			//lineValue
			String lineValue = value.toString();
			
			//split
			String[] strs = lineValue.split(" ");
			
			//iterator
			for (String str : strs) {
				mapOutputKey.set(str);
				context.write(mapOutputKey, mapOutputValue);
			}
		}
	}

	/**
	 * 2. reducer
	 */
	public static class WordCountReducer 
      			extends Reducer<Text, IntWritable, Text, IntWritable> {
		private IntWritable outputValue = new IntWritable();
		
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values, Context context) 
				throws IOException, InterruptedException {
			//tmp sum
			int sum = 0;
			for (IntWritable value : values) {
				sum += value.get();
			}
			
			//set outputValue and output
			outputValue.set(sum);
			context.write(key, outputValue);
		}
	}

	/**
	 * 3. driver
	 */
	public int run(String[] args) throws Exception {
		//定义任务
		Configuration config = new Configuration();
		Job job = Job.getInstance(config, this.getClass().getSimpleName());
		
		//job jar
		job.setJarByClass(this.getClass());
		
		// set job input & output
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		// set mapper & reducer
		job.setMapperClass(WordCountMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setReducerClass(WordCountReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		// submit job to yarn
		boolean success = job.waitForCompletion(true);
		return success ? 0 : 1;
	}
	
	
	/**
	 * 为了防止出现如下的权限异常，我事先设置了相关hdfs文件夹的权限为755
	 * Permission denied: 
	 * user=xxx, access=EXECUTE, inode="/tmp":
	 * ubuntu:supergroup:drwxrwx---
	 * $ bin/hdfs dfs -chmod -R 755 /tmp
	 */
	public static void main(String[] args) throws Exception {
		//init args
		args = new String[] {
				"hdfs://nimbusz:8020/user/ubuntu/wc/input",
				"hdfs://nimbusz:8020/user/ubuntu/wc/output"
		};
		
		//run job
		int status = new WordCountMapReduce().run(args);
		System.exit(status);
	}
}

对wordcount程序的一些优化：

package com.hadoop.test01;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class WordCountMapReduce extends Configured implements Tool {
	/**
	 * 1. mapper
	 */
	public static class WordCountMapper 
      		extends Mapper<LongWritable, Text, Text, IntWritable> {
		private Text mapOutputKey = new Text();
		private IntWritable mapOutputValue = new IntWritable(1);
		
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			//lineValue
			String lineValue = value.toString();
			
			//split & iterator
			StringTokenizer stringTokenizer = new StringTokenizer(lineValue);
			while (stringTokenizer.hasMoreTokens()) {
				mapOutputKey.set(stringTokenizer.nextToken());
				context.write(mapOutputKey, mapOutputValue);
			}
		}
	}

	/**
	 * 2. reducer
	 */
	public static class WordCountReducer 
      		extends Reducer<Text, IntWritable, Text, IntWritable> {
		private IntWritable outputValue = new IntWritable();
		
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values, Context context) 
				throws IOException, InterruptedException {
			//tmp sum
			int sum = 0;
			for (IntWritable value : values) {
				sum += value.get();
			}
			
			//set outputValue and output
			outputValue.set(sum);
			context.write(key, outputValue);
		}
	}

	/**
	 * 3. driver
	 */
	public int run(String[] args) throws Exception {
		//定义任务
		Configuration config = this.getConf();
		Job job = Job.getInstance(config, this.getClass().getSimpleName());
		
		//job jar
		job.setJarByClass(this.getClass());
		
		// set job input & output
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		
		//============================shuffle==============================
		//partition
		//job.setPartitionerClass(cls);
		
		//sort
		//job.setSortComparatorClass(cls);
		
		//group
		//job.setGroupingComparatorClass(cls);
		//============================shuffle==============================
		
		
		// set mapper & reducer
		job.setMapperClass(WordCountMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setReducerClass(WordCountReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		// submit job to yarn
		boolean success = job.waitForCompletion(true);
		return success ? 0 : 1;
	}
	
	
	public static void main(String[] args) throws Exception {
		Configuration config = new Configuration();
		int status = ToolRunner.run(config, new WordCountMapReduce(), args);
		System.exit(status);
	}
}

MapReduce的核心过程-shuffle

发生的阶段 map -> shuffle -> reduce

数据输入 <key, value>
		<0, hadoop, spark>
        	|
        	map
        	|
        <hadoop,1> <spark,1>
        	|
        	map output
       	-----------------------map端shuffle阶段------------------------
       	map的结果输出到内存当中，这个内存的结构称之为环形缓冲区，
       	环形缓冲区的大小默认为100M，		#可自定义
        环形缓冲区的大小达到80%大小时候，	  #可自定义
        就会发生【溢写过程】，将环形缓冲区的数据内容溢写到本地磁盘，如下：
		
		1. 分区(partition)，即将环形缓冲区80%那部分数据进行分区， #这里可以设置分区规则的比较器
		   决定了map输出的数据被哪个reduce任务进行处理。
		2. 排序(sort)，对分区阶段的数据在环形缓冲区进行排序。		 #这里可以设置排序规则的比较器
		3. 溢写(spill)，将分区、排序的结果溢写到磁盘的某个目录中，形成一个溢写文件。
		4. 合并(merge)，将溢写到本地磁盘的溢写小文件进行一次性合并。
		   合并完成之后也要进行排序，最后形成一个分完区、排完序的大文件。
		-----------------------map端shuffle阶段------------------------
			|
		-----------------------reduce端shuffle阶段---------------------
		每个reduce任务会启动一个线程，去对应的map上拷贝对应的数据（这时候将会涉及到
		reduce 通过 appMaster 与 map进行网络通信），将map合并生成的磁盘输出文件再
		一次加载到环形缓冲区中，也会发生类似于map任务阶段的【溢写过程】
		
		1. 如果reduce处理的数据量比较小(没有达到缓冲区的80%)，则数据直接放在内存当中。
		2. 如果reduce处理的数据量比较大，则数据也会被溢写到本地磁盘。
		3. 分组（group），最后reduce会将相同key的value放在一起。#这里可以设置判断key是否相同的比较器
		   <hadoop,1> <hadoop,1> <hadoop,1> -> <hadoop,list(1,1,1)>
		   形成reduce输入的一个key-values
		-----------------------reduce端shuffle阶段---------------------
			|
			reduce input
			|
		<hadoop,list(1,1,1)>
			|
			reduce
			|
数据输出  reduce result

[注]

环形缓冲区大小的设置：

设置mapred-site.xml 的 mapreduce.task.io.sort.mb 配置项，默认值为100.

环形缓冲区溢写条件设置：

设置mapred-site.xml 的 mapreduce.map.sort.spill.percent 配置项，默认值为 0.80.

环形缓冲区溢写到磁盘路径的设置：

设置mapred-site.xml 的 mapreduce.cluster.local.dir 配置项，默认值为 ${hadoop.tmp.dir}/mapred/local

MapReduce的过程的优化-Combiner（可选）

Combine（组合）的过程通常也被称为map端的reduce任务，在map端完成reduce任务的部分分组统计工作。这样可以减少网络的传输量，也可以减少本地磁盘的io读写，从而提高了程序的性能。一般Combiner和Reducer做的是相同的工作，所以一般也可以直接将Reducer拿来当Combiner来用：

job.setCombinerClass(WordCountReducer.class);

MapReduce设置计数器

context.getCounter("GROUP_NAME", "COUNTER_NAME").increment(1L);

//e.g.
context.getCounter("WEB_PV_MAPPER_COUNTERS", "URL_BLANK").increment(1L);
context.getCounter("WEB_PV_MAPPER_COUNTERS", "LENGTH_LT_30").increment(1L);
context.getCounter("WEB_PV_MAPPER_COUNTERS", "PROVINCE_ID_BLANK").increment(1L);
context.getCounter("WEB_PV_MAPPER_COUNTERS", "PROVINCE_ID_NOT_NUMBER").increment(1L); 

MapReduce的典型应用之日志分析-网站的基本指标（PUVI）分析统计

PV：即page view，页面浏览次数，衡量网站用户访问的网页数量，用户没打开一个页面就记录一次，多次打开统一页面则累计。
UV：即unique visitor，独立访客数，1天内访问某站点的人数（以cookie为依据技计数）。
VV：即visit view，方可的访问次数，记录所有访客1天内访问了多少次您的网站，当访客完成浏览并关闭掉网站的所有页面便完成了一次访问，统一访客一天内可能有多次访问行为。
IP：即独立IP数，指1天内使用不同ip地址的用户访问网站的数量，同一个ip不管访问了几个页面，独立ip数均为1。

MapReduce数据类型

当我们的程序间传递对象或者持久化对象的时候就需要我们序列化对象成字节流，反之就需要我们进行反序列化操作。Writable就是hadoop中序列化的格式。 MapReduce的数据类型都实现了Writable接口（MapReduce中所有的key都实现了 WritableComparable 接口，而所有的value都实现了 Writable 接口），以便于这些类型定义的数据可以被序列化进行网络传输和文件存储。基本的数据类型包括：

BooleanWritable
ByteWritable
IntWritable
LongWritable
FloatWritable
DoubleWritable
TextWritable
NullWritable

mapreduce自定义数据类型：

package com.hadoop.test01;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;

/**
 * 自定义的value
 */
public class UserWritable implements Writable {
	private int id;
	private String name;

	public UserWritable() {}
	public UserWritable(int id, String name) {
		this.id = id;
		this.name = name;
	}

	public int getId() {
		return id;
	}
	public void setId(int id) {
		this.id = id;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}

	@Override
	public void write(DataOutput out) throws IOException { //注意顺序一致
		out.write(id);
		out.writeUTF(name);
	}

	@Override
	public void readFields(DataInput in) throws IOException { //注意顺序一致
		this.id = in.readInt();
		this.name = in.readUTF();
	}
	
	
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + id;
		result = prime * result + ((name == null) ? 0 : name.hashCode());
		return result;
	}
	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		UserWritable other = (UserWritable) obj;
		if (id != other.id)
			return false;
		if (name == null) {
			if (other.name != null)
				return false;
		} else if (!name.equals(other.name))
			return false;
		return true;
	}
	@Override
	public String toString() {
		return "UserWritable [id=" + id + ", name=" + name + "]";
	}
}



package com.hadoop.test01;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

public class OrderWritable implements WritableComparable<OrderWritable> {
	private String id;
	private float price;
	
	public OrderWritable() {}
	public OrderWritable(String id, float price) {
		super();
		this.id = id;
		this.price = price;
	}

	public String getId() {
		return id;
	}
	public void setId(String id) {
		this.id = id;
	}
	public float getPrice() {
		return price;
	}
	public void setPrice(float price) {
		this.price = price;
	}

	
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(id);
		out.writeFloat(price);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.id = in.readUTF();
		this.price = in.readFloat();
	}

	@Override
	public int compareTo(OrderWritable o) {
		int compareTo = this.getId().compareTo(o.getId());
		if (compareTo==0) {
			return Float.valueOf(this.getPrice()).compareTo(o.getPrice());
		}
		return compareTo;
	}
	
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((id == null) ? 0 : id.hashCode());
		result = prime * result + Float.floatToIntBits(price);
		return result;
	}
	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		OrderWritable other = (OrderWritable) obj;
		if (id == null) {
			if (other.id != null)
				return false;
		} else if (!id.equals(other.id))
			return false;
		if (Float.floatToIntBits(price) != Float.floatToIntBits(other.price))
			return false;
		return true;
	}
}

对mapreduce输出结果进行二次排序

MR框架无论是默认排序还是自定义排序，都只是对key值进行排序的。那么当key相同的时候，如何再对value进行排序呢？

第一点：使用组合key（优势是利用了shuffle过程的集群计算能力），key是原来key和value的组合字段，自定义数据类型，需要实现WrableComparable接口；
第二点：保证原来的分区不变，自定义分区规则，组要实现Partitioner接口。
第三点：保证原来的分组保持不变，自定义分组规则，需要实现RawComparable接口；
第四点：默认情况下，reduce的个数是1个，但是reduce任务也可以自定义reducer的数目，通常我们会将所有的任务规划到一个时间区域内，比如20个reduce和10个reduce运行的时间都差不多，那么肯定选择10个reduce才合理，只有通过不断的测试才能得出最优的reduce个数。

package com.hadoop.test01;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
 * SecondarySortMapReduce
 */
public class SecondarySortMapReduce extends Configured implements Tool {
	
	public static class SecondarySortMap 
      		extends Mapper<LongWritable, Text, PairWritable, IntWritable> {
		private PairWritable mapOutputKey = new PairWritable();
		private IntWritable mapOutputValue = new IntWritable();
		
		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, PairWritable, IntWritable>.Context context)
				throws IOException, InterruptedException {
			
			String lineValue = value.toString();
			String[] splits = lineValue.split("\t");
			if (splits.length != 2) {
				context.getCounter("SECONDARY_SORT_MAP", "LENGTH_NOT_2").increment(1L);
				return;
			}
			mapOutputKey.set(splits[0], Integer.valueOf(splits[1]));
			mapOutputValue.set(Integer.valueOf(splits[1]));
			context.write(mapOutputKey, mapOutputValue);
		}
	}
	
	
	public static class SecondarySortReduce 
      		extends Reducer<PairWritable, IntWritable, Text, IntWritable> {
		private Text outputKey = new Text();
		
		@Override
		protected void reduce(PairWritable key, Iterable<IntWritable> values,
				Reducer<PairWritable, IntWritable, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			
			for (IntWritable value : values) {
				outputKey.set(key.getFirst());
				context.write(outputKey, value);
			}
		}
	}
	
	
	@Override
	public int run(String[] args) throws Exception {
		// define job
		Configuration config = this.getConf();
		Job job = Job.getInstance(config, this.getClass().getSimpleName());
		
		// set jar
		job.setJarByClass(this.getClass());
		
		// set input and output
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		// set map and reduce
		job.setMapperClass(SecondarySortMap.class);
		job.setMapOutputKeyClass(PairWritable.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setReducerClass(SecondarySortReduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		job.setNumReduceTasks(2); //////设置reduce的个数
		
		// shuffle
		//==========================================================
		//partition
		job.setPartitionerClass(FirstPatitioner.class);
		
		//sort
		//job.setSortComparatorClass(cls);
		
		//group
		job.setGroupingComparatorClass(FirstGroupingComparator.class);
		//==========================================================
		
		//submit to yarn
		boolean success = job.waitForCompletion(true);
		return success ? 0 : 1;
	}
	
	
	public static void main(String[] args) throws Exception {
		Configuration config = new Configuration();
		int status = ToolRunner.run(config, new SecondarySortMapReduce(), args);
		System.exit(status);
	}
}



package com.hadoop.test01;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
 * 组合key定义
 */
public class PairWritable implements WritableComparable<PairWritable> {
	private String first;
	private int second;

	public PairWritable() {}
	public PairWritable(String first, int second) {
		super();
		this.first = first;
		this.second = second;
	}

	public String getFirst() {
		return first;
	}
	public void setFirst(String first) {
		this.first = first;
	}
	public int getSecond() {
		return second;
	}
	public void setSecond(int second) {
		this.second = second;
	}
	public void set(String first, int second) {
		this.first = first;
		this.second = second;
	}
	

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(first);
		out.writeInt(second);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.first = in.readUTF();
		this.second = in.readInt();
	}

	@Override
	public int compareTo(PairWritable o) {
		int compareTo = this.getFirst().compareTo(o.getFirst());
		if (compareTo==0) {
			return Integer.valueOf(this.getSecond()).compareTo(o.getSecond());
		}
		return compareTo;
	}
	
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((first == null) ? 0 : first.hashCode());
		result = prime * result + second;
		return result;
	}
	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		PairWritable other = (PairWritable) obj;
		if (first == null) {
			if (other.first != null)
				return false;
		} else if (!first.equals(other.first))
			return false;
		if (second != other.second)
			return false;
		return true;
	}
}




package com.hadoop.test01;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
 * Partitioner 默认使用的分区是 HashPatitioner
 */
public class FirstPatitioner extends Partitioner<PairWritable, IntWritable> {

	@Override
	public int getPartition(PairWritable key, IntWritable value, int numPartitions) {
		return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions;
	}
}



package com.hadoop.test01;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;
/**
 * HADOOP为序列化提供了优化，类型的比较对M/R尤为重要，key和key的比较也是在排序阶段完成的，
 * hadoop提供了原生的比较器接口 RawComparator，用于序列化字节间的比较，该接口允许其实现
 * 直接比较数据流中的记录，无需反序列化为对象，RawComparator是一个原生化的接口，它只是简单
 * 提供了用于数据流中简单的数据对比方法，从而提供优化。
 *
 */
public class FirstGroupingComparator implements RawComparator<PairWritable> {

	@Override
	public int compare(PairWritable o1, PairWritable o2) {
		return o1.getFirst().compareTo(o2.getFirst());
	}

	@Override
	public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
		return WritableComparator.compareBytes(b1, 0, l1-4, b2, 0, l2-4);
	}
}

hadoop shuffle过程详解

转自

Shuffle过程是MapReduce的核心，也被称为奇迹发生的地方。要想理解MapReduce， Shuffle是必须要了解的。我看过很多相关的资料，但每次看完都云里雾里的绕着，很难理清大致的逻辑，反而越搅越混。前段时间在做MapReduce job 性能调优的工作，需要深入代码研究MapReduce的运行机制，这才对Shuffle探了个究竟。考虑到之前我在看相关资料而看不懂时很恼火，所以在这里我尽最大的可能试着把Shuffle说清楚，让每一位想了解它原理的朋友都能有所收获。

Shuffle的正常意思是洗牌或弄乱，可能大家更熟悉的是Java API里的Collections.shuffle(List)方法，它会随机地打乱参数list里的元素顺序。如果你不知道MapReduce里Shuffle是什么，那么请看这张图：

这张是官方对Shuffle过程的描述。但我可以肯定的是，单从这张图你基本不可能明白Shuffle的过程，因为它与事实相差挺多，细节也是错乱的。后面我会具体描述Shuffle的事实情况，所以这里你只要清楚Shuffle的大致范围就成－怎样把map task的输出结果有效地传送到reduce端。也可以这样理解， Shuffle描述着数据从map task输出到reduce task输入的这段过程。

在Hadoop这样的集群环境中，大部分map task与reduce task的执行是在不同的节点上。当然很多情况下Reduce执行时需要跨节点去拉取其它节点上的map task结果。如果集群正在运行的job有很多，那么task的正常执行对集群内部的网络资源消耗会很严重。这种网络消耗是正常的，我们不能限制，能做的就是最大化地减少不必要的消耗。还有在节点内，相比于内存，磁盘IO对job完成时间的影响也是可观的。从最基本的要求来说，我们对Shuffle过程的期望可以有：

完整地从map task端拉取数据到reduce 端。
在跨节点拉取数据时，尽可能地减少对带宽的不必要消耗。
减少磁盘IO对task执行的影响。

OK，看到这里时，大家可以先停下来想想，如果是自己来设计这段Shuffle过程，那么你的设计目标是什么。我想能优化的地方主要在于减少拉取数据的量及尽量使用内存而不是磁盘。

我的分析是基于Hadoop0.21.0的源码，如果与你所认识的Shuffle过程有差别，不吝指出。我会以WordCount为例，并假设它有8个map task和3个reduce task。从上图看出，Shuffle过程横跨map与reduce两端，所以下面我也会分两部分来展开。

先看看map端的情况，如下图：

上图可能是某个map task的运行情况。拿它与官方图的左半边比较，会发现很多不一致。官方图没有清楚地说明partition， sort与combiner到底作用在哪个阶段。我画了这张图，希望让大家清晰地了解从map数据输入到map端所有数据准备好的全过程。

整个流程我分了四步。简单些可以这样说，每个map task都有一个内存缓冲区，存储着map的输出结果，当缓冲区快满的时候需要将缓冲区的数据以一个临时文件的方式存放到磁盘，当整个map task结束后再对磁盘中这个map task产生的所有临时文件做合并，生成最终的正式输出文件，然后等待reduce task来拉数据。

当然这里的每一步都可能包含着多个步骤与细节，下面我对细节来一一说明：

在map task执行时，它的输入数据来源于HDFS的block，当然在MapReduce概念中，map task只读取split。Split与block的对应关系可能是多对一，默认是一对一。在WordCount例子里，假设map的输入数据都是像“aaa”这样的字符串。
在经过mapper的运行后，我们得知mapper的输出是这样一个key/value对： key是“aaa”， value是数值1。因为当前map端只做加1的操作，在reduce task里才去合并结果集。前面我们知道这个job有3个reduce task，到底当前的“aaa”应该交由哪个reduce去做呢，是需要现在决定的。 MapReduce提供Partitioner接口，它的作用就是根据key或value及reduce的数量来决定当前的这对输出数据最终应该交由哪个reduce task处理。默认对key hash后再以reduce task数量取模。默认的取模方式只是为了平均reduce的处理能力，如果用户自己对Partitioner有需求，可以订制并设置到job上。在我们的例子中，“aaa”经过Partitioner后返回0，也就是这对值应当交由第一个reducer来处理。接下来，需要将数据写入内存缓冲区中，缓冲区的作用是批量收集map结果，减少磁盘IO的影响。我们的key/value对以及Partition的结果都会被写入缓冲区。当然写入之前，key与value值都会被序列化成字节数组。整个内存缓冲区就是一个字节数组，它的字节索引及key/value存储结构我没有研究过。如果有朋友对它有研究，那么请大致描述下它的细节吧。
这个内存缓冲区是有大小限制的，默认是100MB。当map task的输出结果很多时，就可能会撑爆内存，所以需要在一定条件下将缓冲区中的数据临时写入磁盘，然后重新利用这块缓冲区。这个从内存往磁盘写数据的过程被称为Spill，中文可译为溢写，字面意思很直观。这个溢写是由单独线程来完成，不影响往缓冲区写map结果的线程。溢写线程启动时不应该阻止map的结果输出，所以整个缓冲区有个溢写的比例spill.percent。这个比例默认是0.8，也就是当缓冲区的数据已经达到阈值（buffer size * spill percent = 100MB * 0.8 = 80MB），溢写线程启动，锁定这80MB的内存，执行溢写过程。Map task的输出结果还可以往剩下的20MB内存中写，互不影响。当溢写线程启动后，需要对这80MB空间内的key做排序(Sort)。排序是MapReduce模型默认的行为，这里的排序也是对序列化的字节做的排序。在这里我们可以想想，因为map task的输出是需要发送到不同的reduce端去，而内存缓冲区没有对将发送到相同reduce端的数据做合并，那么这种合并应该是体现是磁盘文件中的。从官方图上也可以看到写到磁盘中的溢写文件是对不同的reduce端的数值做过合并。所以溢写过程一个很重要的细节在于，如果有很多个key/value对需要发送到某个reduce端去，那么需要将这些key/value值拼接到一块，减少与partition相关的索引记录。在针对每个reduce端而合并数据时，有些数据可能像这样：“aaa”/1， “aaa”/1。对于WordCount例子，就是简单地统计单词出现的次数，如果在同一个map task的结果中有很多个像“aaa”一样出现多次的key，我们就应该把它们的值合并到一块，这个过程叫reduce也叫combine。但MapReduce的术语中，reduce只指reduce端执行从多个map task取数据做计算的过程。除reduce外，非正式地合并数据只能算做combine了。其实大家知道的，MapReduce中将Combiner等同于Reducer。如果client设置过Combiner，那么现在就是使用Combiner的时候了。将有相同key的key/value对的value加起来，减少溢写到磁盘的数据量。Combiner会优化MapReduce的中间结果，所以它在整个模型中会多次使用。那哪些场景才能使用Combiner呢？从这里分析，Combiner的输出是Reducer的输入，Combiner绝不能改变最终的计算结果。所以从我的想法来看，Combiner只应该用于那种Reduce的输入key/value与输出key/value类型完全一致，且不影响最终结果的场景。比如累加，最大值等。Combiner的使用一定得慎重，如果用好，它对job执行效率有帮助，反之会影响reduce的最终结果。
每次溢写会在磁盘上生成一个溢写文件，如果map的输出结果真的很大，有多次这样的溢写发生，磁盘上相应的就会有多个溢写文件存在。当map task真正完成时，内存缓冲区中的数据也全部溢写到磁盘中形成一个溢写文件。最终磁盘中会至少有一个这样的溢写文件存在(如果map的输出结果很少，当map执行完成时，只会产生一个溢写文件)，因为最终的文件只有一个，所以需要将这些溢写文件归并到一起，这个过程就叫做Merge。Merge是怎样的？如前面的例子，“aaa”从某个map task读取过来时值是5，从另外一个map 读取时值是8，因为它们有相同的key，所以得merge成group。什么是group。对于“aaa”就是像这样的：{“aaa”, [5, 8, 2, …]}，数组中的值就是从不同溢写文件中读取出来的，然后再把这些值加起来。请注意，因为merge是将多个溢写文件合并到一个文件，所以可能也有相同的key存在，在这个过程中如果client设置过Combiner，也会使用Combiner来合并相同的key。

至此，map端的所有工作都已结束，最终生成的这个文件也存放在TaskTracker够得着的某个本地目录内。每个reduce task不断地通过RPC从JobTracker那里获取map task是否完成的信息，如果reduce task得到通知，获知某台TaskTracker上的map task执行完成，Shuffle的后半段过程开始启动。

简单地说，reduce task在执行之前的工作就是不断地拉取当前job里每个map task的最终结果，然后对从不同地方拉取过来的数据不断地做merge，也最终形成一个文件作为reduce task的输入文件。见下图：

如map 端的细节图，Shuffle在reduce端的过程也能用图上标明的三点来概括。当前reduce copy数据的前提是它要从JobTracker获得有哪些map task已执行结束，这段过程不表，有兴趣的朋友可以关注下。Reducer真正运行之前，所有的时间都是在拉取数据，做merge，且不断重复地在做。如前面的方式一样，下面我也分段地描述reduce 端的Shuffle细节：

Copy过程，简单地拉取数据。Reduce进程启动一些数据copy线程(Fetcher)，通过HTTP方式请求map task所在的TaskTracker获取map task的输出文件。因为map task早已结束，这些文件就归TaskTracker管理在本地磁盘中。
Merge阶段。这里的merge如map端的merge动作，只是数组中存放的是不同map端copy来的数值。Copy过来的数据会先放入内存缓冲区中，这里的缓冲区大小要比map端的更为灵活，它基于JVM的heap size设置，因为Shuffle阶段Reducer不运行，所以应该把绝大部分的内存都给Shuffle用。这里需要强调的是，merge有三种形式：1)内存到内存 2)内存到磁盘 3)磁盘到磁盘。默认情况下第一种形式不启用，让人比较困惑，是吧。当内存中的数据量到达一定阈值，就启动内存到磁盘的merge。与map 端类似，这也是溢写的过程，这个过程中如果你设置有Combiner，也是会启用的，然后在磁盘中生成了众多的溢写文件。第二种merge方式一直在运行，直到没有map端的数据时才结束，然后启动第三种磁盘到磁盘的merge方式生成最终的那个文件。
Reducer的输入文件。不断地merge后，最后会生成一个“最终文件”。为什么加引号？因为这个文件可能存在于磁盘上，也可能存在于内存中。对我们来说，当然希望它存放于内存中，直接作为Reducer的输入，但默认情况下，这个文件是存放于磁盘中的。至于怎样才能让这个文件出现在内存中，之后的性能优化篇我再说。当Reducer的输入文件已定，整个Shuffle才最终结束。然后就是Reducer执行，把结果放到HDFS上。

上面就是整个Shuffle的过程。细节很多，我很多都略过了，只试着把要点说明白。当然，我可能也有理解或表述上的很多问题，不吝指点。