标签:
为了简化执行作业的命令行。Hadoop它配备了一些辅助类。GenericOptionsParser它是一类。经常用来解释Hadoop命令行选项,并根据需要。至Configuration采取相应的对象设置值。
通常不直接使用GenericOptionsParser,更方便的方式是:实现Tool接口,通过ToolRunner来执行应用程序,ToolRunner内部调用GenericOptionsParser。
A utility to help run Tools.
ToolRunner can be used to run classes implementing Tool interface. It works in conjunction with GenericOptionsParser to parse the generic hadoop command line arguments and modifies the Configuration of the Tool. The application-specific options are passed along without being modified.
public static int run(Configuration conf, Tool tool, String[] args) throws Exception
public static int run(Tool tool, String[] args) throws Exception
4、ToolRunner完毕下面2个功能:
(1)为Tool创建一个Configuration对象。
(2)使得程序能够方便的读取參数配置。
ToolRunner完整源码例如以下:
package org.apache.hadoop.util; import java.io.PrintStream; import org.apache.hadoop.conf.Configuration; /** * A utility to help run {@link Tool}s. * * <p><code>ToolRunner</code> can be used to run classes implementing * <code>Tool</code> interface. It works in conjunction with * {@link GenericOptionsParser} to parse the * <a href="{@docRoot}/org/apache/hadoop/util/GenericOptionsParser.html#GenericOptions"> * generic hadoop command line arguments</a> and modifies the * <code>Configuration</code> of the <code>Tool</code>. The * application-specific options are passed along without being modified. * </p> * * @see Tool * @see GenericOptionsParser */ public class ToolRunner { /** * Runs the given <code>Tool</code> by {@link Tool#run(String[])}, after * parsing with the given generic arguments. Uses the given * <code>Configuration</code>, or builds one if null. * * Sets the <code>Tool</code>'s configuration with the possibly modified * version of the <code>conf</code>. * * @param conf <code>Configuration</code> for the <code>Tool</code>. * @param tool <code>Tool</code> to run. * @param args command-line arguments to the tool. * @return exit code of the {@link Tool#run(String[])} method. */ public static int run(Configuration conf, Tool tool, String[] args) throws Exception{ if(conf == null) { conf = new Configuration(); } GenericOptionsParser parser = new GenericOptionsParser(conf, args); //set the configuration back, so that Tool can configure itself tool.setConf(conf); //get the args w/o generic hadoop args String[] toolArgs = parser.getRemainingArgs(); return tool.run(toolArgs); } /** * Runs the <code>Tool</code> with its <code>Configuration</code>. * * Equivalent to <code>run(tool.getConf(), tool, args)</code>. * * @param tool <code>Tool</code> to run. * @param args command-line arguments to the tool. * @return exit code of the {@link Tool#run(String[])} method. */ public static int run(Tool tool, String[] args) throws Exception{ return run(tool.getConf(), tool, args); } /** * Prints generic command-line argurments and usage information. * * @param out stream to write usage information to. */ public static void printGenericCommandUsage(PrintStream out) { GenericOptionsParser.printGenericCommandUsage(out); } }
Unless explicitly turned off, Hadoop by default specifies two resources, loaded in-order from the classpath:
static{ //print deprecation warning if hadoop-site.xml is found in classpath ClassLoader cL = Thread.currentThread().getContextClassLoader(); if (cL == null) { cL = Configuration.class.getClassLoader(); } if(cL.getResource("hadoop-site.xml")!=null) { LOG.warn("DEPRECATED: hadoop-site.xml found in the classpath. " + "Usage of hadoop-site.xml is deprecated. Instead use core-site.xml, " + "mapred-site.xml and hdfs-site.xml to override properties of " + "core-default.xml, mapred-default.xml and hdfs-default.xml " + "respectively"); } addDefaultResource("core-default.xml"); addDefaultResource("core-site.xml"); }Configuration.java的源码中包括了以上代码,即通过静态语句为程序载入core-default.xml以及core-site.xml中的參数。
for (Entry<String, String> entry : conf){ ..... }
package org.apache.hadoop.util; import org.apache.hadoop.conf.Configurable; public interface Tool extends Configurable { int run(String [] args) throws Exception; }由此可见,Tool自身仅仅有一个方法run(String[]),同一时候它继承了Configuable的2个方法。
package org.apache.hadoop.conf; public interface Configurable { void setConf(Configuration conf); Configuration getConf(); }
package org.apache.hadoop.conf; public class Configured implements Configurable { private Configuration conf; public Configured() { this(null); } public Configured(Configuration conf) { setConf(conf); } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return conf; } }它有2个构造方法。各自是带Configuration參数的方法与不还參数的方法。
package org.jediael.hadoopdemo.toolrunnerdemo; import java.util.Map.Entry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class ToolRunnerDemo extends Configured implements Tool { static { //Configuration.addDefaultResource("hdfs-default.xml"); //Configuration.addDefaultResource("hdfs-site.xml"); //Configuration.addDefaultResource("mapred-default.xml"); //Configuration.addDefaultResource("mapred-site.xml"); } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); for (Entry<String, String> entry : conf) { System.out.printf("%s=%s\n", entry.getKey(), entry.getValue()); } return 0; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new ToolRunnerDemo(), args); System.exit(exitCode); } }
package org.jediael.hadoopdemo.toolrunnerdemo; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class WordCount extends Configured implements Tool{ public static class WordCountMap extends Mapper<LongWritable, Text, Text, IntWritable> { private final IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer token = new StringTokenizer(line); while (token.hasMoreTokens()) { word.set(token.nextToken()); context.write(word, one); } } } public static class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } context.write(key, new IntWritable(sum)); } } @Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf); job.setJarByClass(WordCount.class); job.setJobName("wordcount"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMap.class); job.setReducerClass(WordCountReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return(job.waitForCompletion(true)?执行程序:0:-1); } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new WordCount(), args); System.exit(exitCode); } }
[root@jediael project]# hadoop fs -mkdir wcin2 [root@jediael project]# hadoop fs -copyFromLocal /opt/jediael/apache-nutch-2.2.1/CHANGES.txt wcin2 [root@jediael project]# hadoop jar wordcount2.jar org.jediael.hadoopdemo.toolrunnerdemo.WordCount wcin2 wcout2
版权声明:本文博主原创文章,博客,未经同意不得转载。
标签:
原文地址:http://www.cnblogs.com/yxwkf/p/4832759.html