标签:des style blog http color java os 使用 io
public static void main(String[] args) throws Exception { final int res = ToolRunner.run(NutchConfiguration.create(), new SolrIndexerJob(), args); System.exit(res); }使用了ToolRunner.run()来执行程序,可参考:使用ToolRunner运行Hadoop程序基本原理分析。
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: SolrIndexerJob <solr url> (<batchId> | -all | -reindex) [-crawlId <id>]"); return -1; } if (args.length == 4 && "-crawlId".equals(args[2])) { getConf().set(Nutch.CRAWL_ID_KEY, args[3]); } try { indexSolr(args[0], args[1]); return 0; } catch (final Exception e) { LOG.error("SolrIndexerJob: " + StringUtils.stringifyException(e)); return -1; } }先判断参数的合理性,然后执行执行indexSolr(String,String)方法。
public void indexSolr(String solrUrl, String batchId) throws Exception { LOG.info("SolrIndexerJob: starting"); run(ToolUtil.toArgMap( Nutch.ARG_SOLR, solrUrl, Nutch.ARG_BATCH, batchId)); // do the commits once and for all the reducers in one go getConf().set(SolrConstants.SERVER_URL,solrUrl); SolrServer solr = SolrUtils.getCommonsHttpSolrServer(getConf()); if (getConf().getBoolean(SolrConstants.COMMIT_INDEX, true)) { solr.commit(); } LOG.info("SolrIndexerJob: done."); }
@Override public Map<String,Object> run(Map<String,Object> args) throws Exception { String solrUrl = (String)args.get(Nutch.ARG_SOLR); String batchId = (String)args.get(Nutch.ARG_BATCH); NutchIndexWriterFactory.addClassToConf(getConf(), SolrWriter.class); getConf().set(SolrConstants.SERVER_URL, solrUrl); currentJob = createIndexJob(getConf(), "solr-index", batchId); currentJob.waitForCompletion(true); ToolUtil.recordJobStatus(null, currentJob, results); return results; }
protected Job createIndexJob(Configuration conf, String jobName, String batchId) throws IOException, ClassNotFoundException { conf.set(GeneratorJob.BATCH_ID, batchId); Job job = new NutchJob(conf, jobName); // TODO: Figure out why this needs to be here job.getConfiguration().setClass("mapred.output.key.comparator.class", StringComparator.class, RawComparator.class); Collection<WebPage.Field> fields = getFields(job); StorageUtils.initMapperJob(job, fields, String.class, NutchDocument.class, IndexerMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(IndexerOutputFormat.class); return job; } }
public static class IndexerMapper extends GoraMapper<String, WebPage, String, NutchDocument> { public IndexUtil indexUtil; public DataStore<String, WebPage> store; protected Utf8 batchId; @Override public void setup(Context context) throws IOException { Configuration conf = context.getConfiguration(); batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR)); indexUtil = new IndexUtil(conf); try { store = StorageUtils.createWebStore(conf, String.class, WebPage.class); } catch (ClassNotFoundException e) { throw new IOException(e); } } protected void cleanup(Context context) throws IOException ,InterruptedException { store.close(); }; @Override public void map(String key, WebPage page, Context context) throws IOException, InterruptedException { ParseStatus pstatus = page.getParseStatus(); if (pstatus == null || !ParseStatusUtils.isSuccess(pstatus) || pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) { return; // filter urls not parsed } Utf8 mark = Mark.UPDATEDB_MARK.checkMark(page); if (!batchId.equals(REINDEX)) { if (!NutchJob.shouldProcess(mark, batchId)) { if (LOG.isDebugEnabled()) { LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different batch id (" + mark + ")"); } return; } } NutchDocument doc = indexUtil.index(key, page); if (doc == null) { return; } if (mark != null) { Mark.INDEX_MARK.putMark(page, Mark.UPDATEDB_MARK.checkMark(page)); store.put(key, page); } context.write(key, doc); } }
public NutchDocument index(String key, WebPage page) { NutchDocument doc = new NutchDocument(); doc.add("id", key); doc.add("digest", StringUtil.toHexString(page.getSignature())); if (page.getBatchId() != null) { doc.add("batchId", page.getBatchId().toString()); } String url = TableUtil.unreverseUrl(key); if (LOG.isDebugEnabled()) { LOG.debug("Indexing URL: " + url); } try { doc = filters.filter(doc, url, page); } catch (IndexingException e) { LOG.warn("Error indexing "+key+": "+e); return null; } // skip documents discarded by indexing filters if (doc == null) return null; float boost = 1.0f; // run scoring filters try { boost = scoringFilters.indexerScore(url, doc, page, boost); } catch (final ScoringFilterException e) { LOG.warn("Error calculating score " + key + ": " + e); return null; } doc.setScore(boost); // store boost for use by explain and dedup doc.add("boost", Float.toString(boost)); return doc; }
标签:des style blog http color java os 使用 io
原文地址:http://blog.csdn.net/jediael_lu/article/details/38817983