标签:
def getTotalSize(rdd: RDD[Row]): Long = { // This can be a parameter val NO_OF_SAMPLE_ROWS = 10l; val totalRows = rdd.count(); var totalSize = 0l if (totalRows > NO_OF_SAMPLE_ROWS) { val sampleRDD = rdd.sample(true, NO_OF_SAMPLE_ROWS) val sampleRDDSize = getRDDSize(sampleRDD) totalSize = sampleRDDSize.*(totalRows)./(NO_OF_SAMPLE_ROWS) } else { // As the RDD is smaller than sample rows count, we can just calculate the total RDD size totalSize = getRDDSize(rdd) } totalSize } def getRDDSize(rdd: RDD[Row]) : Long = { var rddSize = 0l val rows = rdd.collect() for (i <- 0 until rows.length) { rddSize += SizeEstimator.estimate(rows.apply(i).toSeq.map { value => value.asInstanceOf[AnyRef] }) } rddSize }
标签:
原文地址:http://www.cnblogs.com/bigbigtree/p/5669927.html