码迷,mamicode.com
首页 > 编程语言 > 详细

spark中transformation操作的各种算子(java版)

时间:2016-05-07 08:13:42      阅读:277      评论:0      收藏:0      [点我收藏+]

标签:

package cn.spark.study.core;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

/**
* transformation操作实战
* @author dd
*
*/
public class TransformationOperation {
public static void main(String[] args) {
//mapTest();
//filterTest();
//flatMapTest();
//groupByKeyTest();
//reduceByKeyTest();
//sortByKeyTest();
joinTest();
}

/**
 * map算子案例:
 * 将集合中的元素都乘以2
 */
private static void mapTest(){
    SparkConf conf = new SparkConf()
                    .setAppName("map")
                    .setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    List<Integer> numbers = Arrays.asList(1,2,3,4,5);

    JavaRDD<Integer> numberRDD = sc.parallelize(numbers);

    JavaRDD<Integer> multipleNumberRDD = numberRDD.map(new Function<Integer, Integer>() {


        private static final long serialVersionUID = 1L;

        @Override
        public Integer call(Integer arg0) throws Exception {
            // TODO Auto-generated method stub
            return arg0*2;
        }
    });

    multipleNumberRDD.foreach(new VoidFunction<Integer>() {

        @Override
        public void call(Integer arg0) throws Exception {
            // TODO Auto-generated method stub
            System.out.print(arg0+" ");
        }
    });

    sc.close();
}

/**
 * filter算子案例:
 * 过滤集合中的偶数
 */
private static void filterTest(){
    SparkConf conf =new SparkConf()
                    .setAppName("filter")
                    .setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    List<Integer> numbers = Arrays.asList(1,2,3,4,5,6,7,8,9,10);

    JavaRDD<Integer> numberRDD = sc.parallelize(numbers);

    //filter算子传入的也是Function,call方法的返回值是Boolean
    //每一个初始RDD中的元素都会传入call方法,如果想在新的RDD中保留该元素则返回true,否则返回false
    JavaRDD<Integer> evenNumberRDD = numberRDD.filter(new Function<Integer, Boolean>() {

        private static final long serialVersionUID = 1L;

        @Override
        public Boolean call(Integer arg0) throws Exception {
            // TODO Auto-generated method stub
            return arg0 % 2 == 0;
        }
    });

    evenNumberRDD.foreach(new VoidFunction<Integer>() {


        private static final long serialVersionUID = 1L;

        @Override
        public void call(Integer arg0) throws Exception {
            System.out.println(arg0);

        }
    });

    sc.close();
}

/**
 * flatMap算zi
 * 拆分一行文本的单词
 */
private static void flatMapTest(){
    SparkConf conf = new SparkConf()
                    .setAppName("faltMap")
                    .setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    List<String> lineList = Arrays.asList("hello you","hello me","hello world");

    JavaRDD<String> lines = sc.parallelize(lineList);

    /*
     * 对RDD执行flatMap算子将每一行文本拆分为多个单词
     * flatMap其实就是接收原始RDD中的每个元素,并进行各种处理返回多个元素,即封装在Iterable中
     * 新的RDD中,即封装了所有的新元素,所以新的RDD大小一定大于原始的RDD
     */
    JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {

        private static final long serialVersionUID = 1L;

        @Override
        public Iterable<String> call(String arg0) throws Exception {
            // TODO Auto-generated method stub
            return Arrays.asList(arg0.split(" "));
        }
    });

    words.foreach(new VoidFunction<String>() {

        private static final long serialVersionUID = 1L;

        @Override
        public void call(String arg0) throws Exception {
            // TODO Auto-generated method stub
            System.out.println(arg0);
        }
    });


    sc.close();
}

/**
 * groupByKey算子
 * 案例:按照班级对成绩进行分组
 */
private static void groupByKeyTest(){
    SparkConf conf = new SparkConf()
                    .setAppName("groupByKey")
                    .setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    List<Tuple2<String, Integer>> scores = Arrays.asList(
                    new Tuple2<String, Integer>("class1",80),
                    new Tuple2<String, Integer>("class2",75),
                    new Tuple2<String, Integer>("class1",90),
                    new Tuple2<String, Integer>("class2",65));

    //创建JavaPairRDD
    JavaPairRDD<String, Integer> scoresRDD = sc.parallelizePairs(scores);

    JavaPairRDD<String, Iterable<Integer>> groupScores = scoresRDD.groupByKey();

    groupScores.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {

        @Override
        public void call(Tuple2<String, Iterable<Integer>> arg0) throws Exception {
            // TODO Auto-generated method stub
            System.out.println("class:"+arg0._1);
            Iterator<Integer> it = arg0._2.iterator();
            while(it.hasNext()){
                System.out.println(it.next());
            }
            System.out.println("====================================");
        }
    });

    sc.close();
}

/**
 * reduceByKey算子
 * 案例:求各个班级总分
 */
private static void reduceByKeyTest(){
    SparkConf conf = new SparkConf()
    .setAppName("reduceByKey")
    .setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    List<Tuple2<String, Integer>> scores = Arrays.asList(
            new Tuple2<String, Integer>("class1",80),
            new Tuple2<String, Integer>("class2",75),
            new Tuple2<String, Integer>("class1",90),
            new Tuple2<String, Integer>("class2",65));

    JavaPairRDD<String, Integer> scoresRDD = sc.parallelizePairs(scores);

    JavaPairRDD<String, Integer> totalScores = scoresRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {


        private static final long serialVersionUID = 1L;

        @Override
        public Integer call(Integer arg0, Integer arg1) throws Exception {
            // TODO Auto-generated method stub
            return arg0+arg1;
        }
    });

    totalScores.foreach(new VoidFunction<Tuple2<String,Integer>>() {

        @Override
        public void call(Tuple2<String, Integer> arg0) throws Exception {
            // TODO Auto-generated method stub
            System.out.println(arg0._1+" : "+arg0._2);
        }
    });

    sc.close();
}

/**
 * sortByKey算子
 * 案例:对学生成绩进行排序
 */
private static void sortByKeyTest(){
    SparkConf conf = new SparkConf()
    .setAppName("sortByKey")
    .setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    List<Tuple2<Integer, String>> scores = Arrays.asList(
            new Tuple2<Integer, String>(10,"leo"),
            new Tuple2<Integer, String>(100,"ksc"),
            new Tuple2<Integer, String>(99,"my"),
            new Tuple2<Integer, String>(80,"jack"));

    JavaPairRDD<Integer, String> scoresRDD = sc.parallelizePairs(scores);

    //默认true升序,false降序
    JavaPairRDD<Integer, String> sortedRDD = scoresRDD.sortByKey();

    sortedRDD.foreach(new VoidFunction<Tuple2<Integer,String>>() {

        @Override
        public void call(Tuple2<Integer, String> arg0) throws Exception {
            System.out.println(arg0._1+": "+arg0._2);

        }
    });

    sc.close();
}

/**
 * join
 * 案例:打印学生成绩
 */
private static void joinTest(){
    SparkConf conf = new SparkConf()
                    .setAppName("joinandCogroup")
                    .setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    List<Tuple2<Integer, String>> studentsList = Arrays.asList(
            new Tuple2<Integer, String>(1,"leo"),
            new Tuple2<Integer, String>(2,"jack"),
            new Tuple2<Integer, String>(3,"tom"));
    List<Tuple2<Integer, Integer>> scoresList = Arrays.asList(
            new Tuple2<Integer, Integer>(1,100),
            new Tuple2<Integer, Integer>(2,90),
            new Tuple2<Integer, Integer>(3,60));

    //并行化两个集合
    JavaPairRDD<Integer, String> studentsRDD = sc.parallelizePairs(studentsList);
    JavaPairRDD<Integer, Integer> scoresRDD = sc.parallelizePairs(scoresList);

    //使用join算子关联两个RDD
    JavaPairRDD<Integer, Tuple2<String, Integer>> studentscores = studentsRDD.join(scoresRDD);

    studentscores.foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() {

        @Override
        public void call(Tuple2<Integer, Tuple2<String, Integer>> arg0)
                throws Exception {
            // TODO Auto-generated method stub
            System.out.println("student id : "+arg0._1);
            System.out.println("student name: "+arg0._2._1);
            System.out.println("student score: "+arg0._2._2);
            System.out.println("==========================================");
        }
    });
}

}

spark中transformation操作的各种算子(java版)

标签:

原文地址:http://blog.csdn.net/kongshuchen/article/details/51334115

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!