本文共 2866 字,大约阅读时间需要 9 分钟。
org.apache.spark spark-core_2.11 2.1.0
idea中添加maven jdk8构建依赖
org.apache.maven.plugins maven-compiler-plugin
public static void main(String[] args){ //conf SparkConf conf = new SparkConf(); conf.setMaster("local"); conf.setAppName("wc"); //context JavaSparkContext context = new JavaSparkContext(conf); JavaRDDrdd1 = context.textFile("/home/wang/txt/word.txt"); //lambda: 表达式 JavaRDD rdd2 = rdd1.flatMap(s -> Arrays.asList(s.split(" ")).iterator()); JavaPairRDD rdd3 = rdd2.mapToPair(s -> new Tuple2 (s, 1)); JavaPairRDD rdd4 = rdd3.reduceByKey((x, y) -> x + y); //按单词升序 List > list1 = rdd4.sortByKey(true).collect(); //按词频降序 JavaPairRDD rdd5 = rdd4.mapToPair(x -> new Tuple2 (x._2, x._1)) .sortByKey() .mapToPair(x -> new Tuple2 (x._2, x._1)); List > list2 = rdd5.collect(); }
public static void main(String[] args){ //conf, context........ 同上 //1, a b c ===> split( ) : string[] {a,b,c} JavaRDDrdd2 = rdd1.flatMap(new FlatMapFunction () { public Iterator call(String s) throws Exception { String[] arr = s.split(" "); return Arrays.asList(arr).iterator(); } }); //2, string[] {a,b,c}==>(a,1),(b,1) JavaPairRDD rdd3 = rdd2.mapToPair(new PairFunction () { public Tuple2 call(String s) throws Exception { return new Tuple2 (s, 1); } }).filter(new Function , Boolean>() {//过滤空字符 public Boolean call(Tuple2 v1) throws Exception { return v1._1.trim().length()>0 ; } }); //3, (a,1),(b,1) ==> reduceByKey: (a,4), (b,3) JavaPairRDD rdd4 = rdd3.reduceByKey(new Function2 () { public Integer call(Integer v1, Integer v2) throws Exception { return v1+v2; } }); //4.1 排序( 字母生序) JavaPairRDD rddRes = rdd4.sortByKey(); List > list1 = rddRes.collect(); //4.2排序( 词频降序) JavaPairRDD sortRdd1 = rdd4.mapToPair(new PairFunction , Integer, String>() { public Tuple2 call(Tuple2 tup) throws Exception { return new Tuple2 (tup._2, tup._1); } }); JavaPairRDD sortRdd2 = sortRdd1.sortByKey(false).mapToPair(new PairFunction , String, Integer>() { public Tuple2 call(Tuple2 tup) throws Exception { return new Tuple2 (tup._2, tup._1); } }); List > list2 = sortRdd2.collect(); }
转载地址:http://gpdef.baihongyu.com/