码迷,mamicode.com
首页 > 其他好文 > 详细

聚类-----KMeans

时间:2017-11-07 18:14:18      阅读:175      评论:0      收藏:0      [点我收藏+]

标签:res   1.4   each   creat   int   only   style   session   最大   

package Spark_MLlib

import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.{Vector, Vectors}

/**
  * K均值
  */
case class features_schema(features:Vector)
object 聚类__KMeans {
       val spark=SparkSession.builder().master("local[2]").getOrCreate()
       import spark.implicits._
  def main(args: Array[String]): Unit = {

       val data=spark.sparkContext.textFile("file:///home/soyo/桌面/spark编程测试数据/soyo2.txt")
                  .map(_.split(",")).map(x=>features_schema(Vectors.dense(x(0).toDouble,x(1).toDouble,x(2).toDouble,x(3).toDouble))).toDF()
         data.show()
        val KMeansModel=new KMeans().setK(7).setFeaturesCol("features").setPredictionCol("prediction").fit(data)
        val results=KMeansModel.transform(data)
         results.show(150)
        //模型所有的聚类中心(指最后生成的聚类中心,K是几就有几组)的情况
         KMeansModel.clusterCenters.foreach(println)
        //集合内误差平方和(选取K的大小可以参照,使用场景+最大的集合内误差平方的值=较合适的K)
         val cost=KMeansModel.computeCost(data)
         println(cost)
  }
}

结果:

+-----------------+
|         features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
|[5.4,3.9,1.7,0.4]|
|[4.6,3.4,1.4,0.3]|
|[5.0,3.4,1.5,0.2]|
|[4.4,2.9,1.4,0.2]|
|[4.9,3.1,1.5,0.1]|
|[5.4,3.7,1.5,0.2]|
|[4.8,3.4,1.6,0.2]|
|[4.8,3.0,1.4,0.1]|
|[4.3,3.0,1.1,0.1]|
|[5.8,4.0,1.2,0.2]|
|[5.7,4.4,1.5,0.4]|
|[5.4,3.9,1.3,0.4]|
|[5.1,3.5,1.4,0.3]|
|[5.7,3.8,1.7,0.3]|
|[5.1,3.8,1.5,0.3]|
+-----------------+
only showing top 20 rows

+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]|         0|
|[4.9,3.0,1.4,0.2]|         0|
|[4.7,3.2,1.3,0.2]|         0|
|[4.6,3.1,1.5,0.2]|         0|
|[5.0,3.6,1.4,0.2]|         0|
|[5.4,3.9,1.7,0.4]|         0|
|[4.6,3.4,1.4,0.3]|         0|
|[5.0,3.4,1.5,0.2]|         0|
|[4.4,2.9,1.4,0.2]|         0|
|[4.9,3.1,1.5,0.1]|         0|
|[5.4,3.7,1.5,0.2]|         0|
|[4.8,3.4,1.6,0.2]|         0|
|[4.8,3.0,1.4,0.1]|         0|
|[4.3,3.0,1.1,0.1]|         0|
|[5.8,4.0,1.2,0.2]|         0|
|[5.7,4.4,1.5,0.4]|         0|
|[5.4,3.9,1.3,0.4]|         0|
|[5.1,3.5,1.4,0.3]|         0|
|[5.7,3.8,1.7,0.3]|         0|
|[5.1,3.8,1.5,0.3]|         0|
+-----------------+----------+
only showing top 20 rows

[5.005999999999999,3.4180000000000006,1.4640000000000002,0.2439999999999999]
[6.8538461538461535,3.076923076923076,5.715384615384614,2.0538461538461537]
[5.883606557377049,2.740983606557377,4.388524590163936,1.4344262295081966]
78.94506582597859

聚类-----KMeans

标签:res   1.4   each   creat   int   only   style   session   最大   

原文地址:http://www.cnblogs.com/soyo/p/7799422.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!