标签:read cto cas textfile wim ref log text numpy
val path = "/usr/data/lfw-a/*" val rdd = sc.wholeTextFiles(path) val first = rdd.first println(first)
val files = rdd.map { case (fileName, content) => fileName.replace("file:", "") } println(files.first)
println(files.count
%pyspark import matplotlib.pyplot as plt path = "/usr/data/lfw-a/Aaron_Eckhart/Aaron_Eckhart_0001.jpg" ae = plt.imread(path) plt.imshow(ae) plt.show()
import java.awt.image.BufferedImage def loadImageFromFile(path: String): BufferedImage = { import javax.imageio.ImageIO import java.io.File ImageIO.read(new File(path)) }
val aePath = "/usr/data/lfw-a/Aaron_Eckhart/Aaron_Eckhart_0001.jpg" val aeImage = loadImageFromFile(aePath)
import java.awt.image def processImage(image: BufferedImage, width: Int, height: Int): BufferedImage = { val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY) val g = bwImage.getGraphics() g.drawImage(image, 0, 0, width, height, null) g.dispose() bwImage }
val grayImage = processImage(aeImage, 100, 100)
import javax.imageio.ImageIO import java.io.File ImageIO.write(grayImage, "jpg", new File("/tmp/aeGray.jpg"))
%pyspark import matplotlib.pyplot as plt tmpPath = "/tmp/aeGray.jpg" aeGary = plt.imread(tmpPath) plt.imshow(aeGary, cmap=plt.cm.gray) plt.show()
def getPixelsFromImage(image: BufferedImage): Array[Double] = { val width = image.getWidth val height = image.getHeight val pixels = Array.ofDim[Double](width * height) image.getData.getPixels(0, 0, width, height, pixels) }
def extractPixels(path: String, width: Int, height: Int): Array[Double] = { val raw = loadImageFromFile(path) val processed = processImage(raw, width, height) getPixelsFromImage(processed) }
val pixels = files.map(f => extractPixels(f, 50, 50)) println(pixels.take(10).map(_.take(10).mkString ("", ",", ", ...")).mkString("\n"))
import org.apache.spark.mllib.linalg.Vectors val vectors = pixels.map(p => Vectors.dense(p)) vectors.setName("image-vectors") vectors.cache
import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.feature.StandardScaler val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.RowMatrix val matrix = new RowMatrix(scaledVectors) val K = 10 val pc = matrix.computePrincipalComponents(K)
http://stackoverflow.com/questions/21138751/spark-java-lang-outofmemoryerror-java-heap-space
SPARK_MEM=${SPARK_MEM:-512m}
export SPARK_MEM
# Set JAVA_OPTS to be able to load native libraries and to set heap size
JAVA_OPTS="$OUR_JAVA_OPTS"
JAVA_OPTS="$JAVA_OPTS -Djava.library.path=$SPARK_LIBRARY_PATH"
JAVA_OPTS="$JAVA_OPTS -Xms$SPARK_MEM -Xmx$SPARK_MEM"
spark.driver.memory 4000m
val rows = pc.numRows val cols = pc.numCols println(rows, cols)
import breeze.linalg.DenseMatrix val pcBreeze = new DenseMatrix(rows, cols, pc.toArray) import breeze.linalg.csvwrite csvwrite(new File("/tmp/pc.csv"), pcBreeze)
%pyspark import numpy as np import matplotlib.pyplot as plt pcs = np.loadtxt("/tmp/pc.csv", delimiter=",") print(pcs.shape)
%pyspark import numpy as np import matplotlib.pyplot as plt def plot_gallery(images, h, w, n_row=2, n_col=5): plt.figure(figsize=(1.8 * n_col, 2.4 * n_row)) plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90,hspace=.35) for i in range(n_row * n_col): plt.subplot(n_row, n_col, i + 1) plt.imshow(images[:, i].reshape((h, w)), cmap=plt.cm.gray) plt.title("Eigenface %d" % (i + 1), size=12) plt.xticks(()) plt.yticks(())
%pyspark plot_gallery(pcs, 50, 50) plt.show()
标签:read cto cas textfile wim ref log text numpy
原文地址:http://www.cnblogs.com/5211314jackrose/p/6078734.html