标签:lan linear ref color dataset count tail readlines 改进
书上没有给具体的逻辑回归的课程,就直接上了代码,这很不好!
可以参考ng的课程,或者看这篇博文:http://blog.csdn.net/wlmnzf/article/details/72855610?utm_source=itdadao
过程还是比较浅显易懂的,就没怎么备注了。
1 # _*_ coding:utf-8 _*_ 2 3 from numpy import * 4 def loadDataSet(): 5 dataMat = [] 6 labelMat = [] 7 fr = open(‘testSet.txt‘) 8 for line in fr.readlines(): 9 lineArr = line.strip().split() 10 dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) 11 labelMat.append(int(lineArr[2])) 12 return dataMat, labelMat 13 14 def sigmoid(inX): 15 return 1.0/(1 + exp(-inX)) 16 17 def gradAscent(dataMatIn, classLabels): 18 dataMatrix = mat(dataMatIn) 19 labelMat = mat(classLabels).transpose() 20 m,n = shape(dataMatrix) 21 alpha = 0.001 22 maxCycles = 500 23 weights = ones((n,1)) 24 for k in range(maxCycles): 25 h = sigmoid(dataMatrix*weights) 26 error = (labelMat - h) # 是数 这里没给出推导过程,推导过程上文有链接 27 weights = weights + alpha * dataMatrix.transpose() * error 28 return weights 29 30 # 5-3 随机梯度上升算法 31 def stocGradAscent0(dataMatrix, classLabels): 32 m,n = shape(dataMatrix) 33 alpha = 0.01 34 weights = ones(n) 35 for i in range(m): 36 h = sigmoid(sum(dataMatrix[i]*weights)) 37 error = classLabels[i] - h # 是向量 38 weights = weights + alpha * error * dataMatrix[i] 39 return weights 40 41 # 5-4 改进的随机梯度上升算法 42 def stocGradAscent1(dataMatrix, classLabels, numIter=150): 43 m,n = shape(dataMatrix) 44 45 weights = ones(n) 46 for j in range(numIter): 47 dataIndex = range(m) 48 for i in range(m): 49 alpha = 4/(1.0+j+i) + 0.01 50 randIndex = int(random.uniform(0, len(dataIndex))) 51 h = sigmoid(sum(dataMatrix[randIndex]*weights)) 52 error = classLabels[randIndex] - h # 是向量 53 weights = weights + alpha * error * dataMatrix[randIndex] 54 del(dataIndex[randIndex]) 55 return weights 56 57 58 def plotBestFit(weights): 59 import matplotlib.pyplot as plt 60 # weights = wei.getA() # 把matrix变为array 61 dataMat, labelMat = loadDataSet() 62 dataArr = array(dataMat) 63 n = shape(dataArr)[0] 64 xcord1 = [] 65 ycord1 = [] 66 xcord2 = [] 67 ycord2 = [] 68 for i in range(n): 69 if int(labelMat[i])==1: 70 xcord1.append(dataArr[i,1]) 71 ycord1.append(dataArr[i,2]) 72 else: 73 xcord2.append(dataArr[i,1]) 74 ycord2.append(dataArr[i,2]) 75 fig = plt.figure() 76 ax = fig.add_subplot(111) 77 ax.scatter(xcord1, ycord1, c=‘red‘, s=30, marker=‘s‘) # marker中s代表square 78 ax.scatter(xcord2, ycord2, c=‘green‘, s=30) 79 x = arange(-3, 3, 0.1) 80 y = (-weights[0] - weights[1] * x) / weights[2] 81 ax.plot(x, y) 82 plt.xlabel(‘X1‘) 83 plt.ylabel(‘X2‘) 84 plt.show() 85 86 def classifyVector(inX, weights): 87 prob = sigmoid(sum(inX * weights)) 88 if prob > 0.5: return 1.0 89 else: return 0.0 90 91 def colicTest(): 92 frTrain = open(‘horseColicTraining.txt‘) 93 frTest = open(‘horseColicTest.txt‘) 94 trainingSet = [] 95 trainingLabels = [] 96 for line in frTrain.readlines(): 97 currLine = line.strip().split(‘\t‘) 98 lineArr = [] 99 for i in range(21): 100 lineArr.append(float(currLine[i])) 101 trainingSet.append(lineArr) 102 trainingLabels.append(float(currLine[21])) 103 trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500) 104 errorCount = 0 105 numTestVec = 0.0 106 for line in frTest.readlines(): 107 numTestVec += 1.0 108 currLine = line.strip().split(‘\t‘) 109 lineArr = [] 110 for i in range(21): 111 lineArr.append(float(currLine[i])) 112 if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]): 113 int(currLine[21]) 114 errorCount += 1 115 errorRate = (float(errorCount)/numTestVec) 116 print "the error rate of this test is: %f" % errorRate 117 return errorRate 118 119 def multiTest(): 120 numTests = 10 121 errorSum = 0.0 122 for k in range(numTests): 123 errorSum += colicTest() 124 print "after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests)) 125 126 multiTest()
标签:lan linear ref color dataset count tail readlines 改进
原文地址:http://www.cnblogs.com/DianeSoHungry/p/7083007.html