标签:machine learning python theano
使用python的theano编写Logistic Regression进行二分类学习,使用到的数据集可以到这里下载。
我们知道Logistic Regression是在一个多元线性函数的基础上加了一个非线性函数,常用的非线性函数是Sigmoid函数。加上sigmoid之后的输出我们认为是对应分类为1的概率,所以需要学习的参数就是线性加权的系数和截距(bias)。
h(x) = wx + b
g(x) = 1 / ( 1 + exp(-h(x)) ) = 1 / ( 1 + exp( -wx-b ) )
那么对应的分类为1的概率可表示为:
p(y=1 | x; w, b) = g(x)
那么对于一个已知数据的概率表示为:
p(y | x; w, b) = g(x)^y (1 - g(x))^(1-y)
于是最后训练的目标函数就是要最大化已知数据的似然函数,将上面的概率进行连乘就是拟合训练数据的似然函数了。但是由于连乘在计算和精度上的问题,通常对似然函数进行log,如果是单个实例进行对数化结果就是:
log(p) = ylog(g(x)) + (1-y)log(1-g(x))
这个看起来有点像交叉熵,将这个对训练数据进行累加就是最后的log似然了。当然前面加一个符号就是负log似然,参数求解就是要最小化这个负log似然时对应的参数情况。常采用的方法是梯度下降。
下面贴上一份借助python theano实现的二分类Logistic Regression,最后输出的是在训练数据上的错误率,有兴趣的同学可以看看。代码中使用到的训练数据可以到这里下载。
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 16 21:37:43 2014
@author: BrightHush
Example for Logistic Regression
"""
import time
import numpy
import theano
import theano.tensor as T
rng = numpy.random
class LogisticRegression(object):
def __init__(self, input, n_in):
self.w = theano.shared(
value=rng.randn(n_in),
name='w',
borrow=True)
self.b = theano.shared(value=.10, name='b')
self.p_given_x = 1 / (1+T.exp(-T.dot(input, self.w) - self.b))
self.y_given_x = self.p_given_x > 0.5
self.params = [self.w, self.b]
def negative_log_likelihood(self, y):
ll = -y * T.log(self.p_given_x) - (1-y) * T.log(1 - self.p_given_x)
cost = ll.mean() + 0.01 * (self.w ** 2).sum()
return cost
def errors(self, y):
return T.mean(T.neq(self.y_given_x, y))
def generate_data():
rng = numpy.random
N = 1000
feats = 5
D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
x = D[0]
y = D[1]
x, y = read_data()
x_shared = theano.shared(numpy.asarray(x,
dtype=theano.config.floatX),
borrow=True)
y_shared = theano.shared(numpy.asarray(y,
dtype=theano.config.floatX),
borrow=True)
return x_shared, T.cast(y_shared, 'int32')
def sgd_optimization(learning_rate=0.13, n_epochs=1000, batch_size=100):
train_x, train_y = generate_data()
n_batches = train_x.get_value(borrow=True).shape[0] / batch_size
index = T.lscalar()
x = T.matrix('x')
y = T.ivector('y')
lr = LogisticRegression(x, train_x.get_value().shape[1])
cost = lr.negative_log_likelihood(y)
print 'compile function test_model...'
test_model = theano.function(inputs=[index],
outputs=lr.errors(y),
givens={
x : train_x[index*batch_size : (index+1)*batch_size],
y : train_y[index*batch_size : (index+1)*batch_size]
})
g_w = T.grad(cost=cost, wrt=lr.w)
g_b = T.grad(cost=cost, wrt=lr.b)
updates = [(lr.w, lr.w-learning_rate*g_w),
(lr.b, lr.b-learning_rate*g_b)]
print 'complie function train_model...'
train_model = theano.function(inputs=[index],
outputs=cost,
updates=updates,
givens={
x : train_x[index*batch_size : (index+1)*batch_size],
y : train_y[index*batch_size : (index+1)*batch_size]
})
best_train_error = numpy.Inf
start_time = time.clock()
for epoch in xrange(n_epochs):
for minibatch_index in xrange(n_batches):
batch_cost = train_model(minibatch_index)
train_errors = [test_model(i) for i in xrange(n_batches)]
train_error = numpy.mean(train_errors)
if best_train_error > train_error:
best_train_error = train_error
print 'epoch %d, best_train_error %lf, train_error %lf' %(epoch, best_train_error, train_error)
#print 'iterator %d %lf' %(epoch*n_batches + minibatch_index+1, batch_cost)
end_time = time.clock()
print 'cost %d' %(end_time-start_time)
def read_data():
print 'load data...'
data = numpy.loadtxt('.\\titanic.dat', delimiter=',', skiprows=8)
x = []
y = []
for i in xrange(data.shape[0]):
x.append(data[i, : data.shape[1]-1])
if data[i, -1]==-1.0:
y.append(0)
else:
y.append(1)
x = numpy.array(x)
y = numpy.array(y)
print '%d examples, %d columns every row' %(data.shape[0], data.shape[1])
#normalize the fatures
feature_min = x.min(0)
feature_max = x.max(0)
x = x - numpy.array(feature_min)
x = x / numpy.array(feature_max - feature_min)
print x.min(0), x.max(0)
return numpy.array(x), numpy.array(y)
if __name__ == '__main__':
sgd_optimization()
Logistic Regression to do Binary Classification
标签:machine learning python theano
原文地址:http://blog.csdn.net/geniusluzh/article/details/41898929