关于最大熵模型的介绍请看:http://www.cnblogs.com/hexinuaa/p/3353479.html
下面是GIS训练算法的python实现,代码不到100行。
from collections import defaultdict
import math
class MaxEnt(object):
def __init__(self):
self.feats = defaultdict(int)
self.trainset = []
self.labels = set()
def load_data(self,file):
for line in open(file):
fields = line.strip().split()
# at least two columns
if len(fields) < 2: continue
# the first column is label
label = fields[0]
self.labels.add(label)
for f in set(fields[1:]):
# (label,f) tuple is feature
self.feats[(label,f)] += 1
self.trainset.append(fields)
def _initparams(self):
self.size = len(self.trainset)
# M param for GIS training algorithm
self.M = max([len(record)-1 for record in self.trainset])
self.ep_ = [0.0]*len(self.feats)
for i,f in enumerate(self.feats):
# calculate feature expectation on empirical distribution
self.ep_[i] = float(self.feats[f])/float(self.size)
# each feature function correspond to id
self.feats[f] = i
# init weight for each feature
self.w = [0.0]*len(self.feats)
self.lastw = self.w
def probwgt(self,features,label):
wgt = 0.0
for f in features:
if (label,f) in self.feats:
wgt += self.w[self.feats[(label,f)]]
return math.exp(wgt)
"""
calculate feature expectation on model distribution
"""
def Ep(self):
ep = [0.0]*len(self.feats)
for record in self.trainset:
features = record[1:]
# calculate p(y|x)
prob = self.calprob(features)
for f in features:
for w,l in prob:
# only focus on features from training data.
if (l,f) in self.feats:
# get feature id
idx = self.feats[(l,f)]
# sum(1/N * f(y,x)*p(y|x)), p(x) = 1/N
ep[idx] += w * (1.0/self.size)
return ep
def _convergence(self,lastw,w):
for w1,w2 in zip(lastw,w):
if abs(w1-w2) >= 0.01:
return False
return True
def train(self, max_iter =1000):
self._initparams()
for i in range(max_iter):
print ‘iter %d ...‘%(i+1)
# calculate feature expectation on model distribution
self.ep = self.Ep()
self.lastw = self.w[:]
for i,win enumerate(self.w):
delta = 1.0/self.M * math.log(self.ep_[i]/self.ep[i])
# update w
self.w[i] += delta
print self.w
# test if the algorithm is convergence
if self._convergence(self.lastw,self.w):
break
def calprob(self,features):
wgts = [(self.probwgt(features, l),l) for l in self.labels]
Z = sum([ w for w,l in wgts])
prob = [ (w/Z,l) for w,l in wgts]
return prob
def predict(self,input):
features = input.strip().split()
prob = self.calprob(features)
prob.sort(reverse=True)
return prob
运行:
prepare training data:
Outdoor Sunny Happy
Outdoor Sunny Happy Dry
Outdoor Sunny Happy Humid
Outdoor Sunny Sad Dry
Outdoor Sunny Sad Humid
Outdoor Cloudy Happy Humid
Outdoor Cloudy Happy Humid
Outdoor Cloudy Sad Humid
Outdoor Cloudy Sad Humid
Indoor Rainy Happy Humid
Indoor Rainy Happy Dry
Indoor Rainy Sad Dry
Indoor Rainy Sad Humid
Indoor Cloudy Sad Humid
Indoor Cloudy Sad Humid
open ipython to run the following commands:
In [11]: import maxent
In [12]: model = maxent.MaxEnt()
In [13]: model.load_data(‘data/gameLocation.dat‘)
In [14]: model.train()
In [11]: import maxent
In [12]: model = maxent.MaxEnt()
In [13]: model.load_data(‘data/gameLocation.dat‘)
In [14]: model.train()
iter 1 ...
iter 2 ...
iter 3 ...
iter 4 ...
iter 5 ...
iter 6 ...
iter 7 ...
iter 8 ...
iter 9 ...
iter 10 ...
iter 11 ...
iter 12 ...
iter 13 ...
iter 14 ...
iter 15 ...
iter 16 ...
iter 17 ...
iter 18 ...
iter 19 ...
iter 20 ...
iter 21 ...
iter 22 ...
iter 23 ...
iter 24 ...
iter 25 ...
iter 26 ...
iter 27 ...
iter 28 ...
iter 29 ...
iter 30 ...
iter 31 ...
iter 32 ...
iter 33 ...
iter 34 ...
iter 35 ...
iter 36 ...
iter 37 ...
iter 38 ...
iter 39 ...
iter 40 ...
iter 41 ...
iter 42 ...
iter 43 ...
iter 44 ...
iter 45 ...
iter 46 ...
iter 47 ...
iter 48 ...
iter 49 ...
iter 50 ...
iter 51 ...
iter 52 ...
iter 53 ...
iter 54 ...
iter 55 ...
iter 56 ...
iter 57 ...
iter 58 ...
iter 59 ...
iter 60 ...
iter 61 ...
iter 62 ...
iter 63 ...
iter 64 ...
iter 65 ...
iter 66 ...
iter 67 ...
iter 68 ...
iter 69 ...
iter 70 ...
iter 71 ...
iter 72 ...
iter 73 ...
iter 74 ...
iter 75 ...
iter 76 ...
iter 77 ...
iter 78 ...
iter 79 ...
iter 80 ...
iter 81 ...
iter 82 ...
iter 83 ...
iter 84 ...
iter 85 ...
iter 86 ...
iter 87 ...
iter 88 ...
iter 89 ...
iter 90 ...
iter 91 ...
iter 92 ...
iter 93 ...
iter 94 ...
iter 95 ...
iter 96 ...
iter 97 ...
iter 98 ...
iter 99 ...
iter 100 ...
iter 101 ...
iter 102 ...
iter 103 ...
iter 104 ...
iter 105 ...
iter 106 ...
iter 107 ...
iter 108 ...
iter 109 ...
iter 110 ...
iter 111 ...
iter 112 ...
iter 113 ...
iter 114 ...
iter 115 ...
iter 116 ...
iter 117 ...
iter 118 ...
iter 119 ...
iter 120 ...
iter 121 ...
iter 122 ...
iter 123 ...
iter 124 ...
iter 125 ...
iter 126 ...
iter 127 ...
iter 128 ...
iter 129 ...
iter 130 ...
iter 131 ...
iter 132 ...
iter 133 ...
iter 134 ...
iter 135 ...
iter 136 ...
iter 137 ...
iter 138 ...
iter 139 ...
iter 140 ...
iter 141 ...
iter 142 ...
iter 143 ...
iter 144 ...
In [16]: model.predict(‘Sunny‘)
Out[16]: [(0.9763203118841158, ‘Outdoor‘), (0.02367968811588421, ‘Indoor‘)]
In [18]: model.predict(‘Cloudy‘)
Out[18]: [(0.7136730549489295, ‘Outdoor‘), (0.28632694505107054, ‘Indoor‘)]
原文地址:http://blog.csdn.net/hexinuaa/article/details/24711675