标签:
user-knn
1 import numpy 2 import csv 3 from numpy import * 4 5 ‘‘‘ 6 data from (1,1)->(user, item) 7 (user, 0) mean the mean rating of user u 8 (0, item) mean the mean rating of item i 9 ‘‘‘ 10 11 def toInt(arr): 12 print(‘toInt() startting...‘) 13 arr = mat(arr) 14 m, n = shape(arr) 15 nArr = zeros((m, n)) 16 for i in range(m): 17 for j in range(n): 18 nArr[i, j] = int(arr[i, j]) 19 print(‘toInt() ending...‘) 20 return nArr 21 22 def loadTrainData(path): 23 print(‘loadTrainData startting...‘) 24 l = [] 25 with open(path, ‘r‘) as file: 26 lines = csv.reader(file) 27 for line in lines: 28 l.append(line) 29 l = array(l) 30 print(‘loadTrainData ending...‘) 31 return toInt(l) 32 33 def loadTestData(path): 34 print(‘loadTestData startting...‘) 35 l = [] 36 with open(path) as file: 37 lines = csv.reader(file) 38 for line in lines: 39 l.append(line) 40 l = array(l) 41 print(‘loadTestData ending...‘) 42 return toInt(l) 43 44 def fillUIMatrix(uimatrix, train_data): 45 print(‘fillUIMatrix startting...‘) 46 train_data = mat(train_data) 47 m, n = shape(train_data) 48 for i in range(m): 49 uimatrix[train_data[i, 0], train_data[i, 1]] = train_data[i, 2] 50 print(‘fillUIMatrix ending...‘) 51 52 def calAverageRating(uimatrix): 53 print(‘calAverageRating starting...‘) 54 uimatrix = mat(uimatrix) 55 m, n = shape(uimatrix) 56 for i in range(1, m): 57 rating = 0 58 cnt = 0 59 for j in range(1, n): 60 rating += uimatrix[i, j] 61 if uimatrix[i, j] != 0: 62 cnt += 1 63 uimatrix[i, 0] = rating / cnt 64 65 for i in range(1, n): 66 rating = 0 67 cnt = 0 68 for j in range(1, m): 69 rating += uimatrix[j, i] 70 if uimatrix[j, i] != 0: 71 cnt += 1 72 if cnt == 0: uimatrix[0, i] = 0 73 else: uimatrix[0, i] = rating / cnt 74 print(‘calAverageRating ending...‘) 75 76 def calPerson(l1, l2, rating1, rating2): 77 print(‘calPerson startting...‘) 78 r1 = 0.0; r2 = 0.0; r3 = 0.0; 79 for i in range(len(l1)): 80 r1 += (l1[i]-rating1)*(l2[i]-rating2) 81 r2 += (l1[i]-rating1)*(l1[i]-rating1) 82 r3 += (l2[i]-rating2)*(l2[i]-rating2) 83 r = r1 / (sqrt(r2)*sqrt(r3)) 84 print(‘calPerson ending...‘) 85 return abs(r) 86 87 def rSort(r_list, index_list): 88 print(‘rSort startting...‘) 89 for i in range(len(r_list)-1): 90 for j in range(len(r_list)-1-i): 91 if r_list[j] < r_list[j+1]: 92 tmp = r_list[j] 93 r_list[j] = r_list[j+1] 94 r_list[j+1] = tmp 95 tmp = index_list[j] 96 index_list[j] = index_list[j+1] 97 index_list[j+1] = tmp 98 for i in range(len(r_list)): 99 print(i, ‘:‘, r_list[i]) 100 print(‘rSort ending...‘) 101 102 def calSim(uimatrix, index): 103 print(‘calSim startting...‘) 104 uimatrix = mat(uimatrix) 105 m, n = shape(uimatrix) 106 r_list = []; # sim list 107 index_list = []; # mapping sim and index 108 for i in range(1, m): 109 l1 = []; l2 = []; 110 if i == index: continue 111 for j in range(1, n): 112 if uimatrix[i, j] != 0 and uimatrix[index, j] != 0: 113 l1.append(uimatrix[index, j]); 114 l2.append(uimatrix[i, j]) 115 if l1 != []: 116 rating1 = 0; rating2 = 0; 117 for j in range(len(l1)): 118 rating1 += l1[j] 119 for j in range(len(l2)): 120 rating2 += l2[j] 121 rating1 /= len(l1); rating2 /= len(l2); 122 r = calPerson(l1, l2, rating1, rating2) 123 if math.isnan(r) == True: r = 0.0 124 r_list.append(r) 125 index_list.append(i) 126 rSort(r_list, index_list) 127 print(‘calSim ending...‘) 128 return r_list, index_list 129 130 def calRMSE(uimatrix, test_data, users): 131 print(‘calRMSE startting...‘) 132 test_data = mat(test_data) 133 m, n = shape(test_data) 134 tmp1 = 0 135 tmp2 = 0 136 for k in range(1, users+1): 137 for i in range(m): 138 if test_data[i, 0] == k: 139 if uimatrix[k, test_data[i, 1]] == 0.0: 140 uimatrix[k, test_data[i, 1]] = uimatrix[k, 0] 141 uimatrix[k, test_data[i, 1]] = round(uimatrix[k, test_data[i, 1]]) 142 tmp1 += (test_data[i, 2]-uimatrix[k, test_data[i, 1]])**2 143 tmp2 += 1 144 print(test_data[i, 1], ‘ real rating:‘, test_data[i, 2], ‘ predict:‘, uimatrix[k, test_data[i, 1]]) 145 print(‘calRMSE ending...‘) 146 return sqrt(tmp1/tmp2) 147 148 select_top = 30 149 users = 943 150 items = 1682 151 user_item_matrix = zeros((users+1, items+1)) 152 train_path = ‘C:\\Users\\think\\Desktop\\data\\u2.base‘ 153 test_path = ‘C:\\Users\\think\\Desktop\\data\\u2.test‘ 154 155 train_data = loadTrainData(train_path) 156 test_data = loadTestData(test_path) 157 158 fillUIMatrix(user_item_matrix, train_data) 159 calAverageRating(user_item_matrix) 160 uimatrix = user_item_matrix 161 uimatrix = mat(uimatrix) 162 163 for i in range(1,users): 164 r_list, index_list = calSim(uimatrix, i) 165 for j in range(1, items): 166 if uimatrix[i, j] == 0: 167 tmp1 = 0.0; tmp2 = 0.0; 168 for k in range(select_top): 169 if math.isnan(r_list[k]) == False and uimatrix[index_list[k], j] != 0: 170 tmp1 += r_list[k]*(uimatrix[index_list[k], j]-uimatrix[index_list[k], 0]) 171 tmp2 += r_list[k] 172 print(j, tmp1, tmp2) 173 if tmp2 == 0: uimatrix[i, j] = uimatrix[i, 0] 174 else: uimatrix[i, j] = uimatrix[i, 0] + tmp1/tmp2 175 176 RMSE = calRMSE(uimatrix, test_data, users) 177 print(RMSE)
lfm1
1 from numpy import * 2 import csv 3 import time 4 5 def RMSE(estimation, truth): 6 num = len(estimation) 7 8 sse = sum(square(truth - estimation)) 9 return sqrt(divide(sse, num-1.0)) 10 11 class matrixFactorization(): 12 def __init__(self, num_user, num_item, num_feature, train_data, test_data, **params): 13 self._num_user = num_user 14 self._num_item = num_item 15 self._num_featrue = num_feature 16 self._train_data = train_data 17 self._test_data = test_data 18 19 self.batch_size = int(params.get(‘batch_size‘, 1000000)) 20 21 self.epsilon = float(params.get(‘epsilon‘, 100.0)) 22 self.lam = float(params.get(‘lam‘, 0.00001)) 23 24 self.max_rating = params.get(‘max_rating‘) 25 self.min_rating = params.get(‘min_rating‘) 26 27 if self.max_rating: 28 self.max_rating = float(self.max_rating) 29 if self.min_rating: 30 self.min_rating = float(self.min_rating) 31 32 self._mean_rating = mean(self._train_data[:, 2]) 33 34 self._user_feature = 0.3 * random.rand(num_user, num_feature) 35 self._item_feature = 0.3 * random.rand(num_item, num_feature) 36 37 self.train_errors = [] 38 self.test_errors = [] 39 40 def estimate(self, iterations = 50, converge = 1e-4): 41 last_rmse = None 42 for iteration in range(iterations): 43 data = self._train_data 44 #compute gradient 45 u_features = (self._user_feature)[data[:, 0], :] 46 i_features = (self._item_feature)[data[:, 1], :] 47 ratings = data[:, 2] - self._mean_rating 48 preds = sum(u_features*i_features, 1) 49 errs = preds - ratings 50 err_mat = tile(errs, (self._num_featrue, 1)).T 51 52 u_grads = u_features * err_mat + self.lam * i_features 53 i_grads = i_features * err_mat + self.lam * u_features 54 55 u_feature_grads = zeros((self._num_user, self._num_featrue)) 56 i_feature_grads = zeros((self._num_item, self._num_featrue)) 57 58 for i in range(shape(data)[0]): 59 user = data[i, 0] 60 item = data[i, 1] 61 u_feature_grads[user, :] += u_grads[i, :] 62 i_feature_grads[item, :] += i_grads[i, :] 63 64 self._user_feature = self._user_feature - (self.epsilon / self.batch_size) * u_feature_grads 65 self._item_feature = self._item_feature - (self.epsilon / self.batch_size) * i_feature_grads 66 67 train_preds = self.predict(self._train_data) 68 train_rmse = RMSE(train_preds, float16(self._train_data[:, 2])) 69 70 test_preds = self.predict(self._test_data) 71 test_rmse = RMSE(test_preds, float16(self._test_data[:, 2])) 72 73 self.train_errors.append(train_rmse) 74 self.test_errors.append(test_rmse) 75 76 print(‘iterations: %3d, train RMSE: %.6f, test RMSE: %.6f‘) % (iteration+1, train_rmse, test_rmse) 77 78 if last_rmse: 79 if abs(train_rmse - last_rmse) < converge: 80 break 81 last_rmse = train_rmse 82 83 def predict(self, data): 84 u_features = self._user_feature[data[:, 0], :] 85 i_features = self._item_feature[data[:, 1], :] 86 preds = sum(u_features*i_features, 1) + self._mean_rating 87 88 if self.max_rating: 89 preds[preds > self.max_rating] = self.max_rating 90 if self.min_rating: 91 preds[preds < self.min_rating] = self.min_rating 92 return preds 93 94 def toInt(arr): 95 print(‘toInt() startting...‘) 96 arr = mat(arr) 97 m, n = shape(arr) 98 nArr = zeros((m, n), dtype=‘int8‘) 99 for i in range(m): 100 for j in range(n): 101 nArr[i, j] = int(arr[i, j]) 102 print(‘toInt() ending...‘) 103 return nArr 104 105 def loadTrainData(path): 106 print(‘loadTrainData startting...‘) 107 l = [] 108 with open(path, ‘r‘) as file: 109 lines = csv.reader(file) 110 for line in lines: 111 l.append(line) 112 l = array(l) 113 print(‘loadTrainData ending...‘) 114 return toInt(l) 115 116 def loadTestData(path): 117 print(‘loadTestData startting...‘) 118 l = [] 119 with open(path) as file: 120 lines = csv.reader(file) 121 for line in lines: 122 l.append(line) 123 l = array(l) 124 print(‘loadTestData ending...‘) 125 return toInt(l) 126 127 train_path = ‘C:\\Users\\think\\Desktop\\data\\u1.base‘ 128 test_path = ‘C:\\Users\\think\\Desktop\\data\\u1.test‘ 129 130 train_data = loadTrainData(train_path) 131 test_data = loadTestData(test_path) 132 num_feature = 15 133 max_iter = 20000 134 num_user = 943 135 num_item = 1682 136 rec = matrixFactorization(num_user, num_item, num_feature, train_data, test_data, max_rating=5, min_rating=1) 137 rec.estimate(max_iter)
lfm2
1 from __future__ import division 2 import numpy as np 3 import scipy as sp 4 from numpy import * 5 from numpy.random import random 6 import csv 7 8 class SVD_C: 9 def __init__(self,X,k=20): 10 ‘‘‘ 11 k is the length of vector 12 ‘‘‘ 13 self.X=np.array(X) 14 self.k=k 15 self.ave=np.mean(self.X[:,2]) 16 print "the input data size is ",self.X.shape 17 self.bi={} 18 self.bu={} 19 self.qi={} 20 self.pu={} 21 self.movie_user={} 22 self.user_movie={} 23 for i in range(self.X.shape[0]): 24 uid=self.X[i][0] 25 mid=self.X[i][1] 26 rat=self.X[i][2] 27 self.movie_user.setdefault(mid,{}) 28 self.user_movie.setdefault(uid,{}) 29 self.movie_user[mid][uid]=rat 30 self.user_movie[uid][mid]=rat 31 self.bi.setdefault(mid,0) 32 self.bu.setdefault(uid,0) 33 self.qi.setdefault(mid,random((self.k,1))/10*(np.sqrt(self.k))) 34 self.pu.setdefault(uid,random((self.k,1))/10*(np.sqrt(self.k))) 35 def pred(self,uid,mid): 36 self.bi.setdefault(mid,0) 37 self.bu.setdefault(uid,0) 38 self.qi.setdefault(mid,np.zeros((self.k,1))) 39 self.pu.setdefault(uid,np.zeros((self.k,1))) 40 if (self.qi[mid]==None): 41 self.qi[mid]=np.zeros((self.k,1)) 42 if (self.pu[uid]==None): 43 self.pu[uid]=np.zeros((self.k,1)) 44 ans=self.ave+self.bi[mid]+self.bu[uid]+np.sum(self.qi[mid]*self.pu[uid]) 45 if ans>5: 46 return 5 47 elif ans<1: 48 return 1 49 return ans 50 def train(self,steps=50,gamma=0.04,Lambda=0.15): 51 for step in range(steps): 52 print ‘the ‘,step,‘-th step is running‘ 53 rmse_sum=0.0 54 kk=np.random.permutation(self.X.shape[0]) 55 for j in range(self.X.shape[0]): 56 i=kk[j] 57 uid=self.X[i][0] 58 mid=self.X[i][1] 59 rat=self.X[i][2] 60 eui=rat-self.pred(uid,mid) 61 rmse_sum+=eui**2 62 self.bu[uid]+=gamma*(eui-Lambda*self.bu[uid]) 63 self.bi[mid]+=gamma*(eui-Lambda*self.bi[mid]) 64 temp=self.qi[mid] 65 self.qi[mid]+=gamma*(eui*self.pu[uid]-Lambda*self.qi[mid]) 66 self.pu[uid]+=gamma*(eui*temp-Lambda*self.pu[uid]) 67 gamma=gamma*0.93 68 print "the rmse of this step on train data is ",np.sqrt(rmse_sum/self.X.shape[0]) 69 #self.test(test_data) 70 def test(self,test_X): 71 output=[] 72 sums=0 73 test_X=np.array(test_X) 74 #print "the test data size is ",test_X.shape 75 for i in range(test_X.shape[0]): 76 pre=self.pred(test_X[i][0],test_X[i][1]) 77 output.append(pre) 78 #print pre,test_X[i][2] 79 sums+=(pre-test_X[i][2])**2 80 rmse=np.sqrt(sums/test_X.shape[0]) 81 print "the rmse on test data is ",rmse 82 return output 83 84 85 def toInt(arr): 86 print(‘toInt() startting...‘) 87 arr = mat(arr) 88 m, n = shape(arr) 89 nArr = zeros((m, n), dtype=‘int8‘) 90 for i in range(m): 91 for j in range(n): 92 nArr[i, j] = int(arr[i, j]) 93 print(‘toInt() ending...‘) 94 return nArr 95 96 def loadTrainData(path): 97 print(‘loadTrainData startting...‘) 98 l = [] 99 with open(path, ‘r‘) as file: 100 lines = csv.reader(file) 101 for line in lines: 102 l.append(line) 103 l = array(l) 104 print(‘loadTrainData ending...‘) 105 return toInt(l) 106 107 def loadTestData(path): 108 print(‘loadTestData startting...‘) 109 l = [] 110 with open(path) as file: 111 lines = csv.reader(file) 112 for line in lines: 113 l.append(line) 114 l = array(l) 115 print(‘loadTestData ending...‘) 116 return toInt(l) 117 118 train_path = ‘C:\\Users\\think\\Desktop\\data\\u1.base‘ 119 test_path = ‘C:\\Users\\think\\Desktop\\data\\u1.test‘ 120 121 train_data = loadTrainData(train_path) 122 test_data = loadTestData(test_path) 123 124 a = SVD_C(train_data, 30) 125 a.train() 126 a.test(test_data)
标签:
原文地址:http://www.cnblogs.com/JustForCS/p/5486974.html