标签:
#coding=utf-8 from math import sqrt from loadMovieLens import loadMovieLensTrain from loadMovieLens import loadMovieLensTest ### 计算pearson相关度 def sim_pearson(prefer, person1, person2): sim = {} #查找双方都评价过的项 for item in prefer[person1]: if item in prefer[person2]: sim[item] = 1 #将相同项添加到字典sim中 #元素个数 n = len(sim) if len(sim)==0: return -1 # 所有偏好之和 sum1 = sum([prefer[person1][item] for item in sim]) sum2 = sum([prefer[person2][item] for item in sim]) #求平方和 sum1Sq = sum( [pow(prefer[person1][item] ,2) for item in sim] ) sum2Sq = sum( [pow(prefer[person2][item] ,2) for item in sim] ) #求乘积之和 ∑XiYi sumMulti = sum([prefer[person1][item]*prefer[person2][item] for item in sim]) num1 = sumMulti - (sum1*sum2/n) num2 = sqrt( (sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n)) if num2==0: ### 如果分母为0,本处将返回0. return 0 result = num1/num2 return result ### 获取对item评分的K个最相似用户(K默认20) def topKMatches(prefer, person, itemId, k=20, sim = sim_pearson): userSet = [] scores = [] users = [] #找出所有prefer中评价过Item的用户,存入userSet for user in prefer: if itemId in prefer[user]: userSet.append(user) #计算相似性 scores = [(sim(prefer, person, other),other) for other in userSet if other!=person] #按相似度排序 scores.sort() scores.reverse() if len(scores)<=k: #如果小于k,只选择这些做推荐。 for item in scores: users.append(item[1]) #提取每项的userId return users else: #如果>k,截取k个用户 kscore = scores[0:k] for item in kscore: users.append(item[1]) #提取每项的userId return users #返回K个最相似用户的ID ### 计算用户的平均评分 def getAverage(prefer, userId): count = 0 sum = 0 for item in prefer[userId]: sum = sum + prefer[userId][item] count = count+1 return sum/count ### 平均加权策略,预测userId对itemId的评分 def getRating(prefer1, userId, itemId, knumber=20,similarity=sim_pearson): sim = 0.0 averageOther =0.0 jiaquanAverage = 0.0 simSums = 0.0 #获取K近邻用户(评过分的用户集) users = topKMatches(prefer1, userId, itemId, k=knumber, sim = sim_pearson) #获取userId 的平均值 averageOfUser = getAverage(prefer1, userId) #计算每个用户的加权,预测 for other in users: sim = similarity(prefer1, userId, other) #计算比较其他用户的相似度 averageOther = getAverage(prefer1, other) #其他用户的平均分 # 累加 simSums += abs(sim) #取绝对值 jiaquanAverage += (prefer1[other][itemId]-averageOther)*sim #累加,一些值为负 # simSums为0,即该项目尚未被其他用户评分,这里的处理方法:返回用户平均分 if simSums==0: return averageOfUser else: return (averageOfUser + jiaquanAverage/simSums) ##================================================================== ## getAllUserRating(): 获取所有用户的预测评分,存放到fileResult中 ## ## 参数:fileTrain,fileTest 是训练文件和对应的测试文件,fileResult为结果文件 ## similarity是相似度度量方法,默认是皮尔森。 ##================================================================== def getAllUserRating(fileTrain=‘u1.base‘, fileTest=‘u1.test‘, fileResult=‘result.txt‘, similarity=sim_pearson): prefer1 = loadMovieLensTrain(fileTrain) # 加载训练集 prefer2 = loadMovieLensTest(fileTest) # 加载测试集 inAllnum = 0 file = open(fileResult, ‘a‘) file.write("%s\n"%("------------------------------------------------------")) for userid in prefer2: #test集中每个用户 for item in prefer2[userid]: #对于test集合中每一个项目用base数据集,CF预测评分 rating = getRating(prefer1, userid, item, 20) #基于训练集预测用户评分(用户数目<=K) file.write(‘%s\t%s\t%s\n‘%(userid, item, rating)) inAllnum = inAllnum +1 file.close() print("-------------Completed!!-----------",inAllnum) ############ 主程序 ############## if __name__ == "__main__": print("\n--------------推荐系统 运行中... -----------\n") getAllUserRating(‘u1.base‘, ‘u1.test‘, ‘result.txt‘)
标签:
原文地址:http://www.cnblogs.com/blueyyc/p/5666463.html