# A dictionary of movie critics and their ratings of a small # set of movies critics={‘Lisa Rose‘: {‘Lady in the Water‘: 2.5, ‘Snakes on a Plane‘: 3.5, ‘Just My Luck‘: 3.0, ‘Superman Returns‘: 3.5, ‘You, Me and Dupree‘: 2.5, ‘The Night Listener‘: 3.0}, ‘Gene Seymour‘: {‘Lady in the Water‘: 3.0, ‘Snakes on a Plane‘: 3.5, ‘Just My Luck‘: 1.5, ‘Superman Returns‘: 5.0, ‘The Night Listener‘: 3.0, ‘You, Me and Dupree‘: 3.5}, ‘Michael Phillips‘: {‘Lady in the Water‘: 2.5, ‘Snakes on a Plane‘: 3.0, ‘Superman Returns‘: 3.5, ‘The Night Listener‘: 4.0}, ‘Claudia Puig‘: {‘Snakes on a Plane‘: 3.5, ‘Just My Luck‘: 3.0, ‘The Night Listener‘: 4.5, ‘Superman Returns‘: 4.0, ‘You, Me and Dupree‘: 2.5}, ‘Mick LaSalle‘: {‘Lady in the Water‘: 3.0, ‘Snakes on a Plane‘: 4.0, ‘Just My Luck‘: 2.0, ‘Superman Returns‘: 3.0, ‘The Night Listener‘: 3.0, ‘You, Me and Dupree‘: 2.0}, ‘Jack Matthews‘: {‘Lady in the Water‘: 3.0, ‘Snakes on a Plane‘: 4.0, ‘The Night Listener‘: 3.0, ‘Superman Returns‘: 5.0, ‘You, Me and Dupree‘: 3.5}, ‘Toby‘: {‘Snakes on a Plane‘:4.5,‘You, Me and Dupree‘:1.0,‘Superman Returns‘:4.0}} from math import sqrt from pandas import DataFrame import numpy as np import operator import scipy.stats.stats as scss import pandas as pd # #import pydelicious def sim_distance(prefs, person1, person2): #prefs = DataFrame(prefs) sub = prefs[person1] - prefs[person2] commonCount = len(sub) - len(sub[np.isnan(sub)]) if commonCount == 0: #print(‘%s and %s has nothing in common.‘%(person1,person2)) return 0 sim = 1/ (1+sqrt(sum(sub[~np.isnan(sub)]**2))) return sim def sim_pearson(prefs, person1, person2): #prefs = DataFrame(prefs) pref = prefs[[person1,person2]] pref = pref.dropna(axis=0, how = ‘any‘) sub = prefs[person1] - prefs[person2] commonCount = len(sub) - len(sub[np.isnan(sub)]) if commonCount == 0: #print(‘%s and %s has nothing in common.‘%(person1,person2)) return 0 sim = scss.pearsonr(pref[person1], pref[person2])[0] return sim def topMatches(prefs, person, n=3, similarity = sim_distance): #prefs = DataFrame(prefs) sims = {} for other in prefs.columns: if other == person: continue sim = similarity(prefs, person, other) sims[other] = sim sortedSims = sorted(sims.items(), key = operator.itemgetter(1), reverse = True) return sortedSims[:n] def getRecommendations(prefs, person, similarity = sim_pearson): prefs = DataFrame(prefs).T totalSums = {} simSums = {} for other in prefs.columns: if other == person: continue sim = similarity(prefs, other, person) if sim <= 0: continue for item in prefs[other].index: if np.isnan(prefs[person][item]) or prefs[person][item] == 0: if np.isnan(prefs[other][item] * sim): continue totalSums.setdefault(item,0) totalSums[item] += prefs[other][item] * sim simSums.setdefault(item,0) simSums[item] += sim rankings = [(total/simSums[item], item) for item, total in totalSums.items()] rankings.sort() rankings.reverse() return rankings def caculateSimilarItems(prefs, n=10): simularItems = {} for item in prefs.columns: topMatch = topMatches(prefs, item, n = n, similarity = sim_distance) simularItems[item] = topMatch return simularItems def getRecommandationItems(prefs, simularItems, person): totals = {} sumSims = {} pref = prefs.loc[person, :] alreadyWatched = pref[~pref.isna()] for i in alreadyWatched.index: #所有當前用戶看過的電影 if i not in simularItems.keys(): continue #其他人都沒看過的電影 for item, sim in simularItems[i]: if item in alreadyWatched.index: continue totals.setdefault(item,0) sumSims.setdefault(item,0) totals[item] += alreadyWatched[i] * sim sumSims[item] += sim ranking = [(score/ sumSims[item],item) for item,score in totals.items()] ranking.sort() ranking.reverse() return ranking if __name__ == "__main__": homePath = ‘P:\Department\Celer\個人資料夾\F3234506_麗娟\Extra\Python\资源\9780596529321-master\PCI_Code Folder\chapter2\DataSet\ml-latest-small‘ #homePath1 = os.path.dirname(os.path.abspath(‘__filen__‘)) #os.path.join() with open(homePath + ‘\\ratings.csv‘,‘rb‘) as ratings_f: ratings = pd.read_csv(ratings_f) with open(homePath + ‘\\movies.csv‘, ‘rb‘) as movies_f: movies = pd.read_csv(movies_f) with open(homePath + ‘\\tags.csv‘, ‘rb‘) as tags_f: tags = pd.read_csv(tags_f) movies = movies.drop_duplicates([‘title‘]) data = pd.merge(ratings, movies, on = ‘movieId‘, how =‘outer‘) #.merge(tags, on = [‘movieId‘,‘userId‘]) data = data.pivot(index = ‘userId‘, columns = ‘title‘,values = ‘rating‘) #ratings = ratings.pivot(index = ‘movieId‘, columns = ‘userId‘, values = ‘rating‘) #ratings = ratings.pivot(index = ‘userId‘, columns = ‘movieId‘, values = ‘rating‘).loc[:,0:100] similarity = caculateSimilarItems(data.iloc[:,0:500]) #print(similarity) recommItems = getRecommandationItems(data, similarity, 6) print(recommItems)
原文:https://www.cnblogs.com/Colleen-Blog/p/10865201.html