這個例子中的"Sakila"數據是MySQL 提供的樣本,假設一間DVD租售店的營運,數據庫的結構參考: https://dev.mysql.com/doc/sakila/en/sakila-structure.html。這個教學例子是利用顧客的租借記錄,以及電影的類別、分級等資料,製做一個推薦系統,為顧客推薦他可能有興趣的電影。一個方法是計算利用顧客的借閱記錄計算顧客間的相似性,這個就是下面的例子。書中也提及另一個方法是利用被電影被借閱的記錄計算電影間的相似性。實際上,更好的算法應該還要考慮在新的顧客或電影時如何更新推薦。
######### # #Recommendation System # #########
setwd("(folder path)") dbUser="(user name)"; dbPw="(user pw)"
#原本使用的ODBC套件 #require(package=RODBC) #ch = odbcConnect(dsn=sakila) #customer = sqlQuery(channel=ch, query=sql) #odbcClose(channel=ch)
# 使用RJDBC套件
library(RJDBC) drv = JDBC("com.mysql.jdbc.Driver", "/Users/ryanli/Documents/WorkingDirectory/R/Tools/mysql-connector-java-5.1.39/mysql-connector-java-5.1.39-bin.jar", identifier.quote="`") conn = dbConnect(drv, "jdbc:mysql://localhost:3306/sakila", dbUser, dbPw) rm(dbUser, dbPw) sql = 'SELECT * FROM rental AS t1 JOIN (SELECT film.*, film_category.category_id FROM film,film_category WHERE film.film_id=film_category.film_id) AS t2 ON t2.film_id= (SELECT film_id FROM inventory AS t3 WHERE t3.inventory_id = t1.inventory_id);' customer = dbGetQuery(conn, sql) rm(conn) #資料整理 customer = customer[,-c(10, 13, 20)] dim(customer) customer$category_id = as.factor(customer$category_id) require(Hmisc) describe(customer) customer = customer[,-1] # 檢視 Category &
Rating
sql = 'SELECT category_id, name FROM category' category_name = dbGetQuery(conn, sql) customer$category_id = as.factor(customer$category_id) levels(customer$category_id) = category_name$name require(ggplot2) p = ggplot(data=customer, mapping=aes(x=category_id, fill=..count..)) p+geom_bar() p = ggplot(data=customer, mapping=aes(x=rating, fill=..count..)) p+geom_bar() # 計算顧客間的相似性 # by similarity in distribution of rating, category, title similarity = function(c1,c2,w1=0.3,w2=0.3,w3=1-w1-w2){ c1_rating = table(c1$rating)/dim(c1)[1] c1_category = table(c1$category_id)/dim(c1)[1] c1_title = unique(c1$title) c2_rating = table(c2$rating)/dim(c2)[1] c2_category = table(c2$category_id)/dim(c2)[1] c2_title = unique(c2$title) s1 = 1/(1+sqrt(sum((c1_rating-c2_rating)^2))) s2 = 1/(1+sqrt(sum((c1_category-c2_category)^2))) s3 = length(intersect(c1_title, c2_title))/ (max(length(c1_title),length(c2_title))) s = s1*w1 + s2*w2 + s3*w3 return(s) } cu = customer[,c("customer_id", "title", "rating", "category_id")] cu$customer_id = factor(cu$customer_id) cu$category_id = factor(cu$category_id) cu$rating = factor(cu$rating) cus = split(x=cu, f=cu$customer_id) result = 0; m = 1; len = length(cus) for(j in 2:len){ for(i in 1:(j-1)){ result[m] = similarity(cus[[i]], cus[[j]]) m=m+1 } } save(result, file="./similarity.Rdata") # 重整相似性矩陣 # load(file="./similarity.Rdata") ma = diag(x=rep(NA,len)) ma[upper.tri(ma)] = result ma[lower.tri(ma)] = t(ma)[lower.tri(t(ma))] ma[1:5,1:5] library(reshape2) p = ggplot(melt(ma[1:30,1:30], value.name="similarity"), aes(Var1, Var2, fill=similarity)) p + geom_raster()+labs(x="customers", y="customers")
# 為所有電影借閱頻率排序 film_rate = as.data.frame(table(customer$film_id)) names(film_rate)[1] = "film_id" head(film_rate)
# 進行推薦 MaxN = function(similarity, n=5){ re = apply(X=ma, MARGIN=1, FUN=function(x) order(x, decreasing=TRUE)[2:(n+1)]) dimnames(re) = list(paste0("Recommend", c(1:n)), dimnames(re)[1]) return(re) } re = MaxN(similarity=ma, n=3)
userID = 1
customerID = sort(unique(cu$customer_id)) movies = customer[customer$customer_id%in%re[, userID], c("customer_id", "title", "film_id")] movies = movies[!duplicated(movies$title),] sim = data.frame(ID=re[, userID], sim=ma[1,re[, userID]]) movies = merge(x=movies, y=sim, by.x="customer_id", by.y="ID") movies = merge(x=movies, y=film_rate, by="film_id") movies = transform(movies, Freq_adjust=Freq*sim) recommend_movies = movies[order(movies$Freq_adjust, decreasing=TRUE), "title"] setdiff(recommend_movies, customer[customer$customer_id==1, "title"])[1:10]
沒有留言:
張貼留言