2016年12月29日 星期四

[R] RJDBC的例子,透過顧客行為的相似度建立推薦系統

自從之前安裝好R上的 rJava 和 RJDBC 套件(這篇:R 上安裝 RJDBC 和 rJava 的除錯),就應該可以借回那本書繼續學習R上連結用MySQL的例子。加上最近又想繼續股票短線的技術分析模型,希望將計算後的指標輸出到 Excel可以接手做試驗的形式,於是這幾天都在對著RStudio。股票方面思考過匯出怎樣的資料和格式才方便Excel處理,整理過之前的幾段Scirpt, 但想要的結果仍在做回溯測試中。而SQL的例子就可以告一段落了。
這個例子中的"Sakila"數據是MySQL 提供的樣本,假設一間DVD租售店的營運,數據庫的結構參考: https://dev.mysql.com/doc/sakila/en/sakila-structure.html。這個教學例子是利用顧客的租借記錄,以及電影的類別、分級等資料,製做一個推薦系統,為顧客推薦他可能有興趣的電影。一個方法是計算利用顧客的借閱記錄計算顧客間的相似性,這個就是下面的例子。書中也提及另一個方法是利用被電影被借閱的記錄計算電影間的相似性。實際上,更好的算法應該還要考慮在新的顧客或電影時如何更新推薦。



#########
#
#Recommendation System
#
######### 

setwd("(folder path)")
dbUser="(user name)"; dbPw="(user pw)"

#原本使用的ODBC套件
#require(package=RODBC)
#ch = odbcConnect(dsn=sakila)
#customer = sqlQuery(channel=ch, query=sql)
#odbcClose(channel=ch)

# 使用RJDBC套件
library(RJDBC)
drv = JDBC("com.mysql.jdbc.Driver", "/Users/ryanli/Documents/WorkingDirectory/R/Tools/mysql-connector-java-5.1.39/mysql-connector-java-5.1.39-bin.jar", identifier.quote="`")
conn = dbConnect(drv, "jdbc:mysql://localhost:3306/sakila", dbUser, dbPw)
rm(dbUser, dbPw)
sql = 'SELECT * FROM rental AS t1 
JOIN 
(SELECT film.*, film_category.category_id FROM film,film_category 
WHERE film.film_id=film_category.film_id) AS t2 
ON t2.film_id= (SELECT film_id FROM inventory AS t3 
WHERE t3.inventory_id = t1.inventory_id);'
customer = dbGetQuery(conn, sql)
rm(conn)

#資料整理
customer = customer[,-c(10, 13, 20)]
dim(customer)

customer$category_id = as.factor(customer$category_id)
require(Hmisc)
describe(customer)
customer = customer[,-1]

# 檢視 Category & Rating
sql = 'SELECT category_id, name FROM category'
category_name = dbGetQuery(conn, sql)
customer$category_id = as.factor(customer$category_id)
levels(customer$category_id) = category_name$name

require(ggplot2)
p = ggplot(data=customer, mapping=aes(x=category_id, fill=..count..))
p+geom_bar()
p = ggplot(data=customer, mapping=aes(x=rating, fill=..count..))
p+geom_bar()


# 計算顧客間的相似性
# by similarity in distribution of rating, category, title
similarity = function(c1,c2,w1=0.3,w2=0.3,w3=1-w1-w2){
  c1_rating = table(c1$rating)/dim(c1)[1]
  c1_category = table(c1$category_id)/dim(c1)[1]
  c1_title = unique(c1$title)
  c2_rating = table(c2$rating)/dim(c2)[1]
  c2_category = table(c2$category_id)/dim(c2)[1]
  c2_title = unique(c2$title)
  s1 = 1/(1+sqrt(sum((c1_rating-c2_rating)^2)))
  s2 = 1/(1+sqrt(sum((c1_category-c2_category)^2)))
  s3 = length(intersect(c1_title, c2_title))/
    (max(length(c1_title),length(c2_title)))
  s = s1*w1 + s2*w2 + s3*w3
  return(s)
}

cu = customer[,c("customer_id", "title", "rating", "category_id")]
cu$customer_id = factor(cu$customer_id)
cu$category_id = factor(cu$category_id)
cu$rating = factor(cu$rating)
cus = split(x=cu, f=cu$customer_id)

result = 0; m = 1; len = length(cus)
for(j in 2:len){
 for(i in 1:(j-1)){
  result[m] = similarity(cus[[i]], cus[[j]])
  m=m+1
 }
}
save(result, file="./similarity.Rdata")


# 重整相似性矩陣
# load(file="./similarity.Rdata")
ma = diag(x=rep(NA,len))
ma[upper.tri(ma)] = result
ma[lower.tri(ma)] = t(ma)[lower.tri(t(ma))] 
ma[1:5,1:5]
library(reshape2)
p = ggplot(melt(ma[1:30,1:30], value.name="similarity"), aes(Var1, Var2, fill=similarity))
p + geom_raster()+labs(x="customers", y="customers")

# 為所有電影借閱頻率排序
film_rate = as.data.frame(table(customer$film_id))
names(film_rate)[1] = "film_id"
head(film_rate)

# 進行推薦
MaxN = function(similarity, n=5){
 re = apply(X=ma, MARGIN=1, FUN=function(x) order(x, decreasing=TRUE)[2:(n+1)])
 dimnames(re) = list(paste0("Recommend", c(1:n)), dimnames(re)[1])
 return(re)
}
re = MaxN(similarity=ma, n=3)
userID = 1
customerID = sort(unique(cu$customer_id))

movies = customer[customer$customer_id%in%re[, userID], c("customer_id", "title", "film_id")]
movies = movies[!duplicated(movies$title),]
sim = data.frame(ID=re[, userID], sim=ma[1,re[, userID]])
movies = merge(x=movies, y=sim, by.x="customer_id", by.y="ID")
movies = merge(x=movies, y=film_rate, by="film_id")
movies = transform(movies, Freq_adjust=Freq*sim)

recommend_movies = movies[order(movies$Freq_adjust, decreasing=TRUE), "title"]
setdiff(recommend_movies, customer[customer$customer_id==1, "title"])[1:10]





沒有留言:

張貼留言