Hledání modelu uživatele
require(arules)
## Loading required package: arules
## Warning: package 'arules' was built under R version 3.4.2
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
load('C:/Users/Marta/Desktop/tirpitz/project/exercises.RData')
movies=unique(ratings[,c("MOVIEID", "RATING","MAVG", "CNT", "IMDBRATING", "GENRE1" , "METASCORE")])
to_clust=data.frame(GENRE=unique(movies$GENRE1))
to_clust$RATING=tapply(movies$RATING,movies$GENRE1,mean)
#to_clust$CNT=tapply(movies$CNT,movies$GENRE1,mean)
#to_clust$MAVG=tapply(movies$MAVG,movies$GENRE1,mean)
#to_clust[,c(2:ncol(to_clust))]=scale(to_clust[,c(2:ncol(to_clust))])
set.seed(100)
hc=hclust(dist(to_clust[,2:ncol(to_clust)]),method='ave')
plot(hc,labels=to_clust$GENRE,xlab='')
to_clust[order(to_clust$RATING),]
## GENRE RATING
## 12 Romance 2.000000
## 13 Thriller 3.000000
## 18 Sci-Fi 3.039735
## 15 Musical 3.077419
## 10 Family 3.081081
## 1 Drama 3.087818
## 5 Adventure 3.108476
## 3 Documentary 3.122288
## 4 Action 3.128991
## 6 Horror 3.137464
## 8 Biography 3.139720
## 17 Fantasy 3.147368
## 2 Comedy 3.167890
## 20 History 3.170455
## 16 Film-Noir 3.200000
## 11 Western 3.250000
## 7 Crime 3.264840
## 14 Animation 3.285714
## 9 Mystery 3.307692
## 19 War 3.400000
gen_clust=tapply(ratings$RATING,list(ratings$USERID,ratings$GENRE1),length)
gen_clust[is.na(gen_clust)]=0
sums=apply(gen_clust,1,sum)
gen_clust=gen_clust/sums
hc=hclust(dist(gen_clust[,2:ncol(gen_clust)]),method='ave')
plot(as.dendrogram(hc),horiz=TRUE,nodePar=list(lab.cex=0.1))
K-means
set.seed(1)
optClust=6
km.o=kmeans(gen_clust,optClust,nstart=3)
plot(km.o$centers[1,],type='b',col=1,pch=2,ylim=c(0,max(km.o$centers)),xaxt='n', main=paste0('Clusters for Genres (Flix)'),xlab='',ylab="popularity")#
labs=colnames(km.o$centers)
axis(1,at=0:length(labs),labels=c(0,labs),las=2,cex=0.5)
grid(length(labs)/2+1)
for(g in 2:optClust){
points(km.o$centers[g,],type='b',col=g,pch=g+1)
}
#plot(2:nclust,sapply(2:nclust,FUN=function(x)km.o[[x]]$tot.withinss ))
pr.out=prcomp(km.o$centers, scale=TRUE)
name.index=apply(km.o$centers,1,which.max)
clust.names=sapply(name.index,
FUN=function(x){colnames(km.o$centers)[x]})
mm=predict(pr.out,newdata=gen_clust)
plot(jitter(mm[,1:2]), col=km.o$cluster,
pch='.',xlab="1.principal component",ylab="2. PC",
main="User Clustering acc. to Genre (f)",
xlim=c(-20,20),ylim=c(-10,10),cex=1.3)
text(pr.out$x[,1:2], pch='.',col=1:optClust, labels = clust.names,
cex=1.5)
rules <- apriori(ratings[,c('dRATING','UGENRE','GENREMATCH','DIRECTOR')],
appearance = list(rhs=c("dRATING=5"), default="lhs"),
parameter = list(minlen=1, supp=0.0005, conf=0.4),
# appearance = list(rhs=c("Rating=5"), default="lhs"),
control = list(verbose=F))
rules.sorted <- sort(rules, by="lift")
# inspect(rules.sorted)
length(rules)
## [1] 37
# find redundant rules
subset.matrix <- is.subset(rules.sorted, rules.sorted)
subset.matrix[lower.tri(subset.matrix, diag=T)] = FALSE
redundant <- colSums(subset.matrix, na.rm=T) >= 1
#which(redundant)
# remove redundant rules
rules.pruned <- rules.sorted[!redundant]
inspect(rules.pruned[1:3])
## lhs rhs support confidence lift count
## [1] {UGENRE=Comedy,
## DIRECTOR=Andrew Stanton, Lee Unkrich} => {dRATING=5} 0.0008216443 0.5950266 2.650103 335
## [2] {DIRECTOR=Andrew Stanton, Lee Unkrich} => {dRATING=5} 0.0013097256 0.5842451 2.602084 534
## [3] {GENREMATCH,
## DIRECTOR=Andy Wachowski, Lana Wachowski} => {dRATING=5} 0.0005714720 0.5796020 2.581405 233
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 3.4.2
## Loading required package: grid
plot(rules.pruned)
plot(rules.pruned[1:10], method="graph", control=list(type="items"))
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## main = Graph for 10 rules
## nodeColors = c("#66CC6680", "#9999CC80")
## nodeCol = c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF", "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF", "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## edgeCol = c("#474747FF", "#494949FF", "#4B4B4BFF", "#4D4D4DFF", "#4F4F4FFF", "#515151FF", "#535353FF", "#555555FF", "#575757FF", "#595959FF", "#5B5B5BFF", "#5E5E5EFF", "#606060FF", "#626262FF", "#646464FF", "#666666FF", "#686868FF", "#6A6A6AFF", "#6C6C6CFF", "#6E6E6EFF", "#707070FF", "#727272FF", "#747474FF", "#767676FF", "#787878FF", "#7A7A7AFF", "#7C7C7CFF", "#7E7E7EFF", "#808080FF", "#828282FF", "#848484FF", "#868686FF", "#888888FF", "#8A8A8AFF", "#8C8C8CFF", "#8D8D8DFF", "#8F8F8FFF", "#919191FF", "#939393FF", "#959595FF", "#979797FF", "#999999FF", "#9A9A9AFF", "#9C9C9CFF", "#9E9E9EFF", "#A0A0A0FF", "#A2A2A2FF", "#A3A3A3FF", "#A5A5A5FF", "#A7A7A7FF", "#A9A9A9FF", "#AAAAAAFF", "#ACACACFF", "#AEAEAEFF", "#AFAFAFFF", "#B1B1B1FF", "#B3B3B3FF", "#B4B4B4FF", "#B6B6B6FF", "#B7B7B7FF", "#B9B9B9FF", "#BBBBBBFF", "#BCBCBCFF", "#BEBEBEFF", "#BFBFBFFF", "#C1C1C1FF", "#C2C2C2FF", "#C3C3C4FF", "#C5C5C5FF", "#C6C6C6FF", "#C8C8C8FF", "#C9C9C9FF", "#CACACAFF", "#CCCCCCFF", "#CDCDCDFF", "#CECECEFF", "#CFCFCFFF", "#D1D1D1FF", "#D2D2D2FF", "#D3D3D3FF", "#D4D4D4FF", "#D5D5D5FF", "#D6D6D6FF", "#D7D7D7FF", "#D8D8D8FF", "#D9D9D9FF", "#DADADAFF", "#DBDBDBFF", "#DCDCDCFF", "#DDDDDDFF", "#DEDEDEFF", "#DEDEDEFF", "#DFDFDFFF", "#E0E0E0FF", "#E0E0E0FF", "#E1E1E1FF", "#E1E1E1FF", "#E2E2E2FF", "#E2E2E2FF", "#E2E2E2FF")
## alpha = 0.5
## cex = 1
## itemLabels = TRUE
## labelCol = #000000B3
## measureLabels = FALSE
## precision = 3
## layout = NULL
## layoutParams = list()
## arrowSize = 0.5
## engine = igraph
## plot = TRUE
## plot_options = list()
## max = 100
## verbose = FALSE
plot(rules.pruned[1:20], method="paracoord", control=list(reorder=TRUE))
Rozdělení dat na trénovací/testovací, naučení modelu, predikce
ratings=ratings[! is.element(ratings[,'GENRE1'],c('Film-Noir','War')) & !is.na(ratings$METASCORE) ,]
#rozdeleni dat na trenovaci a testovaci
nRec=nrow(ratings)
set.seed(1)
trainRatio=0.8
train=sample(nRec,nRec*trainRatio)
test=setdiff(1:nRec, train)
#tvorba linearniho regresniho modelu
model=lm(RATING~MAVG+IMDBRATING,data=ratings[train,])
model=lm(RATING~IMDBRATING+MAVG+GENRE1+USHIFT+METASCORE,data=ratings[train,])
library(splines)
library(gam)
## Warning: package 'gam' was built under R version 3.4.2
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.2
## Loaded gam 1.14-4
model=gam(RATING~MAVG+IMDBRATING,data=ratings[train,])
model.b=gam(as.factor(RATING)~MAVG+IMDBRATING,data=ratings[train,],family=binomial)
#prohlizeni modelu
summary(model)
##
## Call: gam(formula = RATING ~ MAVG + IMDBRATING, data = ratings[train,
## ])
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5166 -0.6125 0.1094 0.7027 3.1908
##
## (Dispersion Parameter for gaussian family taken to be 0.9944)
##
## Null Deviance: 303694.2 on 270230 degrees of freedom
## Residual Deviance: 268727.9 on 270228 degrees of freedom
## AIC: 765383.1
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## MAVG 1 34966 34966 35161.4383 <2e-16 ***
## IMDBRATING 1 0 0 0.0074 0.9317
## Residuals 270228 268728 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#predikce pro testovaci data
pred=predict(model, newdata=ratings[test,],subset=! is.element(ratings[test,'GENRE1'],c('Film-Noir','War')))
#cbind(decathlon[test,c(1:2,3)],dec.pred)[order(-dec.pred),]
#cor.test(decathlon[test,3],dec.pred,method='spearman')#pearson na rank
Chybová funkce
with(ratings,
cat(' Mean absolute error:',mean(abs(pred-RATING[test]),na.rm = TRUE),'\n',
'Mean square error:',mean((pred-RATING[test])^2,na.rm = TRUE),'\n',
'Relative mean absolute error:',mean(abs(pred-RATING[test]),na.rm = TRUE)/mean(RATING[test]))
)
## Mean absolute error: 0.8011851
## Mean square error: 0.9919521
## Relative mean absolute error: 0.2218625
Porovnání do sebe vnořených modelů
model=lm(RATING~MAVG+IMDBRATING+USHIFT+METASCORE+GENRE1,data=ratings[train,])
model0=lm(RATING~1,data=ratings[train,])
model1=lm(RATING~MAVG,data=ratings[train,])
model2=lm(RATING~MAVG+IMDBRATING,data=ratings[train,])
model3=lm(RATING~MAVG+IMDBRATING+USHIFT,data=ratings[train,])
model4=lm(RATING~MAVG+IMDBRATING+USHIFT+METASCORE,data=ratings[train,])
anova(model0,model1,model2,model3,model4,model)
## Analysis of Variance Table
##
## Model 1: RATING ~ 1
## Model 2: RATING ~ MAVG
## Model 3: RATING ~ MAVG + IMDBRATING
## Model 4: RATING ~ MAVG + IMDBRATING + USHIFT
## Model 5: RATING ~ MAVG + IMDBRATING + USHIFT + METASCORE
## Model 6: RATING ~ MAVG + IMDBRATING + USHIFT + METASCORE + GENRE1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 270230 303694
## 2 270229 268728 1 34966 41735.0023 < 2.2e-16 ***
## 3 270228 268728 1 0 0.0087 0.9256
## 4 270227 226470 1 42257 50437.6516 < 2.2e-16 ***
## 5 270226 226443 1 28 33.1072 8.731e-09 ***
## 6 270210 226386 16 56 4.2118 2.854e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model8=lm(RATING~MAVG+USHIFT+METASCORE+GENRE1,data=ratings[train,])
anova(model8,model)
## Analysis of Variance Table
##
## Model 1: RATING ~ MAVG + USHIFT + METASCORE + GENRE1
## Model 2: RATING ~ MAVG + IMDBRATING + USHIFT + METASCORE + GENRE1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 270211 226428
## 2 270210 226386 1 41.96 50.082 1.478e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model
##
## Call:
## lm(formula = RATING ~ MAVG + IMDBRATING + USHIFT + METASCORE +
## GENRE1, data = ratings[train, ])
##
## Coefficients:
## (Intercept) MAVG IMDBRATING
## 0.0893804 0.9077739 0.0284350
## USHIFT METASCORE GENRE1Adventure
## 1.0093581 0.0009192 0.0056177
## GENRE1Animation GENRE1Biography GENRE1Comedy
## -0.0145083 0.0417096 0.0141992
## GENRE1Crime GENRE1Documentary GENRE1Drama
## 0.0162586 0.0412721 0.0345824
## GENRE1Family GENRE1Fantasy GENRE1Horror
## -0.1395856 0.0603930 0.0185457
## GENRE1Musical GENRE1Mystery GENRE1Romance
## -0.0309363 0.0197252 0.0719617
## GENRE1Sci-Fi GENRE1Thriller GENRE1Western
## 0.0582193 0.0552423 -0.0205674
model=lm(RATING~MAVG+CNT+IMDBRATING+USHIFT,data=ratings[train,])
#predikce pro testovaci data
pred=predict(model, newdata=ratings[test,])
confusion_t=table(ratings$RATING[test], round(pred))
#contour(imdb_range,rating_range,tab_rat,nlevels=45,add=T)
#image(imdb_range,rating_range,tab_rat)
persp(1:5,1:6,confusion_t-confusion_t*diag(1,5,6),theta=40,phi=30)
#model=lm(RATING~MAVG+CNT+GENRE1,data=ratings[sel,])
#summary(model)
sel=TRUE
require(earth)
## Loading required package: earth
## Warning: package 'earth' was built under R version 3.4.2
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 3.4.2
## Loading required package: plotrix
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 3.4.2
earth.mod <- earth(as.matrix(ratings[sel,c('MAVG','IMDBRATING','CNT')]),as.vector(ratings$RATING[sel]))
#earth(MAVG~CNT,data=ratings[sel,])
plotmo(earth.mod)
## plotmo grid: MAVG IMDBRATING CNT
## 3.610329 7 438
summary(earth.mod)
## Call: earth(x=as.matrix(ratings[sel,c("MAVG","IMDBRATING","CNT")...),
## y=as.vector(ratings$RATING[sel]))
##
## coefficients
## (Intercept) 2.7793569
## h(2.77778-MAVG) -0.9334554
## h(MAVG-2.77778) 0.9987702
##
## Selected 3 of 3 terms, and 1 of 3 predictors
## Termination condition: RSq changed by less than 0.001 at 3 terms
## Importance: MAVG, IMDBRATING-unused, CNT-unused
## Number of terms at each degree of interaction: 1 2 (additive model)
## GCV 0.9939625 RSS 335739.6 GRSq 0.1152168 RSq 0.1152378
library(tree)
tree.m=tree(RATING~GENRE1+MAVG+IMDBRATING,ratings)
#summary(tree.m)
plot(tree.m)
text(tree.m,pretty=0)