R Exercises 2

Hledání modelu uživatele

require(arules)

## Loading required package: arules

## Warning: package 'arules' was built under R version 3.4.2

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

load('C:/Users/Marta/Desktop/tirpitz/project/exercises.RData')
movies=unique(ratings[,c("MOVIEID", "RATING","MAVG",       "CNT",  "IMDBRATING", "GENRE1"   ,    "METASCORE")])
to_clust=data.frame(GENRE=unique(movies$GENRE1))
to_clust$RATING=tapply(movies$RATING,movies$GENRE1,mean)
#to_clust$CNT=tapply(movies$CNT,movies$GENRE1,mean)
#to_clust$MAVG=tapply(movies$MAVG,movies$GENRE1,mean)
#to_clust[,c(2:ncol(to_clust))]=scale(to_clust[,c(2:ncol(to_clust))])
set.seed(100)
hc=hclust(dist(to_clust[,2:ncol(to_clust)]),method='ave')
plot(hc,labels=to_clust$GENRE,xlab='')

to_clust[order(to_clust$RATING),]

##          GENRE   RATING
## 12     Romance 2.000000
## 13    Thriller 3.000000
## 18      Sci-Fi 3.039735
## 15     Musical 3.077419
## 10      Family 3.081081
## 1        Drama 3.087818
## 5    Adventure 3.108476
## 3  Documentary 3.122288
## 4       Action 3.128991
## 6       Horror 3.137464
## 8    Biography 3.139720
## 17     Fantasy 3.147368
## 2       Comedy 3.167890
## 20     History 3.170455
## 16   Film-Noir 3.200000
## 11     Western 3.250000
## 7        Crime 3.264840
## 14   Animation 3.285714
## 9      Mystery 3.307692
## 19         War 3.400000

gen_clust=tapply(ratings$RATING,list(ratings$USERID,ratings$GENRE1),length)
gen_clust[is.na(gen_clust)]=0
sums=apply(gen_clust,1,sum)


gen_clust=gen_clust/sums
hc=hclust(dist(gen_clust[,2:ncol(gen_clust)]),method='ave')

plot(as.dendrogram(hc),horiz=TRUE,nodePar=list(lab.cex=0.1))

K-means

set.seed(1)
optClust=6
km.o=kmeans(gen_clust,optClust,nstart=3)
plot(km.o$centers[1,],type='b',col=1,pch=2,ylim=c(0,max(km.o$centers)),xaxt='n', main=paste0('Clusters for Genres (Flix)'),xlab='',ylab="popularity")#
labs=colnames(km.o$centers)
axis(1,at=0:length(labs),labels=c(0,labs),las=2,cex=0.5)
grid(length(labs)/2+1)
for(g in 2:optClust){
  points(km.o$centers[g,],type='b',col=g,pch=g+1)
}

#plot(2:nclust,sapply(2:nclust,FUN=function(x)km.o[[x]]$tot.withinss ))

pr.out=prcomp(km.o$centers, scale=TRUE)
name.index=apply(km.o$centers,1,which.max)
clust.names=sapply(name.index,        
                   FUN=function(x){colnames(km.o$centers)[x]})
mm=predict(pr.out,newdata=gen_clust)
plot(jitter(mm[,1:2]),  col=km.o$cluster,      
     pch='.',xlab="1.principal component",ylab="2. PC",
     main="User Clustering acc. to Genre (f)", 
     xlim=c(-20,20),ylim=c(-10,10),cex=1.3)
text(pr.out$x[,1:2],  pch='.',col=1:optClust, labels = clust.names, 
     cex=1.5)

 rules <- apriori(ratings[,c('dRATING','UGENRE','GENREMATCH','DIRECTOR')],
                  appearance = list(rhs=c("dRATING=5"), default="lhs"),
                  parameter = list(minlen=1, supp=0.0005, conf=0.4),
#  appearance = list(rhs=c("Rating=5"),  default="lhs"),
   control = list(verbose=F))
rules.sorted <- sort(rules, by="lift")
# inspect(rules.sorted)
length(rules)

## [1] 37

 # find redundant rules
 subset.matrix <- is.subset(rules.sorted, rules.sorted)
 subset.matrix[lower.tri(subset.matrix, diag=T)] = FALSE
 redundant <- colSums(subset.matrix, na.rm=T) >= 1
 #which(redundant)

 # remove redundant rules
 rules.pruned <- rules.sorted[!redundant]
 inspect(rules.pruned[1:3])

##     lhs                                          rhs              support confidence     lift count
## [1] {UGENRE=Comedy,                                                                                
##      DIRECTOR=Andrew Stanton, Lee Unkrich}    => {dRATING=5} 0.0008216443  0.5950266 2.650103   335
## [2] {DIRECTOR=Andrew Stanton, Lee Unkrich}    => {dRATING=5} 0.0013097256  0.5842451 2.602084   534
## [3] {GENREMATCH,                                                                                   
##      DIRECTOR=Andy Wachowski, Lana Wachowski} => {dRATING=5} 0.0005714720  0.5796020 2.581405   233

 library(arulesViz)

## Warning: package 'arulesViz' was built under R version 3.4.2

## Loading required package: grid

 plot(rules.pruned)

 plot(rules.pruned[1:10], method="graph", control=list(type="items"))

## Warning: Unknown control parameters: type

## Available control parameters (with default values):
## main  =  Graph for 10 rules
## nodeColors    =  c("#66CC6680", "#9999CC80")
## nodeCol   =  c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF",  "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF",  "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## edgeCol   =  c("#474747FF", "#494949FF", "#4B4B4BFF", "#4D4D4DFF", "#4F4F4FFF", "#515151FF", "#535353FF", "#555555FF", "#575757FF", "#595959FF", "#5B5B5BFF", "#5E5E5EFF", "#606060FF", "#626262FF", "#646464FF", "#666666FF", "#686868FF", "#6A6A6AFF", "#6C6C6CFF", "#6E6E6EFF", "#707070FF", "#727272FF", "#747474FF", "#767676FF", "#787878FF", "#7A7A7AFF", "#7C7C7CFF", "#7E7E7EFF", "#808080FF", "#828282FF", "#848484FF", "#868686FF", "#888888FF", "#8A8A8AFF", "#8C8C8CFF", "#8D8D8DFF", "#8F8F8FFF", "#919191FF", "#939393FF",  "#959595FF", "#979797FF", "#999999FF", "#9A9A9AFF", "#9C9C9CFF", "#9E9E9EFF", "#A0A0A0FF", "#A2A2A2FF", "#A3A3A3FF", "#A5A5A5FF", "#A7A7A7FF", "#A9A9A9FF", "#AAAAAAFF", "#ACACACFF", "#AEAEAEFF", "#AFAFAFFF", "#B1B1B1FF", "#B3B3B3FF", "#B4B4B4FF", "#B6B6B6FF", "#B7B7B7FF", "#B9B9B9FF", "#BBBBBBFF", "#BCBCBCFF", "#BEBEBEFF", "#BFBFBFFF", "#C1C1C1FF", "#C2C2C2FF", "#C3C3C4FF", "#C5C5C5FF", "#C6C6C6FF", "#C8C8C8FF", "#C9C9C9FF", "#CACACAFF", "#CCCCCCFF", "#CDCDCDFF", "#CECECEFF", "#CFCFCFFF", "#D1D1D1FF",  "#D2D2D2FF", "#D3D3D3FF", "#D4D4D4FF", "#D5D5D5FF", "#D6D6D6FF", "#D7D7D7FF", "#D8D8D8FF", "#D9D9D9FF", "#DADADAFF", "#DBDBDBFF", "#DCDCDCFF", "#DDDDDDFF", "#DEDEDEFF", "#DEDEDEFF", "#DFDFDFFF", "#E0E0E0FF", "#E0E0E0FF", "#E1E1E1FF", "#E1E1E1FF", "#E2E2E2FF", "#E2E2E2FF", "#E2E2E2FF")
## alpha     =  0.5
## cex   =  1
## itemLabels    =  TRUE
## labelCol  =  #000000B3
## measureLabels     =  FALSE
## precision     =  3
## layout    =  NULL
## layoutParams  =  list()
## arrowSize     =  0.5
## engine    =  igraph
## plot  =  TRUE
## plot_options  =  list()
## max   =  100
## verbose   =  FALSE

 plot(rules.pruned[1:20], method="paracoord", control=list(reorder=TRUE))

Rozdělení dat na trénovací/testovací, naučení modelu, predikce

ratings=ratings[! is.element(ratings[,'GENRE1'],c('Film-Noir','War')) & !is.na(ratings$METASCORE) ,]
#rozdeleni dat na trenovaci a testovaci
nRec=nrow(ratings)
set.seed(1)
trainRatio=0.8
train=sample(nRec,nRec*trainRatio)
test=setdiff(1:nRec, train)

#tvorba linearniho regresniho modelu
model=lm(RATING~MAVG+IMDBRATING,data=ratings[train,])
model=lm(RATING~IMDBRATING+MAVG+GENRE1+USHIFT+METASCORE,data=ratings[train,])
library(splines)
library(gam)

## Warning: package 'gam' was built under R version 3.4.2

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 3.4.2

## Loaded gam 1.14-4

model=gam(RATING~MAVG+IMDBRATING,data=ratings[train,])
model.b=gam(as.factor(RATING)~MAVG+IMDBRATING,data=ratings[train,],family=binomial)

#prohlizeni modelu
summary(model)

## 
## Call: gam(formula = RATING ~ MAVG + IMDBRATING, data = ratings[train, 
##     ])
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5166 -0.6125  0.1094  0.7027  3.1908 
## 
## (Dispersion Parameter for gaussian family taken to be 0.9944)
## 
##     Null Deviance: 303694.2 on 270230 degrees of freedom
## Residual Deviance: 268727.9 on 270228 degrees of freedom
## AIC: 765383.1 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                Df Sum Sq Mean Sq    F value Pr(>F)    
## MAVG            1  34966   34966 35161.4383 <2e-16 ***
## IMDBRATING      1      0       0     0.0074 0.9317    
## Residuals  270228 268728       1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

 #predikce pro testovaci data
pred=predict(model, newdata=ratings[test,],subset=! is.element(ratings[test,'GENRE1'],c('Film-Noir','War')))
#cbind(decathlon[test,c(1:2,3)],dec.pred)[order(-dec.pred),]
#cor.test(decathlon[test,3],dec.pred,method='spearman')#pearson na rank

Chybová funkce

with(ratings,
cat(' Mean absolute error:',mean(abs(pred-RATING[test]),na.rm = TRUE),'\n',
'Mean square error:',mean((pred-RATING[test])^2,na.rm = TRUE),'\n',
'Relative mean absolute error:',mean(abs(pred-RATING[test]),na.rm = TRUE)/mean(RATING[test]))
)

##  Mean absolute error: 0.8011851 
##  Mean square error: 0.9919521 
##  Relative mean absolute error: 0.2218625

Porovnání do sebe vnořených modelů

model=lm(RATING~MAVG+IMDBRATING+USHIFT+METASCORE+GENRE1,data=ratings[train,])
model0=lm(RATING~1,data=ratings[train,])
model1=lm(RATING~MAVG,data=ratings[train,])
model2=lm(RATING~MAVG+IMDBRATING,data=ratings[train,])
model3=lm(RATING~MAVG+IMDBRATING+USHIFT,data=ratings[train,])
model4=lm(RATING~MAVG+IMDBRATING+USHIFT+METASCORE,data=ratings[train,])

anova(model0,model1,model2,model3,model4,model)

## Analysis of Variance Table
## 
## Model 1: RATING ~ 1
## Model 2: RATING ~ MAVG
## Model 3: RATING ~ MAVG + IMDBRATING
## Model 4: RATING ~ MAVG + IMDBRATING + USHIFT
## Model 5: RATING ~ MAVG + IMDBRATING + USHIFT + METASCORE
## Model 6: RATING ~ MAVG + IMDBRATING + USHIFT + METASCORE + GENRE1
##   Res.Df    RSS Df Sum of Sq          F    Pr(>F)    
## 1 270230 303694                                      
## 2 270229 268728  1     34966 41735.0023 < 2.2e-16 ***
## 3 270228 268728  1         0     0.0087    0.9256    
## 4 270227 226470  1     42257 50437.6516 < 2.2e-16 ***
## 5 270226 226443  1        28    33.1072 8.731e-09 ***
## 6 270210 226386 16        56     4.2118 2.854e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

model8=lm(RATING~MAVG+USHIFT+METASCORE+GENRE1,data=ratings[train,])
anova(model8,model)

## Analysis of Variance Table
## 
## Model 1: RATING ~ MAVG + USHIFT + METASCORE + GENRE1
## Model 2: RATING ~ MAVG + IMDBRATING + USHIFT + METASCORE + GENRE1
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1 270211 226428                                  
## 2 270210 226386  1     41.96 50.082 1.478e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

model

## 
## Call:
## lm(formula = RATING ~ MAVG + IMDBRATING + USHIFT + METASCORE + 
##     GENRE1, data = ratings[train, ])
## 
## Coefficients:
##       (Intercept)               MAVG         IMDBRATING  
##         0.0893804          0.9077739          0.0284350  
##            USHIFT          METASCORE    GENRE1Adventure  
##         1.0093581          0.0009192          0.0056177  
##   GENRE1Animation    GENRE1Biography       GENRE1Comedy  
##        -0.0145083          0.0417096          0.0141992  
##       GENRE1Crime  GENRE1Documentary        GENRE1Drama  
##         0.0162586          0.0412721          0.0345824  
##      GENRE1Family      GENRE1Fantasy       GENRE1Horror  
##        -0.1395856          0.0603930          0.0185457  
##     GENRE1Musical      GENRE1Mystery      GENRE1Romance  
##        -0.0309363          0.0197252          0.0719617  
##      GENRE1Sci-Fi     GENRE1Thriller      GENRE1Western  
##         0.0582193          0.0552423         -0.0205674

model=lm(RATING~MAVG+CNT+IMDBRATING+USHIFT,data=ratings[train,])

 #predikce pro testovaci data
pred=predict(model, newdata=ratings[test,])
confusion_t=table(ratings$RATING[test], round(pred))

#contour(imdb_range,rating_range,tab_rat,nlevels=45,add=T)
#image(imdb_range,rating_range,tab_rat)
persp(1:5,1:6,confusion_t-confusion_t*diag(1,5,6),theta=40,phi=30)

#model=lm(RATING~MAVG+CNT+GENRE1,data=ratings[sel,])
#summary(model)
sel=TRUE
require(earth)

## Loading required package: earth

## Warning: package 'earth' was built under R version 3.4.2

## Loading required package: plotmo

## Warning: package 'plotmo' was built under R version 3.4.2

## Loading required package: plotrix

## Loading required package: TeachingDemos

## Warning: package 'TeachingDemos' was built under R version 3.4.2

earth.mod <- earth(as.matrix(ratings[sel,c('MAVG','IMDBRATING','CNT')]),as.vector(ratings$RATING[sel]))
#earth(MAVG~CNT,data=ratings[sel,])
plotmo(earth.mod)

##  plotmo grid:    MAVG IMDBRATING CNT
##              3.610329          7 438

summary(earth.mod)

## Call: earth(x=as.matrix(ratings[sel,c("MAVG","IMDBRATING","CNT")...),
##             y=as.vector(ratings$RATING[sel]))
## 
##                 coefficients
## (Intercept)        2.7793569
## h(2.77778-MAVG)   -0.9334554
## h(MAVG-2.77778)    0.9987702
## 
## Selected 3 of 3 terms, and 1 of 3 predictors
## Termination condition: RSq changed by less than 0.001 at 3 terms
## Importance: MAVG, IMDBRATING-unused, CNT-unused
## Number of terms at each degree of interaction: 1 2 (additive model)
## GCV 0.9939625    RSS 335739.6    GRSq 0.1152168    RSq 0.1152378

library(tree)
tree.m=tree(RATING~GENRE1+MAVG+IMDBRATING,ratings)
#summary(tree.m)
plot(tree.m)
text(tree.m,pretty=0)