Tiny introduction to R: assignment, list of variables,
a=2+3
a
## [1] 5
ls()
## [1] "a"
Load a dataset, print summary
load('C:/Users/Marta/Desktop/tirpitz/project/exercises.RData')
summary(ratings)
## TT USERID MOVIEID RATING
## Length:407719 Min. : 195 Min. : 6 Min. :1.000
## Class :character 1st Qu.: 663789 1st Qu.: 4736 1st Qu.:3.000
## Mode :character Median :1343182 Median : 9204 Median :4.000
## Mean :1330793 Mean : 9184 Mean :3.622
## 3rd Qu.:2000661 3rd Qu.:13827 3rd Qu.:4.000
## Max. :2648758 Max. :17770 Max. :5.000
##
## TS K MAVG CNT
## Min. :1999-12-30 Min. : 1.0 Min. :1.000 Min. : 1.0
## 1st Qu.:2004-07-03 1st Qu.: 43.0 1st Qu.:3.395 1st Qu.: 128.0
## Median :2005-02-16 Median :107.0 Median :3.625 Median : 341.0
## Mean :2004-11-19 Mean :144.5 Mean :3.621 Mean : 415.1
## 3rd Qu.:2005-07-24 3rd Qu.:214.0 3rd Qu.:3.867 3rd Qu.: 640.0
## Max. :2005-12-31 Max. :598.0 Max. :5.000 Max. :1398.0
##
## BAYESAVG IMDBRATING GENRE1 YEAR
## Min. :2.820 Min. :1.900 Comedy :126327 Length:407719
## 1st Qu.:3.460 1st Qu.:6.400 Action :101030 Class :character
## Median :3.627 Median :7.000 Drama : 70412 Mode :character
## Mean :3.646 Mean :6.956 Crime : 32369
## 3rd Qu.:3.824 3rd Qu.:7.600 Biography: 21005
## Max. :4.460 Max. :9.200 Adventure: 20485
## (Other) : 36091
## USHIFT USHIFTB UGENRE GENREMATCH
## Min. :-2.02088 Min. :-1.10638 Comedy :245614 Mode :logical
## 1st Qu.:-0.29887 1st Qu.:-0.23742 Action :134210 FALSE:258404
## Median :-0.03586 Median :-0.02675 Drama : 26010 TRUE :149315
## Mean :-0.02362 Mean :-0.02277 Crime : 689
## 3rd Qu.: 0.23167 3rd Qu.: 0.17260 Animation: 508
## Max. : 1.43803 Max. : 1.06262 Horror : 480
## (Other) : 208
## dRATING
## 1: 16362
## 2: 39230
## 3:118080
## 4:142502
## 5: 91545
##
##
## NAME
## Miss Congeniality : 1562
## Independence Day : 1402
## The Patriot : 1342
## The Day After Tomorrow : 1297
## Pirates of the Caribbean: The Curse of the Black Pearl: 1282
## Pretty Woman : 1275
## (Other) :399559
## RATED AWARDS METASCORE
## R :165688 : 23193 Min. : 1.00
## PG-13 :150684 1 nomination. : 12989 1st Qu.: 47.00
## PG : 63498 2 nominations. : 10325 Median : 61.00
## G : 8183 3 nominations. : 8316 Mean : 59.63
## APPROVED : 7660 1 win & 2 nominations.: 7656 3rd Qu.: 72.00
## NOT RATED: 4635 1 win & 3 nominations.: 5975 Max. :100.00
## (Other) : 7371 (Other) :339265 NA's :69930
## IMDBYEAR COUNTRY LANGUAGE
## Min. :1915 USA :279647 English :241914
## 1st Qu.:1992 USA, Germany : 24080 English, Spanish: 21559
## Median :1999 UK, USA : 11028 English, French : 15661
## Mean :1995 USA, Australia: 9951 English, Italian: 7239
## 3rd Qu.:2003 USA, UK : 9663 English, Russian: 6133
## Max. :2005 USA, Canada : 6934 English, German : 4966
## (Other) : 66416 (Other) :110247
## GENRES DIRECTOR
## Comedy, Drama, Romance: 25408 Steven Spielberg: 8697
## Comedy, Romance : 22620 Roland Emmerich : 4260
## Drama : 19494 Michael Bay : 4158
## Comedy : 18045 Tony Scott : 4129
## Comedy, Drama : 13769 Richard Donner : 3876
## Drama, Romance : 11756 Garry Marshall : 3653
## (Other) :296627 (Other) :378946
## ACTORS
## Sandra Bullock, Michael Caine, Benjamin Bratt, Candice Bergen: 1562
## Mel Gibson, Danny Glover, Joe Pesci, Rene Russo : 1450
## Will Smith, Bill Pullman, Jeff Goldblum, Mary McDonnell : 1402
## Mel Gibson, Heath Ledger, Joely Richardson, Jason Isaacs : 1311
## Dennis Quaid, Jake Gyllenhaal, Emmy Rossum, Dash Mihok : 1297
## Johnny Depp, Geoffrey Rush, Orlando Bloom, Keira Knightley : 1282
## (Other) :399415
Single statictic evaluation, attach command
#min(ratings$USERID)
attach(ratings)
median(RATING)
## [1] 4
Histograms
hist(RATING)
hist(IMDBRATING,main='FLIX - IMDBRATING')
plot(CNT,MAVG)
cor.test(RATING,MAVG,method='pearson')#E[(X-mu_X)(Y-mu_Y)]/(sigma_x.sigma_y)
##
## Pearson's product-moment correlation
##
## data: RATING and MAVG
## t = 239.02, df = 407720, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3478772 0.3532617
## sample estimates:
## cor
## 0.3505724
#cor.test(RATING,MAVG,method='kendall') #(cond-dis)/(n(n-1)/2)
#cor.test(RATING,MAVG,method='spearman')#pearson na rank
cor.test(RATING,MAVG,method='pearson')#E[(X-mu_X)(Y-mu_Y)]/(sigma_x.sigma_y)
##
## Pearson's product-moment correlation
##
## data: RATING and MAVG
## t = 239.02, df = 407720, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3478772 0.3532617
## sample estimates:
## cor
## 0.3505724
#cor.test(RATING,MAVG,method='kendall') #(cond-dis)/(n(n-1)/2)
#cor.test(RATING,MAVG,method='spearman')#pearson na rank
pairs(~RATING+IMDBRATING+CNT,data=ratings)
boxplot(IMDBRATING~RATING,data=ratings,xlab='RATING',ylab='IMDBRATING')
boxplot(MAVG~RATING,data=ratings,xlab='RATING',ylab='MAVG',main="MAVG vs. RATING")
linModel=lm(MAVG~RATING, data=ratings)
abline(linModel$coefficients, col='blue')
mtext(linModel$coefficients[2],1,col='blue')
boxplot(RATING~GENRE1,data=ratings,ylab='RATING',xlab='', main='RATING vs. GENRE', las=2,text.cex=0.5)
Multiple figures, for cycle
par(mfrow=c(2,3))
for(i in 1:5)hist(ratings[RATING==i,'IMDBRATING'],xlab='IMDBRATING',main=paste0('RATING==',i))
3D plot (advanced: sorting, table to get the data)
imdb_range=sort(unique(IMDBRATING))
rating_range=sort(unique(RATING))
tab_rat=table(IMDBRATING,RATING)
#image(imdb_range,rating_range,tab_rat)
#contour(imdb_range,rating_range,tab_rat,nlevels=45,add=T)
persp(imdb_range,rating_range,tab_rat,theta=40,phi=20)
Aggregation with tapply command
ucounts=tapply(RATING,USERID,FUN=length)
hist(ucounts,main="# Rated Movies per User",xlab='#ratings',ylab='#users with the #ratings')
Dvacet nejsledovanÄ›jÅ¡Ãch režisérů
not.null=!is.na(ratings$DIRECTOR)
ta=tapply(ratings$dRATING[not.null],ratings$DIRECTOR[not.null],FUN=length)
sort(ta[!is.na(ta)],decreasing=TRUE)[1:20]
## Steven Spielberg Roland Emmerich Michael Bay Tony Scott
## 8697 4260 4158 4129
## Richard Donner Garry Marshall Steven Soderbergh Robert Zemeckis
## 3876 3653 3412 3374
## Tom Shadyac Donald Petrie Rob Reiner Clint Eastwood
## 3148 3048 3045 2979
## Tim Burton Ridley Scott Wolfgang Petersen Jay Roach
## 2870 2861 2713 2694
## Simon West Ron Howard John Woo Peter Segal
## 2652 2649 2619 2508
users=unique(ratings[,'USERID'])
sel=ratings$USERID==users[uid<-16]
#sel=TRUE
plot(as.factor(ratings[sel,'GENRE1']),ratings[sel,'MAVG'],ylim=c(0,5),las=2,text.cex=0.5,main=paste0('User ',users[uid]))
points(as.factor(ratings[sel,'GENRE1']),ratings[sel,'MAVG'],col=ratings[sel,'RATING'])
legend('topleft',legend=1:5,text.col=1:5,cex=0.5)
load(file='toclustf.RData')
ngens=ncol(to.clust)-1
genres=colnames(to.clust)[2:(ngens+1)]
optClust=4
set.seed(2)
km.o=kmeans(to.clust[,2:(ngens+1)],optClust,nstart=2)
pr.out=prcomp(km.o$centers, scale=TRUE)
name.index=apply(km.o$centers,1,which.max)
clust.names=sapply(name.index, FUN=function(x){colnames(km.o$centers)[x]})
mm=predict(pr.out,newdata=to.clust[1:1000,2:ncol(to.clust)])
plot(jitter(mm[,1:2]), col=km.o$cluster, pch='.',xlab="1. principal component",ylab="2. PC",
main="User Clustering acc. to Genre (f)",xlim=c(-20,20),ylim=c(-10,10),cex=1.3)
text(pr.out$x[,1:2], pch='.',col=1:optClust, labels = clust.names, cex=1.0)
Clustering
plot(km.o$centers[1,],type='b',col=1,pch=2,ylim=c(0,0.3),xaxt='n', main=paste0('Clusters for Genres (Flix)'),xlab='genre',ylab="popularity")
labs=colnames(km.o$centers)
axis(1,at=0:length(labs),labels=c(0,labs),las=2,cex=0.5)
grid(length(labs)/2+1)
for(g in 2:optClust){
points(km.o$centers[g,],type='b',col=g,pch=g+1)
}
#plot(2:nclust,sapply(2:nclust,FUN=function(x)km.o[[x]]$tot.withinss ))