Tiny introduction to R: assignment, list of variables,

a=2+3
a
## [1] 5
ls()
## [1] "a"

Load a dataset, print summary

load('C:/Users/Marta/Desktop/tirpitz/project/exercises.RData')
summary(ratings)
##       TT                USERID           MOVIEID          RATING     
##  Length:407719      Min.   :    195   Min.   :    6   Min.   :1.000  
##  Class :character   1st Qu.: 663789   1st Qu.: 4736   1st Qu.:3.000  
##  Mode  :character   Median :1343182   Median : 9204   Median :4.000  
##                     Mean   :1330793   Mean   : 9184   Mean   :3.622  
##                     3rd Qu.:2000661   3rd Qu.:13827   3rd Qu.:4.000  
##                     Max.   :2648758   Max.   :17770   Max.   :5.000  
##                                                                      
##        TS                   K              MAVG            CNT        
##  Min.   :1999-12-30   Min.   :  1.0   Min.   :1.000   Min.   :   1.0  
##  1st Qu.:2004-07-03   1st Qu.: 43.0   1st Qu.:3.395   1st Qu.: 128.0  
##  Median :2005-02-16   Median :107.0   Median :3.625   Median : 341.0  
##  Mean   :2004-11-19   Mean   :144.5   Mean   :3.621   Mean   : 415.1  
##  3rd Qu.:2005-07-24   3rd Qu.:214.0   3rd Qu.:3.867   3rd Qu.: 640.0  
##  Max.   :2005-12-31   Max.   :598.0   Max.   :5.000   Max.   :1398.0  
##                                                                       
##     BAYESAVG       IMDBRATING          GENRE1           YEAR          
##  Min.   :2.820   Min.   :1.900   Comedy   :126327   Length:407719     
##  1st Qu.:3.460   1st Qu.:6.400   Action   :101030   Class :character  
##  Median :3.627   Median :7.000   Drama    : 70412   Mode  :character  
##  Mean   :3.646   Mean   :6.956   Crime    : 32369                     
##  3rd Qu.:3.824   3rd Qu.:7.600   Biography: 21005                     
##  Max.   :4.460   Max.   :9.200   Adventure: 20485                     
##                                  (Other)  : 36091                     
##      USHIFT            USHIFTB               UGENRE       GENREMATCH     
##  Min.   :-2.02088   Min.   :-1.10638   Comedy   :245614   Mode :logical  
##  1st Qu.:-0.29887   1st Qu.:-0.23742   Action   :134210   FALSE:258404   
##  Median :-0.03586   Median :-0.02675   Drama    : 26010   TRUE :149315   
##  Mean   :-0.02362   Mean   :-0.02277   Crime    :   689                  
##  3rd Qu.: 0.23167   3rd Qu.: 0.17260   Animation:   508                  
##  Max.   : 1.43803   Max.   : 1.06262   Horror   :   480                  
##                                        (Other)  :   208                  
##  dRATING   
##  1: 16362  
##  2: 39230  
##  3:118080  
##  4:142502  
##  5: 91545  
##            
##            
##                                                      NAME       
##  Miss Congeniality                                     :  1562  
##  Independence Day                                      :  1402  
##  The Patriot                                           :  1342  
##  The Day After Tomorrow                                :  1297  
##  Pirates of the Caribbean: The Curse of the Black Pearl:  1282  
##  Pretty Woman                                          :  1275  
##  (Other)                                               :399559  
##        RATED                           AWARDS         METASCORE     
##  R        :165688                         : 23193   Min.   :  1.00  
##  PG-13    :150684   1 nomination.         : 12989   1st Qu.: 47.00  
##  PG       : 63498   2 nominations.        : 10325   Median : 61.00  
##  G        :  8183   3 nominations.        :  8316   Mean   : 59.63  
##  APPROVED :  7660   1 win & 2 nominations.:  7656   3rd Qu.: 72.00  
##  NOT RATED:  4635   1 win & 3 nominations.:  5975   Max.   :100.00  
##  (Other)  :  7371   (Other)               :339265   NA's   :69930   
##     IMDBYEAR              COUNTRY                   LANGUAGE     
##  Min.   :1915   USA           :279647   English         :241914  
##  1st Qu.:1992   USA, Germany  : 24080   English, Spanish: 21559  
##  Median :1999   UK, USA       : 11028   English, French : 15661  
##  Mean   :1995   USA, Australia:  9951   English, Italian:  7239  
##  3rd Qu.:2003   USA, UK       :  9663   English, Russian:  6133  
##  Max.   :2005   USA, Canada   :  6934   English, German :  4966  
##                 (Other)       : 66416   (Other)         :110247  
##                     GENRES                   DIRECTOR     
##  Comedy, Drama, Romance: 25408   Steven Spielberg:  8697  
##  Comedy, Romance       : 22620   Roland Emmerich :  4260  
##  Drama                 : 19494   Michael Bay     :  4158  
##  Comedy                : 18045   Tony Scott      :  4129  
##  Comedy, Drama         : 13769   Richard Donner  :  3876  
##  Drama, Romance        : 11756   Garry Marshall  :  3653  
##  (Other)               :296627   (Other)         :378946  
##                                                            ACTORS      
##  Sandra Bullock, Michael Caine, Benjamin Bratt, Candice Bergen:  1562  
##  Mel Gibson, Danny Glover, Joe Pesci, Rene Russo              :  1450  
##  Will Smith, Bill Pullman, Jeff Goldblum, Mary McDonnell      :  1402  
##  Mel Gibson, Heath Ledger, Joely Richardson, Jason Isaacs     :  1311  
##  Dennis Quaid, Jake Gyllenhaal, Emmy Rossum, Dash Mihok       :  1297  
##  Johnny Depp, Geoffrey Rush, Orlando Bloom, Keira Knightley   :  1282  
##  (Other)                                                      :399415

Single statictic evaluation, attach command

#min(ratings$USERID)
attach(ratings)
median(RATING)
## [1] 4

Histograms

hist(RATING)

hist(IMDBRATING,main='FLIX - IMDBRATING')

plot(CNT,MAVG)

cor.test(RATING,MAVG,method='pearson')#E[(X-mu_X)(Y-mu_Y)]/(sigma_x.sigma_y)
## 
##  Pearson's product-moment correlation
## 
## data:  RATING and MAVG
## t = 239.02, df = 407720, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3478772 0.3532617
## sample estimates:
##       cor 
## 0.3505724
#cor.test(RATING,MAVG,method='kendall') #(cond-dis)/(n(n-1)/2)
#cor.test(RATING,MAVG,method='spearman')#pearson na rank
cor.test(RATING,MAVG,method='pearson')#E[(X-mu_X)(Y-mu_Y)]/(sigma_x.sigma_y)
## 
##  Pearson's product-moment correlation
## 
## data:  RATING and MAVG
## t = 239.02, df = 407720, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3478772 0.3532617
## sample estimates:
##       cor 
## 0.3505724
#cor.test(RATING,MAVG,method='kendall') #(cond-dis)/(n(n-1)/2)
#cor.test(RATING,MAVG,method='spearman')#pearson na rank
pairs(~RATING+IMDBRATING+CNT,data=ratings)

boxplot(IMDBRATING~RATING,data=ratings,xlab='RATING',ylab='IMDBRATING')

boxplot(MAVG~RATING,data=ratings,xlab='RATING',ylab='MAVG',main="MAVG vs. RATING")
linModel=lm(MAVG~RATING, data=ratings)
abline(linModel$coefficients, col='blue')
mtext(linModel$coefficients[2],1,col='blue')

boxplot(RATING~GENRE1,data=ratings,ylab='RATING',xlab='', main='RATING vs. GENRE', las=2,text.cex=0.5)

Multiple figures, for cycle

par(mfrow=c(2,3))
for(i in 1:5)hist(ratings[RATING==i,'IMDBRATING'],xlab='IMDBRATING',main=paste0('RATING==',i))

3D plot (advanced: sorting, table to get the data)

imdb_range=sort(unique(IMDBRATING))
rating_range=sort(unique(RATING))
tab_rat=table(IMDBRATING,RATING)
#image(imdb_range,rating_range,tab_rat)
#contour(imdb_range,rating_range,tab_rat,nlevels=45,add=T)
persp(imdb_range,rating_range,tab_rat,theta=40,phi=20)

Aggregation with tapply command

ucounts=tapply(RATING,USERID,FUN=length)
hist(ucounts,main="# Rated Movies per User",xlab='#ratings',ylab='#users with the #ratings')

Dvacet nejsledovanějších režisérů

not.null=!is.na(ratings$DIRECTOR)
ta=tapply(ratings$dRATING[not.null],ratings$DIRECTOR[not.null],FUN=length)
sort(ta[!is.na(ta)],decreasing=TRUE)[1:20]
##  Steven Spielberg   Roland Emmerich       Michael Bay        Tony Scott 
##              8697              4260              4158              4129 
##    Richard Donner    Garry Marshall Steven Soderbergh   Robert Zemeckis 
##              3876              3653              3412              3374 
##       Tom Shadyac     Donald Petrie        Rob Reiner    Clint Eastwood 
##              3148              3048              3045              2979 
##        Tim Burton      Ridley Scott Wolfgang Petersen         Jay Roach 
##              2870              2861              2713              2694 
##        Simon West        Ron Howard          John Woo       Peter Segal 
##              2652              2649              2619              2508
users=unique(ratings[,'USERID'])
sel=ratings$USERID==users[uid<-16]
#sel=TRUE
plot(as.factor(ratings[sel,'GENRE1']),ratings[sel,'MAVG'],ylim=c(0,5),las=2,text.cex=0.5,main=paste0('User ',users[uid]))
points(as.factor(ratings[sel,'GENRE1']),ratings[sel,'MAVG'],col=ratings[sel,'RATING'])
legend('topleft',legend=1:5,text.col=1:5,cex=0.5)

load(file='toclustf.RData')
ngens=ncol(to.clust)-1
genres=colnames(to.clust)[2:(ngens+1)]
optClust=4
set.seed(2)
km.o=kmeans(to.clust[,2:(ngens+1)],optClust,nstart=2)

pr.out=prcomp(km.o$centers, scale=TRUE)
name.index=apply(km.o$centers,1,which.max)
clust.names=sapply(name.index,  FUN=function(x){colnames(km.o$centers)[x]})
  mm=predict(pr.out,newdata=to.clust[1:1000,2:ncol(to.clust)])
  plot(jitter(mm[,1:2]),  col=km.o$cluster, pch='.',xlab="1. principal component",ylab="2. PC",
       main="User Clustering acc. to Genre (f)",xlim=c(-20,20),ylim=c(-10,10),cex=1.3)
  text(pr.out$x[,1:2],  pch='.',col=1:optClust, labels = clust.names, cex=1.0) 

Clustering

plot(km.o$centers[1,],type='b',col=1,pch=2,ylim=c(0,0.3),xaxt='n', main=paste0('Clusters for Genres (Flix)'),xlab='genre',ylab="popularity")
labs=colnames(km.o$centers)
axis(1,at=0:length(labs),labels=c(0,labs),las=2,cex=0.5)
grid(length(labs)/2+1)
for(g in 2:optClust){
  points(km.o$centers[g,],type='b',col=g,pch=g+1)
}

#plot(2:nclust,sapply(2:nclust,FUN=function(x)km.o[[x]]$tot.withinss ))