## load packages ############################################### library(randomForest) library(MASS) ## read training and test data ################################# train <- read.table(file="roe-train.txt") test <- read.table(file="roe-test.txt") ## training events (remove bad events) ######################## train <- train[-nrow(train),] X <- train[,1:50] rem <- which(apply(X,1,sd)==0) X <- X[-rem,] Y <- train[-rem,51] ## test events (again remove bad events) ####################### Xt <- test[,1:50] rem <- which(apply(Xt,1,sd)==0) Xt <- Xt[-rem,] Yt <- test[-rem,51] ## Random Forest on a reduced size of events ##################### ind <- sample(nrow(X),800,replace=FALSE) Xn <- X[ind,] Yn <- Y[ind] # fit random forest object rfn <- randomForest(Xn,as.factor(Yn)) # predict test cases Yp <- predict(rfn,newdata=Xt,type="prob")[,2] par(mfrow=c(2,1)) # plot par(mfrow=c(2,1)) hist(Yp[Yt==0],40,col=2,main="background") hist(Yp[Yt==1],40,col=3,main="signal") print(mean(Yp[Yt==0]>median(Yp[Yt==1]))) ## proximity matrix ############################################## ind <- sample(nrow(X),4000,replace=FALSE) Xr <- X[ind,] Yr <- Y[ind] # fit random forest rf <- randomForest(Xr,as.factor(Yr),replace=FALSE,proximity=TRUE,importance=TRUE) load(file="mds.rda") # plot variable importance varImpPlot(rf) # Multi Dimensional Scaling mds <- cmdscale(1-rf$proximity,eig=TRUE) par(mfrow=c(1,1)) plot(mds$points,col=as.numeric(predict(rf))+1,cex=0.5,pch=4,main="predicted signal green, predicted background red") X11() plot(mds$points,col=Yr+2,cex=0.5,pch=4,main="signal green, background red") ### LDA ########################################################### ld <- lda(Yr ~ ., data=Xr) Yp <- predict( ld, newdata=Xt ) Ypp <- Yp$posterior[,2] # plot par(mfrow=c(2,1)) hist(Ypp[Yt==0],40,col=2,main="background") hist(Ypp[Yt==1],40,col=3,main="signal") ## LOGISTIC REGRESSION ######################################## gl <- glm(Yr ~ ., data=Xr,family="binomial") Yp <- predict(gl,newdata=Xt,type="response") # plot hist(Yp[Yt==0],40,col=2,main="background") hist(Yp[Yt==1],40,col=3,main="signal")