taitanic004.r

yamamoto — Feb 9, 2014, 12:30 PM

# Titanic

rm(list=ls(all=TRUE))

# load file
    filepath <- "C:/R/data/train_mv.csv"
    X0 <- read.csv(filepath)
    attach(X0)

# data
    Sex <- as.numeric(Sex)
    X1 <- cbind(Sex,Age,Pclass,SibSp,Parch,Fare)
  Y1 <- X0$Survived
    X1 <- data.frame(X1)
    X1_ALL <- data.frame(X1,Y1)

# Logistic regression
  x0 <- glm(as.factor(Y1)~Sex+Age+Pclass+SibSp+Parch+Fare,data=X1_ALL, family=binomial(link=logit))

# variable selection(AIC)
  x_AIC <- step(x0)
Start:  AIC=799.4
as.factor(Y1) ~ Sex + Age + Pclass + SibSp + Parch + Fare

         Df Deviance  AIC
- Parch   1      786  798
- Fare    1      787  799
<none>           785  799
- SibSp   1      799  811
- Age     1      817  829
- Pclass  1      849  861
- Sex     1     1026 1038

Step:  AIC=798.2
as.factor(Y1) ~ Sex + Age + Pclass + SibSp + Fare

         Df Deviance  AIC
- Fare    1      787  797
<none>           786  798
- SibSp   1      803  813
- Age     1      817  827
- Pclass  1      853  863
- Sex     1     1030 1040

Step:  AIC=797.4
as.factor(Y1) ~ Sex + Age + Pclass + SibSp

         Df Deviance  AIC
<none>           787  797
- SibSp   1      803  811
- Age     1      819  827
- Pclass  1      902  910
- Sex     1     1037 1045

# Cross-validation, ROC
  library(Epi)

Attaching package: 'Epi'

 以下のオブジェクトはマスクされています (from 'package:base') : 

     merge.data.frame 

  cross <- 10 # 10-fold CV
  pp <- NaN;AUC <- NaN;
  for(i in 1:cross){
    x <- glm(x_AIC$formula,data=X1_ALL[-seq(i,nrow(X1_ALL),cross),], family=binomial(link=logit))
    pp[seq(i,nrow(X1_ALL),cross)] <- predict(x,X1_ALL[seq(i,nrow(X1_ALL),cross),], type="response")
  }
  ROC(test=pp,stat=Y1)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1


# clear
    detach(X0)
  rm(Sex)

# test data
    filepath_test <- "C:/R/data/test_mv.csv"
    X1_test <- read.csv(filepath_test)
    attach(X1_test)

    # data
    Sex <- as.numeric(Sex)
    X_test <- cbind(Sex,Age,Pclass,SibSp,Parch,Fare)
    X2_ALL <- data.frame(X_test)

    # prediction
    qqq1 <- predict(x_AIC,X2_ALL, type="response")
  qqq2 <- as.numeric(qqq1>=0.5)