yamamoto — Feb 9, 2014, 12:30 PM
# Titanic
rm(list=ls(all=TRUE))
# load file
filepath <- "C:/R/data/train_mv.csv"
X0 <- read.csv(filepath)
attach(X0)
# data
Sex <- as.numeric(Sex)
X1 <- cbind(Sex,Age,Pclass,SibSp,Parch,Fare)
Y1 <- X0$Survived
X1 <- data.frame(X1)
X1_ALL <- data.frame(X1,Y1)
# Logistic regression
x0 <- glm(as.factor(Y1)~Sex+Age+Pclass+SibSp+Parch+Fare,data=X1_ALL, family=binomial(link=logit))
# variable selection(AIC)
x_AIC <- step(x0)
Start: AIC=799.4
as.factor(Y1) ~ Sex + Age + Pclass + SibSp + Parch + Fare
Df Deviance AIC
- Parch 1 786 798
- Fare 1 787 799
<none> 785 799
- SibSp 1 799 811
- Age 1 817 829
- Pclass 1 849 861
- Sex 1 1026 1038
Step: AIC=798.2
as.factor(Y1) ~ Sex + Age + Pclass + SibSp + Fare
Df Deviance AIC
- Fare 1 787 797
<none> 786 798
- SibSp 1 803 813
- Age 1 817 827
- Pclass 1 853 863
- Sex 1 1030 1040
Step: AIC=797.4
as.factor(Y1) ~ Sex + Age + Pclass + SibSp
Df Deviance AIC
<none> 787 797
- SibSp 1 803 811
- Age 1 819 827
- Pclass 1 902 910
- Sex 1 1037 1045
# Cross-validation, ROC
library(Epi)
Attaching package: 'Epi'
以下のオブジェクトはマスクされています (from 'package:base') :
merge.data.frame
cross <- 10 # 10-fold CV
pp <- NaN;AUC <- NaN;
for(i in 1:cross){
x <- glm(x_AIC$formula,data=X1_ALL[-seq(i,nrow(X1_ALL),cross),], family=binomial(link=logit))
pp[seq(i,nrow(X1_ALL),cross)] <- predict(x,X1_ALL[seq(i,nrow(X1_ALL),cross),], type="response")
}
ROC(test=pp,stat=Y1)
# clear
detach(X0)
rm(Sex)
# test data
filepath_test <- "C:/R/data/test_mv.csv"
X1_test <- read.csv(filepath_test)
attach(X1_test)
# data
Sex <- as.numeric(Sex)
X_test <- cbind(Sex,Age,Pclass,SibSp,Parch,Fare)
X2_ALL <- data.frame(X_test)
# prediction
qqq1 <- predict(x_AIC,X2_ALL, type="response")
qqq2 <- as.numeric(qqq1>=0.5)