-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSTEM_Imp_VI.R
193 lines (150 loc) · 7.03 KB
/
STEM_Imp_VI.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
source("Imputation_VI_Functions.R")
<<<<<<< HEAD
=======
library(randomForest)
library(randomForestSRC)
library(CALIBERrfimpute)
library(mice)
library(MASS)
>>>>>>> dada33f66033f7caf3341265e7f78ed3b010401d
STEM <- readRDS("STEM1.rds") #Include all students, not only those who stayed at ISU
STEM <- STEM[STEM$Year==2016, ]
STEM$`Learning Community Participation` <- as.numeric(STEM$`Learning Community Participation`!=0)
STEM <- STEM[, c(3, 5, 6, 26, 27, 30, 31, 32, 40, 47, 50, 54, 55, 56, 57, 62)]
varnames <- c("ACT",
"GPA",
"Gender",
"Athlete",
"Age",
"ALEKS",
"LC",
"STEM Interest",
"Remedial Math",
"Parent Education",
"Hrs. Studying*",
"Academic Skills*",
"Social Integration*",
"Peer Connections*",
"Self Efficacy*",
"Class" )
names(STEM) <- c("x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "y")
Propmiss <- function(x){
sum(is.na(x)/length(x))
}
round(apply(STEM, 2, Propmiss),3)
#This function does the imputation and VI once. Apply it nmult times for multiple imputation, as is done in Impute_VI
CaliberVImvt <- function(x, y){
vecdata <- unlist(c(x))
ImputedX <-x #first set imputed dataset equal to one with missing values then fill missing values
ImputeInfo <- mice(ImputedX, method = c('rfcont'), m = 1, maxit = 5)
Imputedvals <- unlist(ImputeInfo$imp)
vecdata[is.na(vecdata)] <- Imputedvals
ImputedX <- data.frame(matrix(vecdata, nrow=nrow(x), ncol=ncol(x), byrow=FALSE))
names(ImputedX) <- names(x)
Imputed <- cbind(ImputedX, y)
rfMiss=rfsrc(y~., data <- Imputed, ntree=500,importance=TRUE)
if(is.factor(y)){VIvec <- rfMiss$importance[,1]}else{VIvec <- rfMiss$importance} #If y is a factor, permutation importance is given in 3rd col. Otherwise first
return(VIvec)
}
#Function to impute missing values and compute variable importance using Doove's method
#This function does the imputation and VI once. Apply it nmult times for multiple imputation, as is done in Impute_VI
miceVImvt <- function(x,y){
vecdata <- unlist(c(x))
ImputeInfo <- mice(data=x, m=1, method="rf")
Imputedvals <- unlist(ImputeInfo$imp)
vecdata[is.na(vecdata)] <- Imputedvals
ImputedX <- data.frame(matrix(vecdata, nrow=nrow(x), ncol=ncol(x), byrow=FALSE))
names(ImputedX) <- names(x)
Imputed <- cbind(ImputedX, y)
rfMiss=rfsrc(y~., data <- Imputed, ntree=500,importance=TRUE)
if(is.factor(y)){VIvec <- rfMiss$importance[,1]}else{VIvec <- rfMiss$importance} #If y is a factor, permutation importance is given in 3rd col. Otherwise first
return(VIvec)
}
ntechs <- 10
nreps <- 10
x <- STEM[,-ncol(STEM)]
y <- STEM[,ncol(STEM)]
VI <- array(NA, dim=c(nreps, ncol(x), ntechs)) #rows correspond to variables, columns to imputation techniques
set.seed(12062019)
for (it in 1:nreps){
#Strawman-median imputation
tech <- 1
Imputed <- cbind(na.roughfix(x), y)
rfMiss <- rfsrc(y~., data=Imputed, ntree=500,importance=TRUE)
if(is.factor(y)){VI[it,,tech]<-rfMiss$importance[,1]}else{VI[it,,tech]<-rfMiss$importance}
#rfimpute
tech <- 2
Imputed <- rfImpute(x, y, iter=5)[,c(2:(ncol(x)+1),1)]
rfMiss <- rfsrc(y~., data=Imputed, ntree=500,importance=TRUE)
if(is.factor(y)){VI[it,,tech]<-rfMiss$importance[,1]}else{VI[it,,tech]<-rfMiss$importance}
#Impute using missForest
tech <- 3
ImputedX <- impute(data = x, mf.q = 1/ncol(x))
Imputed <- cbind(ImputedX, y)
rfMiss <- rfsrc(y~., data=Imputed, ntree=500,importance=TRUE)
if(is.factor(y)){VI[it,,tech]<-rfMiss$importance[,1]}else{VI[it,,tech]<-rfMiss$importance} #If y is a factor, permutation importance is given in 1st col. Otherwise just a vector
#Impute using RFSRC-1 iteration
tech <- 4
rfMiss <- rfsrc(y~., data=STEM, ntree=500,importance=c("permute"), na.action=c("na.impute"), nimpute=1)
if(is.factor(y)){VI[it,,tech]<-rfMiss$importance[,1]}else{VI[it,,tech]<-rfMiss$importance} #If y is a factor, permutation importance is given in 1st col. Otherwise just a vector
#Impute using RFSRC-5 iterations
tech <- 5
rfMiss <- rfsrc(y~., data=STEM, ntree=500,importance=c("permute"), na.action=c("na.impute"), nimpute=5)
if(is.factor(y)){VI[it,,tech]<-rfMiss$importance[,1]}else{VI[it,,tech]<-rfMiss$importance} #If y is a factor, permutation importance is given in 1st col. Otherwise just a vector
# RFSRC unsupervised - 1 iteration
tech <- 6
ImputedX <- impute(data = x, nimpute = 1)
Imputed <- cbind(ImputedX, y)
rfMiss <- rfsrc(y~., data=Imputed, ntree=500,importance=TRUE)
if(is.factor(y)){VI[it,,tech]<-rfMiss$importance[,1]}else{VI[it,,tech]<-rfMiss$importance} #If y is a factor, permutation importance is given in 3rd col. Otherwise first
# RFSRC unsupervised - 5 iterations
tech <- 7
ImputedX <- impute(data = x, nimpute = 5)
Imputed <- cbind(ImputedX, y)
rfMiss <- rfsrc(y~., data=Imputed, ntree=500,importance=TRUE)
if(is.factor(y)){VI[it,,tech]<-rfMiss$importance[,1]}else{VI[it,,tech]<-rfMiss$importance} #If y is a factor, permutation importance is given in 3rd col. Otherwise first
#Impute using CALIBER
#since this is a multiple imputation technique, perform nmult times then average VI
tech <- 8
VImat <- replicate(n=5, CaliberVImvt(x,y))
VImat1 <- apply(VImat, 2, function(x){x/sum(x)}) #scale so each rep counts equally
VI[it,,tech] <- rowMeans(VImat)
#Impute using mice
#since this is a multiple imputation technique, perform nmult times then average VI
tech <- 9
VImat <- replicate(n=5, miceVImvt(x,y))
VImat1 <- apply(VImat, 2, function(x){x/sum(x)}) #scale so each rep counts equally
VI[it,,tech] <- rowMeans(VImat)
#Complete Cases
tech <- 10
STEMsubset <- STEM[complete.cases(STEM),]
RF <- rfrsc(y~., data=STEMsubset, ntree=500,importance=TRUE)
VI[it,,tech] <- RF$importance/sum(RF$importance)
print(it)
}
###############################################
VI <- readRDS("STEMVIRes.rds")
#1st index is for repetition
#2nd index is for variable
#3rd index is for imputation technique
VI[VI<0]<-0
for (i in 1:10){
VI[i,,] <- apply(VI[i,,], 2, function(x){x/sum(x)}) #scale so each rep counts equally
}
MeanVI <- apply(VI, c(2,3), mean) #average over reps
SEVI <- apply(VI, c(2,3), function(x){sd(x)/sqrt(length(x))})
round(MeanVI, 3)
round(MeanVI-qt(.975,10)*SEVI, 3)
round(MeanVI+qt(.975,10)*SEVI, 3)
#Setup dataframe to show results
Technique <- rep(1:10, each=15)
Variable <- rep(1:15, 10)
Importance <- c(MeanVI)
IMPdf <- data.frame(Technique, Variable, Importance)
ggplot(data=IMPdf, aes(x=Technique, y=Importance)) + geom_col() + facet_grid(. ~ Variable)
ggplot(data=IMPdf, aes(x=Variable, y=Importance)) + geom_col() + facet_grid(. ~ Technique)
STEM1 <- STEM[complete.cases(STEM),]
library(corrplot)
STEM2 <- STEM1[, c(1, 2, 5, 6, 11:15) ]
M <- cor(STEM2)
corrplot(M)