library(tidyverse)
train <- targets::tar_read(training_data)
head(train)
## # A tibble: 6 × 31
## Y Xb1 Xb2 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7 Xc8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 1 NA 0.997 -1.05 NA 0.177 1.98 1.15 NA -1.91
## 2 2 1 0 NA 0.235 -0.875 1.74 NA -1.02 -0.216 NA
## 3 0 1 0 -1.19 NA NA NA -1.87 0.322 -0.109 NA
## 4 5 0 0 0.215 1.51 0.117 -0.112 0.881 -1.50 0.708 1.06
## 5 0 1 0 -0.0831 1.48 -0.381 0.810 -1.80 -1.79 0.148 1.43
## 6 0 0 1 -0.944 0.114 -0.336 -1.03 1.60 0.0605 NA -1.06
## # … with 20 more variables: Xc9 <dbl>, Xc10 <dbl>, Xc11 <dbl>, Xc12 <dbl>,
## # Xc13 <dbl>, Xc14 <dbl>, Xc15 <dbl>, Xc16 <dbl>, Xc17 <dbl>, Xc18 <dbl>,
## # Xc19 <dbl>, Xc20 <dbl>, Xc21 <dbl>, Xc22 <dbl>, Xc23 <dbl>, Xc24 <dbl>,
## # Xc25 <dbl>, Xc26 <dbl>, Xc27 <dbl>, Xc28 <dbl>
summary(train)
## Y Xb1 Xb2 Xc1
## Min. : 0.00 Min. :0.0000 Min. :0.0000 Min. :-2.42262
## 1st Qu.: 0.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:-0.77685
## Median : 2.00 Median :1.0000 Median :0.0000 Median :-0.08307
## Mean : 3.92 Mean :0.5326 Mean :0.4588 Mean :-0.04817
## 3rd Qu.: 6.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 0.74471
## Max. :25.00 Max. :1.0000 Max. :1.0000 Max. : 2.38876
## NA's :8 NA's :15 NA's :13
## Xc2 Xc3 Xc4 Xc5
## Min. :-2.22244 Min. :-2.65138 Min. :-2.994136 Min. :-2.611865
## 1st Qu.:-0.76658 1st Qu.:-0.75554 1st Qu.:-0.665072 1st Qu.:-0.793311
## Median : 0.06364 Median :-0.21222 Median : 0.008279 Median : 0.111528
## Mean : 0.06045 Mean :-0.04582 Mean :-0.009028 Mean : 0.001503
## 3rd Qu.: 0.67083 3rd Qu.: 0.58221 3rd Qu.: 0.662160 3rd Qu.: 0.643325
## Max. : 2.24748 Max. : 2.81411 Max. : 2.680929 Max. : 2.232133
## NA's :11 NA's :11 NA's :13 NA's :15
## Xc6 Xc7 Xc8 Xc9
## Min. :-2.33651 Min. :-2.3337 Min. :-3.71406 Min. :-2.62379
## 1st Qu.:-0.86994 1st Qu.:-0.7635 1st Qu.:-0.62434 1st Qu.:-0.46509
## Median :-0.00598 Median :-0.1548 Median : 0.14861 Median :-0.02001
## Mean : 0.03411 Mean :-0.1729 Mean : 0.03631 Mean : 0.03218
## 3rd Qu.: 0.80659 3rd Qu.: 0.5018 3rd Qu.: 0.76780 3rd Qu.: 0.63033
## Max. : 2.47438 Max. : 1.6673 Max. : 2.57667 Max. : 2.50040
## NA's :13 NA's :12 NA's :11 NA's :14
## Xc10 Xc11 Xc12 Xc13
## Min. :-2.17129 Min. :-2.97353 Min. :-2.021350 Min. :-2.11388
## 1st Qu.:-0.96595 1st Qu.:-0.68268 1st Qu.:-0.617916 1st Qu.:-0.53245
## Median : 0.02689 Median :-0.06923 Median :-0.014719 Median : 0.01151
## Mean :-0.02638 Mean :-0.08105 Mean : 0.000896 Mean : 0.08188
## 3rd Qu.: 0.83884 3rd Qu.: 0.59591 3rd Qu.: 0.649330 3rd Qu.: 0.74255
## Max. : 2.69826 Max. : 2.04709 Max. : 2.009133 Max. : 2.61611
## NA's :4 NA's :8 NA's :10 NA's :6
## Xc14 Xc15 Xc16 Xc17
## Min. :-3.822673 Min. :-2.33111 Min. :-2.2892 Min. :-2.07666
## 1st Qu.:-0.504155 1st Qu.:-0.75604 1st Qu.:-0.3268 1st Qu.:-0.75959
## Median : 0.190520 Median : 0.04651 Median : 0.1328 Median : 0.01229
## Mean : 0.007986 Mean : 0.01799 Mean : 0.1469 Mean : 0.03996
## 3rd Qu.: 0.536209 3rd Qu.: 0.81281 3rd Qu.: 0.6115 3rd Qu.: 0.73229
## Max. : 2.423473 Max. : 2.62742 Max. : 2.3000 Max. : 2.75841
## NA's :6 NA's :9 NA's :7 NA's :9
## Xc18 Xc19 Xc20 Xc21
## Min. :-1.963741 Min. :-2.557614 Min. :-2.3967 Min. :-2.14738
## 1st Qu.:-0.436281 1st Qu.:-0.714337 1st Qu.:-0.4963 1st Qu.:-0.56835
## Median : 0.009714 Median : 0.040600 Median : 0.3228 Median : 0.07179
## Mean : 0.075030 Mean : 0.004843 Mean : 0.1679 Mean : 0.10608
## 3rd Qu.: 0.678930 3rd Qu.: 0.709491 3rd Qu.: 0.7862 3rd Qu.: 0.90248
## Max. : 3.031002 Max. : 2.431430 Max. : 2.6652 Max. : 2.41865
## NA's :10 NA's :10 NA's :6 NA's :10
## Xc22 Xc23 Xc24 Xc25
## Min. :-2.27797 Min. :-2.45074 Min. :-3.32592 Min. :-2.30911
## 1st Qu.:-0.58014 1st Qu.:-0.81507 1st Qu.:-0.37031 1st Qu.:-0.66183
## Median : 0.04339 Median :-0.03349 Median : 0.26846 Median :-0.06358
## Mean : 0.04936 Mean :-0.09579 Mean : 0.06763 Mean :-0.05510
## 3rd Qu.: 0.73982 3rd Qu.: 0.66651 3rd Qu.: 0.70255 3rd Qu.: 0.52815
## Max. : 1.96397 Max. : 2.70206 Max. : 2.03726 Max. : 3.20248
## NA's :14 NA's :7 NA's :15 NA's :10
## Xc26 Xc27 Xc28
## Min. :-3.3002 Min. :-3.336178 Min. :-2.27482
## 1st Qu.:-0.6705 1st Qu.:-0.722137 1st Qu.:-0.64359
## Median :-0.1275 Median :-0.008069 Median : 0.11982
## Mean :-0.1679 Mean :-0.091117 Mean :-0.01474
## 3rd Qu.: 0.4258 3rd Qu.: 0.529088 3rd Qu.: 0.52878
## Max. : 2.0646 Max. : 2.096923 Max. : 1.71450
## NA's :11 NA's :7 NA's :9
pairs(train[, 1:10])
train %>%
select(-Y) %>%
is.na() %>%
as_tibble() %>%
mutate(Index = 1:n()) %>%
pivot_longer(-Index) %>%
ggplot(aes(x = name, y = Index, fill = value)) +
geom_tile() +
labs(x = "Variable", fill = "Missing") +
coord_cartesian(expand = FALSE) +
scale_fill_manual(values = c("#000000", "#E69F00")) +
theme_classic(base_size = 15)
For plotting, we impute missing values by 0 (mean of each feature).
train %>%
select(-Y, -Xb1, -Xb2) %>%
replace(is.na(.), 0) %>%
cor() %>%
corrplot::corrplot.mixed()
table(train[["Y"]]) %>%
barplot(main = "Count of outcome variable")