library(tidyverse)

train <- targets::tar_read(training_data)
head(train)
## # A tibble: 6 × 31
##       Y   Xb1   Xb2     Xc1    Xc2    Xc3    Xc4    Xc5     Xc6    Xc7   Xc8
##   <dbl> <dbl> <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl> <dbl>
## 1     6     1    NA  0.997  -1.05  NA      0.177  1.98   1.15   NA     -1.91
## 2     2     1     0 NA       0.235 -0.875  1.74  NA     -1.02   -0.216 NA   
## 3     0     1     0 -1.19   NA     NA     NA     -1.87   0.322  -0.109 NA   
## 4     5     0     0  0.215   1.51   0.117 -0.112  0.881 -1.50    0.708  1.06
## 5     0     1     0 -0.0831  1.48  -0.381  0.810 -1.80  -1.79    0.148  1.43
## 6     0     0     1 -0.944   0.114 -0.336 -1.03   1.60   0.0605 NA     -1.06
## # … with 20 more variables: Xc9 <dbl>, Xc10 <dbl>, Xc11 <dbl>, Xc12 <dbl>,
## #   Xc13 <dbl>, Xc14 <dbl>, Xc15 <dbl>, Xc16 <dbl>, Xc17 <dbl>, Xc18 <dbl>,
## #   Xc19 <dbl>, Xc20 <dbl>, Xc21 <dbl>, Xc22 <dbl>, Xc23 <dbl>, Xc24 <dbl>,
## #   Xc25 <dbl>, Xc26 <dbl>, Xc27 <dbl>, Xc28 <dbl>
summary(train)
##        Y              Xb1              Xb2              Xc1          
##  Min.   : 0.00   Min.   :0.0000   Min.   :0.0000   Min.   :-2.42262  
##  1st Qu.: 0.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:-0.77685  
##  Median : 2.00   Median :1.0000   Median :0.0000   Median :-0.08307  
##  Mean   : 3.92   Mean   :0.5326   Mean   :0.4588   Mean   :-0.04817  
##  3rd Qu.: 6.00   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 0.74471  
##  Max.   :25.00   Max.   :1.0000   Max.   :1.0000   Max.   : 2.38876  
##                  NA's   :8        NA's   :15       NA's   :13        
##       Xc2                Xc3                Xc4                 Xc5           
##  Min.   :-2.22244   Min.   :-2.65138   Min.   :-2.994136   Min.   :-2.611865  
##  1st Qu.:-0.76658   1st Qu.:-0.75554   1st Qu.:-0.665072   1st Qu.:-0.793311  
##  Median : 0.06364   Median :-0.21222   Median : 0.008279   Median : 0.111528  
##  Mean   : 0.06045   Mean   :-0.04582   Mean   :-0.009028   Mean   : 0.001503  
##  3rd Qu.: 0.67083   3rd Qu.: 0.58221   3rd Qu.: 0.662160   3rd Qu.: 0.643325  
##  Max.   : 2.24748   Max.   : 2.81411   Max.   : 2.680929   Max.   : 2.232133  
##  NA's   :11         NA's   :11         NA's   :13          NA's   :15         
##       Xc6                Xc7               Xc8                Xc9          
##  Min.   :-2.33651   Min.   :-2.3337   Min.   :-3.71406   Min.   :-2.62379  
##  1st Qu.:-0.86994   1st Qu.:-0.7635   1st Qu.:-0.62434   1st Qu.:-0.46509  
##  Median :-0.00598   Median :-0.1548   Median : 0.14861   Median :-0.02001  
##  Mean   : 0.03411   Mean   :-0.1729   Mean   : 0.03631   Mean   : 0.03218  
##  3rd Qu.: 0.80659   3rd Qu.: 0.5018   3rd Qu.: 0.76780   3rd Qu.: 0.63033  
##  Max.   : 2.47438   Max.   : 1.6673   Max.   : 2.57667   Max.   : 2.50040  
##  NA's   :13         NA's   :12        NA's   :11         NA's   :14        
##       Xc10               Xc11               Xc12                Xc13         
##  Min.   :-2.17129   Min.   :-2.97353   Min.   :-2.021350   Min.   :-2.11388  
##  1st Qu.:-0.96595   1st Qu.:-0.68268   1st Qu.:-0.617916   1st Qu.:-0.53245  
##  Median : 0.02689   Median :-0.06923   Median :-0.014719   Median : 0.01151  
##  Mean   :-0.02638   Mean   :-0.08105   Mean   : 0.000896   Mean   : 0.08188  
##  3rd Qu.: 0.83884   3rd Qu.: 0.59591   3rd Qu.: 0.649330   3rd Qu.: 0.74255  
##  Max.   : 2.69826   Max.   : 2.04709   Max.   : 2.009133   Max.   : 2.61611  
##  NA's   :4          NA's   :8          NA's   :10          NA's   :6         
##       Xc14                Xc15               Xc16              Xc17         
##  Min.   :-3.822673   Min.   :-2.33111   Min.   :-2.2892   Min.   :-2.07666  
##  1st Qu.:-0.504155   1st Qu.:-0.75604   1st Qu.:-0.3268   1st Qu.:-0.75959  
##  Median : 0.190520   Median : 0.04651   Median : 0.1328   Median : 0.01229  
##  Mean   : 0.007986   Mean   : 0.01799   Mean   : 0.1469   Mean   : 0.03996  
##  3rd Qu.: 0.536209   3rd Qu.: 0.81281   3rd Qu.: 0.6115   3rd Qu.: 0.73229  
##  Max.   : 2.423473   Max.   : 2.62742   Max.   : 2.3000   Max.   : 2.75841  
##  NA's   :6           NA's   :9          NA's   :7         NA's   :9         
##       Xc18                Xc19                Xc20              Xc21         
##  Min.   :-1.963741   Min.   :-2.557614   Min.   :-2.3967   Min.   :-2.14738  
##  1st Qu.:-0.436281   1st Qu.:-0.714337   1st Qu.:-0.4963   1st Qu.:-0.56835  
##  Median : 0.009714   Median : 0.040600   Median : 0.3228   Median : 0.07179  
##  Mean   : 0.075030   Mean   : 0.004843   Mean   : 0.1679   Mean   : 0.10608  
##  3rd Qu.: 0.678930   3rd Qu.: 0.709491   3rd Qu.: 0.7862   3rd Qu.: 0.90248  
##  Max.   : 3.031002   Max.   : 2.431430   Max.   : 2.6652   Max.   : 2.41865  
##  NA's   :10          NA's   :10          NA's   :6         NA's   :10        
##       Xc22               Xc23               Xc24               Xc25         
##  Min.   :-2.27797   Min.   :-2.45074   Min.   :-3.32592   Min.   :-2.30911  
##  1st Qu.:-0.58014   1st Qu.:-0.81507   1st Qu.:-0.37031   1st Qu.:-0.66183  
##  Median : 0.04339   Median :-0.03349   Median : 0.26846   Median :-0.06358  
##  Mean   : 0.04936   Mean   :-0.09579   Mean   : 0.06763   Mean   :-0.05510  
##  3rd Qu.: 0.73982   3rd Qu.: 0.66651   3rd Qu.: 0.70255   3rd Qu.: 0.52815  
##  Max.   : 1.96397   Max.   : 2.70206   Max.   : 2.03726   Max.   : 3.20248  
##  NA's   :14         NA's   :7          NA's   :15         NA's   :10        
##       Xc26              Xc27                Xc28         
##  Min.   :-3.3002   Min.   :-3.336178   Min.   :-2.27482  
##  1st Qu.:-0.6705   1st Qu.:-0.722137   1st Qu.:-0.64359  
##  Median :-0.1275   Median :-0.008069   Median : 0.11982  
##  Mean   :-0.1679   Mean   :-0.091117   Mean   :-0.01474  
##  3rd Qu.: 0.4258   3rd Qu.: 0.529088   3rd Qu.: 0.52878  
##  Max.   : 2.0646   Max.   : 2.096923   Max.   : 1.71450  
##  NA's   :11        NA's   :7           NA's   :9
pairs(train[, 1:10])

Visualising missing values

train %>%
  select(-Y) %>%
  is.na() %>%
  as_tibble() %>%
  mutate(Index = 1:n()) %>%
  pivot_longer(-Index) %>%
  ggplot(aes(x = name, y = Index, fill = value)) +
  geom_tile() +
  labs(x = "Variable", fill = "Missing") +
  coord_cartesian(expand = FALSE) +
  scale_fill_manual(values = c("#000000", "#E69F00")) +
  theme_classic(base_size = 15)

Multicollinearity

For plotting, we impute missing values by 0 (mean of each feature).

train %>%
  select(-Y, -Xb1, -Xb2) %>%
  replace(is.na(.), 0) %>%
  cor() %>%
  corrplot::corrplot.mixed()

Distribution of the outcome variable

table(train[["Y"]]) %>%
  barplot(main = "Count of outcome variable")