# Set CRAN mirror and Install kernlab, ggplot2, and dplyr packages
options(repos = c(CRAN = "https://cloud.r-project.org"))

if (!requireNamespace("h2o", quietly = TRUE)) {
  cat("The 'h2o' package is not installed.\n")
} else {
  cat("The 'h2o' package is already installed.\n")
}

## The 'h2o' package is already installed.

if (!requireNamespace("rJavaEnv", quietly = TRUE)) {
  cat("The 'rJavaEnv' package is not installed.\n")
} else {
  cat("The 'rJavaEnv' package is already installed.\n")
}

## The 'rJavaEnv' package is already installed.

#import the bank data
library(readr)

#Import the dataset.  Change the file path to where you saved your bank.txt file
bank <- read_csv("C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files/bank.txt", 
                      col_types = cols(b_tgt = col_character(), 
                                                 int_tgt = col_number(), cnt_tgt = col_double(), 
                                                 demog_homeval = col_number(), demog_inc = col_number(), 
                                                 rfm1 = col_number(), rfm2 = col_number(), 
                                                 rfm3 = col_number(), rfm4 = col_number(), 
                                                 demog_genf = col_character(), demog_genm = col_character(), 
                                                 dataset = col_character()))

#Caret and other packages don't like 0,1 factor levels and will treat them as numeric, lets fix that
bank$b_tgt<-ifelse(bank$b_tgt=="1", "yes", "no")
bank$demog_ho<-ifelse(bank$demog_ho=='1',"yes","no")
bank$demog_genf<-ifelse(bank$demog_genf=="1", "yes","no")
bank$demog_genm<-ifelse(bank$demog_genm=="1", "yes","no")

#Convert factor columns to factors
cols<-c("b_tgt", "cat_input1", "cat_input2","demog_ho","demog_genf", "demog_genm")
bank[cols] <- lapply(bank[cols], factor)
str(bank) #Make sure the data structure looks good

head(bank)

## # A tibble: 6 × 26
##   b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##   <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
## 1 yes      7000      NA X          A                 NA no               57600
## 2 yes      7000       2 X          A                 NA no               57587
## 3 yes     15000       2 X          A                 NA no               44167
## 4 no         NA       0 X          A                 68 no               90587
## 5 no         NA       0 X          A                 NA no              100313
## 6 no         NA       0 X          A                 26 no               26622
## # ℹ 18 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>

summary(bank$int_tgt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0    6000   10000   11236   16000  500000  848529

Imputation

int_tgt

class(bank)

## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

#replace to nas with 0 for int_tgt
bank$int_tgt[is.na(bank$int_tgt)] <- 0

Demog_age

#use Linear regression to impute missing values
## Create a linear model to predict demog_age using other variables
lm_model <- lm(demog_age ~ demog_homeval + demog_inc + demog_pr + demog_ho + rfm12 + rfm1, data = bank)

# Predict missing values
predicted_values <- predict(lm_model, newdata = bank[is.na(bank$demog_age), ])
# Replace missing values with predicted values
bank$demog_age[is.na(bank$demog_age)] <- predicted_values
 
# Check the summary of the imputed variable
cat("Summary of imputed Customer age:\n")

## Summary of imputed Customer age:

summary(bank$demog_age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -14.45   50.00   58.00   58.55   69.00   89.00

library(stargazer)

# summarize the regression output
summary(lm_model)

## 
## Call:
## lm(formula = demog_age ~ demog_homeval + demog_inc + demog_pr + 
##     demog_ho + rfm12 + rfm1, data = bank)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.038 -11.833   1.174  13.161  59.490 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.836e+01  7.805e-02 619.527  < 2e-16 ***
## demog_homeval  3.217e-06  2.166e-07  14.854  < 2e-16 ***
## demog_inc     -2.069e-05  8.543e-07 -24.218  < 2e-16 ***
## demog_pr       1.336e-01  1.637e-03  81.651  < 2e-16 ***
## demog_hoyes   -2.948e-01  4.242e-02  -6.949 3.68e-12 ***
## rfm12          1.045e-01  4.962e-04 210.696  < 2e-16 ***
## rfm1          -2.222e-02  1.262e-03 -17.612  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.3 on 793170 degrees of freedom
##   (266861 observations deleted due to missingness)
## Multiple R-squared:  0.06459,    Adjusted R-squared:  0.06458 
## F-statistic:  9127 on 6 and 793170 DF,  p-value: < 2.2e-16

#create a histogram of demog_age
library(ggplot2)
ggplot(bank, aes(x = demog_age)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Customer Age", x = "Customer Age", y = "Frequency") +
  theme_minimal()

summary(bank$demog_age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -14.45   50.00   58.00   58.55   69.00   89.00

#count the number of negative values
count_negative_values <- sum(bank$demog_age < 0)
cat("Number of negative values in demog_age:", count_negative_values, "\n")

## Number of negative values in demog_age: 20

#create a new data frame that only includes the rows with demog_age < 0
bank_demog_age_neg <- bank[bank$demog_age == -1, ]
head(bank_demog_age_neg)

## # A tibble: 6 × 26
##   b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##   <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
## 1 no          0       0 X          B                 -1 yes              88071
## 2 yes      2000       2 X          B                 -1 yes                  0
## 3 yes      2000       2 X          B                 -1 yes                  0
## 4 no          0       0 X          C                 -1 no               41927
## 5 yes      2000       2 X          B                 -1 yes                  0
## 6 no          0       0 X          B                 -1 yes              88099
## # ℹ 18 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>

#Winsorized demog_age < 0 to the 1st quartile 
bank$demog_age[bank$demog_age < 0] <- quantile(bank$demog_age, 0.25, na.rm = TRUE)
# Check the summary of the imputed variable
cat("Summary of imputed demog_age:\n")

## Summary of imputed demog_age:

summary(bank$demog_age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   50.00   58.00   58.55   69.00   89.00

quantile(bank$demog_age, 0.25, na.rm = TRUE)

## 25% 
##  50

#plot the distribution of demog_age
ggplot(bank, aes(x = demog_age)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Customer Age", x = "Customer Age", y = "Frequency") +
  theme_minimal()

rfm3

#use linear regression to impute the missing values for rfm3
lm_model_rfm3 <- lm(rfm3 ~ rfm1 + rfm2 + rfm4 + rfm5 + rfm7 + rfm8 , data = bank)

# Predict missing values
predicted_values_rfm3 <- predict(lm_model_rfm3, newdata = bank[is.na(bank$rfm3), ])
# Replace missing values with predicted values
bank$rfm3[is.na(bank$rfm3)] <- predicted_values_rfm3

# Check the summary of the imputed variable
cat("Summary of imputed rfm3:\n")

## Summary of imputed rfm3:

summary(bank$rfm3)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -63.73   10.00   14.88   15.83   20.00 3713.31

#count the number of negative values
count_negative_values_rfm3 <- sum(bank$rfm3 < 0)
cat("Number of negative values in rfm3:", count_negative_values_rfm3, "\n")

## Number of negative values in rfm3: 14041

#impute the negative values for rfm3 with the .05 quantile
bank$rfm3[bank$rfm3 < 0] <- quantile(bank$rfm3, 0.05, na.rm = TRUE)
# Check the summary of the imputed variable
cat("Summary of imputed rfm3:\n")

## Summary of imputed rfm3:

summary(bank$rfm3)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   10.00   14.88   15.98   20.00 3713.31

#plot the distribution of rfm3
ggplot(bank, aes(x = rfm3)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of rfm3", x = "rfm3", y = "Frequency") +
  theme_minimal()

cnt_tgt

#filter the data to only include the rows with cnt_tgt as NA
bank_cnt_tgt_na <- bank[is.na(bank$cnt_tgt), ]

head(bank_cnt_tgt_na)

## # A tibble: 1 × 26
##   b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##   <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
## 1 yes      7000      NA X          A               60.2 no               57600
## # ℹ 18 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>

library(ggplot2)
#mean of cnt_tgt when demog_genm == yes and demog_homeval < 100000
mean_cnt_tgt_genm_yes_homeval_lt_100k <- mean(bank$cnt_tgt[bank$demog_genm == "yes" & bank$demog_homeval < 100000], na.rm = TRUE)
mean_cnt_tgt_genm_yes_homeval_lt_100k

## [1] 0.2450341

#round the imputed value to the nearest whole number
mean_cnt_tgt_genm_yes_homeval_lt_100k <- round(mean_cnt_tgt_genm_yes_homeval_lt_100k)

mean_cnt_tgt_genm_yes_homeval_lt_100k

## [1] 0

summary(bank$cnt_tgt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.3118  0.0000  6.0000       1

#create histogram of cnt_tgt
ggplot(bank, aes(x = cnt_tgt)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of cnt_tgt", x = "cnt_tgt", y = "Frequency") +
  theme_minimal()

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

#impute the cnt_tgt variable as 1
bank$cnt_tgt[is.na(bank$cnt_tgt)] <- 1
# Check the summary of the imputed variable
cat("Summary of imputed cnt_tgt:\n")

## Summary of imputed cnt_tgt:

summary(bank$cnt_tgt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3118  0.0000  6.0000

the single NA value for cnt_tgt was imputed as 1, which is the mean of cnt_tgt when demog_genm == yes and demog_homeval < 100000.

Categorical Predictor analysis

library(vcd)

## Loading required package: grid

#create a contingency table with Categorical variables
bank_cat1 <- table(bank$b_tgt, bank$cat_input1)
bank_cat2 <- table(bank$b_tgt, bank$cat_input2)



# perform a Chi-squared test
chisq_test_cat1 <- chisq.test(bank_cat1)
chisq_test_cat2 <- chisq.test(bank_cat2)
 

# print the results
cat("Chi-squared test for b_tgt and cat_input1:\n")

## Chi-squared test for b_tgt and cat_input1:

print(chisq_test_cat1)

## 
##  Pearson's Chi-squared test
## 
## data:  bank_cat1
## X-squared = 20362, df = 2, p-value < 2.2e-16

cat("Chi-squared test for b_tgt and cat_input2:\n")

## Chi-squared test for b_tgt and cat_input2:

print(chisq_test_cat2)

## 
##  Pearson's Chi-squared test
## 
## data:  bank_cat2
## X-squared = 16146, df = 4, p-value < 2.2e-16

n <- nrow(bank)
v2 <- chisq_test_cat1$statistic / (n *4)
v1 <- chisq_test_cat2$statistic / (n *4)

cat("Cramer's V for cat_input1:", sqrt(v2), "\n")

## Cramer's V for cat_input1: 0.06929801

cat("Cramer's V for cat_input2:", sqrt(v1), "\n")

## Cramer's V for cat_input2: 0.0617084

Cramer’s V for cat_input1 and cat_input2 show that they likely have negligible effects on the b_tgt response variable given our sample size.

#perform ANOVA test for cnt_tgt
anova_test_cnt_tgt <- aov(cnt_tgt ~ cat_input1 + cat_input2, data = bank)

#perform ANOVA test for int_tgt
anova_test_int_tgt <- aov(int_tgt ~  cat_input1 + cat_input2, data = bank)

# print the results
cat("ANOVA test for cnt_tgt:\n")

## ANOVA test for cnt_tgt:

print(summary(anova_test_cnt_tgt))

##                  Df Sum Sq Mean Sq F value Pr(>F)    
## cat_input1        2  10212    5106   10719 <2e-16 ***
## cat_input2        4   3761     940    1974 <2e-16 ***
## Residuals   1060031 504973       0                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("ANOVA test for int_tgt:\n")

## ANOVA test for int_tgt:

print(summary(anova_test_int_tgt))

##                  Df    Sum Sq   Mean Sq F value Pr(>F)    
## cat_input1        2 3.274e+11 1.637e+11    4832 <2e-16 ***
## cat_input2        4 3.851e+11 9.629e+10    2842 <2e-16 ***
## Residuals   1060031 3.591e+13 3.388e+07                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Cat input 1 is only explaining 1.97% of the overall variance in cnt_tgt, and Cat Input 2 is only explaining .72% of the variance in cnt_tgt.

Cat INput 1 is only explaining .89% of overall variance of int_tgt, and Cat Input 2 is explaining 1.05% of the overall variance in int_tgt.
This is not enough variation in the data for me to include these variables in the candidate models for cnt_tgt or int_tgt.

# Calculate effect sizes (eta-squared)
eta_cnt_input1 <- 10212 / (10212 + 3761 + 504973)
eta_cnt_input2 <- 3761 / (10212 + 3761 + 504973)

# Post-hoc analysis (Tukey HSD)
library(multcomp)
Tukey_cnt <- glht(aov(cnt_tgt ~ cat_input1 + cat_input2, data=bank), 
                linfct = mcp(cat_input1 = "Tukey"))
summary(Tukey_cnt)

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = cnt_tgt ~ cat_input1 + cat_input2, data = bank)
## 
## Linear Hypotheses:
##             Estimate Std. Error t value Pr(>|t|)    
## Y - X == 0 -0.062475   0.002595  -24.07   <2e-16 ***
## Z - X == 0 -0.283639   0.001932 -146.81   <2e-16 ***
## Z - Y == 0 -0.221164   0.003054  -72.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

#perform ANOVA test for cnt_tgt
anova_test_cnt_tgt <- aov(cnt_tgt ~ rfm1 + rfm2 + rfm8 + demog_age, data = bank)

#perform ANOVA test for int_tgt
anova_test_int_tgt <- aov(int_tgt ~  rfm1 + rfm2 + rfm8 + demog_age ,  data = bank)

# print the results
cat("ANOVA test for cnt_tgt:\n")

## ANOVA test for cnt_tgt:

print(summary(anova_test_cnt_tgt))

##                  Df Sum Sq Mean Sq F value Pr(>F)    
## rfm1              1   9065    9065 20729.7 <2e-16 ***
## rfm2              1  15766   15766 36050.8 <2e-16 ***
## rfm8              1  30465   30465 69663.6 <2e-16 ***
## demog_age         1     78      78   178.5 <2e-16 ***
## Residuals   1060033 463572       0                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("ANOVA test for int_tgt:\n")

## ANOVA test for int_tgt:

print(summary(anova_test_int_tgt))

##                  Df    Sum Sq   Mean Sq F value   Pr(>F)    
## rfm1              1 3.909e+10 3.909e+10  1161.7  < 2e-16 ***
## rfm2              1 2.243e+11 2.243e+11  6663.7  < 2e-16 ***
## rfm8              1 6.887e+11 6.887e+11 20465.7  < 2e-16 ***
## demog_age         1 5.586e+08 5.586e+08    16.6 4.62e-05 ***
## Residuals   1060033 3.567e+13 3.365e+07                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Outliers

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#create a new data frame with only the numeric variables from the data frame
numeric_vars <- bank %>% select(where(is.numeric))


#filter the numeric_vars dataframe to exclude missing values
numeric_vars <- numeric_vars %>% filter(complete.cases(.))

#compute the IQR for the numeric variables
IQR_values <- sapply(numeric_vars, IQR) 


#Create a 3 column table data  showing the 25, 50, 75 percentiles for each numeric variable
# Create a data frame with the 25th, 50th, and 75th percentiles
percentiles <- data.frame(
  Variable = names(numeric_vars),
  Q25 = sapply(numeric_vars, quantile, probs = 0.25),
  Q50 = sapply(numeric_vars, quantile, probs = 0.50),
  Q75 = sapply(numeric_vars, quantile, probs = 0.75)
)

# Create the datatable and round the values to 2 decimal places
library(DT)
datatable(percentiles, options = list(pageLength = 10), rownames = FALSE) %>%
  formatRound(columns = 2:4, digits = 2) %>%
  formatStyle(columns = 1, fontWeight = 'bold') %>%
  formatStyle(columns = 2:4, backgroundColor = 'white') %>%
  formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
  formatStyle(columns = 1, textAlign = 'center') %>%
  formatStyle(columns = 2:4, textAlign = 'center')

library(tibble)

#find the count of observations that are greater than 2 times the Q75 IQR label them as "high outliers"
high_outliers <- sapply(numeric_vars, function(x) sum(x > (quantile(x, 0.75) + 2 * IQR(x)))) 

# Convert to a data frame and rename the columns
high_outliers <- as.data.frame(high_outliers)


#find the count of observations that are less than 2 times the Q25 IQR label them as "low outliers"
low_outliers <- sapply(numeric_vars, function(x) sum(x < (quantile(x, 0.25) - 2 * IQR(x))))

# Convert to a data frame and rename the columns
low_outliers <- as.data.frame(low_outliers)
# Combine the two data frames
outliers <- cbind(high_outliers, low_outliers)
# Rename the columns
colnames(outliers) <- c("High Outliers", "Low Outliers")
# Create a data frame with the variable names
outliers <- tibble::rownames_to_column(outliers, "Variable")
# Create the datatable
library(DT)
#exclude the variables with no outliers
outliers <- outliers %>% filter(`High Outliers` > 0 | `Low Outliers` > 0)
# Create the datatable
datatable(outliers, options = list(pageLength = 10)) %>%
  formatRound(columns = 2:3, digits = 2) %>%
  formatStyle(columns = 1, fontWeight = 'bold') %>%
  formatStyle(columns = 2:3, backgroundColor = 'white') %>%
  formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
  formatStyle(columns = 1, textAlign = 'center') %>%
  formatStyle(columns = 2:3, textAlign = 'center')

#export the outliers data frame to a csv
write.csv(outliers, "outliers.csv", row.names = FALSE)

#view the complete observation for when int_tgt == 500000
bank[bank$int_tgt > 100000, ]

## # A tibble: 55 × 26
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 yes    200000       1 X          A               57   yes             169771
##  2 yes    200000       1 Y          B               76   no              100111
##  3 yes    200000       1 X          A               71   yes             217081
##  4 yes    500000       1 X          E               49.0 no               64972
##  5 yes    112000       1 X          C               42   yes                  0
##  6 yes    200000       1 X          A               58   yes             169803
##  7 yes    200000       1 X          A               76   no              100090
##  8 yes    200000       1 X          A               72   yes             217106
##  9 yes    500000       1 X          E               48.8 no               64995
## 10 yes    112000       1 X          C               42   yes                  0
## # ℹ 45 more rows
## # ℹ 18 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>

there are 11 observations where int_tgt = 500K.

This data point seems off when we look at the values of the other predictors when int_tgt is $500k. There is little to no variation in the income, demographics,and new sales data that that makes up these observations.

When we look at observations where int_tgt > $100k , we see significantly more variation in the predictor set.

It is only 11 obs, but they’re erroneous.

I am going to reclassify them based on income. Int_tgt == 500K will now take the average value of Int_tgt when deomog_inc is between $60k - $70K

The other high value Int_tgt observations over $100k are somewhat more realistic, as there is variation in the homevalue, income, and age demographic predictors that accompany these observations. There are a sparse number of these observations (44 in total), but they could be true customers of these banks.

They might have bought a home loan or education loans.

#reclassify the int_tgt == 500K observations based on income
bank$int_tgt[bank$int_tgt == 500000] <- mean(bank$int_tgt[bank$demog_inc > 60000 & bank$demog_inc < 70000], na.rm = TRUE)
# Check the summary of the imputed variable
cat("Summary of imputed int_tgt:\n")

## Summary of imputed int_tgt:

summary(bank$int_tgt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    2237       0  200000

#bin the observations of int_tgt into 6 bins - up to $200k
library(dplyr)
bank <- bank %>%
  mutate(int_tgt_bin = case_when(
    int_tgt <= 20000 ~ "0-20k",
    int_tgt > 20000 & int_tgt <= 40000 ~ "20k-40k",
    int_tgt > 40000 & int_tgt <= 60000 ~ "40k-60k",
    int_tgt > 60000 & int_tgt <= 80000 ~ "60k-80k",
    int_tgt > 80000 & int_tgt <= 100000 ~ "80k-100k",
    TRUE ~ ">100k"
  ))
# Create a histogram of the binned int_tgt variable
ggplot(bank, aes(x = int_tgt_bin)) +
  geom_bar(fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of binned int_tgt", x = "int_tgt Bin", y = "Frequency") +
  theme_minimal()

#add the exact count number for each bin to the histogram
ggplot(bank, aes(x = int_tgt_bin)) +
  geom_bar(fill = "blue", color = "black", alpha = 0.7) +
  geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5) +
  labs(title = "Histogram of binned int_tgt", x = "int_tgt Bin", y = "Frequency") +
  theme_minimal()

summary(bank$cnt_tgt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3118  0.0000  6.0000

#calculate 2x over the 3Q for cnt_tgt
cnt_tgt_3Q <- quantile(bank$cnt_tgt, 0.75) + 2 * IQR(bank$cnt_tgt)

#count the number of cnt_tgt observations that are greater than 2x the 3Q
count_high_cnt_tgt <- sum(bank$cnt_tgt > cnt_tgt_3Q)

print(paste("Number of cnt_tgt observations greater than 2x the 3Q:", count_high_cnt_tgt))

## [1] "Number of cnt_tgt observations greater than 2x the 3Q: 211509"

print(paste("cnt_tgt 3Q:", cnt_tgt_3Q))

## [1] "cnt_tgt 3Q: 0"

#show the observations wher cnt_tgt = 6
bank[bank$cnt_tgt == 6, ]

## # A tibble: 11 × 27
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 yes     16000       6 X          E               56.7 yes             127994
##  2 yes     16000       6 X          E               56.7 yes             127991
##  3 yes     16000       6 X          E               56.7 yes             127989
##  4 yes     16000       6 X          E               56.5 yes             127988
##  5 yes     16000       6 X          E               56.7 yes             127989
##  6 yes     16000       6 X          E               56.7 yes             127991
##  7 yes     16000       6 X          E               56.7 yes             127995
##  8 yes     16000       6 X          E               56.7 yes             128000
##  9 yes     16000       6 X          E               56.7 yes             128006
## 10 yes     16000       6 X          E               56.7 yes             127995
## 11 yes     16000       6 X          E               56.7 yes             128039
## # ℹ 19 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>

#reclassify the observations when cnt_tgt == 6 based on income 
bank$cnt_tgt[bank$cnt_tgt == 6] <- mean(bank$cnt_tgt[bank$demog_inc > 30000 & bank$demog_inc < 35000], na.rm = TRUE)

#round the imputed values to the nearest whole number
bank$cnt_tgt[bank$cnt_tgt == 6] <- round(bank$cnt_tgt[bank$cnt_tgt == 6])

#show the imputed data points

when cnt_tgt = 6, there is little variation in the predictor set… these appear to be erroneous. Again only 11 obs, but better to put them to use than to throw them out.

And better to use them in a meaningful way rather than use them as they’re given (errant!)

#show the observations wher cnt_tgt = 5
bank[bank$cnt_tgt == 5, ]

## # A tibble: 297 × 27
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 yes     17000       5 X          A                 82 yes              91509
##  2 yes     10000       5 X          A                 84 no               28767
##  3 yes     20000       5 X          A                 79 yes              68634
##  4 yes      5000       5 X          A                 80 yes              74290
##  5 yes      2000       5 X          A                 48 no              228917
##  6 yes     12000       5 X          A                 67 no              224969
##  7 yes      2000       5 X          A                 66 yes              75412
##  8 yes     12000       5 X          D                 81 no               89601
##  9 yes     14000       5 X          E                 72 no               49445
## 10 yes     11000       5 X          E                 41 no                   0
## # ℹ 287 more rows
## # ℹ 19 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>

when cnt_tgt = 5, there is more variation in the predictor set… these appear to be true observations

#show the observations wher cnt_tgt = 4
bank[bank$cnt_tgt == 4, ]

## # A tibble: 2,552 × 27
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 yes      9000       4 X          A               58   no               51401
##  2 yes     18000       4 X          A               80   no               59612
##  3 yes     16000       4 X          A               35   no               24720
##  4 yes      8000       4 X          A               85   no               99583
##  5 yes     20000       4 X          A               61.4 no              136386
##  6 yes      6000       4 X          A               55.7 no              167885
##  7 yes      2000       4 X          A               67   yes              94492
##  8 yes     12000       4 X          A               56.9 no              157205
##  9 yes     20000       4 X          A               56.7 no               36902
## 10 yes      6000       4 X          A               63   no               33204
## # ℹ 2,542 more rows
## # ℹ 19 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>

#plot a histogram of demog_inc
ggplot(bank, aes(x = demog_inc)) +
  geom_histogram(binwidth = 1000, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Customer Income", x = "Customer Income", y = "Frequency") +
  theme_minimal()

#count the number of observations where demog_inc == 0
count_zero_income <- sum(bank$demog_inc == 0)
cat("Number of observations with demog_inc == 0:", count_zero_income, "\n")

## Number of observations with demog_inc == 0: 253253

#plot the distribution of demog_homeval when demog_inc == 0
ggplot(bank[bank$demog_inc == 0, ], aes(x = demog_homeval)) +
  geom_histogram(binwidth = 1000, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Customer Home Value when Income == 0", x = "Customer Home Value", y = "Frequency") +
  theme_minimal()

summary(bank$demog_homeval[bank$demog_inc == 0])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0   49710   76698  112377  137999  600056

summary(bank$demog_homeval)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0   51107   73880  106104  122214  600067

#show observations where demog_inc == 0
bank[bank$demog_inc == 0, ]

## # A tibble: 253,253 × 27
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 no          0       0 X          A               61.9 no              100313
##  2 no          0       0 X          A               74   yes              95496
##  3 no          0       0 X          A               77   yes                  0
##  4 no          0       0 X          A               57.3 no               42583
##  5 no          0       0 X          A               83   no              181710
##  6 no          0       0 X          A               81   no               61176
##  7 yes     10000       1 X          A               57.4 no              316292
##  8 yes     17000       2 Y          B               55.7 no               92482
##  9 yes      9000       2 X          A               55.2 no               88296
## 10 no          0       0 X          A               79   no               63998
## # ℹ 253,243 more rows
## # ℹ 19 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>

#count the number cnt_tgt == 0 when demog_inc == 0
count_zero_income_cnt_tgt <- sum(bank$cnt_tgt[bank$demog_inc == 0] == 0)
cat("Number of observations with cnt_tgt == 0 when demog_inc == 0:", count_zero_income_cnt_tgt, "\n")

## Number of observations with cnt_tgt == 0 when demog_inc == 0: 201674

#show unique values for b_tgt
unique(bank$b_tgt[bank$demog_inc == 0])

## [1] no  yes
## Levels: no yes

#count the number of demog_inc == 0  when b_tgt == 2
count_zero_income_b_tgt <- sum(bank$b_tgt[bank$demog_inc == 0] == 2)
cat("Number of observations with b_tgt == 2 when demog_inc == 0:", count_zero_income_b_tgt, "\n")

## Number of observations with b_tgt == 2 when demog_inc == 0: 0

#count the number of demog_inc == 0  when demog_ho == 0
count_zero_income_demog_ho <- sum(bank$demog_ho[bank$demog_inc == 0] == 0)
cat("Number of observations with demog_ho == 0 when demog_inc == 0:", count_zero_income_demog_ho, "\n")

## Number of observations with demog_ho == 0 when demog_inc == 0: 0

#plot the distribution of demog_inc
ggplot(bank, aes(x = demog_inc)) +
  geom_histogram(binwidth = 1000, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Customer Income", x = "Customer Income", y = "Frequency") +
  theme_minimal()

library(psych)
#describe demog_inc
describe(bank$demog_inc)

##    vars       n     mean       sd median  trimmed      mad min    max  range
## X1    1 1060038 40368.69 28029.02  43174 39283.05 21628.17   0 200007 200007
##    skew kurtosis    se
## X1 0.23     0.56 27.22

#Create a new variable called demog_inc2
#copy it from the demog_inc variable
bank$demog_inc2 <- bank$demog_inc

#use linear regression to impute the values where demog_inc2 == 0
lm_model_inc2 <- lm(demog_inc2 ~ demog_homeval + demog_pr + demog_ho + rfm12 + rfm1, data = bank)

summary(lm_model_inc2)

## 
## Call:
## lm(formula = demog_inc2 ~ demog_homeval + demog_pr + demog_ho + 
##     rfm12 + rfm1, data = bank)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -83362 -19114  -2588  16252 176286 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.507e+04  8.441e+01 178.524   <2e-16 ***
## demog_homeval  7.637e-02  2.410e-04 316.852   <2e-16 ***
## demog_pr       4.197e+01  1.950e+00  21.521   <2e-16 ***
## demog_hoyes    2.795e+04  4.517e+01 618.685   <2e-16 ***
## rfm12          8.504e+00  6.005e-01  14.163   <2e-16 ***
## rfm1          -2.804e+00  1.162e+00  -2.414   0.0158 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23040 on 1060032 degrees of freedom
## Multiple R-squared:  0.3244, Adjusted R-squared:  0.3244 
## F-statistic: 1.018e+05 on 5 and 1060032 DF,  p-value: < 2.2e-16

# Predict missing values
predicted_values_inc <- predict(lm_model_inc2, newdata = bank[bank$demog_inc2 == 0, ])

# Replace missing values with predicted values
bank$demog_inc2[bank$demog_inc2 == 0] <- predicted_values_inc
# Check the summary of the imputed variable
cat("Summary of imputed demog_inc2:\n")

## Summary of imputed demog_inc2:

describe(bank$demog_inc2)

##    vars       n     mean       sd median  trimmed      mad  min    max  range
## X1    1 1060038 46706.98 20544.33  44034 44691.92 18341.24 2493 200007 197514
##    skew kurtosis    se
## X1 1.32     3.71 19.95

#create a data frame of the predicted values
predicted_values_df <- data.frame(
  demog_inc2 = bank$demog_inc2[bank$demog_inc == 0],
  demog_homeval = bank$demog_homeval[bank$demog_inc == 0],
  demog_pr = bank$demog_pr[bank$demog_inc == 0],
  demog_ho = bank$demog_ho[bank$demog_inc == 0],
  rfm12 = bank$rfm12[bank$demog_inc == 0],
  rfm1 = bank$rfm1[bank$demog_inc == 0]
)

#plot the distribution of the predicted values for demog_inc2
ggplot(predicted_values_df, aes(x = demog_inc2)) +
  geom_histogram(binwidth = 1000, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Predicted Values for Customer Income", x = "Predicted Customer Income", y = "Frequency") +
  theme_minimal()

#show the observations where demog_inc > 200k
bank[bank$demog_inc > 200000, ]

## # A tibble: 203 × 28
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 yes     13000       1 X          A               46   yes             500005
##  2 no          0       0 X          A               50   yes             499981
##  3 no          0       0 X          C               58.9 yes                  0
##  4 yes      3000       1 Z          A               79   no              287506
##  5 no          0       0 X          B               85   yes             499986
##  6 yes      8000       3 X          B               49   yes             500009
##  7 no          0       0 X          A               73   yes             419308
##  8 yes      4000       1 X          A                6   no              323914
##  9 no          0       0 X          A               78   yes             471412
## 10 no          0       0 X          A               79   yes             368976
## # ℹ 193 more rows
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#show the observations where rfm4 > 8000
bank[bank$rfm4 > 8000, ]

## # A tibble: 11 × 28
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 no          0       0 X          E                 69 no               36275
##  2 no          0       0 X          E                 67 no               36295
##  3 no          0       0 Y          E                 68 no               36331
##  4 no          0       0 X          E                 68 no               36323
##  5 no          0       0 X          E                 68 no               36299
##  6 no          0       0 X          E                 68 no               36255
##  7 no          0       0 Y          E                 68 no               36284
##  8 no          0       0 Y          E                 69 no               36309
##  9 no          0       0 X          E                 67 no               36352
## 10 no          0       0 X          E                 67 no               36308
## 11 no          0       0 X          E                 68 no               36298
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#reclassify the observations where rfm4 > 8000 as 0
bank$rfm4[bank$rfm4 > 8000] <- 0

#show the observations where rfm2 > 1000
bank[bank$rfm2 > 500, ]

## # A tibble: 11 × 28
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 no          0       0 X          A               27.6 no              233171
##  2 no          0       0 Y          B               27.3 no              233151
##  3 no          0       0 X          A               27.6 no              233192
##  4 no          0       0 X          A               27.8 no              233204
##  5 no          0       0 X          A               27.3 no              233229
##  6 no          0       0 X          A               27.6 no              233212
##  7 no          0       0 X          A               27.8 no              233218
##  8 no          0       0 X          A               27.6 no              233211
##  9 no          0       0 X          A               27.3 no              233198
## 10 no          0       0 X          A               27.3 no              233177
## 11 no          0       0 X          A               27.6 no              233218
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#reclassify the observations where rfm2 > 500 as mean rfm2
bank$rfm2[bank$rfm2 > 500] <- mean(bank$rfm2[bank$rfm2 < 500], na.rm = TRUE)

#show the observations where rfm3 > 3000
bank[bank$rfm3 > 3000, ]

## # A tibble: 11 × 28
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 no          0       0 X          C                 50 no               60072
##  2 no          0       0 X          C                 50 no               60119
##  3 no          0       0 X          C                 50 no               60092
##  4 no          0       0 X          C                 50 no               60126
##  5 no          0       0 X          C                 50 no               60089
##  6 no          0       0 X          C                 50 no               60102
##  7 no          0       0 X          C                 50 no               60110
##  8 no          0       0 X          C                 50 no               60084
##  9 no          0       0 X          C                 50 no               60115
## 10 no          0       0 X          C                 50 no               60090
## 11 no          0       0 X          B                 50 no               60085
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#classify these as the mean of rfm3
bank$rfm3[bank$rfm3 > 3000] <- mean(bank$rfm3[bank$rfm3 < 3000], na.rm = TRUE)

#show the observations where rfm5 > 10
bank[bank$rfm5 == 18, ]

## # A tibble: 5 × 28
##   b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##   <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
## 1 yes     17000       5 X          A                 82 yes              91509
## 2 no          0       0 X          B                 69 yes              83471
## 3 yes     17000       5 X          A                 82 yes              91463
## 4 no          0       0 X          B                 69 yes              83509
## 5 yes     17000       5 X          A                 82 yes              91486
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#classify these obs when in_tgt == 5 as mean of rfm5 for obs where int_tgt == 5
bank$rfm5[bank$rfm5 == 18] <- mean(bank$rfm5[bank$int_tgt == 5], na.rm = TRUE)

#classify these obs when int_tgt == 0 as mean of rfm5 for obs where int_tgt == 0
bank$rfm5[bank$rfm5 == 18] <- mean(bank$rfm5[bank$int_tgt == 0], na.rm = TRUE)

#show observations where rfm6 > 100
bank[bank$rfm6 > 120, ]

## # A tibble: 11 × 28
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 yes     19000       2 X          A                 37 yes              48604
##  2 yes     19000       2 X          A                 37 yes              48592
##  3 yes     19000       2 X          A                 37 yes              48581
##  4 yes     19000       2 X          A                 37 yes              48624
##  5 yes     19000       2 X          A                 38 yes              48588
##  6 yes     19000       2 X          A                 37 yes              48598
##  7 yes     19000       2 X          A                 37 yes              48657
##  8 yes     19000       2 X          A                 38 yes              48565
##  9 yes     19000       2 X          A                 37 yes              48609
## 10 yes     19000       2 X          A                 37 yes              48628
## 11 yes     19000       2 X          A                 36 yes              48624
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#reclassify these as the mean of rfm6 when int_tgt_bin == 0-20k
bank$rfm6[bank$rfm6 > 120] <- mean(bank$rfm6[bank$int_tgt_bin == "0-20k"], na.rm = TRUE)

#show observations where rfm7 == 11
bank[bank$rfm7 == 11, ]

## # A tibble: 1 × 28
##   b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##   <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
## 1 yes     20000       4 X          B                 48 yes              56480
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#show observations where rfm8 == 46
bank[bank$rfm8 == 46, ]

## # A tibble: 5 × 28
##   b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##   <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
## 1 no          0       0 Y          E                  8 no               36504
## 2 no          0       0 X          E                  8 no               36485
## 3 no          0       0 X          E                  8 no               36490
## 4 no          0       0 X          E                  8 no               36497
## 5 no          0       0 X          E                  8 no               36500
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#reclassify these as 0 
bank$rfm8[bank$rfm8 == 46] <- 0

#show observations for when demog_age < 21
bank[bank$demog_age < 21, ]

## # A tibble: 12,066 × 28
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 yes      5000       3 X          A                 16 no               96274
##  2 no          0       0 X          A                  8 no               59047
##  3 no          0       0 X          A                 18 no               75077
##  4 no          0       0 X          A                 17 yes             114000
##  5 no          0       0 X          A                 17 yes             499986
##  6 no          0       0 X          A                 17 no               40615
##  7 no          0       0 X          A                 18 no                   0
##  8 no          0       0 Y          B                  8 no               83072
##  9 no          0       0 X          A                 17 no              153013
## 10 no          0       0 X          A                  7 yes              95624
## # ℹ 12,056 more rows
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#reclassify rfm9 as 0 when demog_age < 21
bank$rfm9[bank$demog_age < 21] <- 0

#show observations when rfm12 >500
bank[bank$rfm12 > 500, ]

## # A tibble: 11 × 28
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 yes      9000       1 X          E                 82 no               23102
##  2 yes      9000       1 X          E                 81 no               23093
##  3 yes      9000       1 X          E                 81 no               23075
##  4 yes      9000       1 X          E                 81 no               23122
##  5 yes      9000       1 X          E                 81 no               23124
##  6 yes      9000       1 X          E                 81 no               23095
##  7 yes      9000       1 X          E                 82 no               23083
##  8 yes      9000       1 X          E                 80 no               23097
##  9 yes      9000       1 X          E                 81 no               23153
## 10 yes      9000       1 X          E                 81 no               23134
## 11 yes      9000       1 X          E                 81 no               23103
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#show observations where rfm3 < 0
bank[bank$rfm3 < 0, ]

## # A tibble: 0 × 28
## # ℹ 28 variables: b_tgt <fct>, int_tgt <dbl>, cnt_tgt <dbl>, cat_input1 <fct>,
## #   cat_input2 <fct>, demog_age <dbl>, demog_ho <fct>, demog_homeval <dbl>,
## #   demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>, rfm3 <dbl>,
## #   rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>, rfm9 <dbl>,
## #   rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>, demog_genm <fct>,
## #   account <dbl>, dataset <chr>, int_tgt_bin <chr>, demog_inc2 <dbl>

#count the number of observations where demog_age < 21
count_demog_age_lt_21 <- sum(bank$demog_age < 21)
cat("Number of observations with demog_age < 21:", count_demog_age_lt_21, "\n")

## Number of observations with demog_age < 21: 12066

#show the observations where demog_age < 21
bank[bank$demog_age < 21, ]

## # A tibble: 12,066 × 28
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 yes      5000       3 X          A                 16 no               96274
##  2 no          0       0 X          A                  8 no               59047
##  3 no          0       0 X          A                 18 no               75077
##  4 no          0       0 X          A                 17 yes             114000
##  5 no          0       0 X          A                 17 yes             499986
##  6 no          0       0 X          A                 17 no               40615
##  7 no          0       0 X          A                 18 no                   0
##  8 no          0       0 Y          B                  8 no               83072
##  9 no          0       0 X          A                 17 no              153013
## 10 no          0       0 X          A                  7 yes              95624
## # ℹ 12,056 more rows
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>

#exclude the observations where demog_age < 21
bank <- bank[bank$demog_age >= 21, ]

Feature Engineering

#Create a variable that is income squared
bank$demog_inc2_sq <- bank$demog_inc2^2

#create a variable that is rfm6 squared
bank$rfm6_sq <- bank$rfm6^2

#create a single binary indicator if the 3 conditions are true: 
#demog_age < 40
#if demog_pr <= 25 
#demog_ho == no
bank$prospect_ho <- ifelse(bank$demog_age < 40 & bank$demog_pr <= 25 & bank$demog_ho == "no", 1, 0)

#Create an interaction term between rfm2 and demog_inc
bank$rfm2_inc2 <- bank$rfm2 * bank$demog_inc2

Skewness

# Calculate skewness for each numeric variable
skewness_values <- sapply(bank, function(x) if(is.numeric(x)) e1071::skewness(x, na.rm = TRUE) else NA)

# Create a data frame with the variable names and skewness values
skewness_df <- data.frame(
  Variable = names(skewness_values),
  Skewness = skewness_values
)
# Filter out non-numeric variables
skewness_df <- skewness_df[!is.na(skewness_df$Skewness), ]
# Create the datatable
library(DT)
# Create the datatable
datatable(skewness_df, options = list(pageLength = 10)) %>%
  formatRound(columns = 2, digits = 2) %>%
  formatStyle(columns = 1, fontWeight = 'bold') %>%
  formatStyle(columns = 2, backgroundColor = 'white') %>%
  formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
  formatStyle(columns = 1, textAlign = 'center') %>%
  formatStyle(columns = 2, textAlign = 'center')

#export the datatable to a csv
write.csv(skewness_df, "skewness_df.csv", row.names = FALSE)

#Create a second data frame from the original data set called bank_log
bank_log <- bank

# Apply log transformation and add a small constant
bank_log$int_tgt <- log(bank_log$int_tgt + 1)
bank_log$cnt_tgt <- log(bank_log$cnt_tgt + 1)
bank_log$demog_homeval <- log(bank_log$demog_homeval + 1)
bank_log$demog_inc <- log(bank_log$demog_inc + 1)
bank_log$demog_age <- log(bank_log$demog_age + 1)
bank_log$rfm1 <- log(bank_log$rfm1 + 1)
bank_log$rfm2 <- log(bank_log$rfm2 + 1)
bank_log$rfm3 <- log(bank_log$rfm3 + 1)
bank_log$rfm4 <- log(bank_log$rfm4 + 1)
bank_log$rfm5 <- log(bank_log$rfm5 + 1)
bank_log$rfm6 <- log(bank_log$rfm6 + 1)
bank_log$rfm7 <- log(bank_log$rfm7 + 1)
bank_log$rfm8 <- log(bank_log$rfm8 + 1)
bank_log$rfm9 <- log(bank_log$rfm9 + 1)
bank_log$rfm10 <- log(bank_log$rfm10 + 1)
bank_log$rfm11 <- log(bank_log$rfm11 + 1)
bank_log$rfm12 <- log(bank_log$rfm12 + 1)
bank_log$rfm2_inc2 <- log(bank_log$rfm2_inc2 + 1)
bank_log$demog_inc2_sq <- log(bank_log$demog_inc2_sq + 1)
bank_log$rfm6_sq <- log(bank_log$rfm6_sq + 1)
bank_log$demog_inc2 <- log(bank_log$demog_inc2 + 1)


#check for NaNs
na_count <- sum(is.na(bank_log))

cat("Number of NaN values in bank_log:", na_count, "\n")

## Number of NaN values in bank_log: 5

#count the NaNs for the log transformed variables
log_vars <- c("int_tgt", "cnt_tgt", "demog_homeval", "demog_inc", "demog_age", "rfm1", "rfm2", "rfm3", "rfm4", "rfm5", "rfm6", "rfm7", "rfm8", "rfm9", "rfm10", "rfm11", "rfm12","demog_inc2_sq","rfm2_inc2")
na_counts <- sapply(bank_log[log_vars], function(x) sum(is.na(x)))
# Create a data frame with the variable names and NaN counts
na_counts_df <- data.frame(
  Variable = names(na_counts),
  NaN_Count = na_counts
)
# Create the datatable
library(DT)
# Create the datatable
datatable(na_counts_df, options = list(pageLength = 10)) %>%
  formatRound(columns = 2, digits = 2) %>%
  formatStyle(columns = 1, fontWeight = 'bold') %>%
  formatStyle(columns = 2, backgroundColor = 'white') %>%
  formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
  formatStyle(columns = 1, textAlign = 'center') %>%
  formatStyle(columns = 2, textAlign = 'center')

#create a datatable for the skewness of each variable in the bank_log data set 
# Calculate skewness for each numeric variable
skewness_values_log <- sapply(bank_log, function(x) if(is.numeric(x)) e1071::skewness(x, na.rm = TRUE) else NA)
# Create a data frame with the variable names and skewness values
skewness_df_log <- data.frame(
  Variable = names(skewness_values_log),
  Skewness = skewness_values_log
)

#Create a data table
# Filter out non-numeric variables
skewness_df_log <- skewness_df_log[!is.na(skewness_df_log$Skewness), ]
# Create the datatable
library(DT)
# Create the datatable
datatable(skewness_df_log, options = list(pageLength = 10)) %>%
  formatRound(columns = 2, digits = 2) %>%
  formatStyle(columns = 1, fontWeight = 'bold') %>%
  formatStyle(columns = 2, backgroundColor = 'white') %>%
  formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
  formatStyle(columns = 1, textAlign = 'center') %>%
  formatStyle(columns = 2, textAlign = 'center')

#export the datatable to a csv
write.csv(skewness_df_log, "skewness_df_log.csv", row.names = FALSE)

#show observations where rfm5 is na
bank_log[is.na(bank_log$rfm5), ]

## # A tibble: 5 × 32
##   b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##   <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
## 1 yes      9.74    1.79 X          A               4.42 yes               11.4
## 2 no       0       0    X          B               4.25 yes               11.3
## 3 yes      9.74    1.79 X          A               4.42 yes               11.4
## 4 no       0       0    X          B               4.25 yes               11.3
## 5 yes      9.74    1.79 X          A               4.42 yes               11.4
## # ℹ 24 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>, demog_inc2_sq <dbl>, rfm6_sq <dbl>, prospect_ho <dbl>,
## #   rfm2_inc2 <dbl>

#impute the NANs as the mean of rfm5 when int_tgt_bin == 0-20k
bank_log$rfm5[is.na(bank_log$rfm5)] <- mean(bank_log$rfm5[bank_log$int_tgt_bin == "0-20k"], na.rm = TRUE)

# Check the summary of the imputed variable
summary(bank_log$rfm5)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.6931  1.0986  1.2321  1.6094  2.8904

head(bank_log)

## # A tibble: 6 × 32
##   b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##   <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
## 1 yes      8.85   0.693 X          A               4.11 no                11.0
## 2 yes      8.85   1.10  X          A               4.11 no                11.0
## 3 yes      9.62   1.10  X          A               4.06 no                10.7
## 4 no       0      0     X          A               4.23 no                11.4
## 5 no       0      0     X          A               4.14 no                11.5
## 6 no       0      0     X          A               3.30 no                10.2
## # ℹ 24 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>, demog_inc2_sq <dbl>, rfm6_sq <dbl>, prospect_ho <dbl>,
## #   rfm2_inc2 <dbl>

#show the values where demog_inc == 0
#use the bank_log data set
bank_log[bank_log$demog_inc == 0, ]

## # A tibble: 251,175 × 32
##    b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
##    <fct>   <dbl>   <dbl> <fct>      <fct>          <dbl> <fct>            <dbl>
##  1 no       0      0     X          A               4.14 no                11.5
##  2 no       0      0     X          A               4.32 yes               11.5
##  3 no       0      0     X          A               4.36 yes                0  
##  4 no       0      0     X          A               4.07 no                10.7
##  5 no       0      0     X          A               4.43 no                12.1
##  6 no       0      0     X          A               4.41 no                11.0
##  7 yes      9.21   0.693 X          A               4.07 no                12.7
##  8 yes      9.74   1.10  Y          B               4.04 no                11.4
##  9 yes      9.11   1.10  X          A               4.03 no                11.4
## 10 no       0      0     X          A               4.38 no                11.1
## # ℹ 251,165 more rows
## # ℹ 24 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## #   rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## #   rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## #   demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## #   demog_inc2 <dbl>, demog_inc2_sq <dbl>, rfm6_sq <dbl>, prospect_ho <dbl>,
## #   rfm2_inc2 <dbl>

#un log the cnt_tgt variable in the bank_log data set
bank_log$cnt_tgt <- exp(bank_log$cnt_tgt) - 1
#un log the demog_pr variable in the bank_log data set
bank_log$demog_pr <- exp(bank_log$demog_pr) - 1

#plot the density for demog_age
#use the bank_log data set
ggplot(bank_log, aes(x = demog_age)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(title = "Density Plot of Customer Age", x = "Customer Age", y = "Density") +
  theme_minimal()

#label the x axis for increments of .25
ggplot(bank_log, aes(x = demog_age)) +
  geom_density(fill = "blue", alpha = 0.5) +
  scale_x_continuous(breaks = seq(0, 10, by = 0.25)) +
  labs(title = "Density Plot of Customer Age", x = "Customer Age", y = "Density") +
  theme_minimal()

#plot the density for rfm2
#label the x axis for increments of .25
ggplot(bank_log, aes(x = rfm2)) +
  geom_density(fill = "blue", alpha = 0.5) +
  scale_x_continuous(breaks = seq(0, 10, by = 0.25)) +
  labs(title = "Density Plot of Customer RFM2", x = "Customer RFM2", y = "Density") +
  theme_minimal()

#plot the density of demog_homeval
#label the x axis for increments of .25
ggplot(bank_log, aes(x = demog_homeval)) +
  geom_density(fill = "blue", alpha = 0.5) +
  scale_x_continuous(breaks = seq(0, 50, by = 1)) +
  labs(title = "Density Plot of Customer Home Value", x = "Customer Home Value", y = "Density") +
  theme_minimal()

#plot the density of demog_inc2
#label the x axis for increments of .25
ggplot(bank_log, aes(x = demog_inc2)) +
  geom_density(fill = "blue", alpha = 0.5) +
  scale_x_continuous(breaks = seq(0, 50, by = 1)) +
  labs(title = "Density Plot of Customer Income", x = "Customer Income", y = "Density") +
  theme_minimal()

#plot the scatter plot of demog_inc2 and demog_homeval
#highlight the points where B_tgt == 1
ggplot(bank_log, aes(x = demog_inc2, y = demog_homeval, color = b_tgt)) +
  geom_point(alpha = 0.5) +
  labs(title = "Scatter Plot of Customer Income vs Home Value", x = "Customer Income", y = "Customer Home Value") +
  theme_minimal() +
  scale_color_manual(values = c("red", "blue"))

#create an interaction term of demog_inc and demog_homeval
bank_log$demog_inc_homeval <- bank_log$demog_inc * bank_log$demog_homeval

library(psych)

describe(bank_log)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##                   vars       n         mean           sd       median       min
## b_tgt                1 1047972          NaN           NA           NA       Inf
## int_tgt              2 1047972 1.780000e+00 3.630000e+00 0.000000e+00 0.000e+00
## cnt_tgt              3 1047972 3.100000e-01 7.000000e-01 0.000000e+00 0.000e+00
## cat_input1           4 1047972          NaN           NA           NA       Inf
## cat_input2           5 1047972          NaN           NA           NA       Inf
## demog_age            6 1047972 4.070000e+00 2.500000e-01 4.080000e+00 3.090e+00
## demog_ho             7 1047972          NaN           NA           NA       Inf
## demog_homeval        8 1047972 1.120000e+01 1.380000e+00 1.121000e+01 0.000e+00
## demog_inc            9 1047972 8.230000e+00 4.630000e+00 1.067000e+01 0.000e+00
## demog_pr            10 1047972 1.517608e+39 1.707822e+41 2.904885e+13 0.000e+00
## rfm1                11 1047972 2.690000e+00 5.700000e-01 2.770000e+00 0.000e+00
## rfm2                12 1047972 2.550000e+00 4.700000e-01 2.540000e+00 9.500e-01
## rfm3                13 1047972 2.690000e+00 5.200000e-01 2.770000e+00 0.000e+00
## rfm4                14 1047972 2.760000e+00 5.400000e-01 2.770000e+00 0.000e+00
## rfm5                15 1047972 1.230000e+00 5.200000e-01 1.100000e+00 0.000e+00
## rfm6                16 1047972 2.040000e+00 8.100000e-01 2.080000e+00 0.000e+00
## rfm7                17 1047972 8.200000e-01 5.700000e-01 6.900000e-01 0.000e+00
## rfm8                18 1047972 1.520000e+00 7.600000e-01 1.610000e+00 0.000e+00
## rfm9                19 1047972 2.930000e+00 2.600000e-01 2.940000e+00 1.100e+00
## rfm10               20 1047972 2.590000e+00 2.700000e-01 2.560000e+00 0.000e+00
## rfm11               21 1047972 1.820000e+00 2.300000e-01 1.950000e+00 0.000e+00
## rfm12               22 1047972 4.060000e+00 6.300000e-01 4.170000e+00 0.000e+00
## demog_genf          23 1047972          NaN           NA           NA       Inf
## demog_genm          24 1047972          NaN           NA           NA       Inf
## account             25 1047972 1.005300e+08 3.060058e+05 1.005300e+08 1.000e+08
## dataset             26 1047972          NaN           NA           NA       Inf
## int_tgt_bin         27 1047972          NaN           NA           NA       Inf
## demog_inc2          28 1047972 1.066000e+01 4.300000e-01 1.069000e+01 7.820e+00
## demog_inc2_sq       29 1047972 2.132000e+01 8.600000e-01 2.138000e+01 1.564e+01
## rfm6_sq             30 1047972 3.800000e+00 1.790000e+00 3.910000e+00 0.000e+00
## prospect_ho         31 1047972 1.000000e-02 1.100000e-01 0.000000e+00 0.000e+00
## rfm2_inc2           32 1047972 1.311000e+01 6.900000e-01 1.311000e+01 9.780e+00
## demog_inc_homeval   33 1047972 9.292000e+01 5.318000e+01 1.175800e+02 0.000e+00
##                           max        range   skew kurtosis           se
## b_tgt                    -Inf         -Inf     NA       NA           NA
## int_tgt           1.22100e+01 1.221000e+01   1.57     0.51 0.000000e+00
## cnt_tgt           5.00000e+00 5.000000e+00   2.40     5.57 0.000000e+00
## cat_input1               -Inf         -Inf     NA       NA           NA
## cat_input2               -Inf         -Inf     NA       NA           NA
## demog_age         4.50000e+00 1.410000e+00  -0.77     0.54 0.000000e+00
## demog_ho                 -Inf         -Inf     NA       NA           NA
## demog_homeval     1.33000e+01 1.330000e+01  -5.99    46.94 0.000000e+00
## demog_inc         1.22100e+01 1.221000e+01  -1.21    -0.52 0.000000e+00
## demog_pr          7.30706e+43 7.307060e+43 178.52 46856.25 1.668276e+38
## rfm1              8.22000e+00 8.220000e+00  -1.11     6.10 0.000000e+00
## rfm2              6.08000e+00 5.130000e+00   0.36     1.28 0.000000e+00
## rfm3              7.60000e+00 7.600000e+00   0.12     1.38 0.000000e+00
## rfm4              8.22000e+00 8.220000e+00  -0.48     3.80 0.000000e+00
## rfm5              2.89000e+00 2.890000e+00  -0.22    -0.02 0.000000e+00
## rfm6              4.80000e+00 4.800000e+00  -0.21    -0.48 0.000000e+00
## rfm7              2.48000e+00 2.480000e+00  -0.03    -0.81 0.000000e+00
## rfm8              3.61000e+00 3.610000e+00  -0.14    -0.62 0.000000e+00
## rfm9              3.40000e+00 2.300000e+00  -2.52    10.10 0.000000e+00
## rfm10             4.36000e+00 4.360000e+00   0.72     4.25 0.000000e+00
## rfm11             3.14000e+00 3.140000e+00  -1.35     5.47 0.000000e+00
## rfm12             6.35000e+00 6.350000e+00  -0.38    -1.04 0.000000e+00
## demog_genf               -Inf         -Inf     NA       NA           NA
## demog_genm               -Inf         -Inf     NA       NA           NA
## account           1.01060e+08 1.060037e+06   0.00    -1.20 2.989200e+02
## dataset                  -Inf         -Inf     NA       NA           NA
## int_tgt_bin              -Inf         -Inf     NA       NA           NA
## demog_inc2        1.22100e+01 4.380000e+00  -0.10    -0.23 0.000000e+00
## demog_inc2_sq     2.44100e+01 8.770000e+00  -0.10    -0.23 0.000000e+00
## rfm6_sq           9.58000e+00 9.580000e+00  -0.20    -0.73 0.000000e+00
## prospect_ho       1.00000e+00 1.000000e+00   8.84    76.19 0.000000e+00
## rfm2_inc2         1.72100e+01 7.430000e+00   0.13     0.44 0.000000e+00
## demog_inc_homeval 1.62340e+02 1.623400e+02  -1.11    -0.61 5.000000e-02

Data Conversion to H2o

library(rJavaEnv)
java_install(
  java_distrib_path = "C:/Program Files/R/R-4.4.3/amazon-corretto-17-x64-windows-jdk.zip",
  autoset_java_env = TRUE
)

## Java distribution amazon-corretto-17-x64-windows-jdk.zip already unpacked at
## 'C:\Users\andre\AppData\Local/R/cache/R/rJavaEnv/installed/windows/x64/17'
## Junction creation failed. This is likely because the project directory is not
## on the same disk as the R package cache directory. Java files will instead be
## copied to
## 'C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files/rjavaenv/windows/x64/17'

## Warning in dir.create(project_version_path, recursive = TRUE):
## 'C:\Users\andre\OneDrive\Desktop\School\MSQE\1_Coursework\Data\ECON_562\Projects\Final
## Project\R_files\rjavaenv\windows\x64\17' already exists

## Java copied to project
## 'C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files/rjavaenv/windows/x64/17'
## ✔ Current R Session: JAVA_HOME and PATH set to 'C:\Users\andre\AppData\Local/R/cache/R/rJavaEnv/installed/windows/x64/17'
## 
## ✔ Current R Project/Working Directory: JAVA_HOME and PATH set to ''C:\Users\andre\AppData\Local/R/cache/R/rJavaEnv/installed/windows/x64/17'' in .Rprofile at ''C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files''
## 
## Java 17 (amazon-corretto-17-x64-windows-jdk.zip) for windows x64 installed at
## 'C:\Users\andre\AppData\Local/R/cache/R/rJavaEnv/installed/windows/x64/17' and
## symlinked to
## 'C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files/rjavaenv/windows/x64/17'

## [1] "C:\\Users\\andre\\AppData\\Local/R/cache/R/rJavaEnv/installed/windows/x64/17"

#initialize h20
library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

# Start H2O
h2o.init(nthreads = 5, max_mem_size = "16G")

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\andre\AppData\Local\Temp\RtmpQnd2cQ\file5e4c29bc7169/h2o_andre_started_from_r.out
##     C:\Users\andre\AppData\Local\Temp\RtmpQnd2cQ\file5e4ce054b37/h2o_andre_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 830 milliseconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year and 5 months 
##     H2O cluster name:           H2O_started_from_R_andre_zxa961 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   16.00 GB 
##     H2O cluster total cores:    16 
##     H2O cluster allowed cores:  5 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.3 (2025-02-28 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year and 5 months) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Convert the data frame to an H2O frame
bank_h2o <- as.h2o(bank_log)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Check if 'dataset' column exists
print("dataset" %in% colnames(bank_h2o))

## [1] TRUE

# Look at all column names to confirm
print(colnames(bank_h2o))

##  [1] "b_tgt"             "int_tgt"           "cnt_tgt"          
##  [4] "cat_input1"        "cat_input2"        "demog_age"        
##  [7] "demog_ho"          "demog_homeval"     "demog_inc"        
## [10] "demog_pr"          "rfm1"              "rfm2"             
## [13] "rfm3"              "rfm4"              "rfm5"             
## [16] "rfm6"              "rfm7"              "rfm8"             
## [19] "rfm9"              "rfm10"             "rfm11"            
## [22] "rfm12"             "demog_genf"        "demog_genm"       
## [25] "account"           "dataset"           "int_tgt_bin"      
## [28] "demog_inc2"        "demog_inc2_sq"     "rfm6_sq"          
## [31] "prospect_ho"       "rfm2_inc2"         "demog_inc_homeval"

# Get description of your H2O frame including types
h2o.describe(bank_h2o)

##                Label   Type Missing   Zeros PosInf NegInf          Min
## 1              b_tgt   enum       0  839461      0      0 0.000000e+00
## 2            int_tgt   real       0  843311      0      0 0.000000e+00
## 3            cnt_tgt   real       0  839461      0      0 0.000000e+00
## 4         cat_input1   enum       0  820863      0      0 0.000000e+00
## 5         cat_input2   enum       0  185840      0      0 0.000000e+00
## 6          demog_age   real       0       0      0      0 3.091042e+00
## 7           demog_ho   enum       0  470985      0      0 0.000000e+00
## 8      demog_homeval   real       0   12016      0      0 0.000000e+00
## 9          demog_inc   real       0  251175      0      0 0.000000e+00
## 10          demog_pr   real       0   30838      0      0 0.000000e+00
## 11              rfm1   real       0   14096      0      0 0.000000e+00
## 12              rfm2   real       0       0      0      0 9.477894e-01
## 13              rfm3   real       0       2      0      0 0.000000e+00
## 14              rfm4   real       0    4489      0      0 0.000000e+00
## 15              rfm5   real       0   52785      0      0 0.000000e+00
## 16              rfm6   real       0   17583      0      0 0.000000e+00
## 17              rfm7   real       0  246969      0      0 0.000000e+00
## 18              rfm8   real       0   76595      0      0 0.000000e+00
## 19              rfm9   real       0       0      0      0 1.098612e+00
## 20             rfm10   real       0       4      0      0 0.000000e+00
## 21             rfm11   real       0     756      0      0 0.000000e+00
## 22             rfm12   real       0      11      0      0 0.000000e+00
## 23        demog_genf   enum       0  459070      0      0 0.000000e+00
## 24        demog_genm   enum       0  588902      0      0 0.000000e+00
## 25           account    int       0       0      0      0 1.000000e+08
## 26           dataset string       0       0      0      0          NaN
## 27       int_tgt_bin string       0       0      0      0          NaN
## 28        demog_inc2   real       0       0      0      0 7.821643e+00
## 29     demog_inc2_sq   real       0       0      0      0 1.564248e+01
## 30           rfm6_sq   real       0   17583      0      0 0.000000e+00
## 31       prospect_ho    int       0 1035062      0      0 0.000000e+00
## 32         rfm2_inc2   real       0       0      0      0 9.779775e+00
## 33 demog_inc_homeval   real       0  253346      0      0 0.000000e+00
##             Max         Mean        Sigma Cardinality
## 1  1.000000e+00 1.989662e-01 3.992227e-01           2
## 2  1.220608e+01 1.778894e+00 3.627034e+00          NA
## 3  5.000000e+00 3.106866e-01 6.984116e-01          NA
## 4  2.000000e+00           NA           NA           3
## 5  4.000000e+00           NA           NA           5
## 6  4.499810e+00 4.065895e+00 2.520369e-01          NA
## 7  1.000000e+00 5.505748e-01 4.974358e-01           2
## 8  1.330480e+01 1.119889e+01 1.379983e+00          NA
## 9  1.220611e+01 8.229451e+00 4.629188e+00          NA
## 10 7.307060e+43 1.517608e+39 1.707822e+41          NA
## 11 8.219948e+00 2.686471e+00 5.712160e-01          NA
## 12 6.079933e+00 2.545268e+00 4.677873e-01          NA
## 13 7.601902e+00 2.688487e+00 5.150642e-01          NA
## 14 8.219948e+00 2.760929e+00 5.441262e-01          NA
## 15 2.890372e+00 1.232083e+00 5.197502e-01          NA
## 16 4.795791e+00 2.044768e+00 8.119793e-01          NA
## 17 2.484907e+00 8.231268e-01 5.667126e-01          NA
## 18 3.610918e+00 1.518190e+00 7.623620e-01          NA
## 19 3.401197e+00 2.934654e+00 2.609206e-01          NA
## 20 4.356709e+00 2.589426e+00 2.737642e-01          NA
## 21 3.135494e+00 1.824366e+00 2.349549e-01          NA
## 22 6.349139e+00 4.057001e+00 6.258354e-01          NA
## 23 1.000000e+00 5.619444e-01 4.961483e-01           2
## 24 1.000000e+00 4.380556e-01 4.961483e-01           2
## 25 1.010600e+08 1.005300e+08 3.060058e+05          NA
## 26          NaN           NA           NA          NA
## 27          NaN           NA           NA          NA
## 28 1.220611e+01 1.065957e+01 4.314523e-01          NA
## 29 2.441222e+01 2.131909e+01 8.629272e-01          NA
## 30 9.575053e+00 3.797591e+00 1.789999e+00          NA
## 31 1.000000e+00 1.231903e-02 1.103054e-01          NA
## 32 1.721277e+01 1.311257e+01 6.858268e-01          NA
## 33 1.623359e+02 9.292136e+01 5.318459e+01          NA

# This creates a numeric column based on the string values
bank_h2o$dataset_num <- h2o.asnumeric(bank_h2o$dataset)

# Check the levels of the 'dataset' column
print(h2o.levels(bank_h2o["dataset_num"]))

## NULL

#check unique values in the dataset column
print(h2o.unique(bank_h2o["dataset_num"]))

##   C1
## 1  2
## 2  1
## 3  3
## 
## [3 rows x 1 column]

Data Partitioning

# Training set: where dataset == 1
bank_train <- bank_h2o[bank_h2o$dataset_num == 1, ]

# Validation set: where dataset == 2
bank_test <- bank_h2o[bank_h2o$dataset_num == 2, ]

# Test set: where dataset == 3
bank_test_final <- bank_h2o[bank_h2o$dataset_num == 3, ]

#check the dimensions of the resulting data frames
print(dim(bank_train))

## [1] 628940     34

print(dim(bank_test))

## [1] 209366     34

print(dim(bank_test_final))

## [1] 209666     34

#count the number of observations where b_tgt == 1
count_b_tgt_1 <- h2o.nrow(bank_train[bank_train$b_tgt == "yes", ])
cat("Number of observations where b_tgt == 1 in the training set:", count_b_tgt_1, "\n")

## Number of observations where b_tgt == 1 in the training set: 125285

#count the number of observations where b_tgt == "no"
count_b_tgt_0 <- h2o.nrow(bank_train[bank_train$b_tgt == "no", ])
cat("Number of observations where b_tgt == 0 in the training set:", count_b_tgt_0, "\n")

## Number of observations where b_tgt == 0 in the training set: 503655

SMOTE for training data

#check if the themis package is installed 
if (!requireNamespace("themis", quietly = TRUE)) {
  install.packages("themis")
}

library(recipes)

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(themis)

# Convert H2OFrame to R dataframe
bank_train_df <- as.data.frame(bank_train)
#remove int_tgt_bin and  column
bank_train_df <- bank_train_df[, !colnames(bank_train_df) %in% c("int_tgt_bin", "dataset_num")]

# Apply SMOTE to balance the classes
bank_train_smote <- bank_train_df %>%
  recipe(b_tgt ~ .) %>%
  step_rm(dataset ) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_smote(b_tgt, over_ratio = 1) %>%
  prep() %>%
  bake(new_data = NULL)

# Convert back to H2OFrame if needed
bank_train_balanced <- as.h2o(bank_train_smote)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

#verify the balance of the classes
# Count the number of observations where b_tgt == 1
count_b_tgt_1 <- h2o.nrow(bank_train_balanced[bank_train_balanced$b_tgt == "yes", ])
cat("Number of observations where b_tgt == 1 in the training set:", count_b_tgt_1, "\n")

## Number of observations where b_tgt == 1 in the training set: 503655

# Count the number of observations where b_tgt == 0
count_b_tgt_0 <- h2o.nrow(bank_train_balanced[bank_train_balanced$b_tgt == "no", ])
cat("Number of observations where b_tgt == 0 in the training set:", count_b_tgt_0, "\n")

## Number of observations where b_tgt == 0 in the training set: 503655

# Check the distribution of the target variable
h2o.table(bank_train_balanced$b_tgt)

##   b_tgt  Count
## 1    no 503655
## 2   yes 503655
## 
## [2 rows x 2 columns]

#check the dimensions of the balanced data frame
print(dim(bank_train_balanced))

## [1] 1007310      35

column_headers <- names(bank_train)
print(column_headers)

##  [1] "b_tgt"             "int_tgt"           "cnt_tgt"          
##  [4] "cat_input1"        "cat_input2"        "demog_age"        
##  [7] "demog_ho"          "demog_homeval"     "demog_inc"        
## [10] "demog_pr"          "rfm1"              "rfm2"             
## [13] "rfm3"              "rfm4"              "rfm5"             
## [16] "rfm6"              "rfm7"              "rfm8"             
## [19] "rfm9"              "rfm10"             "rfm11"            
## [22] "rfm12"             "demog_genf"        "demog_genm"       
## [25] "account"           "dataset"           "int_tgt_bin"      
## [28] "demog_inc2"        "demog_inc2_sq"     "rfm6_sq"          
## [31] "prospect_ho"       "rfm2_inc2"         "demog_inc_homeval"
## [34] "dataset_num"

print(dim(bank_train))

## [1] 628940     34

print(dim(bank_test))

## [1] 209366     34

print(dim(bank_h2o))

## [1] 1047972      34

print(dim(bank_train_balanced))

## [1] 1007310      35

col_headers <- names(bank_train_balanced)
print(col_headers)

##  [1] "int_tgt"           "cnt_tgt"           "demog_age"        
##  [4] "demog_homeval"     "demog_inc"         "demog_pr"         
##  [7] "rfm1"              "rfm2"              "rfm3"             
## [10] "rfm4"              "rfm5"              "rfm6"             
## [13] "rfm7"              "rfm8"              "rfm9"             
## [16] "rfm10"             "rfm11"             "rfm12"            
## [19] "account"           "demog_inc2"        "demog_inc2_sq"    
## [22] "rfm6_sq"           "prospect_ho"       "rfm2_inc2"        
## [25] "demog_inc_homeval" "cat_input1_Y"      "cat_input1_Z"     
## [28] "cat_input2_B"      "cat_input2_C"      "cat_input2_D"     
## [31] "cat_input2_E"      "demog_ho_yes"      "demog_genf_yes"   
## [34] "demog_genm_yes"    "b_tgt"

B_Tgt Candidate Models

AutoML for B_tgt Regular training data

# Set the response variable
response <- "b_tgt"

# Set the predictor variables
#include all variables from the bank_train_balanced data set
predictors <- c(  "demog_age", "demog_ho", "demog_homeval", "demog_inc2", "demog_pr",  
                "rfm2", "rfm3",  "rfm10", "rfm11", "rfm4", "rfm6", "rfm8", "rfm9",
                  "rfm12",  "demog_genf", "demog_inc2_sq" , "rfm2_inc2" , "demog_inc_homeval" , "cat_input1" , "cat_input2")

# Set the training and validation sets
train_set <- bank_train
valid_set <- bank_test
# Set the test set
test_set <- bank_test_final

# Set the stopping criteria
stopping_criteria <- list(
  stopping_rounds = 10,
  stopping_metric = "AUC",
  stopping_tolerance = 0.01
)

#Algorithms to include
algorithms <- c("GLM", "DRF", "XGBoost", "GBM")

# Set exploitation and exploration parameters
exploration_params <- list(
  max_runtime_secs = 2700, # Run for maximum 1 hour
  max_models = 50,         # Build maximum 50 models
  seed = 1234              # Set a seed for reproducibility
)

exploration_ratio <- 0.3
sort_metric <- "AUC"

# Set the AutoML parameters
automl_params_B_tgt <- list(
  x = predictors,
  y = response,
  training_frame = train_set,
  validation_frame = valid_set,
  leaderboard_frame = test_set,
  max_runtime_secs = exploration_params$max_runtime_secs,
  max_models = exploration_params$max_models,
  seed = exploration_params$seed,
  stopping_rounds = stopping_criteria$stopping_rounds,
  stopping_metric = stopping_criteria$stopping_metric,
  stopping_tolerance = stopping_criteria$stopping_tolerance,
  sort_metric = sort_metric,
  exclude_algos = c("DeepLearning", "StackedEnsemble"),
  include_algos = algorithms
)

# Run AutoML
automl_B_tgt_org <- h2o.automl(
  x = predictors,
  y = response,
  training_frame = train_set,
  validation_frame = valid_set,  # For validation during training
  max_runtime_secs = exploration_params$max_runtime_secs,
  max_models = exploration_params$max_models,
  seed = exploration_params$seed,
  stopping_rounds = stopping_criteria$stopping_rounds,
  stopping_metric = stopping_criteria$stopping_metric,
  stopping_tolerance = stopping_criteria$stopping_tolerance,
  sort_metric = sort_metric,
  include_algos = algorithms
)

# 2. Get the leaderboard (ranked on validation data)
leaderboard <- h2o.get_leaderboard(automl_B_tgt_org)
print("AutoML Leaderboard (ranked on validation data):")
print(leaderboard)

# 3. Access and evaluate the best model on validation data
best_model <- automl_B_tgt_org@leader
val_perf <- h2o.performance(best_model, valid_set)
# 4. Print detailed validation metrics
print("Best Model Performance on Validation Data:")
print(val_perf)
print(paste("Validation AUC:", h2o.auc(val_perf)))
print(paste("Validation F1 Score:", h2o.F1(val_perf)))
print(paste("Validation Accuracy:", h2o.accuracy(val_perf)))
print(paste("Validation LogLoss:", h2o.logloss(val_perf)))
print(paste("Validation RMSE:", h2o.rmse(val_perf)))

# Get comprehensive model information
model_details <- h2o.getModel(best_model@model_id)

# Print summary of the model (includes parameters and performance metrics)
summary(model_details)

# For a more structured view of just the parameters
params_table <- as.data.frame(model_details@parameters)
print(params_table)

#export the summary to a csv
write.csv(params_table, "model_details.csv", row.names = FALSE)

# Get the variable importance
varimp <- h2o.varimp(best_model)

# Convert to a data frame
varimp_df <- as.data.frame(varimp)
# Sort by relative importance
varimp_df <- varimp_df[order(varimp_df$relative_importance, decreasing = TRUE), ]

#plot the variable importance
library(ggplot2)
ggplot(varimp_df, aes(x = reorder(variable, -relative_importance), y = relative_importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Variable Importance: Best B_tgt Model (DRF)", x = "Variables", y = "Relative Importance") +
  theme_minimal()

# show the confusion matrix for the best model
confusion_matrix <- h2o.confusionMatrix(best_model, valid_set)
print("Confusion Matrix:")
print(confusion_matrix)

#pull the threshold for the best model
threshold <- h2o.find_threshold_by_max_metric(val_perf, "f1")
print(paste("Best Threshold for F1 Score:", threshold))

library(h2o)
# Generate partial dependence plots for rfm3
drf_model <- h2o.getModel("DRF_1_AutoML_1_20250517_94015")

pdp_rfm3 <- h2o.partialPlot(
  object = drf_model,
  data = train_set,
  cols = c("rfm3")
)

# Generate partial dependence plots for rfm2
pdp_rfm2 <- h2o.partialPlot(
  object = drf_model,
  data = train_set,
  cols = c("rfm2")
)

# Plot the results
plot(pdp_rfm3)
plot(pdp_rfm2)

# Get all models from your AutoML run
all_models <- as.data.frame(automl_B_tgt_org@leaderboard)

# Filter for GLM models
glm_models <- all_models[grepl("GLM", all_models$model_id), ]

# Get the top GLM model ID
if (nrow(glm_models) > 0) {
  top_glm_id <- as.character(glm_models$model_id[1])
  best_glm_model <- h2o.getModel(top_glm_id)
  
  # View the model summary
  summary(best_glm_model)
} else {
  print("No GLM models found in the AutoML run")
}

#export the confusion matrix 
write.csv(confusion_matrix, "confusion_matrix.csv", row.names = FALSE)

#pull the threshold for the GLM model
threshold_glm <- h2o.find_threshold_by_max_metric(val_perf, "f1")
print(paste("Best Threshold for F1 Score (GLM):", threshold_glm))

# Filter for GBM models
gbm_models <- all_models[grepl("GBM", all_models$model_id), ]
# Get the top GBM model ID
if (nrow(gbm_models) > 0) {
  top_gbm_id <- as.character(gbm_models$model_id[1])
  best_gbm_model <- h2o.getModel(top_gbm_id)
  
  # View the model summary
  summary(best_gbm_model)
} else {
  print("No GBM models found in the AutoML run")
}

#pull the threshold for the GBM model
threshold_gbm <- h2o.find_threshold_by_max_metric(val_perf, "f1")
print(paste("Best Threshold for F1 Score (GBM):", threshold_gbm))

B_TGT: Test Data Evaluation

#Evaluate DRF model DRF_1_AutoML_1_20250517_94015 on the test data
#predict B-tgt
#Get Model ID
model_id <- automl_B_tgt_org@leaderboard[1, "model_id"]

# Get the model
best_model <- h2o.getModel(model_id)
#Print model ID
print(paste("Model ID:", model_id))

#check dimensions of the test set
print(dim(bank_test_final))

# Make predictions on the test set
predictions <- h2o.predict(best_model, newdata = bank_test_final)
# Convert predictions to a data frame
predictions_df <- as.data.frame(predictions)
# Add the original test set data to the predictions
predictions_df <- cbind(as.data.frame(bank_test_final), predictions_df)

#Confusion Matrix
confusion_matrix_test <- h2o.confusionMatrix(best_model, bank_test_final)
print("Confusion Matrix for Test Set:")
print(confusion_matrix_test)

# AUC and ROC curve
roc_curve <- h2o.performance(best_model, newdata = bank_test_final)
auc_value <- h2o.auc(roc_curve)

#-------- ADDITIONAL METRICS --------#

# Assuming positive class is 1 (you may need to adjust based on your actual class labels)
tryCatch({
  # Extract values from confusion matrix
  tn <- confusion_matrix_test[1,1]  # True Negatives
  fp <- confusion_matrix_test[1,2]  # False Positives
  fn <- confusion_matrix_test[2,1]  # False Negatives
  tp <- confusion_matrix_test[2,2]  # True Positives
  
  # Calculate metrics
  precision <- tp / (tp + fp)
  recall <- tp / (tp + fn)
  f1 <- 2 * precision * recall / (precision + recall)
  accuracy <- (tp + tn) / (tp + tn + fp + fn)
  sensitivity <- recall
  specificity <- tn / (tn + fp)
  
  cat("\nMetrics calculated from confusion matrix:\n")
  cat("Precision:", precision, "\n")
  cat("Recall:", recall, "\n")
  cat("F1 Score:", f1, "\n")
  cat("Accuracy:", accuracy, "\n")
  cat("Sensitivity:", sensitivity, "\n")
  cat("Specificity:", specificity, "\n")
  cat("AUC:", auc_value, "\n")
  
}, error = function(e) {
  cat("Could not calculate metrics from confusion matrix:", e$message, "\n")
})

# Plot ROC curve
plot(roc_curve, main = "DRF: ROC Curve for Test Set", col = "blue", lwd = 2)

Neural Net for B_tgt

# Set the response variable
response <- "b_tgt"
# Set the predictor variables
predictors <- c(  "demog_age", "demog_ho", "demog_homeval", "demog_inc2", "demog_pr",  
                "rfm2", "rfm3",  "rfm10", "rfm11", "rfm4", "rfm6", "rfm8", "rfm9",
                  "rfm12",  "demog_genf", "demog_inc2_sq" , "rfm2_inc2" , "demog_inc_homeval" , "cat_input1" , "cat_input2")

# Set the training and validation sets
train_set <- bank_train
valid_set <- bank_test
# Set the test set
test_set <- bank_test_final

# Set the stopping criteria
stopping_criteria <- list(
  stopping_rounds = 10,
  stopping_metric = "AUC",
  stopping_tolerance = 0.01
)

#Algorithms to include
algorithms <- c("DeepLearning")

# Set exploitation and exploration parameters
exploration_params <- list(
  max_runtime_secs = 1200, # Run for maximum
  max_models = 40,         # Build maximum 50 models
  seed = 1234              # Set a seed for reproducibility
)

exploration_ratio <- 0.3
sort_metric <- "AUC"

# Train a deep learning model with h2o
automl_params <- list(
  x = predictors,
  y = response,
  training_frame = train_set,
  validation_frame = valid_set,
  leaderboard_frame = valid_set,
  max_runtime_secs = exploration_params$max_runtime_secs,
  max_models = exploration_params$max_models,
  seed = exploration_params$seed,
  stopping_rounds = stopping_criteria$stopping_rounds,
  stopping_metric = stopping_criteria$stopping_metric,
  stopping_tolerance = stopping_criteria$stopping_tolerance,
  sort_metric = sort_metric
)

# Run AutoML
automl_B_tgt <- h2o.automl(
  x = predictors,
  y = response,
  training_frame = train_set,
  validation_frame = valid_set,  # For validation during training
  max_runtime_secs = exploration_params$max_runtime_secs,
  max_models = exploration_params$max_models,
  seed = exploration_params$seed,
  stopping_rounds = stopping_criteria$stopping_rounds,
  stopping_metric = stopping_criteria$stopping_metric,
  stopping_tolerance = stopping_criteria$stopping_tolerance,
  sort_metric = sort_metric,
  include_algos = algorithms
)

# 2. Get the leaderboard (ranked on validation data)
leaderboard <- h2o.get_leaderboard(automl_B_tgt)

# Get comprehensive model information
# select the best model
best_model <- automl_B_tgt@leader
# Print summary of the model (includes parameters and performance metrics)
summary(best_model)

#compute the R squared value for the best model
r_squared <- h2o.r2(val_perf)
print(paste("Validation R-squared:", r_squared))

#get the confusion matrix
confusion_matrix <- h2o.confusionMatrix(best_model, valid_set)
print("Confusion Matrix:")
print(confusion_matrix)

#compute sensitivity and specificity
#select threshold where max AUC
threshold <- h2o.find_threshold_by_max_metric(val_perf, "AUC")
print(paste("Best Threshold for AUC:", threshold))
#compute the sensitivity and specificity
sensitivity <- h2o.sensitivity(confusion_matrix)
specificity <- h2o.specificity(confusion_matrix)

3 Cnt_tgt Candidate Models

AutoML

#show the column names for the bank_train data set
colnames(bank_train)

# Set the response variable
response_cnt_tgt <- "cnt_tgt"
# Set the predictor variables
#include all variables from the bank_train data set
predictors_cnt_tgt <- c( "demog_age", "demog_ho", "demog_homeval", "demog_inc2", "demog_pr", "rfm1" ,  
                "rfm2", "rfm3",  "rfm10", "rfm4" , "rfm6", "rfm8", "rfm9", "rfm10",
                  "rfm12",  "demog_genf", "demog_inc2_sq" , "rfm2_inc2" , "demog_inc_homeval" , "cat_input1" , "cat_input2")

# Set the training and validation sets
train_set_cnt_tgt <- bank_train
valid_set_cnt_tgt <- bank_test
# Set the test set
test_set_cnt_tgt <- bank_test_final

# Set the stopping criteria
stopping_criteria_cnt_tgt <- list(
  stopping_rounds = 10,
  stopping_metric = "RMSE",
  stopping_tolerance = 0.01
)

#Algorithms to include
algorithms_cnt_tgt <- c("GLM", "DRF", "DeepLearning")
# Set exploitation and exploration parameters
exploration_params_cnt_tgt <- list(
  max_runtime_secs = 2500,
  max_models = 50,         # Build maximum 50 models
  seed = 1234              # Set a seed for reproducibility
)

exploration_ratio_cnt_tgt <- 0.3
sort_metric_cnt_tgt <- "RMSE"
# Set the AutoML parameters
automl_params_cnt_tgt <- list(
  x = predictors_cnt_tgt,
  y = response_cnt_tgt,
  training_frame = train_set_cnt_tgt,
  validation_frame = valid_set_cnt_tgt,
  leaderboard_frame = test_set_cnt_tgt,
  max_runtime_secs = exploration_params_cnt_tgt$max_runtime_secs,
  max_models = exploration_params_cnt_tgt$max_models,
  seed = exploration_params_cnt_tgt$seed,
  stopping_rounds = stopping_criteria_cnt_tgt$stopping_rounds,
  stopping_metric = stopping_criteria_cnt_tgt$stopping_metric,
  stopping_tolerance = stopping_criteria_cnt_tgt$stopping_tolerance,
  sort_metric = sort_metric_cnt_tgt,
    include_algos = algorithms_cnt_tgt
)

# Run AutoML
automl_cnt_tgt <- h2o.automl(
  x = predictors_cnt_tgt,
  y = response_cnt_tgt,
  training_frame = train_set_cnt_tgt,
  validation_frame = valid_set_cnt_tgt,  # For validation during training
  max_runtime_secs = exploration_params_cnt_tgt$max_runtime_secs,
  max_models = exploration_params_cnt_tgt$max_models,
  seed = exploration_params_cnt_tgt$seed,
  stopping_rounds = stopping_criteria_cnt_tgt$stopping_rounds,
  stopping_metric = stopping_criteria_cnt_tgt$stopping_metric,
  stopping_tolerance = stopping_criteria_cnt_tgt$stopping_tolerance,
  sort_metric = sort_metric_cnt_tgt,
  include_algos = algorithms_cnt_tgt
)

# 2. Get the leaderboard (ranked on validation data)
leaderboard_cnt_tgt <- h2o.get_leaderboard(automl_cnt_tgt)
print("AutoML Leaderboard (ranked on validation data):")
print(leaderboard_cnt_tgt)

# 3. Access and evaluate the best model on validation data
best_model_cnt_tgt <- automl_cnt_tgt@leader
val_perf_cnt_tgt <- h2o.performance(best_model_cnt_tgt, valid_set_cnt_tgt)

# 4. Print detailed validation metrics
print("Best Model Performance on Validation Data:")
print(val_perf_cnt_tgt)

# Get comprehensive model information
model_details_cnt_tgt <- h2o.getModel(best_model_cnt_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_cnt_tgt)
# For a more structured view of just the parameters
params_table_cnt_tgt <- as.data.frame(model_details_cnt_tgt@parameters)
print(params_table_cnt_tgt)
#export the summary to a csv
write.csv(params_table_cnt_tgt, "model_details_cnt_tgt.csv", row.names = FALSE)

#compute the R squared value
r_squared_cnt_tgt <- h2o.r2(val_perf_cnt_tgt)
print(paste("Validation R-squared:", r_squared_cnt_tgt))

#show trees for the best model
h2o.varimp_plot(best_model_cnt_tgt, num_of_features = 20)

#Get predictions for the validation set
predictions_cnt_tgt <- h2o.predict(best_model_cnt_tgt, valid_set_cnt_tgt)

#create a data frame with the predictions
predictions_cnt_tgt_df <- as.data.frame(predictions_cnt_tgt)
# Add the actual values to the predictions data frame
predictions_cnt_tgt_df$actual <- as.data.frame(valid_set_cnt_tgt)$cnt_tgt

#create the residuals
predictions_cnt_tgt_df$residuals <- predictions_cnt_tgt_df$predict - predictions_cnt_tgt_df$actual
# Create a scatter plot of the predictions vs actual values
library(ggplot2)
ggplot(predictions_cnt_tgt_df, aes(x = actual, y = predict)) +
  geom_point(alpha = 0.5) +
  labs(title = "Predictions vs Actual Values", x = "Actual Values", y = "Predicted Values") +
  theme_minimal() +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line

# Get all models from your AutoML run
all_models_cnt_tgt <- as.data.frame(automl_cnt_tgt@leaderboard)
# Filter for GLM models
glm_models_cnt_tgt <- all_models_cnt_tgt[grepl("GLM", all_models_cnt_tgt$model_id), ]
# Get the top GLM model ID
if (nrow(glm_models_cnt_tgt) > 0) {
  top_glm_id_cnt_tgt <- as.character(glm_models_cnt_tgt$model_id[1])
  best_glm_model_cnt_tgt <- h2o.getModel(top_glm_id_cnt_tgt)
  
  # View the model summary
  summary(best_glm_model_cnt_tgt)
} else {
  print("No GLM models found in the AutoML run")
}

#Get comprehensive Model details
model_details_cnt_tgt_glm <- h2o.getModel(best_glm_model_cnt_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_cnt_tgt_glm)

# Get all models from your AutoML run
all_models_cnt_tgt <- as.data.frame(automl_cnt_tgt@leaderboard)
# Filter for DeepLearning models
dl_models_cnt_tgt <- all_models_cnt_tgt[grepl("DeepLearning", all_models_cnt_tgt$model_id), ]
# Get the top DeepLearning model ID
if (nrow(dl_models_cnt_tgt) > 0) {
  top_dl_id_cnt_tgt <- as.character(dl_models_cnt_tgt$model_id[1])
  best_dl_model_cnt_tgt <- h2o.getModel(top_dl_id_cnt_tgt)
  
  # View the model summary
  summary(best_dl_model_cnt_tgt)
} else {
  print("No DeepLearning models found in the AutoML run")
}

#Get comprehensive Model details
model_details_cnt_tgt_dl <- h2o.getModel(best_dl_model_cnt_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_cnt_tgt_dl)

#compute the R squared value
#for the best deep learning model
val_perf_cnt_tgt <- h2o.performance(best_dl_model_cnt_tgt, valid_set_cnt_tgt)
#compute the R squared value
r_squared_cnt_tgt_dl <- h2o.r2(val_perf_cnt_tgt)
print(paste("Validation R-squared:", r_squared_cnt_tgt_dl))

#get the model parameters
# Method 1: Get model details as a list
model_details <- h2o.getModel(model_details_cnt_tgt_dl@model_id)
print(model_details)

# Method 2: Get hyperparameters
hyper_params <- h2o.getModel(model_details_cnt_tgt_dl@model_id)@allparameters
print(hyper_params)

#Get model parameters for the DRF model
model_details_cnt_tgt_drf <- h2o.getModel(best_model_cnt_tgt@model_id)
print(model_details_cnt_tgt_drf)

#Get Hyperparameters
hyper_params_drf <- h2o.getModel(model_details_cnt_tgt_drf@model_id)@allparameters
print(hyper_params_drf)

1 GAM for cnt_tgt

# Method 1: Manually exclude known categorical variables
categorical_vars <- c("demog_inc_homeval", "demog_ho", "demog_genf", "cat_input1", "cat_input2" , "rfm10" , "rfm4" , "rfm8" ,"rfm9") # Modify this list based on your knowledge
numeric_predictors <- setdiff(predictors_cnt_tgt, categorical_vars )




# Train a GAM model 
gam_model <- h2o.gam(
  x = predictors_cnt_tgt,
  y = response_cnt_tgt,
  training_frame = train_set_cnt_tgt,
  validation_frame = valid_set_cnt_tgt,
  gam_columns = numeric_predictors,
  lambda_search = TRUE,
  lambda = c(0.01, 0.1, 1, 10),
  alpha = c(0.1, 0.5, 0.9),
  max_iterations = 100,
  max_runtime_secs = 9000  # Set maximum runtime to 1 hour (3600 seconds)
)
# Print the model summary
summary(gam_model)

Cnt_tgt: Test Data Evaluation

#Make predictions on the test data using DRF_1_AutoML_2_20250517_164950 
#predict cnt_tgt
#Get Model ID
model_id_cnt_tgt <- automl_cnt_tgt@leaderboard[1, "model_id"]
# Get the model
best_model_cnt_tgt <- h2o.getModel(model_id_cnt_tgt)
#Print model ID
print(paste("Model ID:", model_id_cnt_tgt))

#check dimensions of the test set
print(dim(bank_test_final))

# Make predictions on the test set
predictions_cnt_tgt_test <- h2o.predict(best_model_cnt_tgt, newdata = bank_test_final)
# Convert predictions to a data frame
predictions_cnt_tgt_test_df <- as.data.frame(predictions_cnt_tgt_test)
# Add the original test set data to the predictions
predictions_cnt_tgt_test_df <- cbind(as.data.frame(bank_test_final), predictions_cnt_tgt_test_df)
# Create a scatter plot of the predictions vs actual values
library(ggplot2)
ggplot(predictions_cnt_tgt_test_df, aes(x = cnt_tgt, y = predict)) +
  geom_point(alpha = 0.5) +
  labs(title = "CNT_tgt: Predictions vs Actual Values", x = "Actual Values", y = "Predicted Values") +
  theme_minimal() +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line

#Compute the model metrics for the test data
# Get the model performance on the test set
test_perf_cnt_tgt <- h2o.performance(best_model_cnt_tgt, newdata = bank_test_final)
# Print the model performance metrics
print("Model Performance on Test Data:")
print(test_perf_cnt_tgt)

# Get model ID for best performing cnt_tgt model
model_id_cnt_tgt <- automl_cnt_tgt@leaderboard[1, "model_id"]

# Get the model
best_model_cnt_tgt <- h2o.getModel(model_id_cnt_tgt)

#Generate partial dependence plots for rfm1 and demog_homeval
pdp_rfm1 <- h2o.partialPlot(
  object = best_model_cnt_tgt,
  data = bank_test_final,
  cols = c("rfm1")
)

pdp_demog_homeval <- h2o.partialPlot(
  object = best_model_cnt_tgt,
  data = bank_test_final,
  cols = c("demog_homeval")
)

3 Int_tgt Candidate Models

AutoML

#show the mean int_tgt for the test final data set
mean_int_tgt <- h2o.mean(bank_test_final$int_tgt)

print(paste("Mean int_tgt for test set:", mean_int_tgt))

# Set the response variable
response_int_tgt <- "int_tgt"
# Set the predictor variables
#include all variables from the bank_train data set
predictors_int_tgt <- c( "demog_age", "demog_ho", "demog_homeval", "demog_inc2", "demog_pr", "rfm1" ,  
                "rfm2", "rfm3",  "rfm10", "rfm4" , "rfm6", "rfm6_sq" , "rfm8", "rfm9", "rfm10",
                  "rfm12",   "demog_inc2_sq" , "rfm2_inc2" , "demog_inc_homeval" )
# Set the training and validation sets
train_set_int_tgt <- bank_train
valid_set_int_tgt <- bank_test
# Set the test set
test_set_int_tgt <- bank_test_final
# Set the stopping criteria
stopping_criteria_int_tgt <- list(
  stopping_rounds = 10,
  stopping_metric = "RMSE",
  stopping_tolerance = 0.01
)
#Algorithms to include
algorithms_int_tgt <- c("DeepLearning", "GLM", "DRF")
# Set exploitation and exploration parameters
exploration_params_int_tgt <- list(
  max_runtime_secs = 2700, # Run for maximum 1 hour
  max_models = 50,         # Build maximum 50 models
  seed = 1234              # Set a seed for reproducibility
)
exploration_ratio_int_tgt <- 0.3
sort_metric_int_tgt <- "RMSE"
# Set the AutoML parameters
automl_params_int_tgt <- list(
  x = predictors_int_tgt,
  y = response_int_tgt,
  training_frame = train_set_int_tgt,
  validation_frame = valid_set_int_tgt,
  leaderboard_frame = valid_set_int_tgt,
  max_runtime_secs = exploration_params_int_tgt$max_runtime_secs,
  max_models = exploration_params_int_tgt$max_models,
  seed = exploration_params_int_tgt$seed,
  stopping_rounds = stopping_criteria_int_tgt$stopping_rounds,
  stopping_metric = stopping_criteria_int_tgt$stopping_metric,
  stopping_tolerance = stopping_criteria_int_tgt$stopping_tolerance,
  sort_metric = sort_metric_int_tgt,
    include_algos = algorithms_int_tgt
)

# Run AutoML
automl_int_tgt <- h2o.automl(
  x = predictors_int_tgt,
  y = response_int_tgt,
  training_frame = train_set_int_tgt,
  validation_frame = valid_set_int_tgt,
  leaderboard_frame = valid_set_int_tgt, # For validation during training
  max_runtime_secs = exploration_params_int_tgt$max_runtime_secs,
  max_models = exploration_params_int_tgt$max_models,
  seed = exploration_params_int_tgt$seed,
  stopping_rounds = stopping_criteria_int_tgt$stopping_rounds,
  stopping_metric = stopping_criteria_int_tgt$stopping_metric,
  stopping_tolerance = stopping_criteria_int_tgt$stopping_tolerance,
  sort_metric = sort_metric_int_tgt,
  include_algos = algorithms_int_tgt
)

# 2. Get the leaderboard (ranked on validation data)
leaderboard_int_tgt <- h2o.get_leaderboard(automl_int_tgt)
print("AutoML Leaderboard (ranked on validation data):")
print(leaderboard_int_tgt)

#convert the leaderboard to a dataframe 
leaderboard_int_tgt_df <- as.data.frame(leaderboard_int_tgt)

#export the leaderboard to a csv
write.csv(leaderboard_int_tgt_df, "leaderboard_int_tgt.csv", row.names = FALSE)
# Confirm the file was created
cat("File exported to:", file.path(getwd(), "leaderboard_int_tgt.csv"))

Best GLM for int_tgt

# Get all models from your AutoML run
all_models_int_tgt <- as.data.frame(automl_int_tgt@leaderboard)
# Filter for GLM models
glm_models_int_tgt <- all_models_int_tgt[grepl("GLM", all_models_int_tgt$model_id), ]
# Get the top GLM model ID
if (nrow(glm_models_int_tgt) > 0) {
  top_glm_id_int_tgt <- as.character(glm_models_int_tgt$model_id[1])
  best_glm_model_int_tgt <- h2o.getModel(top_glm_id_int_tgt)
  
  # View the model summary
  summary(best_glm_model_int_tgt)
} else {
  print("No GLM models found in the AutoML run")
}

#Get comprehensive Model details
model_details_int_tgt_glm <- h2o.getModel(best_glm_model_int_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_int_tgt_glm)
 

#extract the model coefficients for the glm model
model_coefficients_int_tgt_glm <- h2o.coef(best_glm_model_int_tgt)
# Convert to a data frame
model_coefficients_int_tgt_glm_df <- as.data.frame(model_coefficients_int_tgt_glm)
# Add the variable names
model_coefficients_int_tgt_glm_df$variable <- rownames(model_coefficients_int_tgt_glm_df)
# Rename the columns
colnames(model_coefficients_int_tgt_glm_df) <- c("coefficient", "variable")
# Sort by absolute value of coefficients
model_coefficients_int_tgt_glm_df <- model_coefficients_int_tgt_glm_df[order(abs(model_coefficients_int_tgt_glm_df$coefficient), decreasing = TRUE), ]
# Print the coefficients
print(model_coefficients_int_tgt_glm_df)

#export the coefficients to a csv
write.csv(model_coefficients_int_tgt_glm_df, "model_coefficients_int_tgt_glm.csv", row.names = FALSE)

Best Neural Net Model for int_tgt

#filer for DeepLearning models
dl_models_int_tgt <- all_models_int_tgt[grepl("DeepLearning", all_models_int_tgt$model_id), ]
# Get the top DeepLearning model ID
#ranked on validation data
# Get the top DeepLearning model ID
if (nrow(dl_models_int_tgt) > 0) {
  top_dl_id_int_tgt <- as.character(dl_models_int_tgt$model_id[1])
  best_dl_model_int_tgt <- h2o.getModel(top_dl_id_int_tgt)
  
  # View the model summary
  summary(best_dl_model_int_tgt)
} else {
  print("No DeepLearning models found in the AutoML run")
}

#Get comprehensive Model details
model_details_int_tgt_dl <- h2o.getModel(best_dl_model_int_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_int_tgt_dl)

#compute the R squared value
#for the best performing deep learning model
val_perf_int_tgt <- h2o.performance(best_dl_model_int_tgt, valid_set_int_tgt)
r_squared_int_tgt_dl <- h2o.r2(val_perf_int_tgt)
print(paste("Validation R-squared (Deep Learning):", r_squared_int_tgt_dl))

Best Random Forest for int_tgt

#filter for DRF models
drf_models_int_tgt <- all_models_int_tgt[grepl("DRF", all_models_int_tgt$model_id), ]
# Get the top DRF model ID
if (nrow(drf_models_int_tgt) > 0) {
  top_drf_id_int_tgt <- as.character(drf_models_int_tgt$model_id[1])
  best_drf_model_int_tgt <- h2o.getModel(top_drf_id_int_tgt)
  
  # View the model summary
  summary(best_drf_model_int_tgt)
} else {
  print("No DRF models found in the AutoML run")
}
#Get comprehensive Model details
model_details_int_tgt_drf <- h2o.getModel(best_drf_model_int_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_int_tgt_drf)
#compute the R squared value
#for int_tgt
val_perf_int_tgt_drf <- h2o.performance(best_drf_model_int_tgt, valid_set_int_tgt)
#compute the R squared value
r_squared_int_tgt_drf <- h2o.r2(val_perf_int_tgt_drf)
print(paste("Validation R-squared (DRF):", r_squared_int_tgt_drf))

#plot variable importance for the best DRF model
h2o.varimp_plot(best_drf_model_int_tgt, num_of_features = 20)

# Get the model ID for best performing DRF
model_id_int_tgt_drf <- best_drf_model_int_tgt@model_id

# Get your DRF model
drf_model_int_tgt <- h2o.getModel(model_id_int_tgt_drf)

#Generate partial dependence plots for rfm2 and rfm9
pdp_rfm2_int_tgt <- h2o.partialPlot(
  object = drf_model_int_tgt,
  data = train_set_int_tgt,
  cols = c("rfm2")
)

pdp_rfm9_int_tgt <- h2o.partialPlot(
  object = drf_model_int_tgt,
  data = train_set_int_tgt,
  cols = c("rfm9")
)

Int_tgt: Ensemble Model

# Model IDs i want to use for the ensemble
model_ids <- c(
  "DRF_1_AutoML_6_20250518_35511",
  "XRT_1_AutoML_6_20250518_35511",
  "DeepLearning_1_AutoML_6_20250518_35511",
  "GLM_1_AutoML_6_20250518_35511",
"DeepLearning_grid_1_AutoML_6_20250518_35511_model_1")



# Create an ensemble using these specific models
custom_ensemble <- h2o.stackedEnsemble(
  x = predictors_int_tgt,
  y = response_int_tgt,
  training_frame = train_set_int_tgt,
  blending_frame = valid_set_int_tgt,  # Use validation data as blending frame instead
  base_models = model_ids,
  metalearner_algorithm = "glm"  # Default metalearner
)

# Evaluate ensemble performance on validation data
ensemble_perf <- h2o.performance(custom_ensemble, valid_set_int_tgt)
print(h2o.rmse(ensemble_perf))
print(h2o.r2(ensemble_perf))
print(h2o.mae(ensemble_perf))
print(h2o.mse(ensemble_perf))
print(h2o.mean_residual_deviance(ensemble_perf))


# Compare with individual model performances
for (model_id in model_ids) {
  model <- h2o.getModel(model_id)
  model_perf <- h2o.performance(model, valid_set_int_tgt)
  cat("\nPerformance for model:", model_id, "\n")
  cat("RMSE:", h2o.rmse(model_perf), "\n")
  cat("R-squared:", h2o.r2(model_perf), "\n")
  cat("MAE:", h2o.mae(model_perf), "\n")
  cat("MSE:", h2o.mse(model_perf), "\n")
}

Int_tgt: Ensemble Model Test Set Evaluation

# Make predictions with the ensemble model
ensemble_preds <- h2o.predict(custom_ensemble, test_set_int_tgt)

# Convert predictions to a data frame
ensemble_preds_df <- as.data.frame(ensemble_preds)
# Add the original test set data to the predictions
ensemble_preds_df <- cbind(as.data.frame(bank_test_final), ensemble_preds_df)

#exponentiate the predicteds
ensemble_preds_df$predict <- exp(ensemble_preds_df$predict)
#exponentiate the actual values
ensemble_preds_df$int_tgt <- exp(ensemble_preds_df$int_tgt)

# Create a scatter plot of the predictions vs actual values
library(ggplot2)
ggplot(ensemble_preds_df, aes(x = int_tgt, y = predict)) +
  geom_point(alpha = 0.5) +
  labs(title = "Ensemble Model: Predictions vs Actual Values", x = "Actual Values", y = "Predicted Values") +
  theme_minimal() +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line

#compute the RMSE
ensemble_perf_test <- h2o.performance(custom_ensemble, newdata = bank_test_final)

# Print the model performance metrics
print("Ensemble Model Performance on Test Data:")
print(ensemble_perf_test)
#compute the RMSE
rmse_ensemble <- h2o.rmse(ensemble_perf_test)
print(paste("Ensemble Model RMSE on Test Data:", rmse_ensemble))
#compute the R squared value
r_squared_ensemble <- h2o.r2(ensemble_perf_test)
print(paste("Ensemble Model R-squared on Test Data:", r_squared_ensemble))
#copmute the MAE
mae_ensemble <- h2o.mae(ensemble_perf_test)
print(paste("Ensemble Model MAE on Test Data:", mae_ensemble))
#compute the MSE
mse_ensemble <- h2o.mse(ensemble_perf_test)
print(paste("Ensemble Model MSE on Test Data:", mse_ensemble))

Ensemble Model Coefficients

#Get model id for ensemble 
model_id_ensemble <- custom_ensemble@model_id
#Print the model id
print(paste("Ensemble Model ID:", model_id_ensemble))
# Access the metalearner model
ensemble_model <- h2o.getModel("StackedEnsemble_model_R_1747498932373_10579")
metalearner <- h2o.getModel(ensemble_model@model$metalearner$name)

# If metalearner is GLM, you can get coefficients
if(metalearner@algorithm == "glm") {
  # This shows how each BASE MODEL is weighted
  coefs <- h2o.coef(metalearner)
  print(coefs)
}

# Convert to a data frame
coefs_df <- as.data.frame(coefs)
# Add the variable names
coefs_df$variable <- rownames(coefs_df)
# Rename the columns
colnames(coefs_df) <- c("coefficient", "variable")
# Sort by absolute value of coefficients
coefs_df <- coefs_df[order(abs(coefs_df$coefficient), decreasing = TRUE), ]
# Print the coefficients
print(coefs_df)
#export the coefficients to a csv
write.csv(coefs_df, "model_coefficients_ensemble.csv", row.names = FALSE)

Ensemble Model Variable Importance

# Get the variable importance for the ensemble model
varimp_ensemble <- h2o.varimp(custom_ensemble)
# Convert to a data frame
varimp_ensemble_df <- as.data.frame(varimp_ensemble)
# Add the variable names
varimp_ensemble_df$variable <- rownames(varimp_ensemble_df)
# Rename the columns
colnames(varimp_ensemble_df) <- c("relative_importance", "variable")
# Sort by relative importance
varimp_ensemble_df <- varimp_ensemble_df[order(varimp_ensemble_df$relative_importance, decreasing = TRUE), ]
# Print the variable importance
print(varimp_ensemble_df)




#plot the variable importance
library(ggplot2)
ggplot(varimp_ensemble_df, aes(x = reorder(variable, -relative_importance), y = relative_importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Variable Importance: Ensemble Model", x = "Variables", y = "Relative Importance") +
  theme_minimal()

Int_tgt GAM Model

# Method 1: Manually exclude known categorical variables
categorical_vars <- c("demog_inc_homeval", "demog_ho", "demog_genf",  "rfm10" , "rfm4" , "rfm8" ,"rfm9" ) # Modify this list based on your knowledge
numeric_predictors <- setdiff(predictors_cnt_tgt, categorical_vars )
 
 x <- c(numeric_predictors, categorical_vars)
  y <- response_int_tgt


# Define a very simple hyperparameter grid
# Just tuning lambda (regularization strength)
hyper_params <- list(
  lambda = c(0, 0.001, 0.01, 0.1),
   # Alpha mixing parameter - controls L1 vs L2 regularization
  alpha = c(0, 0.3, 0.7, 1.0),
  
  # Scale parameter - controls fitting (affects convergence)
  scale = c(0.5, 1.0, 1.5)

)
# Define search criteria for a more extensive search
search_criteria <- list(
  strategy = "RandomDiscrete",    # Use random search for efficiency with larger grid
  max_models = 100,               # Maximum number of models to try
  max_runtime_secs = 1800,       # 1 hour time limit
  seed = 1234                     # For reproducibility
)

# Run a simple GAM grid search with search_criteria
gam_grid <- h2o.grid(
  algorithm = "gam",
  x = x,
  y = y,
  training_frame = train_set_int_tgt,
  validation_frame = valid_set_int_tgt,
  hyper_params = hyper_params,
  search_criteria = search_criteria,
  gam_columns = list(numeric_predictors[1:3]), # Specify the columns for GAM
  lambda_search = TRUE,             # Enable lambda search
  
)


# Print grid results
print(gam_grid)

# Get the best model by validation MSE
best_model <- h2o.getModel(gam_grid@model_ids[[1]])

# Print model details
print(summary(best_model))

# Model performance on validation data
perf <- h2o.performance(best_model, valid_set_int_tgt)
print(perf)

Best GAM Model for INT_tgt

# Sort the grid by your preferred metric (lower MSE is better)
sorted_grid <- h2o.getGrid(gam_grid@grid_id, sort_by = "RMSE", decreasing = FALSE)
# Get the best model from the grid
best_model <- h2o.getModel(sorted_grid@model_ids[[1]])

# Print the best model's details
print(summary(best_model))
# Get the variable importance
varimp <- h2o.varimp(best_model)
# Convert to a data frame
varimp_df <- as.data.frame(varimp)

Variable importance for GAM Model

#plot the variable importance
library(ggplot2)
ggplot(varimp_df, aes(x = reorder(variable, -relative_importance), y = relative_importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Variable Importance: Best Int_tgt Model (GAM)", x = "Variables", y = "Relative Importance") +
  theme_minimal()

Misc Plots

#plot rfm6 vs int_tgt
ggplot(bank_log, aes(x = rfm6, y = int_tgt)) +
  geom_point(alpha = 0.5) +
  labs(title = "Scatter Plot of rfm6 vs int_tgt", x = "rfm6", y = "int_tgt") +
  theme_minimal() +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line

#plot rfm6 vs cnt_tgt
ggplot(bank_log, aes(x = rfm6, y = cnt_tgt)) +
  geom_point(alpha = 0.5) +
  labs(title = "Scatter Plot of rfm6 vs cnt_tgt", x = "rfm6", y = "cnt_tgt") +
  theme_minimal() +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line

ECON_562_FINAL_PROJ_MOD_AF

Andrew Frohner

2025-05-13

Imputation

int_tgt

Demog_age

rfm3

cnt_tgt

the single NA value for cnt_tgt was imputed as 1, which is the mean of cnt_tgt when demog_genm == yes and demog_homeval < 100000.

Categorical Predictor analysis

Cramer’s V for cat_input1 and cat_input2 show that they likely have negligible effects on the b_tgt response variable given our sample size.

Outliers

Feature Engineering

Skewness

Data Conversion to H2o

Data Partitioning

SMOTE for training data

B_Tgt Candidate Models

AutoML for B_tgt Regular training data

B_TGT: Test Data Evaluation

Neural Net for B_tgt

3 Cnt_tgt Candidate Models

AutoML

1 GAM for cnt_tgt

Cnt_tgt: Test Data Evaluation

3 Int_tgt Candidate Models

AutoML

Best GLM for int_tgt

Best Neural Net Model for int_tgt

Best Random Forest for int_tgt

Int_tgt: Ensemble Model

Int_tgt: Ensemble Model Test Set Evaluation

Ensemble Model Coefficients

Ensemble Model Variable Importance

Int_tgt GAM Model

Best GAM Model for INT_tgt

Variable importance for GAM Model

Misc Plots