# Set CRAN mirror and Install kernlab, ggplot2, and dplyr packages
options(repos = c(CRAN = "https://cloud.r-project.org"))
if (!requireNamespace("h2o", quietly = TRUE)) {
cat("The 'h2o' package is not installed.\n")
} else {
cat("The 'h2o' package is already installed.\n")
}
## The 'h2o' package is already installed.
if (!requireNamespace("rJavaEnv", quietly = TRUE)) {
cat("The 'rJavaEnv' package is not installed.\n")
} else {
cat("The 'rJavaEnv' package is already installed.\n")
}
## The 'rJavaEnv' package is already installed.
#import the bank data
library(readr)
#Import the dataset. Change the file path to where you saved your bank.txt file
bank <- read_csv("C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files/bank.txt",
col_types = cols(b_tgt = col_character(),
int_tgt = col_number(), cnt_tgt = col_double(),
demog_homeval = col_number(), demog_inc = col_number(),
rfm1 = col_number(), rfm2 = col_number(),
rfm3 = col_number(), rfm4 = col_number(),
demog_genf = col_character(), demog_genm = col_character(),
dataset = col_character()))
#Caret and other packages don't like 0,1 factor levels and will treat them as numeric, lets fix that
bank$b_tgt<-ifelse(bank$b_tgt=="1", "yes", "no")
bank$demog_ho<-ifelse(bank$demog_ho=='1',"yes","no")
bank$demog_genf<-ifelse(bank$demog_genf=="1", "yes","no")
bank$demog_genm<-ifelse(bank$demog_genm=="1", "yes","no")
#Convert factor columns to factors
cols<-c("b_tgt", "cat_input1", "cat_input2","demog_ho","demog_genf", "demog_genm")
bank[cols] <- lapply(bank[cols], factor)
str(bank) #Make sure the data structure looks good
head(bank)
## # A tibble: 6 × 26
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 7000 NA X A NA no 57600
## 2 yes 7000 2 X A NA no 57587
## 3 yes 15000 2 X A NA no 44167
## 4 no NA 0 X A 68 no 90587
## 5 no NA 0 X A NA no 100313
## 6 no NA 0 X A 26 no 26622
## # ℹ 18 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>
summary(bank$int_tgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 6000 10000 11236 16000 500000 848529
class(bank)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
#replace to nas with 0 for int_tgt
bank$int_tgt[is.na(bank$int_tgt)] <- 0
#use Linear regression to impute missing values
## Create a linear model to predict demog_age using other variables
lm_model <- lm(demog_age ~ demog_homeval + demog_inc + demog_pr + demog_ho + rfm12 + rfm1, data = bank)
# Predict missing values
predicted_values <- predict(lm_model, newdata = bank[is.na(bank$demog_age), ])
# Replace missing values with predicted values
bank$demog_age[is.na(bank$demog_age)] <- predicted_values
# Check the summary of the imputed variable
cat("Summary of imputed Customer age:\n")
## Summary of imputed Customer age:
summary(bank$demog_age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -14.45 50.00 58.00 58.55 69.00 89.00
library(stargazer)
# summarize the regression output
summary(lm_model)
##
## Call:
## lm(formula = demog_age ~ demog_homeval + demog_inc + demog_pr +
## demog_ho + rfm12 + rfm1, data = bank)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.038 -11.833 1.174 13.161 59.490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.836e+01 7.805e-02 619.527 < 2e-16 ***
## demog_homeval 3.217e-06 2.166e-07 14.854 < 2e-16 ***
## demog_inc -2.069e-05 8.543e-07 -24.218 < 2e-16 ***
## demog_pr 1.336e-01 1.637e-03 81.651 < 2e-16 ***
## demog_hoyes -2.948e-01 4.242e-02 -6.949 3.68e-12 ***
## rfm12 1.045e-01 4.962e-04 210.696 < 2e-16 ***
## rfm1 -2.222e-02 1.262e-03 -17.612 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.3 on 793170 degrees of freedom
## (266861 observations deleted due to missingness)
## Multiple R-squared: 0.06459, Adjusted R-squared: 0.06458
## F-statistic: 9127 on 6 and 793170 DF, p-value: < 2.2e-16
#create a histogram of demog_age
library(ggplot2)
ggplot(bank, aes(x = demog_age)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Customer Age", x = "Customer Age", y = "Frequency") +
theme_minimal()
summary(bank$demog_age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -14.45 50.00 58.00 58.55 69.00 89.00
#count the number of negative values
count_negative_values <- sum(bank$demog_age < 0)
cat("Number of negative values in demog_age:", count_negative_values, "\n")
## Number of negative values in demog_age: 20
#create a new data frame that only includes the rows with demog_age < 0
bank_demog_age_neg <- bank[bank$demog_age == -1, ]
head(bank_demog_age_neg)
## # A tibble: 6 × 26
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 no 0 0 X B -1 yes 88071
## 2 yes 2000 2 X B -1 yes 0
## 3 yes 2000 2 X B -1 yes 0
## 4 no 0 0 X C -1 no 41927
## 5 yes 2000 2 X B -1 yes 0
## 6 no 0 0 X B -1 yes 88099
## # ℹ 18 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>
#Winsorized demog_age < 0 to the 1st quartile
bank$demog_age[bank$demog_age < 0] <- quantile(bank$demog_age, 0.25, na.rm = TRUE)
# Check the summary of the imputed variable
cat("Summary of imputed demog_age:\n")
## Summary of imputed demog_age:
summary(bank$demog_age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 50.00 58.00 58.55 69.00 89.00
quantile(bank$demog_age, 0.25, na.rm = TRUE)
## 25%
## 50
#plot the distribution of demog_age
ggplot(bank, aes(x = demog_age)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Customer Age", x = "Customer Age", y = "Frequency") +
theme_minimal()
#use linear regression to impute the missing values for rfm3
lm_model_rfm3 <- lm(rfm3 ~ rfm1 + rfm2 + rfm4 + rfm5 + rfm7 + rfm8 , data = bank)
# Predict missing values
predicted_values_rfm3 <- predict(lm_model_rfm3, newdata = bank[is.na(bank$rfm3), ])
# Replace missing values with predicted values
bank$rfm3[is.na(bank$rfm3)] <- predicted_values_rfm3
# Check the summary of the imputed variable
cat("Summary of imputed rfm3:\n")
## Summary of imputed rfm3:
summary(bank$rfm3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -63.73 10.00 14.88 15.83 20.00 3713.31
#count the number of negative values
count_negative_values_rfm3 <- sum(bank$rfm3 < 0)
cat("Number of negative values in rfm3:", count_negative_values_rfm3, "\n")
## Number of negative values in rfm3: 14041
#impute the negative values for rfm3 with the .05 quantile
bank$rfm3[bank$rfm3 < 0] <- quantile(bank$rfm3, 0.05, na.rm = TRUE)
# Check the summary of the imputed variable
cat("Summary of imputed rfm3:\n")
## Summary of imputed rfm3:
summary(bank$rfm3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 10.00 14.88 15.98 20.00 3713.31
#plot the distribution of rfm3
ggplot(bank, aes(x = rfm3)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of rfm3", x = "rfm3", y = "Frequency") +
theme_minimal()
#filter the data to only include the rows with cnt_tgt as NA
bank_cnt_tgt_na <- bank[is.na(bank$cnt_tgt), ]
head(bank_cnt_tgt_na)
## # A tibble: 1 × 26
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 7000 NA X A 60.2 no 57600
## # ℹ 18 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>
library(ggplot2)
#mean of cnt_tgt when demog_genm == yes and demog_homeval < 100000
mean_cnt_tgt_genm_yes_homeval_lt_100k <- mean(bank$cnt_tgt[bank$demog_genm == "yes" & bank$demog_homeval < 100000], na.rm = TRUE)
mean_cnt_tgt_genm_yes_homeval_lt_100k
## [1] 0.2450341
#round the imputed value to the nearest whole number
mean_cnt_tgt_genm_yes_homeval_lt_100k <- round(mean_cnt_tgt_genm_yes_homeval_lt_100k)
mean_cnt_tgt_genm_yes_homeval_lt_100k
## [1] 0
summary(bank$cnt_tgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.3118 0.0000 6.0000 1
#create histogram of cnt_tgt
ggplot(bank, aes(x = cnt_tgt)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of cnt_tgt", x = "cnt_tgt", y = "Frequency") +
theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
#impute the cnt_tgt variable as 1
bank$cnt_tgt[is.na(bank$cnt_tgt)] <- 1
# Check the summary of the imputed variable
cat("Summary of imputed cnt_tgt:\n")
## Summary of imputed cnt_tgt:
summary(bank$cnt_tgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3118 0.0000 6.0000
library(vcd)
## Loading required package: grid
#create a contingency table with Categorical variables
bank_cat1 <- table(bank$b_tgt, bank$cat_input1)
bank_cat2 <- table(bank$b_tgt, bank$cat_input2)
# perform a Chi-squared test
chisq_test_cat1 <- chisq.test(bank_cat1)
chisq_test_cat2 <- chisq.test(bank_cat2)
# print the results
cat("Chi-squared test for b_tgt and cat_input1:\n")
## Chi-squared test for b_tgt and cat_input1:
print(chisq_test_cat1)
##
## Pearson's Chi-squared test
##
## data: bank_cat1
## X-squared = 20362, df = 2, p-value < 2.2e-16
cat("Chi-squared test for b_tgt and cat_input2:\n")
## Chi-squared test for b_tgt and cat_input2:
print(chisq_test_cat2)
##
## Pearson's Chi-squared test
##
## data: bank_cat2
## X-squared = 16146, df = 4, p-value < 2.2e-16
n <- nrow(bank)
v2 <- chisq_test_cat1$statistic / (n *4)
v1 <- chisq_test_cat2$statistic / (n *4)
cat("Cramer's V for cat_input1:", sqrt(v2), "\n")
## Cramer's V for cat_input1: 0.06929801
cat("Cramer's V for cat_input2:", sqrt(v1), "\n")
## Cramer's V for cat_input2: 0.0617084
#perform ANOVA test for cnt_tgt
anova_test_cnt_tgt <- aov(cnt_tgt ~ cat_input1 + cat_input2, data = bank)
#perform ANOVA test for int_tgt
anova_test_int_tgt <- aov(int_tgt ~ cat_input1 + cat_input2, data = bank)
# print the results
cat("ANOVA test for cnt_tgt:\n")
## ANOVA test for cnt_tgt:
print(summary(anova_test_cnt_tgt))
## Df Sum Sq Mean Sq F value Pr(>F)
## cat_input1 2 10212 5106 10719 <2e-16 ***
## cat_input2 4 3761 940 1974 <2e-16 ***
## Residuals 1060031 504973 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("ANOVA test for int_tgt:\n")
## ANOVA test for int_tgt:
print(summary(anova_test_int_tgt))
## Df Sum Sq Mean Sq F value Pr(>F)
## cat_input1 2 3.274e+11 1.637e+11 4832 <2e-16 ***
## cat_input2 4 3.851e+11 9.629e+10 2842 <2e-16 ***
## Residuals 1060031 3.591e+13 3.388e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Cat input 1 is only explaining 1.97% of the overall variance in cnt_tgt, and Cat Input 2 is only explaining .72% of the variance in cnt_tgt.
Cat INput 1 is only explaining .89% of overall variance of int_tgt, and Cat Input 2 is explaining 1.05% of the overall variance in int_tgt.
This is not enough variation in the data for me to include these variables in the candidate models for cnt_tgt or int_tgt.# Calculate effect sizes (eta-squared)
eta_cnt_input1 <- 10212 / (10212 + 3761 + 504973)
eta_cnt_input2 <- 3761 / (10212 + 3761 + 504973)
# Post-hoc analysis (Tukey HSD)
library(multcomp)
Tukey_cnt <- glht(aov(cnt_tgt ~ cat_input1 + cat_input2, data=bank),
linfct = mcp(cat_input1 = "Tukey"))
summary(Tukey_cnt)
##
## Simultaneous Tests for General Linear Hypotheses
##
## Multiple Comparisons of Means: Tukey Contrasts
##
##
## Fit: aov(formula = cnt_tgt ~ cat_input1 + cat_input2, data = bank)
##
## Linear Hypotheses:
## Estimate Std. Error t value Pr(>|t|)
## Y - X == 0 -0.062475 0.002595 -24.07 <2e-16 ***
## Z - X == 0 -0.283639 0.001932 -146.81 <2e-16 ***
## Z - Y == 0 -0.221164 0.003054 -72.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
#perform ANOVA test for cnt_tgt
anova_test_cnt_tgt <- aov(cnt_tgt ~ rfm1 + rfm2 + rfm8 + demog_age, data = bank)
#perform ANOVA test for int_tgt
anova_test_int_tgt <- aov(int_tgt ~ rfm1 + rfm2 + rfm8 + demog_age , data = bank)
# print the results
cat("ANOVA test for cnt_tgt:\n")
## ANOVA test for cnt_tgt:
print(summary(anova_test_cnt_tgt))
## Df Sum Sq Mean Sq F value Pr(>F)
## rfm1 1 9065 9065 20729.7 <2e-16 ***
## rfm2 1 15766 15766 36050.8 <2e-16 ***
## rfm8 1 30465 30465 69663.6 <2e-16 ***
## demog_age 1 78 78 178.5 <2e-16 ***
## Residuals 1060033 463572 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("ANOVA test for int_tgt:\n")
## ANOVA test for int_tgt:
print(summary(anova_test_int_tgt))
## Df Sum Sq Mean Sq F value Pr(>F)
## rfm1 1 3.909e+10 3.909e+10 1161.7 < 2e-16 ***
## rfm2 1 2.243e+11 2.243e+11 6663.7 < 2e-16 ***
## rfm8 1 6.887e+11 6.887e+11 20465.7 < 2e-16 ***
## demog_age 1 5.586e+08 5.586e+08 16.6 4.62e-05 ***
## Residuals 1060033 3.567e+13 3.365e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#create a new data frame with only the numeric variables from the data frame
numeric_vars <- bank %>% select(where(is.numeric))
#filter the numeric_vars dataframe to exclude missing values
numeric_vars <- numeric_vars %>% filter(complete.cases(.))
#compute the IQR for the numeric variables
IQR_values <- sapply(numeric_vars, IQR)
#Create a 3 column table data showing the 25, 50, 75 percentiles for each numeric variable
# Create a data frame with the 25th, 50th, and 75th percentiles
percentiles <- data.frame(
Variable = names(numeric_vars),
Q25 = sapply(numeric_vars, quantile, probs = 0.25),
Q50 = sapply(numeric_vars, quantile, probs = 0.50),
Q75 = sapply(numeric_vars, quantile, probs = 0.75)
)
# Create the datatable and round the values to 2 decimal places
library(DT)
datatable(percentiles, options = list(pageLength = 10), rownames = FALSE) %>%
formatRound(columns = 2:4, digits = 2) %>%
formatStyle(columns = 1, fontWeight = 'bold') %>%
formatStyle(columns = 2:4, backgroundColor = 'white') %>%
formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
formatStyle(columns = 1, textAlign = 'center') %>%
formatStyle(columns = 2:4, textAlign = 'center')
library(tibble)
#find the count of observations that are greater than 2 times the Q75 IQR label them as "high outliers"
high_outliers <- sapply(numeric_vars, function(x) sum(x > (quantile(x, 0.75) + 2 * IQR(x))))
# Convert to a data frame and rename the columns
high_outliers <- as.data.frame(high_outliers)
#find the count of observations that are less than 2 times the Q25 IQR label them as "low outliers"
low_outliers <- sapply(numeric_vars, function(x) sum(x < (quantile(x, 0.25) - 2 * IQR(x))))
# Convert to a data frame and rename the columns
low_outliers <- as.data.frame(low_outliers)
# Combine the two data frames
outliers <- cbind(high_outliers, low_outliers)
# Rename the columns
colnames(outliers) <- c("High Outliers", "Low Outliers")
# Create a data frame with the variable names
outliers <- tibble::rownames_to_column(outliers, "Variable")
# Create the datatable
library(DT)
#exclude the variables with no outliers
outliers <- outliers %>% filter(`High Outliers` > 0 | `Low Outliers` > 0)
# Create the datatable
datatable(outliers, options = list(pageLength = 10)) %>%
formatRound(columns = 2:3, digits = 2) %>%
formatStyle(columns = 1, fontWeight = 'bold') %>%
formatStyle(columns = 2:3, backgroundColor = 'white') %>%
formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
formatStyle(columns = 1, textAlign = 'center') %>%
formatStyle(columns = 2:3, textAlign = 'center')
#export the outliers data frame to a csv
write.csv(outliers, "outliers.csv", row.names = FALSE)
#view the complete observation for when int_tgt == 500000
bank[bank$int_tgt > 100000, ]
## # A tibble: 55 × 26
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 200000 1 X A 57 yes 169771
## 2 yes 200000 1 Y B 76 no 100111
## 3 yes 200000 1 X A 71 yes 217081
## 4 yes 500000 1 X E 49.0 no 64972
## 5 yes 112000 1 X C 42 yes 0
## 6 yes 200000 1 X A 58 yes 169803
## 7 yes 200000 1 X A 76 no 100090
## 8 yes 200000 1 X A 72 yes 217106
## 9 yes 500000 1 X E 48.8 no 64995
## 10 yes 112000 1 X C 42 yes 0
## # ℹ 45 more rows
## # ℹ 18 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>
there are 11 observations where int_tgt = 500K.
This data point seems off when we look at the values of the other predictors when int_tgt is $500k. There is little to no variation in the income, demographics,and new sales data that that makes up these observations.
When we look at observations where int_tgt > $100k , we see significantly more variation in the predictor set.
It is only 11 obs, but they’re erroneous.
I am going to reclassify them based on income. Int_tgt == 500K will now take the average value of Int_tgt when deomog_inc is between $60k - $70K
The other high value Int_tgt observations over $100k are somewhat more realistic, as there is variation in the homevalue, income, and age demographic predictors that accompany these observations. There are a sparse number of these observations (44 in total), but they could be true customers of these banks.
They might have bought a home loan or education loans.
#reclassify the int_tgt == 500K observations based on income
bank$int_tgt[bank$int_tgt == 500000] <- mean(bank$int_tgt[bank$demog_inc > 60000 & bank$demog_inc < 70000], na.rm = TRUE)
# Check the summary of the imputed variable
cat("Summary of imputed int_tgt:\n")
## Summary of imputed int_tgt:
summary(bank$int_tgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 2237 0 200000
#bin the observations of int_tgt into 6 bins - up to $200k
library(dplyr)
bank <- bank %>%
mutate(int_tgt_bin = case_when(
int_tgt <= 20000 ~ "0-20k",
int_tgt > 20000 & int_tgt <= 40000 ~ "20k-40k",
int_tgt > 40000 & int_tgt <= 60000 ~ "40k-60k",
int_tgt > 60000 & int_tgt <= 80000 ~ "60k-80k",
int_tgt > 80000 & int_tgt <= 100000 ~ "80k-100k",
TRUE ~ ">100k"
))
# Create a histogram of the binned int_tgt variable
ggplot(bank, aes(x = int_tgt_bin)) +
geom_bar(fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of binned int_tgt", x = "int_tgt Bin", y = "Frequency") +
theme_minimal()
#add the exact count number for each bin to the histogram
ggplot(bank, aes(x = int_tgt_bin)) +
geom_bar(fill = "blue", color = "black", alpha = 0.7) +
geom_text(stat = 'count', aes(label = ..count..), vjust = -0.5) +
labs(title = "Histogram of binned int_tgt", x = "int_tgt Bin", y = "Frequency") +
theme_minimal()
summary(bank$cnt_tgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3118 0.0000 6.0000
#calculate 2x over the 3Q for cnt_tgt
cnt_tgt_3Q <- quantile(bank$cnt_tgt, 0.75) + 2 * IQR(bank$cnt_tgt)
#count the number of cnt_tgt observations that are greater than 2x the 3Q
count_high_cnt_tgt <- sum(bank$cnt_tgt > cnt_tgt_3Q)
print(paste("Number of cnt_tgt observations greater than 2x the 3Q:", count_high_cnt_tgt))
## [1] "Number of cnt_tgt observations greater than 2x the 3Q: 211509"
print(paste("cnt_tgt 3Q:", cnt_tgt_3Q))
## [1] "cnt_tgt 3Q: 0"
#show the observations wher cnt_tgt = 6
bank[bank$cnt_tgt == 6, ]
## # A tibble: 11 × 27
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 16000 6 X E 56.7 yes 127994
## 2 yes 16000 6 X E 56.7 yes 127991
## 3 yes 16000 6 X E 56.7 yes 127989
## 4 yes 16000 6 X E 56.5 yes 127988
## 5 yes 16000 6 X E 56.7 yes 127989
## 6 yes 16000 6 X E 56.7 yes 127991
## 7 yes 16000 6 X E 56.7 yes 127995
## 8 yes 16000 6 X E 56.7 yes 128000
## 9 yes 16000 6 X E 56.7 yes 128006
## 10 yes 16000 6 X E 56.7 yes 127995
## 11 yes 16000 6 X E 56.7 yes 128039
## # ℹ 19 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>
#reclassify the observations when cnt_tgt == 6 based on income
bank$cnt_tgt[bank$cnt_tgt == 6] <- mean(bank$cnt_tgt[bank$demog_inc > 30000 & bank$demog_inc < 35000], na.rm = TRUE)
#round the imputed values to the nearest whole number
bank$cnt_tgt[bank$cnt_tgt == 6] <- round(bank$cnt_tgt[bank$cnt_tgt == 6])
#show the imputed data points
when cnt_tgt = 6, there is little variation in the predictor set… these appear to be erroneous. Again only 11 obs, but better to put them to use than to throw them out.
And better to use them in a meaningful way rather than use them as they’re given (errant!)
#show the observations wher cnt_tgt = 5
bank[bank$cnt_tgt == 5, ]
## # A tibble: 297 × 27
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 17000 5 X A 82 yes 91509
## 2 yes 10000 5 X A 84 no 28767
## 3 yes 20000 5 X A 79 yes 68634
## 4 yes 5000 5 X A 80 yes 74290
## 5 yes 2000 5 X A 48 no 228917
## 6 yes 12000 5 X A 67 no 224969
## 7 yes 2000 5 X A 66 yes 75412
## 8 yes 12000 5 X D 81 no 89601
## 9 yes 14000 5 X E 72 no 49445
## 10 yes 11000 5 X E 41 no 0
## # ℹ 287 more rows
## # ℹ 19 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>
when cnt_tgt = 5, there is more variation in the predictor set… these appear to be true observations
#show the observations wher cnt_tgt = 4
bank[bank$cnt_tgt == 4, ]
## # A tibble: 2,552 × 27
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 9000 4 X A 58 no 51401
## 2 yes 18000 4 X A 80 no 59612
## 3 yes 16000 4 X A 35 no 24720
## 4 yes 8000 4 X A 85 no 99583
## 5 yes 20000 4 X A 61.4 no 136386
## 6 yes 6000 4 X A 55.7 no 167885
## 7 yes 2000 4 X A 67 yes 94492
## 8 yes 12000 4 X A 56.9 no 157205
## 9 yes 20000 4 X A 56.7 no 36902
## 10 yes 6000 4 X A 63 no 33204
## # ℹ 2,542 more rows
## # ℹ 19 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>
#plot a histogram of demog_inc
ggplot(bank, aes(x = demog_inc)) +
geom_histogram(binwidth = 1000, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Customer Income", x = "Customer Income", y = "Frequency") +
theme_minimal()
#count the number of observations where demog_inc == 0
count_zero_income <- sum(bank$demog_inc == 0)
cat("Number of observations with demog_inc == 0:", count_zero_income, "\n")
## Number of observations with demog_inc == 0: 253253
#plot the distribution of demog_homeval when demog_inc == 0
ggplot(bank[bank$demog_inc == 0, ], aes(x = demog_homeval)) +
geom_histogram(binwidth = 1000, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Customer Home Value when Income == 0", x = "Customer Home Value", y = "Frequency") +
theme_minimal()
summary(bank$demog_homeval[bank$demog_inc == 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 49710 76698 112377 137999 600056
summary(bank$demog_homeval)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 51107 73880 106104 122214 600067
#show observations where demog_inc == 0
bank[bank$demog_inc == 0, ]
## # A tibble: 253,253 × 27
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 no 0 0 X A 61.9 no 100313
## 2 no 0 0 X A 74 yes 95496
## 3 no 0 0 X A 77 yes 0
## 4 no 0 0 X A 57.3 no 42583
## 5 no 0 0 X A 83 no 181710
## 6 no 0 0 X A 81 no 61176
## 7 yes 10000 1 X A 57.4 no 316292
## 8 yes 17000 2 Y B 55.7 no 92482
## 9 yes 9000 2 X A 55.2 no 88296
## 10 no 0 0 X A 79 no 63998
## # ℹ 253,243 more rows
## # ℹ 19 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>
#count the number cnt_tgt == 0 when demog_inc == 0
count_zero_income_cnt_tgt <- sum(bank$cnt_tgt[bank$demog_inc == 0] == 0)
cat("Number of observations with cnt_tgt == 0 when demog_inc == 0:", count_zero_income_cnt_tgt, "\n")
## Number of observations with cnt_tgt == 0 when demog_inc == 0: 201674
#show unique values for b_tgt
unique(bank$b_tgt[bank$demog_inc == 0])
## [1] no yes
## Levels: no yes
#count the number of demog_inc == 0 when b_tgt == 2
count_zero_income_b_tgt <- sum(bank$b_tgt[bank$demog_inc == 0] == 2)
cat("Number of observations with b_tgt == 2 when demog_inc == 0:", count_zero_income_b_tgt, "\n")
## Number of observations with b_tgt == 2 when demog_inc == 0: 0
#count the number of demog_inc == 0 when demog_ho == 0
count_zero_income_demog_ho <- sum(bank$demog_ho[bank$demog_inc == 0] == 0)
cat("Number of observations with demog_ho == 0 when demog_inc == 0:", count_zero_income_demog_ho, "\n")
## Number of observations with demog_ho == 0 when demog_inc == 0: 0
#plot the distribution of demog_inc
ggplot(bank, aes(x = demog_inc)) +
geom_histogram(binwidth = 1000, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Customer Income", x = "Customer Income", y = "Frequency") +
theme_minimal()
library(psych)
#describe demog_inc
describe(bank$demog_inc)
## vars n mean sd median trimmed mad min max range
## X1 1 1060038 40368.69 28029.02 43174 39283.05 21628.17 0 200007 200007
## skew kurtosis se
## X1 0.23 0.56 27.22
#Create a new variable called demog_inc2
#copy it from the demog_inc variable
bank$demog_inc2 <- bank$demog_inc
#use linear regression to impute the values where demog_inc2 == 0
lm_model_inc2 <- lm(demog_inc2 ~ demog_homeval + demog_pr + demog_ho + rfm12 + rfm1, data = bank)
summary(lm_model_inc2)
##
## Call:
## lm(formula = demog_inc2 ~ demog_homeval + demog_pr + demog_ho +
## rfm12 + rfm1, data = bank)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83362 -19114 -2588 16252 176286
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.507e+04 8.441e+01 178.524 <2e-16 ***
## demog_homeval 7.637e-02 2.410e-04 316.852 <2e-16 ***
## demog_pr 4.197e+01 1.950e+00 21.521 <2e-16 ***
## demog_hoyes 2.795e+04 4.517e+01 618.685 <2e-16 ***
## rfm12 8.504e+00 6.005e-01 14.163 <2e-16 ***
## rfm1 -2.804e+00 1.162e+00 -2.414 0.0158 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23040 on 1060032 degrees of freedom
## Multiple R-squared: 0.3244, Adjusted R-squared: 0.3244
## F-statistic: 1.018e+05 on 5 and 1060032 DF, p-value: < 2.2e-16
# Predict missing values
predicted_values_inc <- predict(lm_model_inc2, newdata = bank[bank$demog_inc2 == 0, ])
# Replace missing values with predicted values
bank$demog_inc2[bank$demog_inc2 == 0] <- predicted_values_inc
# Check the summary of the imputed variable
cat("Summary of imputed demog_inc2:\n")
## Summary of imputed demog_inc2:
describe(bank$demog_inc2)
## vars n mean sd median trimmed mad min max range
## X1 1 1060038 46706.98 20544.33 44034 44691.92 18341.24 2493 200007 197514
## skew kurtosis se
## X1 1.32 3.71 19.95
#create a data frame of the predicted values
predicted_values_df <- data.frame(
demog_inc2 = bank$demog_inc2[bank$demog_inc == 0],
demog_homeval = bank$demog_homeval[bank$demog_inc == 0],
demog_pr = bank$demog_pr[bank$demog_inc == 0],
demog_ho = bank$demog_ho[bank$demog_inc == 0],
rfm12 = bank$rfm12[bank$demog_inc == 0],
rfm1 = bank$rfm1[bank$demog_inc == 0]
)
#plot the distribution of the predicted values for demog_inc2
ggplot(predicted_values_df, aes(x = demog_inc2)) +
geom_histogram(binwidth = 1000, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Predicted Values for Customer Income", x = "Predicted Customer Income", y = "Frequency") +
theme_minimal()
#show the observations where demog_inc > 200k
bank[bank$demog_inc > 200000, ]
## # A tibble: 203 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 13000 1 X A 46 yes 500005
## 2 no 0 0 X A 50 yes 499981
## 3 no 0 0 X C 58.9 yes 0
## 4 yes 3000 1 Z A 79 no 287506
## 5 no 0 0 X B 85 yes 499986
## 6 yes 8000 3 X B 49 yes 500009
## 7 no 0 0 X A 73 yes 419308
## 8 yes 4000 1 X A 6 no 323914
## 9 no 0 0 X A 78 yes 471412
## 10 no 0 0 X A 79 yes 368976
## # ℹ 193 more rows
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#show the observations where rfm4 > 8000
bank[bank$rfm4 > 8000, ]
## # A tibble: 11 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 no 0 0 X E 69 no 36275
## 2 no 0 0 X E 67 no 36295
## 3 no 0 0 Y E 68 no 36331
## 4 no 0 0 X E 68 no 36323
## 5 no 0 0 X E 68 no 36299
## 6 no 0 0 X E 68 no 36255
## 7 no 0 0 Y E 68 no 36284
## 8 no 0 0 Y E 69 no 36309
## 9 no 0 0 X E 67 no 36352
## 10 no 0 0 X E 67 no 36308
## 11 no 0 0 X E 68 no 36298
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#reclassify the observations where rfm4 > 8000 as 0
bank$rfm4[bank$rfm4 > 8000] <- 0
#show the observations where rfm2 > 1000
bank[bank$rfm2 > 500, ]
## # A tibble: 11 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 no 0 0 X A 27.6 no 233171
## 2 no 0 0 Y B 27.3 no 233151
## 3 no 0 0 X A 27.6 no 233192
## 4 no 0 0 X A 27.8 no 233204
## 5 no 0 0 X A 27.3 no 233229
## 6 no 0 0 X A 27.6 no 233212
## 7 no 0 0 X A 27.8 no 233218
## 8 no 0 0 X A 27.6 no 233211
## 9 no 0 0 X A 27.3 no 233198
## 10 no 0 0 X A 27.3 no 233177
## 11 no 0 0 X A 27.6 no 233218
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#reclassify the observations where rfm2 > 500 as mean rfm2
bank$rfm2[bank$rfm2 > 500] <- mean(bank$rfm2[bank$rfm2 < 500], na.rm = TRUE)
#show the observations where rfm3 > 3000
bank[bank$rfm3 > 3000, ]
## # A tibble: 11 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 no 0 0 X C 50 no 60072
## 2 no 0 0 X C 50 no 60119
## 3 no 0 0 X C 50 no 60092
## 4 no 0 0 X C 50 no 60126
## 5 no 0 0 X C 50 no 60089
## 6 no 0 0 X C 50 no 60102
## 7 no 0 0 X C 50 no 60110
## 8 no 0 0 X C 50 no 60084
## 9 no 0 0 X C 50 no 60115
## 10 no 0 0 X C 50 no 60090
## 11 no 0 0 X B 50 no 60085
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#classify these as the mean of rfm3
bank$rfm3[bank$rfm3 > 3000] <- mean(bank$rfm3[bank$rfm3 < 3000], na.rm = TRUE)
#show the observations where rfm5 > 10
bank[bank$rfm5 == 18, ]
## # A tibble: 5 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 17000 5 X A 82 yes 91509
## 2 no 0 0 X B 69 yes 83471
## 3 yes 17000 5 X A 82 yes 91463
## 4 no 0 0 X B 69 yes 83509
## 5 yes 17000 5 X A 82 yes 91486
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#classify these obs when in_tgt == 5 as mean of rfm5 for obs where int_tgt == 5
bank$rfm5[bank$rfm5 == 18] <- mean(bank$rfm5[bank$int_tgt == 5], na.rm = TRUE)
#classify these obs when int_tgt == 0 as mean of rfm5 for obs where int_tgt == 0
bank$rfm5[bank$rfm5 == 18] <- mean(bank$rfm5[bank$int_tgt == 0], na.rm = TRUE)
#show observations where rfm6 > 100
bank[bank$rfm6 > 120, ]
## # A tibble: 11 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 19000 2 X A 37 yes 48604
## 2 yes 19000 2 X A 37 yes 48592
## 3 yes 19000 2 X A 37 yes 48581
## 4 yes 19000 2 X A 37 yes 48624
## 5 yes 19000 2 X A 38 yes 48588
## 6 yes 19000 2 X A 37 yes 48598
## 7 yes 19000 2 X A 37 yes 48657
## 8 yes 19000 2 X A 38 yes 48565
## 9 yes 19000 2 X A 37 yes 48609
## 10 yes 19000 2 X A 37 yes 48628
## 11 yes 19000 2 X A 36 yes 48624
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#reclassify these as the mean of rfm6 when int_tgt_bin == 0-20k
bank$rfm6[bank$rfm6 > 120] <- mean(bank$rfm6[bank$int_tgt_bin == "0-20k"], na.rm = TRUE)
#show observations where rfm7 == 11
bank[bank$rfm7 == 11, ]
## # A tibble: 1 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 20000 4 X B 48 yes 56480
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#show observations where rfm8 == 46
bank[bank$rfm8 == 46, ]
## # A tibble: 5 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 no 0 0 Y E 8 no 36504
## 2 no 0 0 X E 8 no 36485
## 3 no 0 0 X E 8 no 36490
## 4 no 0 0 X E 8 no 36497
## 5 no 0 0 X E 8 no 36500
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#reclassify these as 0
bank$rfm8[bank$rfm8 == 46] <- 0
#show observations for when demog_age < 21
bank[bank$demog_age < 21, ]
## # A tibble: 12,066 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 5000 3 X A 16 no 96274
## 2 no 0 0 X A 8 no 59047
## 3 no 0 0 X A 18 no 75077
## 4 no 0 0 X A 17 yes 114000
## 5 no 0 0 X A 17 yes 499986
## 6 no 0 0 X A 17 no 40615
## 7 no 0 0 X A 18 no 0
## 8 no 0 0 Y B 8 no 83072
## 9 no 0 0 X A 17 no 153013
## 10 no 0 0 X A 7 yes 95624
## # ℹ 12,056 more rows
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#reclassify rfm9 as 0 when demog_age < 21
bank$rfm9[bank$demog_age < 21] <- 0
#show observations when rfm12 >500
bank[bank$rfm12 > 500, ]
## # A tibble: 11 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 9000 1 X E 82 no 23102
## 2 yes 9000 1 X E 81 no 23093
## 3 yes 9000 1 X E 81 no 23075
## 4 yes 9000 1 X E 81 no 23122
## 5 yes 9000 1 X E 81 no 23124
## 6 yes 9000 1 X E 81 no 23095
## 7 yes 9000 1 X E 82 no 23083
## 8 yes 9000 1 X E 80 no 23097
## 9 yes 9000 1 X E 81 no 23153
## 10 yes 9000 1 X E 81 no 23134
## 11 yes 9000 1 X E 81 no 23103
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#show observations where rfm3 < 0
bank[bank$rfm3 < 0, ]
## # A tibble: 0 × 28
## # ℹ 28 variables: b_tgt <fct>, int_tgt <dbl>, cnt_tgt <dbl>, cat_input1 <fct>,
## # cat_input2 <fct>, demog_age <dbl>, demog_ho <fct>, demog_homeval <dbl>,
## # demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>, rfm3 <dbl>,
## # rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>, rfm9 <dbl>,
## # rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>, demog_genm <fct>,
## # account <dbl>, dataset <chr>, int_tgt_bin <chr>, demog_inc2 <dbl>
#count the number of observations where demog_age < 21
count_demog_age_lt_21 <- sum(bank$demog_age < 21)
cat("Number of observations with demog_age < 21:", count_demog_age_lt_21, "\n")
## Number of observations with demog_age < 21: 12066
#show the observations where demog_age < 21
bank[bank$demog_age < 21, ]
## # A tibble: 12,066 × 28
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 5000 3 X A 16 no 96274
## 2 no 0 0 X A 8 no 59047
## 3 no 0 0 X A 18 no 75077
## 4 no 0 0 X A 17 yes 114000
## 5 no 0 0 X A 17 yes 499986
## 6 no 0 0 X A 17 no 40615
## 7 no 0 0 X A 18 no 0
## 8 no 0 0 Y B 8 no 83072
## 9 no 0 0 X A 17 no 153013
## 10 no 0 0 X A 7 yes 95624
## # ℹ 12,056 more rows
## # ℹ 20 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>
#exclude the observations where demog_age < 21
bank <- bank[bank$demog_age >= 21, ]
#Create a variable that is income squared
bank$demog_inc2_sq <- bank$demog_inc2^2
#create a variable that is rfm6 squared
bank$rfm6_sq <- bank$rfm6^2
#create a single binary indicator if the 3 conditions are true:
#demog_age < 40
#if demog_pr <= 25
#demog_ho == no
bank$prospect_ho <- ifelse(bank$demog_age < 40 & bank$demog_pr <= 25 & bank$demog_ho == "no", 1, 0)
#Create an interaction term between rfm2 and demog_inc
bank$rfm2_inc2 <- bank$rfm2 * bank$demog_inc2
# Calculate skewness for each numeric variable
skewness_values <- sapply(bank, function(x) if(is.numeric(x)) e1071::skewness(x, na.rm = TRUE) else NA)
# Create a data frame with the variable names and skewness values
skewness_df <- data.frame(
Variable = names(skewness_values),
Skewness = skewness_values
)
# Filter out non-numeric variables
skewness_df <- skewness_df[!is.na(skewness_df$Skewness), ]
# Create the datatable
library(DT)
# Create the datatable
datatable(skewness_df, options = list(pageLength = 10)) %>%
formatRound(columns = 2, digits = 2) %>%
formatStyle(columns = 1, fontWeight = 'bold') %>%
formatStyle(columns = 2, backgroundColor = 'white') %>%
formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
formatStyle(columns = 1, textAlign = 'center') %>%
formatStyle(columns = 2, textAlign = 'center')
#export the datatable to a csv
write.csv(skewness_df, "skewness_df.csv", row.names = FALSE)
#Create a second data frame from the original data set called bank_log
bank_log <- bank
# Apply log transformation and add a small constant
bank_log$int_tgt <- log(bank_log$int_tgt + 1)
bank_log$cnt_tgt <- log(bank_log$cnt_tgt + 1)
bank_log$demog_homeval <- log(bank_log$demog_homeval + 1)
bank_log$demog_inc <- log(bank_log$demog_inc + 1)
bank_log$demog_age <- log(bank_log$demog_age + 1)
bank_log$rfm1 <- log(bank_log$rfm1 + 1)
bank_log$rfm2 <- log(bank_log$rfm2 + 1)
bank_log$rfm3 <- log(bank_log$rfm3 + 1)
bank_log$rfm4 <- log(bank_log$rfm4 + 1)
bank_log$rfm5 <- log(bank_log$rfm5 + 1)
bank_log$rfm6 <- log(bank_log$rfm6 + 1)
bank_log$rfm7 <- log(bank_log$rfm7 + 1)
bank_log$rfm8 <- log(bank_log$rfm8 + 1)
bank_log$rfm9 <- log(bank_log$rfm9 + 1)
bank_log$rfm10 <- log(bank_log$rfm10 + 1)
bank_log$rfm11 <- log(bank_log$rfm11 + 1)
bank_log$rfm12 <- log(bank_log$rfm12 + 1)
bank_log$rfm2_inc2 <- log(bank_log$rfm2_inc2 + 1)
bank_log$demog_inc2_sq <- log(bank_log$demog_inc2_sq + 1)
bank_log$rfm6_sq <- log(bank_log$rfm6_sq + 1)
bank_log$demog_inc2 <- log(bank_log$demog_inc2 + 1)
#check for NaNs
na_count <- sum(is.na(bank_log))
cat("Number of NaN values in bank_log:", na_count, "\n")
## Number of NaN values in bank_log: 5
#count the NaNs for the log transformed variables
log_vars <- c("int_tgt", "cnt_tgt", "demog_homeval", "demog_inc", "demog_age", "rfm1", "rfm2", "rfm3", "rfm4", "rfm5", "rfm6", "rfm7", "rfm8", "rfm9", "rfm10", "rfm11", "rfm12","demog_inc2_sq","rfm2_inc2")
na_counts <- sapply(bank_log[log_vars], function(x) sum(is.na(x)))
# Create a data frame with the variable names and NaN counts
na_counts_df <- data.frame(
Variable = names(na_counts),
NaN_Count = na_counts
)
# Create the datatable
library(DT)
# Create the datatable
datatable(na_counts_df, options = list(pageLength = 10)) %>%
formatRound(columns = 2, digits = 2) %>%
formatStyle(columns = 1, fontWeight = 'bold') %>%
formatStyle(columns = 2, backgroundColor = 'white') %>%
formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
formatStyle(columns = 1, textAlign = 'center') %>%
formatStyle(columns = 2, textAlign = 'center')
#create a datatable for the skewness of each variable in the bank_log data set
# Calculate skewness for each numeric variable
skewness_values_log <- sapply(bank_log, function(x) if(is.numeric(x)) e1071::skewness(x, na.rm = TRUE) else NA)
# Create a data frame with the variable names and skewness values
skewness_df_log <- data.frame(
Variable = names(skewness_values_log),
Skewness = skewness_values_log
)
#Create a data table
# Filter out non-numeric variables
skewness_df_log <- skewness_df_log[!is.na(skewness_df_log$Skewness), ]
# Create the datatable
library(DT)
# Create the datatable
datatable(skewness_df_log, options = list(pageLength = 10)) %>%
formatRound(columns = 2, digits = 2) %>%
formatStyle(columns = 1, fontWeight = 'bold') %>%
formatStyle(columns = 2, backgroundColor = 'white') %>%
formatStyle(columns = 1, backgroundColor = 'lightgreen') %>%
formatStyle(columns = 1, textAlign = 'center') %>%
formatStyle(columns = 2, textAlign = 'center')
#export the datatable to a csv
write.csv(skewness_df_log, "skewness_df_log.csv", row.names = FALSE)
#show observations where rfm5 is na
bank_log[is.na(bank_log$rfm5), ]
## # A tibble: 5 × 32
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 9.74 1.79 X A 4.42 yes 11.4
## 2 no 0 0 X B 4.25 yes 11.3
## 3 yes 9.74 1.79 X A 4.42 yes 11.4
## 4 no 0 0 X B 4.25 yes 11.3
## 5 yes 9.74 1.79 X A 4.42 yes 11.4
## # ℹ 24 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>, demog_inc2_sq <dbl>, rfm6_sq <dbl>, prospect_ho <dbl>,
## # rfm2_inc2 <dbl>
#impute the NANs as the mean of rfm5 when int_tgt_bin == 0-20k
bank_log$rfm5[is.na(bank_log$rfm5)] <- mean(bank_log$rfm5[bank_log$int_tgt_bin == "0-20k"], na.rm = TRUE)
# Check the summary of the imputed variable
summary(bank_log$rfm5)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.6931 1.0986 1.2321 1.6094 2.8904
head(bank_log)
## # A tibble: 6 × 32
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 yes 8.85 0.693 X A 4.11 no 11.0
## 2 yes 8.85 1.10 X A 4.11 no 11.0
## 3 yes 9.62 1.10 X A 4.06 no 10.7
## 4 no 0 0 X A 4.23 no 11.4
## 5 no 0 0 X A 4.14 no 11.5
## 6 no 0 0 X A 3.30 no 10.2
## # ℹ 24 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>, demog_inc2_sq <dbl>, rfm6_sq <dbl>, prospect_ho <dbl>,
## # rfm2_inc2 <dbl>
#show the values where demog_inc == 0
#use the bank_log data set
bank_log[bank_log$demog_inc == 0, ]
## # A tibble: 251,175 × 32
## b_tgt int_tgt cnt_tgt cat_input1 cat_input2 demog_age demog_ho demog_homeval
## <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 no 0 0 X A 4.14 no 11.5
## 2 no 0 0 X A 4.32 yes 11.5
## 3 no 0 0 X A 4.36 yes 0
## 4 no 0 0 X A 4.07 no 10.7
## 5 no 0 0 X A 4.43 no 12.1
## 6 no 0 0 X A 4.41 no 11.0
## 7 yes 9.21 0.693 X A 4.07 no 12.7
## 8 yes 9.74 1.10 Y B 4.04 no 11.4
## 9 yes 9.11 1.10 X A 4.03 no 11.4
## 10 no 0 0 X A 4.38 no 11.1
## # ℹ 251,165 more rows
## # ℹ 24 more variables: demog_inc <dbl>, demog_pr <dbl>, rfm1 <dbl>, rfm2 <dbl>,
## # rfm3 <dbl>, rfm4 <dbl>, rfm5 <dbl>, rfm6 <dbl>, rfm7 <dbl>, rfm8 <dbl>,
## # rfm9 <dbl>, rfm10 <dbl>, rfm11 <dbl>, rfm12 <dbl>, demog_genf <fct>,
## # demog_genm <fct>, account <dbl>, dataset <chr>, int_tgt_bin <chr>,
## # demog_inc2 <dbl>, demog_inc2_sq <dbl>, rfm6_sq <dbl>, prospect_ho <dbl>,
## # rfm2_inc2 <dbl>
#un log the cnt_tgt variable in the bank_log data set
bank_log$cnt_tgt <- exp(bank_log$cnt_tgt) - 1
#un log the demog_pr variable in the bank_log data set
bank_log$demog_pr <- exp(bank_log$demog_pr) - 1
#plot the density for demog_age
#use the bank_log data set
ggplot(bank_log, aes(x = demog_age)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Density Plot of Customer Age", x = "Customer Age", y = "Density") +
theme_minimal()
#label the x axis for increments of .25
ggplot(bank_log, aes(x = demog_age)) +
geom_density(fill = "blue", alpha = 0.5) +
scale_x_continuous(breaks = seq(0, 10, by = 0.25)) +
labs(title = "Density Plot of Customer Age", x = "Customer Age", y = "Density") +
theme_minimal()
#plot the density for rfm2
#label the x axis for increments of .25
ggplot(bank_log, aes(x = rfm2)) +
geom_density(fill = "blue", alpha = 0.5) +
scale_x_continuous(breaks = seq(0, 10, by = 0.25)) +
labs(title = "Density Plot of Customer RFM2", x = "Customer RFM2", y = "Density") +
theme_minimal()
#plot the density of demog_homeval
#label the x axis for increments of .25
ggplot(bank_log, aes(x = demog_homeval)) +
geom_density(fill = "blue", alpha = 0.5) +
scale_x_continuous(breaks = seq(0, 50, by = 1)) +
labs(title = "Density Plot of Customer Home Value", x = "Customer Home Value", y = "Density") +
theme_minimal()
#plot the density of demog_inc2
#label the x axis for increments of .25
ggplot(bank_log, aes(x = demog_inc2)) +
geom_density(fill = "blue", alpha = 0.5) +
scale_x_continuous(breaks = seq(0, 50, by = 1)) +
labs(title = "Density Plot of Customer Income", x = "Customer Income", y = "Density") +
theme_minimal()
#plot the scatter plot of demog_inc2 and demog_homeval
#highlight the points where B_tgt == 1
ggplot(bank_log, aes(x = demog_inc2, y = demog_homeval, color = b_tgt)) +
geom_point(alpha = 0.5) +
labs(title = "Scatter Plot of Customer Income vs Home Value", x = "Customer Income", y = "Customer Home Value") +
theme_minimal() +
scale_color_manual(values = c("red", "blue"))
#create an interaction term of demog_inc and demog_homeval
bank_log$demog_inc_homeval <- bank_log$demog_inc * bank_log$demog_homeval
library(psych)
describe(bank_log)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median min
## b_tgt 1 1047972 NaN NA NA Inf
## int_tgt 2 1047972 1.780000e+00 3.630000e+00 0.000000e+00 0.000e+00
## cnt_tgt 3 1047972 3.100000e-01 7.000000e-01 0.000000e+00 0.000e+00
## cat_input1 4 1047972 NaN NA NA Inf
## cat_input2 5 1047972 NaN NA NA Inf
## demog_age 6 1047972 4.070000e+00 2.500000e-01 4.080000e+00 3.090e+00
## demog_ho 7 1047972 NaN NA NA Inf
## demog_homeval 8 1047972 1.120000e+01 1.380000e+00 1.121000e+01 0.000e+00
## demog_inc 9 1047972 8.230000e+00 4.630000e+00 1.067000e+01 0.000e+00
## demog_pr 10 1047972 1.517608e+39 1.707822e+41 2.904885e+13 0.000e+00
## rfm1 11 1047972 2.690000e+00 5.700000e-01 2.770000e+00 0.000e+00
## rfm2 12 1047972 2.550000e+00 4.700000e-01 2.540000e+00 9.500e-01
## rfm3 13 1047972 2.690000e+00 5.200000e-01 2.770000e+00 0.000e+00
## rfm4 14 1047972 2.760000e+00 5.400000e-01 2.770000e+00 0.000e+00
## rfm5 15 1047972 1.230000e+00 5.200000e-01 1.100000e+00 0.000e+00
## rfm6 16 1047972 2.040000e+00 8.100000e-01 2.080000e+00 0.000e+00
## rfm7 17 1047972 8.200000e-01 5.700000e-01 6.900000e-01 0.000e+00
## rfm8 18 1047972 1.520000e+00 7.600000e-01 1.610000e+00 0.000e+00
## rfm9 19 1047972 2.930000e+00 2.600000e-01 2.940000e+00 1.100e+00
## rfm10 20 1047972 2.590000e+00 2.700000e-01 2.560000e+00 0.000e+00
## rfm11 21 1047972 1.820000e+00 2.300000e-01 1.950000e+00 0.000e+00
## rfm12 22 1047972 4.060000e+00 6.300000e-01 4.170000e+00 0.000e+00
## demog_genf 23 1047972 NaN NA NA Inf
## demog_genm 24 1047972 NaN NA NA Inf
## account 25 1047972 1.005300e+08 3.060058e+05 1.005300e+08 1.000e+08
## dataset 26 1047972 NaN NA NA Inf
## int_tgt_bin 27 1047972 NaN NA NA Inf
## demog_inc2 28 1047972 1.066000e+01 4.300000e-01 1.069000e+01 7.820e+00
## demog_inc2_sq 29 1047972 2.132000e+01 8.600000e-01 2.138000e+01 1.564e+01
## rfm6_sq 30 1047972 3.800000e+00 1.790000e+00 3.910000e+00 0.000e+00
## prospect_ho 31 1047972 1.000000e-02 1.100000e-01 0.000000e+00 0.000e+00
## rfm2_inc2 32 1047972 1.311000e+01 6.900000e-01 1.311000e+01 9.780e+00
## demog_inc_homeval 33 1047972 9.292000e+01 5.318000e+01 1.175800e+02 0.000e+00
## max range skew kurtosis se
## b_tgt -Inf -Inf NA NA NA
## int_tgt 1.22100e+01 1.221000e+01 1.57 0.51 0.000000e+00
## cnt_tgt 5.00000e+00 5.000000e+00 2.40 5.57 0.000000e+00
## cat_input1 -Inf -Inf NA NA NA
## cat_input2 -Inf -Inf NA NA NA
## demog_age 4.50000e+00 1.410000e+00 -0.77 0.54 0.000000e+00
## demog_ho -Inf -Inf NA NA NA
## demog_homeval 1.33000e+01 1.330000e+01 -5.99 46.94 0.000000e+00
## demog_inc 1.22100e+01 1.221000e+01 -1.21 -0.52 0.000000e+00
## demog_pr 7.30706e+43 7.307060e+43 178.52 46856.25 1.668276e+38
## rfm1 8.22000e+00 8.220000e+00 -1.11 6.10 0.000000e+00
## rfm2 6.08000e+00 5.130000e+00 0.36 1.28 0.000000e+00
## rfm3 7.60000e+00 7.600000e+00 0.12 1.38 0.000000e+00
## rfm4 8.22000e+00 8.220000e+00 -0.48 3.80 0.000000e+00
## rfm5 2.89000e+00 2.890000e+00 -0.22 -0.02 0.000000e+00
## rfm6 4.80000e+00 4.800000e+00 -0.21 -0.48 0.000000e+00
## rfm7 2.48000e+00 2.480000e+00 -0.03 -0.81 0.000000e+00
## rfm8 3.61000e+00 3.610000e+00 -0.14 -0.62 0.000000e+00
## rfm9 3.40000e+00 2.300000e+00 -2.52 10.10 0.000000e+00
## rfm10 4.36000e+00 4.360000e+00 0.72 4.25 0.000000e+00
## rfm11 3.14000e+00 3.140000e+00 -1.35 5.47 0.000000e+00
## rfm12 6.35000e+00 6.350000e+00 -0.38 -1.04 0.000000e+00
## demog_genf -Inf -Inf NA NA NA
## demog_genm -Inf -Inf NA NA NA
## account 1.01060e+08 1.060037e+06 0.00 -1.20 2.989200e+02
## dataset -Inf -Inf NA NA NA
## int_tgt_bin -Inf -Inf NA NA NA
## demog_inc2 1.22100e+01 4.380000e+00 -0.10 -0.23 0.000000e+00
## demog_inc2_sq 2.44100e+01 8.770000e+00 -0.10 -0.23 0.000000e+00
## rfm6_sq 9.58000e+00 9.580000e+00 -0.20 -0.73 0.000000e+00
## prospect_ho 1.00000e+00 1.000000e+00 8.84 76.19 0.000000e+00
## rfm2_inc2 1.72100e+01 7.430000e+00 0.13 0.44 0.000000e+00
## demog_inc_homeval 1.62340e+02 1.623400e+02 -1.11 -0.61 5.000000e-02
library(rJavaEnv)
java_install(
java_distrib_path = "C:/Program Files/R/R-4.4.3/amazon-corretto-17-x64-windows-jdk.zip",
autoset_java_env = TRUE
)
## Java distribution amazon-corretto-17-x64-windows-jdk.zip already unpacked at
## 'C:\Users\andre\AppData\Local/R/cache/R/rJavaEnv/installed/windows/x64/17'
## Junction creation failed. This is likely because the project directory is not
## on the same disk as the R package cache directory. Java files will instead be
## copied to
## 'C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files/rjavaenv/windows/x64/17'
## Warning in dir.create(project_version_path, recursive = TRUE):
## 'C:\Users\andre\OneDrive\Desktop\School\MSQE\1_Coursework\Data\ECON_562\Projects\Final
## Project\R_files\rjavaenv\windows\x64\17' already exists
## Java copied to project
## 'C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files/rjavaenv/windows/x64/17'
## ✔ Current R Session: JAVA_HOME and PATH set to 'C:\Users\andre\AppData\Local/R/cache/R/rJavaEnv/installed/windows/x64/17'
##
## ✔ Current R Project/Working Directory: JAVA_HOME and PATH set to ''C:\Users\andre\AppData\Local/R/cache/R/rJavaEnv/installed/windows/x64/17'' in .Rprofile at ''C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files''
##
## Java 17 (amazon-corretto-17-x64-windows-jdk.zip) for windows x64 installed at
## 'C:\Users\andre\AppData\Local/R/cache/R/rJavaEnv/installed/windows/x64/17' and
## symlinked to
## 'C:/Users/andre/OneDrive/Desktop/School/MSQE/1_Coursework/Data/ECON_562/Projects/Final Project/R_files/rjavaenv/windows/x64/17'
## [1] "C:\\Users\\andre\\AppData\\Local/R/cache/R/rJavaEnv/installed/windows/x64/17"
#initialize h20
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
##
## Attaching package: 'h2o'
##
## The following objects are masked from 'package:stats':
##
## cor, sd, var
##
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
# Start H2O
h2o.init(nthreads = 5, max_mem_size = "16G")
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\andre\AppData\Local\Temp\RtmpQnd2cQ\file5e4c29bc7169/h2o_andre_started_from_r.out
## C:\Users\andre\AppData\Local\Temp\RtmpQnd2cQ\file5e4ce054b37/h2o_andre_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 830 milliseconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year and 5 months
## H2O cluster name: H2O_started_from_R_andre_zxa961
## H2O cluster total nodes: 1
## H2O cluster total memory: 16.00 GB
## H2O cluster total cores: 16
## H2O cluster allowed cores: 5
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.3 (2025-02-28 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year and 5 months) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert the data frame to an H2O frame
bank_h2o <- as.h2o(bank_log)
## | | | 0% | |======================================================================| 100%
# Check if 'dataset' column exists
print("dataset" %in% colnames(bank_h2o))
## [1] TRUE
# Look at all column names to confirm
print(colnames(bank_h2o))
## [1] "b_tgt" "int_tgt" "cnt_tgt"
## [4] "cat_input1" "cat_input2" "demog_age"
## [7] "demog_ho" "demog_homeval" "demog_inc"
## [10] "demog_pr" "rfm1" "rfm2"
## [13] "rfm3" "rfm4" "rfm5"
## [16] "rfm6" "rfm7" "rfm8"
## [19] "rfm9" "rfm10" "rfm11"
## [22] "rfm12" "demog_genf" "demog_genm"
## [25] "account" "dataset" "int_tgt_bin"
## [28] "demog_inc2" "demog_inc2_sq" "rfm6_sq"
## [31] "prospect_ho" "rfm2_inc2" "demog_inc_homeval"
# Get description of your H2O frame including types
h2o.describe(bank_h2o)
## Label Type Missing Zeros PosInf NegInf Min
## 1 b_tgt enum 0 839461 0 0 0.000000e+00
## 2 int_tgt real 0 843311 0 0 0.000000e+00
## 3 cnt_tgt real 0 839461 0 0 0.000000e+00
## 4 cat_input1 enum 0 820863 0 0 0.000000e+00
## 5 cat_input2 enum 0 185840 0 0 0.000000e+00
## 6 demog_age real 0 0 0 0 3.091042e+00
## 7 demog_ho enum 0 470985 0 0 0.000000e+00
## 8 demog_homeval real 0 12016 0 0 0.000000e+00
## 9 demog_inc real 0 251175 0 0 0.000000e+00
## 10 demog_pr real 0 30838 0 0 0.000000e+00
## 11 rfm1 real 0 14096 0 0 0.000000e+00
## 12 rfm2 real 0 0 0 0 9.477894e-01
## 13 rfm3 real 0 2 0 0 0.000000e+00
## 14 rfm4 real 0 4489 0 0 0.000000e+00
## 15 rfm5 real 0 52785 0 0 0.000000e+00
## 16 rfm6 real 0 17583 0 0 0.000000e+00
## 17 rfm7 real 0 246969 0 0 0.000000e+00
## 18 rfm8 real 0 76595 0 0 0.000000e+00
## 19 rfm9 real 0 0 0 0 1.098612e+00
## 20 rfm10 real 0 4 0 0 0.000000e+00
## 21 rfm11 real 0 756 0 0 0.000000e+00
## 22 rfm12 real 0 11 0 0 0.000000e+00
## 23 demog_genf enum 0 459070 0 0 0.000000e+00
## 24 demog_genm enum 0 588902 0 0 0.000000e+00
## 25 account int 0 0 0 0 1.000000e+08
## 26 dataset string 0 0 0 0 NaN
## 27 int_tgt_bin string 0 0 0 0 NaN
## 28 demog_inc2 real 0 0 0 0 7.821643e+00
## 29 demog_inc2_sq real 0 0 0 0 1.564248e+01
## 30 rfm6_sq real 0 17583 0 0 0.000000e+00
## 31 prospect_ho int 0 1035062 0 0 0.000000e+00
## 32 rfm2_inc2 real 0 0 0 0 9.779775e+00
## 33 demog_inc_homeval real 0 253346 0 0 0.000000e+00
## Max Mean Sigma Cardinality
## 1 1.000000e+00 1.989662e-01 3.992227e-01 2
## 2 1.220608e+01 1.778894e+00 3.627034e+00 NA
## 3 5.000000e+00 3.106866e-01 6.984116e-01 NA
## 4 2.000000e+00 NA NA 3
## 5 4.000000e+00 NA NA 5
## 6 4.499810e+00 4.065895e+00 2.520369e-01 NA
## 7 1.000000e+00 5.505748e-01 4.974358e-01 2
## 8 1.330480e+01 1.119889e+01 1.379983e+00 NA
## 9 1.220611e+01 8.229451e+00 4.629188e+00 NA
## 10 7.307060e+43 1.517608e+39 1.707822e+41 NA
## 11 8.219948e+00 2.686471e+00 5.712160e-01 NA
## 12 6.079933e+00 2.545268e+00 4.677873e-01 NA
## 13 7.601902e+00 2.688487e+00 5.150642e-01 NA
## 14 8.219948e+00 2.760929e+00 5.441262e-01 NA
## 15 2.890372e+00 1.232083e+00 5.197502e-01 NA
## 16 4.795791e+00 2.044768e+00 8.119793e-01 NA
## 17 2.484907e+00 8.231268e-01 5.667126e-01 NA
## 18 3.610918e+00 1.518190e+00 7.623620e-01 NA
## 19 3.401197e+00 2.934654e+00 2.609206e-01 NA
## 20 4.356709e+00 2.589426e+00 2.737642e-01 NA
## 21 3.135494e+00 1.824366e+00 2.349549e-01 NA
## 22 6.349139e+00 4.057001e+00 6.258354e-01 NA
## 23 1.000000e+00 5.619444e-01 4.961483e-01 2
## 24 1.000000e+00 4.380556e-01 4.961483e-01 2
## 25 1.010600e+08 1.005300e+08 3.060058e+05 NA
## 26 NaN NA NA NA
## 27 NaN NA NA NA
## 28 1.220611e+01 1.065957e+01 4.314523e-01 NA
## 29 2.441222e+01 2.131909e+01 8.629272e-01 NA
## 30 9.575053e+00 3.797591e+00 1.789999e+00 NA
## 31 1.000000e+00 1.231903e-02 1.103054e-01 NA
## 32 1.721277e+01 1.311257e+01 6.858268e-01 NA
## 33 1.623359e+02 9.292136e+01 5.318459e+01 NA
# This creates a numeric column based on the string values
bank_h2o$dataset_num <- h2o.asnumeric(bank_h2o$dataset)
# Check the levels of the 'dataset' column
print(h2o.levels(bank_h2o["dataset_num"]))
## NULL
#check unique values in the dataset column
print(h2o.unique(bank_h2o["dataset_num"]))
## C1
## 1 2
## 2 1
## 3 3
##
## [3 rows x 1 column]
# Training set: where dataset == 1
bank_train <- bank_h2o[bank_h2o$dataset_num == 1, ]
# Validation set: where dataset == 2
bank_test <- bank_h2o[bank_h2o$dataset_num == 2, ]
# Test set: where dataset == 3
bank_test_final <- bank_h2o[bank_h2o$dataset_num == 3, ]
#check the dimensions of the resulting data frames
print(dim(bank_train))
## [1] 628940 34
print(dim(bank_test))
## [1] 209366 34
print(dim(bank_test_final))
## [1] 209666 34
#count the number of observations where b_tgt == 1
count_b_tgt_1 <- h2o.nrow(bank_train[bank_train$b_tgt == "yes", ])
cat("Number of observations where b_tgt == 1 in the training set:", count_b_tgt_1, "\n")
## Number of observations where b_tgt == 1 in the training set: 125285
#count the number of observations where b_tgt == "no"
count_b_tgt_0 <- h2o.nrow(bank_train[bank_train$b_tgt == "no", ])
cat("Number of observations where b_tgt == 0 in the training set:", count_b_tgt_0, "\n")
## Number of observations where b_tgt == 0 in the training set: 503655
#check if the themis package is installed
if (!requireNamespace("themis", quietly = TRUE)) {
install.packages("themis")
}
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(themis)
# Convert H2OFrame to R dataframe
bank_train_df <- as.data.frame(bank_train)
#remove int_tgt_bin and column
bank_train_df <- bank_train_df[, !colnames(bank_train_df) %in% c("int_tgt_bin", "dataset_num")]
# Apply SMOTE to balance the classes
bank_train_smote <- bank_train_df %>%
recipe(b_tgt ~ .) %>%
step_rm(dataset ) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(b_tgt, over_ratio = 1) %>%
prep() %>%
bake(new_data = NULL)
# Convert back to H2OFrame if needed
bank_train_balanced <- as.h2o(bank_train_smote)
## | | | 0% | |======================================================================| 100%
#verify the balance of the classes
# Count the number of observations where b_tgt == 1
count_b_tgt_1 <- h2o.nrow(bank_train_balanced[bank_train_balanced$b_tgt == "yes", ])
cat("Number of observations where b_tgt == 1 in the training set:", count_b_tgt_1, "\n")
## Number of observations where b_tgt == 1 in the training set: 503655
# Count the number of observations where b_tgt == 0
count_b_tgt_0 <- h2o.nrow(bank_train_balanced[bank_train_balanced$b_tgt == "no", ])
cat("Number of observations where b_tgt == 0 in the training set:", count_b_tgt_0, "\n")
## Number of observations where b_tgt == 0 in the training set: 503655
# Check the distribution of the target variable
h2o.table(bank_train_balanced$b_tgt)
## b_tgt Count
## 1 no 503655
## 2 yes 503655
##
## [2 rows x 2 columns]
#check the dimensions of the balanced data frame
print(dim(bank_train_balanced))
## [1] 1007310 35
column_headers <- names(bank_train)
print(column_headers)
## [1] "b_tgt" "int_tgt" "cnt_tgt"
## [4] "cat_input1" "cat_input2" "demog_age"
## [7] "demog_ho" "demog_homeval" "demog_inc"
## [10] "demog_pr" "rfm1" "rfm2"
## [13] "rfm3" "rfm4" "rfm5"
## [16] "rfm6" "rfm7" "rfm8"
## [19] "rfm9" "rfm10" "rfm11"
## [22] "rfm12" "demog_genf" "demog_genm"
## [25] "account" "dataset" "int_tgt_bin"
## [28] "demog_inc2" "demog_inc2_sq" "rfm6_sq"
## [31] "prospect_ho" "rfm2_inc2" "demog_inc_homeval"
## [34] "dataset_num"
print(dim(bank_train))
## [1] 628940 34
print(dim(bank_test))
## [1] 209366 34
print(dim(bank_h2o))
## [1] 1047972 34
print(dim(bank_train_balanced))
## [1] 1007310 35
col_headers <- names(bank_train_balanced)
print(col_headers)
## [1] "int_tgt" "cnt_tgt" "demog_age"
## [4] "demog_homeval" "demog_inc" "demog_pr"
## [7] "rfm1" "rfm2" "rfm3"
## [10] "rfm4" "rfm5" "rfm6"
## [13] "rfm7" "rfm8" "rfm9"
## [16] "rfm10" "rfm11" "rfm12"
## [19] "account" "demog_inc2" "demog_inc2_sq"
## [22] "rfm6_sq" "prospect_ho" "rfm2_inc2"
## [25] "demog_inc_homeval" "cat_input1_Y" "cat_input1_Z"
## [28] "cat_input2_B" "cat_input2_C" "cat_input2_D"
## [31] "cat_input2_E" "demog_ho_yes" "demog_genf_yes"
## [34] "demog_genm_yes" "b_tgt"
# Set the response variable
response <- "b_tgt"
# Set the predictor variables
#include all variables from the bank_train_balanced data set
predictors <- c( "demog_age", "demog_ho", "demog_homeval", "demog_inc2", "demog_pr",
"rfm2", "rfm3", "rfm10", "rfm11", "rfm4", "rfm6", "rfm8", "rfm9",
"rfm12", "demog_genf", "demog_inc2_sq" , "rfm2_inc2" , "demog_inc_homeval" , "cat_input1" , "cat_input2")
# Set the training and validation sets
train_set <- bank_train
valid_set <- bank_test
# Set the test set
test_set <- bank_test_final
# Set the stopping criteria
stopping_criteria <- list(
stopping_rounds = 10,
stopping_metric = "AUC",
stopping_tolerance = 0.01
)
#Algorithms to include
algorithms <- c("GLM", "DRF", "XGBoost", "GBM")
# Set exploitation and exploration parameters
exploration_params <- list(
max_runtime_secs = 2700, # Run for maximum 1 hour
max_models = 50, # Build maximum 50 models
seed = 1234 # Set a seed for reproducibility
)
exploration_ratio <- 0.3
sort_metric <- "AUC"
# Set the AutoML parameters
automl_params_B_tgt <- list(
x = predictors,
y = response,
training_frame = train_set,
validation_frame = valid_set,
leaderboard_frame = test_set,
max_runtime_secs = exploration_params$max_runtime_secs,
max_models = exploration_params$max_models,
seed = exploration_params$seed,
stopping_rounds = stopping_criteria$stopping_rounds,
stopping_metric = stopping_criteria$stopping_metric,
stopping_tolerance = stopping_criteria$stopping_tolerance,
sort_metric = sort_metric,
exclude_algos = c("DeepLearning", "StackedEnsemble"),
include_algos = algorithms
)
# Run AutoML
automl_B_tgt_org <- h2o.automl(
x = predictors,
y = response,
training_frame = train_set,
validation_frame = valid_set, # For validation during training
max_runtime_secs = exploration_params$max_runtime_secs,
max_models = exploration_params$max_models,
seed = exploration_params$seed,
stopping_rounds = stopping_criteria$stopping_rounds,
stopping_metric = stopping_criteria$stopping_metric,
stopping_tolerance = stopping_criteria$stopping_tolerance,
sort_metric = sort_metric,
include_algos = algorithms
)
# 2. Get the leaderboard (ranked on validation data)
leaderboard <- h2o.get_leaderboard(automl_B_tgt_org)
print("AutoML Leaderboard (ranked on validation data):")
print(leaderboard)
# 3. Access and evaluate the best model on validation data
best_model <- automl_B_tgt_org@leader
val_perf <- h2o.performance(best_model, valid_set)
# 4. Print detailed validation metrics
print("Best Model Performance on Validation Data:")
print(val_perf)
print(paste("Validation AUC:", h2o.auc(val_perf)))
print(paste("Validation F1 Score:", h2o.F1(val_perf)))
print(paste("Validation Accuracy:", h2o.accuracy(val_perf)))
print(paste("Validation LogLoss:", h2o.logloss(val_perf)))
print(paste("Validation RMSE:", h2o.rmse(val_perf)))
# Get comprehensive model information
model_details <- h2o.getModel(best_model@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details)
# For a more structured view of just the parameters
params_table <- as.data.frame(model_details@parameters)
print(params_table)
#export the summary to a csv
write.csv(params_table, "model_details.csv", row.names = FALSE)
# Get the variable importance
varimp <- h2o.varimp(best_model)
# Convert to a data frame
varimp_df <- as.data.frame(varimp)
# Sort by relative importance
varimp_df <- varimp_df[order(varimp_df$relative_importance, decreasing = TRUE), ]
#plot the variable importance
library(ggplot2)
ggplot(varimp_df, aes(x = reorder(variable, -relative_importance), y = relative_importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Variable Importance: Best B_tgt Model (DRF)", x = "Variables", y = "Relative Importance") +
theme_minimal()
# show the confusion matrix for the best model
confusion_matrix <- h2o.confusionMatrix(best_model, valid_set)
print("Confusion Matrix:")
print(confusion_matrix)
#pull the threshold for the best model
threshold <- h2o.find_threshold_by_max_metric(val_perf, "f1")
print(paste("Best Threshold for F1 Score:", threshold))
library(h2o)
# Generate partial dependence plots for rfm3
drf_model <- h2o.getModel("DRF_1_AutoML_1_20250517_94015")
pdp_rfm3 <- h2o.partialPlot(
object = drf_model,
data = train_set,
cols = c("rfm3")
)
# Generate partial dependence plots for rfm2
pdp_rfm2 <- h2o.partialPlot(
object = drf_model,
data = train_set,
cols = c("rfm2")
)
# Plot the results
plot(pdp_rfm3)
plot(pdp_rfm2)
# Get all models from your AutoML run
all_models <- as.data.frame(automl_B_tgt_org@leaderboard)
# Filter for GLM models
glm_models <- all_models[grepl("GLM", all_models$model_id), ]
# Get the top GLM model ID
if (nrow(glm_models) > 0) {
top_glm_id <- as.character(glm_models$model_id[1])
best_glm_model <- h2o.getModel(top_glm_id)
# View the model summary
summary(best_glm_model)
} else {
print("No GLM models found in the AutoML run")
}
#export the confusion matrix
write.csv(confusion_matrix, "confusion_matrix.csv", row.names = FALSE)
#pull the threshold for the GLM model
threshold_glm <- h2o.find_threshold_by_max_metric(val_perf, "f1")
print(paste("Best Threshold for F1 Score (GLM):", threshold_glm))
# Filter for GBM models
gbm_models <- all_models[grepl("GBM", all_models$model_id), ]
# Get the top GBM model ID
if (nrow(gbm_models) > 0) {
top_gbm_id <- as.character(gbm_models$model_id[1])
best_gbm_model <- h2o.getModel(top_gbm_id)
# View the model summary
summary(best_gbm_model)
} else {
print("No GBM models found in the AutoML run")
}
#pull the threshold for the GBM model
threshold_gbm <- h2o.find_threshold_by_max_metric(val_perf, "f1")
print(paste("Best Threshold for F1 Score (GBM):", threshold_gbm))
#Evaluate DRF model DRF_1_AutoML_1_20250517_94015 on the test data
#predict B-tgt
#Get Model ID
model_id <- automl_B_tgt_org@leaderboard[1, "model_id"]
# Get the model
best_model <- h2o.getModel(model_id)
#Print model ID
print(paste("Model ID:", model_id))
#check dimensions of the test set
print(dim(bank_test_final))
# Make predictions on the test set
predictions <- h2o.predict(best_model, newdata = bank_test_final)
# Convert predictions to a data frame
predictions_df <- as.data.frame(predictions)
# Add the original test set data to the predictions
predictions_df <- cbind(as.data.frame(bank_test_final), predictions_df)
#Confusion Matrix
confusion_matrix_test <- h2o.confusionMatrix(best_model, bank_test_final)
print("Confusion Matrix for Test Set:")
print(confusion_matrix_test)
# AUC and ROC curve
roc_curve <- h2o.performance(best_model, newdata = bank_test_final)
auc_value <- h2o.auc(roc_curve)
#-------- ADDITIONAL METRICS --------#
# Assuming positive class is 1 (you may need to adjust based on your actual class labels)
tryCatch({
# Extract values from confusion matrix
tn <- confusion_matrix_test[1,1] # True Negatives
fp <- confusion_matrix_test[1,2] # False Positives
fn <- confusion_matrix_test[2,1] # False Negatives
tp <- confusion_matrix_test[2,2] # True Positives
# Calculate metrics
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
f1 <- 2 * precision * recall / (precision + recall)
accuracy <- (tp + tn) / (tp + tn + fp + fn)
sensitivity <- recall
specificity <- tn / (tn + fp)
cat("\nMetrics calculated from confusion matrix:\n")
cat("Precision:", precision, "\n")
cat("Recall:", recall, "\n")
cat("F1 Score:", f1, "\n")
cat("Accuracy:", accuracy, "\n")
cat("Sensitivity:", sensitivity, "\n")
cat("Specificity:", specificity, "\n")
cat("AUC:", auc_value, "\n")
}, error = function(e) {
cat("Could not calculate metrics from confusion matrix:", e$message, "\n")
})
# Plot ROC curve
plot(roc_curve, main = "DRF: ROC Curve for Test Set", col = "blue", lwd = 2)
# Set the response variable
response <- "b_tgt"
# Set the predictor variables
predictors <- c( "demog_age", "demog_ho", "demog_homeval", "demog_inc2", "demog_pr",
"rfm2", "rfm3", "rfm10", "rfm11", "rfm4", "rfm6", "rfm8", "rfm9",
"rfm12", "demog_genf", "demog_inc2_sq" , "rfm2_inc2" , "demog_inc_homeval" , "cat_input1" , "cat_input2")
# Set the training and validation sets
train_set <- bank_train
valid_set <- bank_test
# Set the test set
test_set <- bank_test_final
# Set the stopping criteria
stopping_criteria <- list(
stopping_rounds = 10,
stopping_metric = "AUC",
stopping_tolerance = 0.01
)
#Algorithms to include
algorithms <- c("DeepLearning")
# Set exploitation and exploration parameters
exploration_params <- list(
max_runtime_secs = 1200, # Run for maximum
max_models = 40, # Build maximum 50 models
seed = 1234 # Set a seed for reproducibility
)
exploration_ratio <- 0.3
sort_metric <- "AUC"
# Train a deep learning model with h2o
automl_params <- list(
x = predictors,
y = response,
training_frame = train_set,
validation_frame = valid_set,
leaderboard_frame = valid_set,
max_runtime_secs = exploration_params$max_runtime_secs,
max_models = exploration_params$max_models,
seed = exploration_params$seed,
stopping_rounds = stopping_criteria$stopping_rounds,
stopping_metric = stopping_criteria$stopping_metric,
stopping_tolerance = stopping_criteria$stopping_tolerance,
sort_metric = sort_metric
)
# Run AutoML
automl_B_tgt <- h2o.automl(
x = predictors,
y = response,
training_frame = train_set,
validation_frame = valid_set, # For validation during training
max_runtime_secs = exploration_params$max_runtime_secs,
max_models = exploration_params$max_models,
seed = exploration_params$seed,
stopping_rounds = stopping_criteria$stopping_rounds,
stopping_metric = stopping_criteria$stopping_metric,
stopping_tolerance = stopping_criteria$stopping_tolerance,
sort_metric = sort_metric,
include_algos = algorithms
)
# 2. Get the leaderboard (ranked on validation data)
leaderboard <- h2o.get_leaderboard(automl_B_tgt)
# Get comprehensive model information
# select the best model
best_model <- automl_B_tgt@leader
# Print summary of the model (includes parameters and performance metrics)
summary(best_model)
#compute the R squared value for the best model
r_squared <- h2o.r2(val_perf)
print(paste("Validation R-squared:", r_squared))
#get the confusion matrix
confusion_matrix <- h2o.confusionMatrix(best_model, valid_set)
print("Confusion Matrix:")
print(confusion_matrix)
#compute sensitivity and specificity
#select threshold where max AUC
threshold <- h2o.find_threshold_by_max_metric(val_perf, "AUC")
print(paste("Best Threshold for AUC:", threshold))
#compute the sensitivity and specificity
sensitivity <- h2o.sensitivity(confusion_matrix)
specificity <- h2o.specificity(confusion_matrix)
#show the column names for the bank_train data set
colnames(bank_train)
# Set the response variable
response_cnt_tgt <- "cnt_tgt"
# Set the predictor variables
#include all variables from the bank_train data set
predictors_cnt_tgt <- c( "demog_age", "demog_ho", "demog_homeval", "demog_inc2", "demog_pr", "rfm1" ,
"rfm2", "rfm3", "rfm10", "rfm4" , "rfm6", "rfm8", "rfm9", "rfm10",
"rfm12", "demog_genf", "demog_inc2_sq" , "rfm2_inc2" , "demog_inc_homeval" , "cat_input1" , "cat_input2")
# Set the training and validation sets
train_set_cnt_tgt <- bank_train
valid_set_cnt_tgt <- bank_test
# Set the test set
test_set_cnt_tgt <- bank_test_final
# Set the stopping criteria
stopping_criteria_cnt_tgt <- list(
stopping_rounds = 10,
stopping_metric = "RMSE",
stopping_tolerance = 0.01
)
#Algorithms to include
algorithms_cnt_tgt <- c("GLM", "DRF", "DeepLearning")
# Set exploitation and exploration parameters
exploration_params_cnt_tgt <- list(
max_runtime_secs = 2500,
max_models = 50, # Build maximum 50 models
seed = 1234 # Set a seed for reproducibility
)
exploration_ratio_cnt_tgt <- 0.3
sort_metric_cnt_tgt <- "RMSE"
# Set the AutoML parameters
automl_params_cnt_tgt <- list(
x = predictors_cnt_tgt,
y = response_cnt_tgt,
training_frame = train_set_cnt_tgt,
validation_frame = valid_set_cnt_tgt,
leaderboard_frame = test_set_cnt_tgt,
max_runtime_secs = exploration_params_cnt_tgt$max_runtime_secs,
max_models = exploration_params_cnt_tgt$max_models,
seed = exploration_params_cnt_tgt$seed,
stopping_rounds = stopping_criteria_cnt_tgt$stopping_rounds,
stopping_metric = stopping_criteria_cnt_tgt$stopping_metric,
stopping_tolerance = stopping_criteria_cnt_tgt$stopping_tolerance,
sort_metric = sort_metric_cnt_tgt,
include_algos = algorithms_cnt_tgt
)
# Run AutoML
automl_cnt_tgt <- h2o.automl(
x = predictors_cnt_tgt,
y = response_cnt_tgt,
training_frame = train_set_cnt_tgt,
validation_frame = valid_set_cnt_tgt, # For validation during training
max_runtime_secs = exploration_params_cnt_tgt$max_runtime_secs,
max_models = exploration_params_cnt_tgt$max_models,
seed = exploration_params_cnt_tgt$seed,
stopping_rounds = stopping_criteria_cnt_tgt$stopping_rounds,
stopping_metric = stopping_criteria_cnt_tgt$stopping_metric,
stopping_tolerance = stopping_criteria_cnt_tgt$stopping_tolerance,
sort_metric = sort_metric_cnt_tgt,
include_algos = algorithms_cnt_tgt
)
# 2. Get the leaderboard (ranked on validation data)
leaderboard_cnt_tgt <- h2o.get_leaderboard(automl_cnt_tgt)
print("AutoML Leaderboard (ranked on validation data):")
print(leaderboard_cnt_tgt)
# 3. Access and evaluate the best model on validation data
best_model_cnt_tgt <- automl_cnt_tgt@leader
val_perf_cnt_tgt <- h2o.performance(best_model_cnt_tgt, valid_set_cnt_tgt)
# 4. Print detailed validation metrics
print("Best Model Performance on Validation Data:")
print(val_perf_cnt_tgt)
# Get comprehensive model information
model_details_cnt_tgt <- h2o.getModel(best_model_cnt_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_cnt_tgt)
# For a more structured view of just the parameters
params_table_cnt_tgt <- as.data.frame(model_details_cnt_tgt@parameters)
print(params_table_cnt_tgt)
#export the summary to a csv
write.csv(params_table_cnt_tgt, "model_details_cnt_tgt.csv", row.names = FALSE)
#compute the R squared value
r_squared_cnt_tgt <- h2o.r2(val_perf_cnt_tgt)
print(paste("Validation R-squared:", r_squared_cnt_tgt))
#show trees for the best model
h2o.varimp_plot(best_model_cnt_tgt, num_of_features = 20)
#Get predictions for the validation set
predictions_cnt_tgt <- h2o.predict(best_model_cnt_tgt, valid_set_cnt_tgt)
#create a data frame with the predictions
predictions_cnt_tgt_df <- as.data.frame(predictions_cnt_tgt)
# Add the actual values to the predictions data frame
predictions_cnt_tgt_df$actual <- as.data.frame(valid_set_cnt_tgt)$cnt_tgt
#create the residuals
predictions_cnt_tgt_df$residuals <- predictions_cnt_tgt_df$predict - predictions_cnt_tgt_df$actual
# Create a scatter plot of the predictions vs actual values
library(ggplot2)
ggplot(predictions_cnt_tgt_df, aes(x = actual, y = predict)) +
geom_point(alpha = 0.5) +
labs(title = "Predictions vs Actual Values", x = "Actual Values", y = "Predicted Values") +
theme_minimal() +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line
# Get all models from your AutoML run
all_models_cnt_tgt <- as.data.frame(automl_cnt_tgt@leaderboard)
# Filter for GLM models
glm_models_cnt_tgt <- all_models_cnt_tgt[grepl("GLM", all_models_cnt_tgt$model_id), ]
# Get the top GLM model ID
if (nrow(glm_models_cnt_tgt) > 0) {
top_glm_id_cnt_tgt <- as.character(glm_models_cnt_tgt$model_id[1])
best_glm_model_cnt_tgt <- h2o.getModel(top_glm_id_cnt_tgt)
# View the model summary
summary(best_glm_model_cnt_tgt)
} else {
print("No GLM models found in the AutoML run")
}
#Get comprehensive Model details
model_details_cnt_tgt_glm <- h2o.getModel(best_glm_model_cnt_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_cnt_tgt_glm)
# Get all models from your AutoML run
all_models_cnt_tgt <- as.data.frame(automl_cnt_tgt@leaderboard)
# Filter for DeepLearning models
dl_models_cnt_tgt <- all_models_cnt_tgt[grepl("DeepLearning", all_models_cnt_tgt$model_id), ]
# Get the top DeepLearning model ID
if (nrow(dl_models_cnt_tgt) > 0) {
top_dl_id_cnt_tgt <- as.character(dl_models_cnt_tgt$model_id[1])
best_dl_model_cnt_tgt <- h2o.getModel(top_dl_id_cnt_tgt)
# View the model summary
summary(best_dl_model_cnt_tgt)
} else {
print("No DeepLearning models found in the AutoML run")
}
#Get comprehensive Model details
model_details_cnt_tgt_dl <- h2o.getModel(best_dl_model_cnt_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_cnt_tgt_dl)
#compute the R squared value
#for the best deep learning model
val_perf_cnt_tgt <- h2o.performance(best_dl_model_cnt_tgt, valid_set_cnt_tgt)
#compute the R squared value
r_squared_cnt_tgt_dl <- h2o.r2(val_perf_cnt_tgt)
print(paste("Validation R-squared:", r_squared_cnt_tgt_dl))
#get the model parameters
# Method 1: Get model details as a list
model_details <- h2o.getModel(model_details_cnt_tgt_dl@model_id)
print(model_details)
# Method 2: Get hyperparameters
hyper_params <- h2o.getModel(model_details_cnt_tgt_dl@model_id)@allparameters
print(hyper_params)
#Get model parameters for the DRF model
model_details_cnt_tgt_drf <- h2o.getModel(best_model_cnt_tgt@model_id)
print(model_details_cnt_tgt_drf)
#Get Hyperparameters
hyper_params_drf <- h2o.getModel(model_details_cnt_tgt_drf@model_id)@allparameters
print(hyper_params_drf)
# Method 1: Manually exclude known categorical variables
categorical_vars <- c("demog_inc_homeval", "demog_ho", "demog_genf", "cat_input1", "cat_input2" , "rfm10" , "rfm4" , "rfm8" ,"rfm9") # Modify this list based on your knowledge
numeric_predictors <- setdiff(predictors_cnt_tgt, categorical_vars )
# Train a GAM model
gam_model <- h2o.gam(
x = predictors_cnt_tgt,
y = response_cnt_tgt,
training_frame = train_set_cnt_tgt,
validation_frame = valid_set_cnt_tgt,
gam_columns = numeric_predictors,
lambda_search = TRUE,
lambda = c(0.01, 0.1, 1, 10),
alpha = c(0.1, 0.5, 0.9),
max_iterations = 100,
max_runtime_secs = 9000 # Set maximum runtime to 1 hour (3600 seconds)
)
# Print the model summary
summary(gam_model)
#Make predictions on the test data using DRF_1_AutoML_2_20250517_164950
#predict cnt_tgt
#Get Model ID
model_id_cnt_tgt <- automl_cnt_tgt@leaderboard[1, "model_id"]
# Get the model
best_model_cnt_tgt <- h2o.getModel(model_id_cnt_tgt)
#Print model ID
print(paste("Model ID:", model_id_cnt_tgt))
#check dimensions of the test set
print(dim(bank_test_final))
# Make predictions on the test set
predictions_cnt_tgt_test <- h2o.predict(best_model_cnt_tgt, newdata = bank_test_final)
# Convert predictions to a data frame
predictions_cnt_tgt_test_df <- as.data.frame(predictions_cnt_tgt_test)
# Add the original test set data to the predictions
predictions_cnt_tgt_test_df <- cbind(as.data.frame(bank_test_final), predictions_cnt_tgt_test_df)
# Create a scatter plot of the predictions vs actual values
library(ggplot2)
ggplot(predictions_cnt_tgt_test_df, aes(x = cnt_tgt, y = predict)) +
geom_point(alpha = 0.5) +
labs(title = "CNT_tgt: Predictions vs Actual Values", x = "Actual Values", y = "Predicted Values") +
theme_minimal() +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line
#Compute the model metrics for the test data
# Get the model performance on the test set
test_perf_cnt_tgt <- h2o.performance(best_model_cnt_tgt, newdata = bank_test_final)
# Print the model performance metrics
print("Model Performance on Test Data:")
print(test_perf_cnt_tgt)
# Get model ID for best performing cnt_tgt model
model_id_cnt_tgt <- automl_cnt_tgt@leaderboard[1, "model_id"]
# Get the model
best_model_cnt_tgt <- h2o.getModel(model_id_cnt_tgt)
#Generate partial dependence plots for rfm1 and demog_homeval
pdp_rfm1 <- h2o.partialPlot(
object = best_model_cnt_tgt,
data = bank_test_final,
cols = c("rfm1")
)
pdp_demog_homeval <- h2o.partialPlot(
object = best_model_cnt_tgt,
data = bank_test_final,
cols = c("demog_homeval")
)
#show the mean int_tgt for the test final data set
mean_int_tgt <- h2o.mean(bank_test_final$int_tgt)
print(paste("Mean int_tgt for test set:", mean_int_tgt))
# Set the response variable
response_int_tgt <- "int_tgt"
# Set the predictor variables
#include all variables from the bank_train data set
predictors_int_tgt <- c( "demog_age", "demog_ho", "demog_homeval", "demog_inc2", "demog_pr", "rfm1" ,
"rfm2", "rfm3", "rfm10", "rfm4" , "rfm6", "rfm6_sq" , "rfm8", "rfm9", "rfm10",
"rfm12", "demog_inc2_sq" , "rfm2_inc2" , "demog_inc_homeval" )
# Set the training and validation sets
train_set_int_tgt <- bank_train
valid_set_int_tgt <- bank_test
# Set the test set
test_set_int_tgt <- bank_test_final
# Set the stopping criteria
stopping_criteria_int_tgt <- list(
stopping_rounds = 10,
stopping_metric = "RMSE",
stopping_tolerance = 0.01
)
#Algorithms to include
algorithms_int_tgt <- c("DeepLearning", "GLM", "DRF")
# Set exploitation and exploration parameters
exploration_params_int_tgt <- list(
max_runtime_secs = 2700, # Run for maximum 1 hour
max_models = 50, # Build maximum 50 models
seed = 1234 # Set a seed for reproducibility
)
exploration_ratio_int_tgt <- 0.3
sort_metric_int_tgt <- "RMSE"
# Set the AutoML parameters
automl_params_int_tgt <- list(
x = predictors_int_tgt,
y = response_int_tgt,
training_frame = train_set_int_tgt,
validation_frame = valid_set_int_tgt,
leaderboard_frame = valid_set_int_tgt,
max_runtime_secs = exploration_params_int_tgt$max_runtime_secs,
max_models = exploration_params_int_tgt$max_models,
seed = exploration_params_int_tgt$seed,
stopping_rounds = stopping_criteria_int_tgt$stopping_rounds,
stopping_metric = stopping_criteria_int_tgt$stopping_metric,
stopping_tolerance = stopping_criteria_int_tgt$stopping_tolerance,
sort_metric = sort_metric_int_tgt,
include_algos = algorithms_int_tgt
)
# Run AutoML
automl_int_tgt <- h2o.automl(
x = predictors_int_tgt,
y = response_int_tgt,
training_frame = train_set_int_tgt,
validation_frame = valid_set_int_tgt,
leaderboard_frame = valid_set_int_tgt, # For validation during training
max_runtime_secs = exploration_params_int_tgt$max_runtime_secs,
max_models = exploration_params_int_tgt$max_models,
seed = exploration_params_int_tgt$seed,
stopping_rounds = stopping_criteria_int_tgt$stopping_rounds,
stopping_metric = stopping_criteria_int_tgt$stopping_metric,
stopping_tolerance = stopping_criteria_int_tgt$stopping_tolerance,
sort_metric = sort_metric_int_tgt,
include_algos = algorithms_int_tgt
)
# 2. Get the leaderboard (ranked on validation data)
leaderboard_int_tgt <- h2o.get_leaderboard(automl_int_tgt)
print("AutoML Leaderboard (ranked on validation data):")
print(leaderboard_int_tgt)
#convert the leaderboard to a dataframe
leaderboard_int_tgt_df <- as.data.frame(leaderboard_int_tgt)
#export the leaderboard to a csv
write.csv(leaderboard_int_tgt_df, "leaderboard_int_tgt.csv", row.names = FALSE)
# Confirm the file was created
cat("File exported to:", file.path(getwd(), "leaderboard_int_tgt.csv"))
# Get all models from your AutoML run
all_models_int_tgt <- as.data.frame(automl_int_tgt@leaderboard)
# Filter for GLM models
glm_models_int_tgt <- all_models_int_tgt[grepl("GLM", all_models_int_tgt$model_id), ]
# Get the top GLM model ID
if (nrow(glm_models_int_tgt) > 0) {
top_glm_id_int_tgt <- as.character(glm_models_int_tgt$model_id[1])
best_glm_model_int_tgt <- h2o.getModel(top_glm_id_int_tgt)
# View the model summary
summary(best_glm_model_int_tgt)
} else {
print("No GLM models found in the AutoML run")
}
#Get comprehensive Model details
model_details_int_tgt_glm <- h2o.getModel(best_glm_model_int_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_int_tgt_glm)
#extract the model coefficients for the glm model
model_coefficients_int_tgt_glm <- h2o.coef(best_glm_model_int_tgt)
# Convert to a data frame
model_coefficients_int_tgt_glm_df <- as.data.frame(model_coefficients_int_tgt_glm)
# Add the variable names
model_coefficients_int_tgt_glm_df$variable <- rownames(model_coefficients_int_tgt_glm_df)
# Rename the columns
colnames(model_coefficients_int_tgt_glm_df) <- c("coefficient", "variable")
# Sort by absolute value of coefficients
model_coefficients_int_tgt_glm_df <- model_coefficients_int_tgt_glm_df[order(abs(model_coefficients_int_tgt_glm_df$coefficient), decreasing = TRUE), ]
# Print the coefficients
print(model_coefficients_int_tgt_glm_df)
#export the coefficients to a csv
write.csv(model_coefficients_int_tgt_glm_df, "model_coefficients_int_tgt_glm.csv", row.names = FALSE)
#filer for DeepLearning models
dl_models_int_tgt <- all_models_int_tgt[grepl("DeepLearning", all_models_int_tgt$model_id), ]
# Get the top DeepLearning model ID
#ranked on validation data
# Get the top DeepLearning model ID
if (nrow(dl_models_int_tgt) > 0) {
top_dl_id_int_tgt <- as.character(dl_models_int_tgt$model_id[1])
best_dl_model_int_tgt <- h2o.getModel(top_dl_id_int_tgt)
# View the model summary
summary(best_dl_model_int_tgt)
} else {
print("No DeepLearning models found in the AutoML run")
}
#Get comprehensive Model details
model_details_int_tgt_dl <- h2o.getModel(best_dl_model_int_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_int_tgt_dl)
#compute the R squared value
#for the best performing deep learning model
val_perf_int_tgt <- h2o.performance(best_dl_model_int_tgt, valid_set_int_tgt)
r_squared_int_tgt_dl <- h2o.r2(val_perf_int_tgt)
print(paste("Validation R-squared (Deep Learning):", r_squared_int_tgt_dl))
#filter for DRF models
drf_models_int_tgt <- all_models_int_tgt[grepl("DRF", all_models_int_tgt$model_id), ]
# Get the top DRF model ID
if (nrow(drf_models_int_tgt) > 0) {
top_drf_id_int_tgt <- as.character(drf_models_int_tgt$model_id[1])
best_drf_model_int_tgt <- h2o.getModel(top_drf_id_int_tgt)
# View the model summary
summary(best_drf_model_int_tgt)
} else {
print("No DRF models found in the AutoML run")
}
#Get comprehensive Model details
model_details_int_tgt_drf <- h2o.getModel(best_drf_model_int_tgt@model_id)
# Print summary of the model (includes parameters and performance metrics)
summary(model_details_int_tgt_drf)
#compute the R squared value
#for int_tgt
val_perf_int_tgt_drf <- h2o.performance(best_drf_model_int_tgt, valid_set_int_tgt)
#compute the R squared value
r_squared_int_tgt_drf <- h2o.r2(val_perf_int_tgt_drf)
print(paste("Validation R-squared (DRF):", r_squared_int_tgt_drf))
#plot variable importance for the best DRF model
h2o.varimp_plot(best_drf_model_int_tgt, num_of_features = 20)
# Get the model ID for best performing DRF
model_id_int_tgt_drf <- best_drf_model_int_tgt@model_id
# Get your DRF model
drf_model_int_tgt <- h2o.getModel(model_id_int_tgt_drf)
#Generate partial dependence plots for rfm2 and rfm9
pdp_rfm2_int_tgt <- h2o.partialPlot(
object = drf_model_int_tgt,
data = train_set_int_tgt,
cols = c("rfm2")
)
pdp_rfm9_int_tgt <- h2o.partialPlot(
object = drf_model_int_tgt,
data = train_set_int_tgt,
cols = c("rfm9")
)
# Model IDs i want to use for the ensemble
model_ids <- c(
"DRF_1_AutoML_6_20250518_35511",
"XRT_1_AutoML_6_20250518_35511",
"DeepLearning_1_AutoML_6_20250518_35511",
"GLM_1_AutoML_6_20250518_35511",
"DeepLearning_grid_1_AutoML_6_20250518_35511_model_1")
# Create an ensemble using these specific models
custom_ensemble <- h2o.stackedEnsemble(
x = predictors_int_tgt,
y = response_int_tgt,
training_frame = train_set_int_tgt,
blending_frame = valid_set_int_tgt, # Use validation data as blending frame instead
base_models = model_ids,
metalearner_algorithm = "glm" # Default metalearner
)
# Evaluate ensemble performance on validation data
ensemble_perf <- h2o.performance(custom_ensemble, valid_set_int_tgt)
print(h2o.rmse(ensemble_perf))
print(h2o.r2(ensemble_perf))
print(h2o.mae(ensemble_perf))
print(h2o.mse(ensemble_perf))
print(h2o.mean_residual_deviance(ensemble_perf))
# Compare with individual model performances
for (model_id in model_ids) {
model <- h2o.getModel(model_id)
model_perf <- h2o.performance(model, valid_set_int_tgt)
cat("\nPerformance for model:", model_id, "\n")
cat("RMSE:", h2o.rmse(model_perf), "\n")
cat("R-squared:", h2o.r2(model_perf), "\n")
cat("MAE:", h2o.mae(model_perf), "\n")
cat("MSE:", h2o.mse(model_perf), "\n")
}
# Make predictions with the ensemble model
ensemble_preds <- h2o.predict(custom_ensemble, test_set_int_tgt)
# Convert predictions to a data frame
ensemble_preds_df <- as.data.frame(ensemble_preds)
# Add the original test set data to the predictions
ensemble_preds_df <- cbind(as.data.frame(bank_test_final), ensemble_preds_df)
#exponentiate the predicteds
ensemble_preds_df$predict <- exp(ensemble_preds_df$predict)
#exponentiate the actual values
ensemble_preds_df$int_tgt <- exp(ensemble_preds_df$int_tgt)
# Create a scatter plot of the predictions vs actual values
library(ggplot2)
ggplot(ensemble_preds_df, aes(x = int_tgt, y = predict)) +
geom_point(alpha = 0.5) +
labs(title = "Ensemble Model: Predictions vs Actual Values", x = "Actual Values", y = "Predicted Values") +
theme_minimal() +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line
#compute the RMSE
ensemble_perf_test <- h2o.performance(custom_ensemble, newdata = bank_test_final)
# Print the model performance metrics
print("Ensemble Model Performance on Test Data:")
print(ensemble_perf_test)
#compute the RMSE
rmse_ensemble <- h2o.rmse(ensemble_perf_test)
print(paste("Ensemble Model RMSE on Test Data:", rmse_ensemble))
#compute the R squared value
r_squared_ensemble <- h2o.r2(ensemble_perf_test)
print(paste("Ensemble Model R-squared on Test Data:", r_squared_ensemble))
#copmute the MAE
mae_ensemble <- h2o.mae(ensemble_perf_test)
print(paste("Ensemble Model MAE on Test Data:", mae_ensemble))
#compute the MSE
mse_ensemble <- h2o.mse(ensemble_perf_test)
print(paste("Ensemble Model MSE on Test Data:", mse_ensemble))
#Get model id for ensemble
model_id_ensemble <- custom_ensemble@model_id
#Print the model id
print(paste("Ensemble Model ID:", model_id_ensemble))
# Access the metalearner model
ensemble_model <- h2o.getModel("StackedEnsemble_model_R_1747498932373_10579")
metalearner <- h2o.getModel(ensemble_model@model$metalearner$name)
# If metalearner is GLM, you can get coefficients
if(metalearner@algorithm == "glm") {
# This shows how each BASE MODEL is weighted
coefs <- h2o.coef(metalearner)
print(coefs)
}
# Convert to a data frame
coefs_df <- as.data.frame(coefs)
# Add the variable names
coefs_df$variable <- rownames(coefs_df)
# Rename the columns
colnames(coefs_df) <- c("coefficient", "variable")
# Sort by absolute value of coefficients
coefs_df <- coefs_df[order(abs(coefs_df$coefficient), decreasing = TRUE), ]
# Print the coefficients
print(coefs_df)
#export the coefficients to a csv
write.csv(coefs_df, "model_coefficients_ensemble.csv", row.names = FALSE)
# Get the variable importance for the ensemble model
varimp_ensemble <- h2o.varimp(custom_ensemble)
# Convert to a data frame
varimp_ensemble_df <- as.data.frame(varimp_ensemble)
# Add the variable names
varimp_ensemble_df$variable <- rownames(varimp_ensemble_df)
# Rename the columns
colnames(varimp_ensemble_df) <- c("relative_importance", "variable")
# Sort by relative importance
varimp_ensemble_df <- varimp_ensemble_df[order(varimp_ensemble_df$relative_importance, decreasing = TRUE), ]
# Print the variable importance
print(varimp_ensemble_df)
#plot the variable importance
library(ggplot2)
ggplot(varimp_ensemble_df, aes(x = reorder(variable, -relative_importance), y = relative_importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Variable Importance: Ensemble Model", x = "Variables", y = "Relative Importance") +
theme_minimal()
# Method 1: Manually exclude known categorical variables
categorical_vars <- c("demog_inc_homeval", "demog_ho", "demog_genf", "rfm10" , "rfm4" , "rfm8" ,"rfm9" ) # Modify this list based on your knowledge
numeric_predictors <- setdiff(predictors_cnt_tgt, categorical_vars )
x <- c(numeric_predictors, categorical_vars)
y <- response_int_tgt
# Define a very simple hyperparameter grid
# Just tuning lambda (regularization strength)
hyper_params <- list(
lambda = c(0, 0.001, 0.01, 0.1),
# Alpha mixing parameter - controls L1 vs L2 regularization
alpha = c(0, 0.3, 0.7, 1.0),
# Scale parameter - controls fitting (affects convergence)
scale = c(0.5, 1.0, 1.5)
)
# Define search criteria for a more extensive search
search_criteria <- list(
strategy = "RandomDiscrete", # Use random search for efficiency with larger grid
max_models = 100, # Maximum number of models to try
max_runtime_secs = 1800, # 1 hour time limit
seed = 1234 # For reproducibility
)
# Run a simple GAM grid search with search_criteria
gam_grid <- h2o.grid(
algorithm = "gam",
x = x,
y = y,
training_frame = train_set_int_tgt,
validation_frame = valid_set_int_tgt,
hyper_params = hyper_params,
search_criteria = search_criteria,
gam_columns = list(numeric_predictors[1:3]), # Specify the columns for GAM
lambda_search = TRUE, # Enable lambda search
)
# Print grid results
print(gam_grid)
# Get the best model by validation MSE
best_model <- h2o.getModel(gam_grid@model_ids[[1]])
# Print model details
print(summary(best_model))
# Model performance on validation data
perf <- h2o.performance(best_model, valid_set_int_tgt)
print(perf)
# Sort the grid by your preferred metric (lower MSE is better)
sorted_grid <- h2o.getGrid(gam_grid@grid_id, sort_by = "RMSE", decreasing = FALSE)
# Get the best model from the grid
best_model <- h2o.getModel(sorted_grid@model_ids[[1]])
# Print the best model's details
print(summary(best_model))
# Get the variable importance
varimp <- h2o.varimp(best_model)
# Convert to a data frame
varimp_df <- as.data.frame(varimp)
#plot the variable importance
library(ggplot2)
ggplot(varimp_df, aes(x = reorder(variable, -relative_importance), y = relative_importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Variable Importance: Best Int_tgt Model (GAM)", x = "Variables", y = "Relative Importance") +
theme_minimal()
#plot rfm6 vs int_tgt
ggplot(bank_log, aes(x = rfm6, y = int_tgt)) +
geom_point(alpha = 0.5) +
labs(title = "Scatter Plot of rfm6 vs int_tgt", x = "rfm6", y = "int_tgt") +
theme_minimal() +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line
#plot rfm6 vs cnt_tgt
ggplot(bank_log, aes(x = rfm6, y = cnt_tgt)) +
geom_point(alpha = 0.5) +
labs(title = "Scatter Plot of rfm6 vs cnt_tgt", x = "rfm6", y = "cnt_tgt") +
theme_minimal() +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") # Add a reference line