# Set CRAN mirror and Install kernlab, ggplot2, and dplyr packages
options(repos = c(CRAN = "https://cloud.r-project.org"))

library(kernlab)
library(ggplot2)
library(dplyr)
library(tidyr)
library(caret)
library(e1071)
library(ggpubr)
#load the glmnet package
library(glmnet)
#load the MASS package
library(MASS)

#load the lars package
install.packages("lars")

## package 'lars' successfully unpacked and MD5 sums checked

## 
## The downloaded binary packages are in
##  C:\Users\andre\AppData\Local\Temp\RtmpkBu0N7\downloaded_packages

library(lars)

#load the diabetes data set
data(diabetes)

data.all <- data.frame(cbind(diabetes$x, y=diabetes$y))

Data Quality Check

#dimensions of the data set
dim(data.all)

## [1] 442  11

cat("The dimensions of the DataSet are: ", dim(data.all)[1], " rows and ", dim(data.all)[2], " columns.\n")

## The dimensions of the DataSet are:  442  rows and  11  columns.

#Generate summary statistics of the data using the describle function
# from the psych package
#install.packages("psych")
library(psych)
#summary statistics
describe(data.all)

##     vars   n   mean    sd median trimmed   mad   min    max  range  skew
## age    1 442   0.00  0.05   0.01    0.00  0.05 -0.11   0.11   0.22 -0.23
## sex    2 442   0.00  0.05  -0.04    0.00  0.00 -0.04   0.05   0.10  0.13
## bmi    3 442   0.00  0.05  -0.01    0.00  0.05 -0.09   0.17   0.26  0.59
## map    4 442   0.00  0.05  -0.01    0.00  0.05 -0.11   0.13   0.24  0.29
## tc     5 442   0.00  0.05   0.00    0.00  0.05 -0.13   0.15   0.28  0.38
## ldl    6 442   0.00  0.05   0.00    0.00  0.04 -0.12   0.20   0.31  0.43
## hdl    7 442   0.00  0.05  -0.01    0.00  0.05 -0.10   0.18   0.28  0.79
## tch    8 442   0.00  0.05   0.00    0.00  0.05 -0.08   0.19   0.26  0.73
## ltg    9 442   0.00  0.05   0.00    0.00  0.05 -0.13   0.13   0.26  0.29
## glu   10 442   0.00  0.05   0.00    0.00  0.04 -0.14   0.14   0.27  0.21
## y     11 442 152.13 77.09 140.50  147.54 88.21 25.00 346.00 321.00  0.44
##     kurtosis   se
## age    -0.69 0.00
## sex    -1.99 0.00
## bmi     0.07 0.00
## map    -0.55 0.00
## tc      0.20 0.00
## ldl     0.56 0.00
## hdl     0.94 0.00
## tch     0.41 0.00
## ltg    -0.16 0.00
## glu     0.21 0.00
## y      -0.90 3.67

#Check for missing values
missing_values <- colSums(is.na(data.all))
missing_values <- missing_values[missing_values > 0]
if (length(missing_values) > 0) {
  cat("Missing values found in the following columns:\n")
  print(missing_values)
} else {
  cat("No missing values found in the dataset.\n")
}

## No missing values found in the dataset.

Quartiles

#compute the 25th, 50th, and 75th percentiles for all variables
quantiles <- apply(data.all, 2, function(x) quantile(x, probs = c(0.25, 0.5, 0.75)))
#display the quantiles round to 3 decimal places
quantiles <- round(quantiles, 3)
#display the quantiles in column format
quantiles <- as.data.frame(t(quantiles))
#display the quantiles
colnames(quantiles) <- c("25th Percentile", "50th Percentile", "75th Percentile")
#display the quantiles
quantiles

##     25th Percentile 50th Percentile 75th Percentile
## age          -0.037           0.005           0.038
## sex          -0.045          -0.045           0.051
## bmi          -0.034          -0.007           0.031
## map          -0.037          -0.006           0.036
## tc           -0.034          -0.004           0.028
## ldl          -0.030          -0.004           0.030
## hdl          -0.035          -0.007           0.029
## tch          -0.039          -0.003           0.034
## ltg          -0.033          -0.002           0.032
## glu          -0.033          -0.001           0.028
## y            87.000         140.500         211.500

library(e1071)
library(dplyr)
library(tidyr)

#calculate skewness for numeric variables
skewness_values <- sapply(data.all, function(x) if(is.numeric(x)) skewness(x, na.rm = TRUE) else NA)
#calculate kurtosis for numeric variables
kurtosis_values <- sapply(data.all, function(x) if(is.numeric(x)) kurtosis(x, na.rm = TRUE) else NA)
#combine skewness and kurtosis into a data frame
skew_kurt <- data.frame(Skewness = skewness_values, Kurtosis = kurtosis_values)
#remove NA values
skew_kurt <- skew_kurt[complete.cases(skew_kurt), ]
#display the skewness and kurtosis values
skew_kurt <- round(skew_kurt, 3)
#display the skewness and kurtosis values
skew_kurt

##     Skewness Kurtosis
## age   -0.230   -0.688
## sex    0.127   -1.988
## bmi    0.594    0.067
## map    0.289   -0.551
## tc     0.376    0.202
## ldl    0.434    0.565
## hdl    0.794    0.939
## tch    0.730    0.410
## ltg    0.290   -0.159
## glu    0.207    0.206
## y      0.438   -0.896

Distributions

#plot the distribution of the target variable y
ggplot(data.all, aes(x = y)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of Target Variable (y)", x = "y", y = "Density") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#grid these distributions
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(grid)
library(ggplot2)
# Create individual plots
p1 <- ggplot(data.all, aes(x = tc)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of TC", x = "TC", y = "Density") +
  theme_minimal()
p2 <- ggplot(data.all, aes(x = ldl)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of LDL", x = "LDL", y = "Density") +
  theme_minimal()
p3 <- ggplot(data.all, aes(x = hdl)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of HDL", x = "HDL", y = "Density") +
  theme_minimal()
p4 <- ggplot(data.all, aes(x = tch)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of TCH", x = "TCH", y = "Density") +
  theme_minimal()
p5 <- ggplot(data.all, aes(x = ltg)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of LTG", x = "LTG", y = "Density") +
  theme_minimal()
p6 <- ggplot(data.all, aes(x = glu)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of GLU", x = "GLU", y = "Density") +
  theme_minimal()
# Arrange the plots in a grid
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 3)

#create individual plots for the age, sex bmi, and map variables, start at p7
p7 <- ggplot(data.all, aes(x = age)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of Age", x = "age", y = "Density") +
  theme_minimal()

p8 <- ggplot(data.all, aes(x = sex)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of Sex", x = "sex", y = "Density") +
  theme_minimal()

p9 <- ggplot(data.all, aes(x = bmi)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of BMI", x = "bmi", y = "Density") +
  theme_minimal()

p10 <- ggplot(data.all, aes(x = map)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  labs(title = "Distribution of MAP", x = "map", y = "Density") +
  theme_minimal()

#arrange the plots in a grid
grid.arrange(p7, p8, p9, p10, ncol = 2)

Correl Matrix

#Create a heatmap of the correaltions using corrplot
#install.packages("corrplot")
library(corrplot)

## corrplot 0.95 loaded

# Compute the correlation matrix
correlation_matrix <- cor(data.all, use = "pairwise.complete.obs")
# Create a heatmap of the correlation matrix
corrplot(correlation_matrix, method = "color", tl.col = "black", tl.srt = 45, addCoef.col = "black", number.cex = 0.7, title = "Correlation Heatmap", mar = c(0, 0, 1, 0))

Box Plots

#Generate Boxplots for each variable
#install.packages("ggpubr")
library(ggpubr)
# Create boxplots for age and overlay the observations
bp_age <- ggplot(data.all, aes(x = "", y = age)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
  geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
  labs(title = "Boxplot of Age", x = "", y = "Age") +
  theme_minimal()



bp_bmi <- ggplot(data.all, aes(x = "", y = bmi)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
  geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
  labs(title = "Boxplot of BMI", x = "", y = "BMI") +
  theme_minimal()

bp_map <- ggplot(data.all, aes(x = "", y = map)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
  geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
  labs(title = "Boxplot of MAP", x = "", y = "MAP") +
  theme_minimal()

bp_tc <- ggplot(data.all, aes(x = "", y = tc)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
  geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
  labs(title = "Boxplot of TC", x = "", y = "TC") +
  theme_minimal()

bp_ldl <- ggplot(data.all, aes(x = "", y = ldl)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
  geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
  labs(title = "Boxplot of LDL", x = "", y = "LDL") +
  theme_minimal()

bp_hdl <- ggplot(data.all, aes(x = "", y = hdl)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
  geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
  labs(title = "Boxplot of HDL", x = "", y = "HDL") +
  theme_minimal()
bp_tch <- ggplot(data.all, aes(x = "", y = tch)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
  geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
  labs(title = "Boxplot of TCH", x = "", y = "TCH") +
  theme_minimal()
bp_ltg <- ggplot(data.all, aes(x = "", y = ltg)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
  geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
  labs(title = "Boxplot of LTG", x = "", y = "LTG") +
  theme_minimal()
bp_glu <- ggplot(data.all, aes(x = "", y = glu)) +
  geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
  geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
  labs(title = "Boxplot of GLU", x = "", y = "GLU") +
  theme_minimal()


# Arrange the boxplots in a grid
library(gridExtra)
library(grid)
library(ggplot2)

#Arrange the boxplots in a 3x3 grid
grid.arrange(bp_age, bp_bmi, bp_map, bp_tc, bp_ldl, bp_hdl, ncol = 3)

# Arrange the boxplots in a 3x3 grid
grid.arrange(bp_tch, bp_ltg, bp_glu, ncol = 3)

Outliers

#count the number of observations 2x above the 75th percentile
#for each variable
#install the dplyr package
library(dplyr)

# Create a function to count the number of observations 2x  above the 75th percentile
count_above_75th <- function(x) {
  threshold <- quantile(x, 0.75, na.rm = TRUE)
  sum(x > (2 * threshold), na.rm = TRUE)
}


#apply the function to each variable
outliers_count <- sapply(data.all, count_above_75th)

# Create a data frame to display the results
outliers_df <- data.frame(Variable = names(outliers_count), Outliers_Count = outliers_count)
# Display the results
outliers_df <- outliers_df %>%
  arrange(desc(Outliers_Count)) %>%
  mutate(Outliers_Count = as.integer(Outliers_Count))

#count the outliers for each variable
outliers_df <- outliers_df %>%
  mutate(Outliers_Count = ifelse(Outliers_Count > 0, Outliers_Count, NA))

print(outliers_df)

##     Variable Outliers_Count
## tch      tch             58
## glu      glu             55
## tc        tc             52
## hdl      hdl             52
## bmi      bmi             45
## ldl      ldl             43
## ltg      ltg             42
## map      map             31
## age      age             13
## sex      sex             NA
## y          y             NA

#count the number of observations 2x below the 25th percentile
#for each variable
# Create a function to count the number of observations below the 25th percentile
count_below_25th <- function(x) {
  threshold <- quantile(x, 0.25, na.rm = TRUE)
  sum(x < (2 *threshold), na.rm = TRUE)
}

#apply the function to each variable
outliers_count_below <- sapply(data.all, count_below_25th)
# Create a data frame to display the results
outliers_df_below <- data.frame(Variable = names(outliers_count_below), Outliers_Count = outliers_count_below)
# Display the results
outliers_df_below <- outliers_df_below %>%
  arrange(desc(Outliers_Count)) %>%
  mutate(Outliers_Count = as.integer(Outliers_Count))
#count the outliers for each variable
outliers_df_below <- outliers_df_below %>%
  mutate(Outliers_Count = ifelse(Outliers_Count > 0, Outliers_Count, NA))
print(outliers_df_below)

##     Variable Outliers_Count
## y          y            275
## ldl      ldl             39
## glu      glu             35
## ltg      ltg             31
## age      age             30
## tc        tc             28
## map      map             23
## bmi      bmi             21
## hdl      hdl             15
## sex      sex             NA
## tch      tch             NA

#scale age bmi map tc ldl hdl tch ltg glu by MAD
#install.packages("robustbase")
library(robustbase)
# Load required library
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

# Perform Yeo-Johnson transformation
vars_to_transform <- setdiff(names(data.all), c("sex", "y"))
yj <- powerTransform(data.all[vars_to_transform], family = "yjPower")  # Estimate lambda
data.all.scaled <- data.all
data.all.scaled[vars_to_transform] <- yjPower(data.all[vars_to_transform], coef(yj))

#check the scaled data
head(data.all.scaled)

##            age         sex         bmi          map           tc         ldl
## 1  0.039431878  0.05068012  0.05221748  0.021247857 -0.044206270 -0.03481480
## 2 -0.001878745 -0.04464164 -0.05962698 -0.027270752 -0.008448088 -0.01916152
## 3  0.092193787  0.05068012  0.03935396 -0.005713851 -0.045581142 -0.03418871
## 4 -0.082295234 -0.04464164 -0.01198357 -0.038494846  0.012191892  0.02499368
## 5  0.005409915 -0.04464164 -0.04036247  0.021247857  0.003934990  0.01559734
## 6 -0.085387666 -0.04464164 -0.04570613 -0.019954320 -0.068949058 -0.07925735
##            hdl          tch          ltg          glu   y
## 1 -0.043396525 -0.002615009  0.022352654 -0.017874473 151
## 2  0.074424140 -0.045162796 -0.047668264 -0.098508964  75
## 3 -0.032353522 -0.002615009  0.002911379 -0.026424040 141
## 4 -0.036034584  0.030670249  0.025899440 -0.009426102 206
## 5  0.008142237 -0.002615009 -0.026815883 -0.048243186 135
## 6  0.041280735 -0.099198741 -0.032898611 -0.103234502  97

#check the dimensions of the scaled data
cat("The dimensions of the scaled DataSet are: ", dim(data.all.scaled)[1], " rows and ", dim(data.all.scaled)[2], " columns.\n")

## The dimensions of the scaled DataSet are:  442  rows and  11  columns.

#check the summary statistics of the scaled data
summary(data.all.scaled)

##       age                 sex                bmi                 map           
##  Min.   :-0.097570   Min.   :-0.04464   Min.   :-0.116944   Min.   :-0.130421  
##  1st Qu.:-0.036054   1st Qu.:-0.04464   1st Qu.:-0.037738   1st Qu.:-0.038495  
##  Median : 0.005410   Median :-0.04464   Median :-0.007436   Median :-0.005714  
##  Mean   : 0.002037   Mean   : 0.00000   Mean   :-0.006053   Mean   :-0.002939  
##  3rd Qu.: 0.039432   3rd Qu.: 0.05068   3rd Qu.: 0.028658   3rd Qu.: 0.034012  
##  Max.   : 0.122426   Max.   : 0.05068   Max.   : 0.111448   Max.   : 0.111950  
##        tc                  ldl                  hdl            
##  Min.   :-1.266e-01   Min.   :-1.155e-01   Min.   :-1.023e-01  
##  1st Qu.:-3.424e-02   1st Qu.:-3.035e-02   1st Qu.:-3.511e-02  
##  Median :-4.321e-03   Median :-3.819e-03   Median :-6.584e-03  
##  Mean   : 1.972e-05   Mean   : 1.098e-05   Mean   : 5.130e-06  
##  3rd Qu.: 2.837e-02   3rd Qu.: 2.985e-02   3rd Qu.: 2.931e-02  
##  Max.   : 1.541e-01   Max.   : 1.990e-01   Max.   : 1.813e-01  
##       tch                 ltg                 glu                  y        
##  Min.   :-0.099199   Min.   :-0.067887   Min.   :-0.151938   Min.   : 25.0  
##  1st Qu.:-0.045163   1st Qu.:-0.027685   1st Qu.:-0.033991   1st Qu.: 87.0  
##  Median :-0.002615   Median :-0.001925   Median :-0.001079   Median :140.5  
##  Mean   :-0.007068   Mean   : 0.014251   Mean   :-0.001608   Mean   :152.1  
##  3rd Qu.: 0.030670   3rd Qu.: 0.039220   3rd Qu.: 0.027360   3rd Qu.:211.5  
##  Max.   : 0.108560   Max.   : 0.303423   Max.   : 0.123502   Max.   :346.0

#check the skewness and kurtosis of the scaled data
skewness_values_scaled <- sapply(data.all.scaled, function(x) if(is.numeric(x)) skewness(x, na.rm = TRUE) else NA)
kurtosis_values_scaled <- sapply(data.all.scaled, function(x) if(is.numeric(x)) kurtosis(x, na.rm = TRUE) else NA)
#combine skewness and kurtosis into a data frame
skew_kurt_scaled <- data.frame(Skewness = skewness_values_scaled, Kurtosis = kurtosis_values_scaled)
#remove NA values
skew_kurt_scaled <- skew_kurt_scaled[complete.cases(skew_kurt_scaled), ]
#display the skewness and kurtosis values
skew_kurt_scaled <- round(skew_kurt_scaled, 3)
#display the skewness and kurtosis values
skew_kurt_scaled

##     Skewness Kurtosis
## age   -0.067   -0.709
## sex    0.127   -1.988
## bmi   -0.006   -0.584
## map    0.033   -0.586
## tc     0.378    0.205
## ldl    0.435    0.568
## hdl    0.795    0.941
## tch   -0.022   -0.510
## ltg    1.940    5.160
## glu   -0.011    0.154
## y      0.438   -0.896

#count the number of observations with a absolute value of z-score greater than 3
#for each variable
# Create a function to count the number of observations with absolute z-score > 3
count_above_3 <- function(x) {
  z_scores <- (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
  sum(abs(z_scores) > 3, na.rm = TRUE)
}

#apply the function to the scaled data
outliers_count_3 <- sapply(data.all.scaled, count_above_3)
# Create a data frame to display the results
outliers_df_3 <- data.frame(Variable = names(outliers_count_3), Outliers_Count = outliers_count_3)
# Display the results
outliers_df_3 <- outliers_df_3 %>%
  arrange(desc(Outliers_Count)) %>%
  mutate(Outliers_Count = as.integer(Outliers_Count))
#count the outliers for each variable
outliers_df_3 <- outliers_df_3 %>%
  mutate(Outliers_Count = ifelse(Outliers_Count > 0, Outliers_Count, NA))
print(outliers_df_3)

##     Variable Outliers_Count
## ltg      ltg              9
## hdl      hdl              5
## tc        tc              2
## ldl      ldl              2
## glu      glu              1
## age      age             NA
## sex      sex             NA
## bmi      bmi             NA
## map      map             NA
## tch      tch             NA
## y          y             NA

#change the values of observations with Z-scores greater than 3 to to the 90th percentile value
#for each variable
# Create a function to replace outliers with the 90th percentile value
replace_outliers <- function(x) {
  z_scores <- (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
  threshold <- quantile(x, 0.90, na.rm = TRUE)
  x[abs(z_scores) > 3] <- threshold
  return(x)
}

#apply the function to the scaled data
data.all.scaled <- data.all.scaled %>%
  mutate(across(c(age, bmi, map, tc, ldl, hdl, tch, ltg, glu), 
                ~ replace_outliers(.)))

#count the number of observations with a absolute value of z-score greater than 3
#for each variable
# Create a function to count the number of observations with absolute z-score > 3
count_above_3 <- function(x) {
  z_scores <- (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
  sum(abs(z_scores) > 3, na.rm = TRUE)
}

#apply this to the scaled data
outliers_count_3 <- sapply(data.all.scaled, count_above_3)
# Create a data frame to display the results
outliers_df_3 <- data.frame(Variable = names(outliers_count_3), Outliers_Count = outliers_count_3)
# Display the results
outliers_df_3 <- outliers_df_3 %>%
  arrange(desc(Outliers_Count)) %>%
  mutate(Outliers_Count = as.integer(Outliers_Count))
#count the outliers for each variable
outliers_df_3 <- outliers_df_3 %>%
  mutate(Outliers_Count = ifelse(Outliers_Count > 0, Outliers_Count, NA))
print(outliers_df_3)

##     Variable Outliers_Count
## ltg      ltg              4
## hdl      hdl              2
## age      age             NA
## sex      sex             NA
## bmi      bmi             NA
## map      map             NA
## tc        tc             NA
## ldl      ldl             NA
## tch      tch             NA
## glu      glu             NA
## y          y             NA

#Partition the scaled data into training and test sets
set.seed(123) # Set seed for reproducibility
train_index <- createDataPartition(data.all.scaled$y, p = 0.7, list = FALSE)
train_data <- as.data.frame(data.all.scaled[train_index, ])
test_data <- as.data.frame(data.all.scaled[-train_index, ])
# Check the dimensions of the training and test sets
cat("Training set dimensions are: ", dim(train_data)[1], " rows and ", dim(train_data)[2], " columns.\n")

## Training set dimensions are:  311  rows and  11  columns.

cat("Test set dimensions : ", dim(test_data)[1], " rows and ", dim(test_data)[2], " columns.\n")

## Test set dimensions :  131  rows and  11  columns.

dim(test_data)

## [1] 131  11

Linear Model

library(caret)
#fit linear model to predict y using all other variables
#apply 10 fold cross validation
set.seed(123) # Set seed for reproducibility
cv_LM <- train(y ~ ., data = train_data, method = "lm", trControl = trainControl(method = "cv", number = 10))

#check the summary of the linear model using stargazer
#install.packages("stargazer")
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

stargazer(cv_LM$finalModel, type = "text", title = "Raw Data Regression Summary", digits = 3, single.row = TRUE)

## 
## Raw Data Regression Summary
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                              .outcome          
## -----------------------------------------------
## age                       -7.809 (74.624)      
## sex                     -178.116** (75.318)    
## bmi                     532.159*** (85.155)    
## map                     306.876*** (81.933)    
## tc                      -319.161 (359.229)     
## ldl                      136.715 (315.395)     
## hdl                      -68.716 (220.546)     
## tch                      115.339 (229.445)     
## ltg                    521.633*** (145.345)    
## glu                       92.887 (79.166)      
## Constant                152.877*** (4.028)     
## -----------------------------------------------
## Observations                    311            
## R2                             0.491           
## Adjusted R2                    0.474           
## Residual Std. Error      55.401 (df = 300)     
## F Statistic          28.967*** (df = 10; 300)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

library(stargazer)
#fit a null model
null_model <- lm(y ~ 1, data = train_data)
#check the summary of the null model
stargazer(null_model, type = "text", title = "Null Model Summary", digits = 3, single.row = TRUE)

## 
## Null Model Summary
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                  y             
## -----------------------------------------------
## Constant                151.887*** (4.333)     
## -----------------------------------------------
## Observations                    311            
## R2                             0.000           
## Adjusted R2                    0.000           
## Residual Std. Error      76.409 (df = 310)     
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

#predict the y values using the linear model
predictions_LM <- predict(cv_LM, newdata = test_data)

#copmute the mean prediction error
mean_prediction_error_LM <- mean(abs(predictions_LM - test_data$y))
#compute the mean squared error
mean_squared_error_LM <- mean((predictions_LM - test_data$y)^2)
#compute the RSS
residual_sum_of_squares_LM <- sum((predictions_LM - test_data$y)^2)
#compute the R squared value
r_squared_LM <- 1 - (residual_sum_of_squares_LM / sum((test_data$y - mean(test_data$y))^2))
#compute the BIC
bic_LM <- BIC(cv_LM$finalModel)
#Compute the MAD
mad_LM <- mad(predictions_LM - test_data$y)
LR_stat <- 2 * (logLik(cv_LM$finalModel) - logLik(null_model))

#combine the results into a data frame
results_LM <- data.frame(
  Model = "Linear Model",
  Mean_Prediction_Error = mean_prediction_error_LM,
  MSE = mean_squared_error_LM,
  RSS = residual_sum_of_squares_LM,
  MAD = mad_LM,
  Rsquared = r_squared_LM,
  BIC = bic_LM,
  LR_stat = LR_stat
)


#format as a kable
#install.packages("kableExtra")
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

results_LM %>%
  kable("html", caption = "Linear Model Results") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:5, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(results_LM), background = "lightblue")

Linear Model Results
Model	Mean_Prediction_Error	MSE	RSS	MAD	Rsquared	BIC	LR_stat
Linear Model	44.78567	3090.399	404842.3	56.35199	0.5008519	3437.342	210.1672

#Caculate the standard error of the mean prediction error 
#using the bootstrap method
#install.packages("boot")
library(boot)

## 
## Attaching package: 'boot'

## The following object is masked from 'package:car':
## 
##     logit

## The following object is masked from 'package:robustbase':
## 
##     salinity

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:lattice':
## 
##     melanoma

# Define a function to calculate the mean prediction error
mean_prediction_error <- function(data, indices) {
  # Resample the data
  resampled_data <- data[indices, ]
  # Fit the model on the resampled data
  model <- lm(y ~ ., data = resampled_data)
  # Predict on the test set
  predictions <- predict(model, newdata = test_data)
  # Calculate the mean prediction error
  mean(abs(predictions - test_data$y))
}

# Perform bootstrap resampling
set.seed(123) # Set seed for reproducibility
boot_results <- boot(data = train_data, statistic = mean_prediction_error, R = 1000)
# Calculate the standard error of the mean prediction error
mean_prediction_error_se <- sd(boot_results$t)
cat("The standard error of the mean prediction error is:", mean_prediction_error_se, "\n")

## The standard error of the mean prediction error is: 1.009269

#plot the residual vs predicted
library(ggplot2)
# Create a data frame with fitted values and residuals
residuals_plot_data <- data.frame(
  Fitted = cv_LM$finalModel$fitted.values,
  Residuals = cv_LM$finalModel$residuals
)

# Create the residuals vs fitted plot
ggplot(residuals_plot_data, aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    title = "Residuals vs Fitted Values",
    x = "Fitted Values",
    y = "Residuals"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Best Subset

#setup parrallel processing
library(doParallel)
#detect the number of cores
numCores <- detectCores()
#setup the cluster
cl <- makeCluster(5)
#register the cluster
registerDoParallel(cl)


#Apply best subset selection 
install.packages("leaps")

## package 'leaps' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\andre\AppData\Local\Temp\RtmpkBu0N7\downloaded_packages

library(leaps)
library(caret)
# Create 10 folds for cross-validation
set.seed(562)  # For reproducibility
folds <- createFolds(train_data$y, k = 10, list = TRUE, returnTrain = TRUE)

# Initialize a matrix to store validation errors for each fold and subset size
cv_bic <- matrix(NA, nrow = 10, ncol = 10)  # 10 folds, 10 subset sizes (nvmax)

# Perform cross-validation
for (i in seq_along(folds)) {
  # Split the data into training and validation sets
  training_indices <- folds[[i]]
  training_data <- train_data[training_indices, ]
  validation_data <- train_data[-training_indices, ]

#Loop through each subset size
for (j in 1:10) {
    # Fit the best subset model
    best_subset <- regsubsets(y ~ ., data = training_data, nvmax = j)
    
    # Get the coefficients for the best model
    best_model <- which.min(summary(best_subset)$bic)
    best_model_coefficients <- coef(best_subset, best_model)
    
    # Create a linear model using the selected variables
    selected_vars <- names(best_model_coefficients)[-1]  # Exclude intercept
    formula <- as.formula(paste("y ~", paste(selected_vars, collapse = "+")))
    lm_model <- lm(formula, data = training_data)
    
    # Predict on the validation set
    predictions <- predict(lm_model, newdata = validation_data)
    
    # Calculate BIC for the model
    bic_value <- BIC(lm_model)
    
    # Store the BIC value
    cv_bic[i, j] <- bic_value
  }
}

# Calculate the average BIC for each subset size
avg_bic <- colMeans(cv_bic, na.rm = TRUE)

# Find the best subset size based on average BIC
best_subset_size <- which.min(avg_bic)

cat("The best subset size based on 10-fold cross-validated BIC is:", best_subset_size, "\n")

## The best subset size based on 10-fold cross-validated BIC is: 5

# Stop the cluster
stopCluster(cl)

#Get the coefficients to extract the predictors for the best subset size
best_model_coefficients <- coef(best_subset, best_subset_size)

# Get the names of the selected variables
selected_vars <- names(best_model_coefficients)[-1]  # Exclude intercept
cat("The selected variables for the best subset size are:", selected_vars, "\n")

## The selected variables for the best subset size are: sex bmi map hdl ltg

library(stargazer)
#Create linear model from the best subset selection
# Create a formula for the best subset model
formula_best_subset <- as.formula(paste("y ~", paste(selected_vars, collapse = "+")))
# Fit the best subset model using the training data
best_model_LM <- lm(formula_best_subset, data = train_data)
# Check the summary of the best subset model
stargazer(best_model_LM, type = "text", title = "Best Subset Selection Model Summary", digits = 3, single.row = TRUE)

## 
## Best Subset Selection Model Summary
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                  y             
## -----------------------------------------------
## sex                     -164.834** (73.783)    
## bmi                     540.858*** (82.457)    
## map                     310.934*** (78.433)    
## hdl                    -257.417*** (84.008)    
## ltg                     436.692*** (72.293)    
## Constant                152.577*** (3.271)     
## -----------------------------------------------
## Observations                    311            
## R2                             0.485           
## Adjusted R2                    0.477           
## Residual Std. Error      55.258 (df = 305)     
## F Statistic           57.548*** (df = 5; 305)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

#predict the y values using the best model
predictions_best_model <- predict(best_model_LM, newdata = test_data)
#compute the mean prediction error
mean_prediction_error_best_model <- mean(abs(predictions_best_model - test_data$y))
#compute the mean squared error
mean_squared_error_best_model <- mean((predictions_best_model - test_data$y)^2)
#compute the RSS
residual_sum_of_squares_best_model <- sum((predictions_best_model - test_data$y)^2)
#compute the R squared value
r_squared_best_model <- 1 - (residual_sum_of_squares_best_model / sum((test_data$y - mean(test_data$y))^2))
#compute the BIC
bic_best_model <- BIC(best_model_LM)
#compute the MAD
mad_best_model <- mad(predictions_best_model - test_data$y)
#compute the LR statistic
LR_stat_best_model <- 2 * (logLik(cv_LM$finalModel) - logLik(best_model_LM)) 
#combine the results into a data frame
results_best_model <- data.frame(
  Model = "Best Subset Selection Model",
  Mean_Prediction_Error = mean_prediction_error_best_model,
  MSE = mean_squared_error_best_model,
  MAD = mad_best_model,
  RSS = residual_sum_of_squares_best_model,
  Rsquared = r_squared_best_model,
  BIC = bic_best_model,
  LR_stat = LR_stat_best_model
)

#format as a kable
results_best_model %>%
  kable("html", caption = "Best Subset Selection Model Results") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:5, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(results_best_model), background = "lightblue")

Best Subset Selection Model Results
Model	Mean_Prediction_Error	MSE	MAD	RSS	Rsquared	BIC	LR_stat
Best Subset Selection Model	45.22154	3129.867	62.59345	410012.5	0.4944773	3412.167	3.524188

#plot the residual vs predicted
library(ggplot2)
# Create a data frame with predictions and residuals
ggplot(data = data.frame(Fitted = best_model_LM$fitted.values, Residuals = best_model_LM$residuals), aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Residuals vs Fitted Values (Best Subset Model)", x = "Fitted Values", y = "Residuals") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Ridge

#install glmnet
#install.packages("glmnet")
library(glmnet)

# Create a matrix of predictors and a vector of response variable
x <- as.matrix(train_data[, -which(names(train_data) == "y")])
y <- train_data$y

#Perform Ridge regression with cross-validation
set.seed(562) # Set seed for reproducibility
cv_ridge <- cv.glmnet(x, y, alpha = 0, nfolds = 10)

# Plot the cross-validation results
plot(cv_ridge)

#highlight the 1SE lambda with green vertical line
abline(v = log(cv_ridge$lambda.1se), col = "green", lty = 2, lwd = 2)
#add legend to the plot denoting teh 1SE lambda
legend("topright", legend = "1SE Lambda", col = "green", lty = 2, lwd = 2)

#Extract the 1SE lambda
lambda_ridge <- cv_ridge$lambda.1se
cat("The optimal lambda for Ridge regression is:", lambda_ridge, "\n")

## The optimal lambda for Ridge regression is: 66.66497

library(glmnet)
library(kableExtra)
library(dplyr)
# Fit the Ridge regression model using the 1SE lambda
ridge_model <- glmnet(x, y, alpha = 0, lambda = lambda_ridge)
# Predict on the test set
predictions_ridge <- predict(ridge_model, s = lambda_ridge, newx = as.matrix(test_data[, -which(names(test_data) == "y")]))
# Compute the mean prediction error
mean_prediction_error_ridge <- mean(abs(predictions_ridge - test_data$y))
# Compute the mean squared error
mean_squared_error_ridge <- mean((predictions_ridge - test_data$y)^2)
# Compute the RSS
residual_sum_of_squares_ridge <- sum((predictions_ridge - test_data$y)^2)
# Compute the R squared value
r_squared_ridge <- 1 - (residual_sum_of_squares_ridge / sum((test_data$y - mean(test_data$y))^2))
# Compute the MAD
mad_ridge <- mad(predictions_ridge - test_data$y)
# Compute the LR statistic
n <- length(y)
sigma_squared_Ridge <- residual_sum_of_squares_ridge/n
logLik_ridge <- -n/2 * log(2 * pi) - n/2 * log(sigma_squared_Ridge) - 1/(2 * sigma_squared_Ridge) * residual_sum_of_squares_ridge

LR_stat_ridge <- 2 * ( logLik(cv_LM$finalModel) - logLik_ridge)
# Compute the BIC
# Manually calculate BIC
n <- nrow(train_data)  # Number of observations
k <- length(coef(ridge_model, s = lambda_ridge)) 
# Number of parameters (including intercept)
bic_ridge <- n * log(residual_sum_of_squares_ridge / n) + log(n) * k

# Combine the results into a data frame
results_ridge <- data.frame(
  Model = "Ridge Regression Model",
  Mean_Prediction_Error = mean_prediction_error_ridge,
  MSE = mean_squared_error_ridge,
  MAD = mad_ridge,
  RSS = residual_sum_of_squares_ridge,
  Rsquared = r_squared_ridge,
  BIC = bic_ridge,
  LR_stat = LR_stat_ridge
  
)
#format as a kable
results_ridge %>%
  kable("html", caption = "Ridge Regression Model Results") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:5, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(results_ridge), background = "lightblue")

Ridge Regression Model Results
Model	Mean_Prediction_Error	MSE	MAD	RSS	Rsquared	BIC	LR_stat
Ridge Regression Model	49.69468	3551.386	67.61069	465231.6	0.4263953	2336.703	-212.3201

#install the broom package
#install.packages("broom")
library(broom)
#tidy the ridge model
ridge_model_tidy <- tidy(ridge_model, s = lambda_ridge)
#format the ridge model coefficients as a kable
ridge_model_tidy %>%
  kable("html", caption = "Ridge Regression Model Coefficients") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(ridge_model_tidy), background = "lightblue")

Ridge Regression Model Coefficients
term	step	estimate	lambda	dev.ratio
(Intercept)	1	153.45143	66.66497	0.4416468
age	1	37.52676	66.66497	0.4416468
sex	1	-56.19174	66.66497	0.4416468
bmi	1	317.73133	66.66497	0.4416468
map	1	206.82558	66.66497	0.4416468
tc	1	23.15439	66.66497	0.4416468
ldl	1	-15.80633	66.66497	0.4416468
hdl	1	-141.18176	66.66497	0.4416468
tch	1	125.91879	66.66497	0.4416468
ltg	1	243.45232	66.66497	0.4416468
glu	1	115.65020	66.66497	0.4416468

#grid the 2 kables
library(gridExtra)
library(grid)
library(htmltools)
# Create individual plots
p12 <- results_ridge %>%
  kable("html", caption = "Ridge Regression Model Results") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:5, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(results_ridge), background = "lightblue")

p13 <- ridge_model_tidy %>%
  kable("html", caption = "Ridge Regression Model Coefficients") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(ridge_model_tidy), background = "lightblue")

#plot the residual vs predicted from the ridge model
library(ggplot2)
# Create a data frame with predictions and residuals
predictions_ridge <- as.vector(predictions_ridge)
residuals_ridge <- as.vector(predictions_ridge - test_data$y)
ggplot(data = data.frame(Fitted = predictions_ridge, Residuals = residuals_ridge), aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Residuals vs Fitted Values (Ridge Model)", x = "Fitted Values", y = "Residuals") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

LASSO

library(glmnet)
#Perform Lasso regression with cross validation
set.seed(562) # Set seed for reproducibility
cv_lasso <- cv.glmnet(x, y, alpha = 1, nfolds = 10)
# Plot the cross-validation results
plot(cv_lasso)

#Extract the 1SE lambda
lambda_lasso <- cv_lasso$lambda.1se
cat("The optimal lambda for Lasso regression is:", lambda_lasso, "\n")

## The optimal lambda for Lasso regression is: 10.13248

#highlight the 1SE lambda with green vertical line
abline(v = log(cv_lasso$lambda.1se), col = "green", lty = 2, lwd = 2)

#add legend to the plot denoting the 1SE lambda
legend("topright", legend = "1SE Lambda", col = "green", lty = 2, lwd = 2)

# Fit the Lasso regression model using the 1SE lambda
lasso_model <- glmnet(x, y, alpha = 1, lambda = lambda_lasso)
# Predict on the test set
predictions_lasso <- predict(lasso_model, s = lambda_lasso, newx = as.matrix(test_data[, -which(names(test_data) == "y")]))
# Compute the mean prediction error
mean_prediction_error_lasso <- mean(abs(predictions_lasso - test_data$y))
# Compute the mean squared error
mean_squared_error_lasso <- mean((predictions_lasso - test_data$y)^2)
# Compute the RSS
residual_sum_of_squares_lasso <- sum((predictions_lasso - test_data$y)^2)
# Compute the R squared value
r_squared_lasso <- 1 - (residual_sum_of_squares_lasso / sum((test_data$y - mean(test_data$y))^2))
#compute the MAD
mad_lasso <- mad(predictions_lasso - test_data$y)
# Compute the LR statistic
n <- length(y)
sigma_squared_Lasso <- residual_sum_of_squares_lasso/n
logLik_lasso <- -n/2 * log(2 * pi) - n/2 * log(sigma_squared_Lasso) - 1/(2 * sigma_squared_Lasso) * residual_sum_of_squares_lasso
LR_stat_lasso <- 2 * ( logLik(cv_LM$finalModel) - logLik_lasso)

# Compute the BIC
# Manually calculate BIC
n <- nrow(train_data)  # Number of observations
k <- length(coef(lasso_model, s = lambda_lasso))
# Number of parameters (including intercept)
bic_lasso <- n * log(residual_sum_of_squares_lasso / n) + log(n) * k
# Combine the results into a data frame
results_lasso <- data.frame(
  Model = "Lasso Regression Model",
  Mean_Prediction_Error = mean_prediction_error_lasso,
  MSE = mean_squared_error_lasso,
  MAD = mad_lasso,
  RSS = residual_sum_of_squares_lasso,
  Rsquared = r_squared_lasso,
  BIC = bic_lasso,
  LR_stat = LR_stat_lasso
)

#format as a kable
results_lasso %>%
  kable("html", caption = "Lasso Regression Model Results") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:5, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(results_lasso), background = "lightblue")

Lasso Regression Model Results
Model	Mean_Prediction_Error	MSE	MAD	RSS	Rsquared	BIC	LR_stat
Lasso Regression Model	50.03535	3631.114	65.17592	475675.9	0.413518	2343.607	-205.4155

#install the broom package
#install.packages("broom")
library(broom)
#tidy the lasso model
lasso_model_tidy <- tidy(lasso_model, s = lambda_lasso)
#format the lasso model coefficients as a kable
lasso_model_tidy %>%
  kable("html", caption = "Lasso Regression Model Coefficients") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(lasso_model_tidy), background = "lightblue")

Lasso Regression Model Coefficients
term	step	estimate	lambda	dev.ratio
(Intercept)	1	152.22289	10.13248	0.4429363
bmi	1	487.03940	10.13248	0.4429363
map	1	146.51797	10.13248	0.4429363
hdl	1	-50.29042	10.13248	0.4429363
ltg	1	354.59762	10.13248	0.4429363

#create a vector of the predicted values
predictions_lasso <- as.vector(predictions_lasso)
residuals_lasso <- as.vector(predictions_lasso - test_data$y)
#plot the residual vs predicted from the lasso model
library(ggplot2)
# Create a data frame with predictions and residuals
ggplot(data = data.frame(Fitted = predictions_lasso, Residuals = residuals_lasso), aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Residuals vs Fitted Values (Lasso Model)", x = "Fitted Values", y = "Residuals") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Metrics Table

library(dplyr)
#create 1 data table with all the results
# Combine all results into a single data frame
all_results <- rbind(results_LM, results_best_model, results_ridge, results_lasso)
#format as a kable
all_results %>%
  kable("html", caption = "All Model Results") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:5, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(all_results), background = "lightblue")

All Model Results
Model	Mean_Prediction_Error	MSE	RSS	MAD	Rsquared	BIC	LR_stat
Linear Model	44.78567	3090.399	404842.3	56.35199	0.5008519	3437.342	210.167174
Best Subset Selection Model	45.22154	3129.867	410012.5	62.59345	0.4944773	3412.167	3.524188
Ridge Regression Model	49.69468	3551.386	465231.6	67.61069	0.4263953	2336.703	-212.320094
Lasso Regression Model	50.03535	3631.114	475675.9	65.17592	0.4135180	2343.607	-205.415464

Scatter Plots

#create a line plot for all variables with y
# Create a line plot for all variables with y
library(ggplot2)
library(gridExtra)
# Create individual plots for each variable
p7 <- ggplot(data.all, aes(x = age, y = y)) +
  geom_line(color = "blue") +
  labs(title = "Age vs Y", x = "Age", y = "Y") +
  theme_minimal()
p8 <- ggplot(data.all, aes(x = bmi, y = y)) +
  geom_line(color = "blue") +
  labs(title = "BMI vs Y", x = "BMI", y = "Y") +
  theme_minimal()
p9 <- ggplot(data.all, aes(x = map, y = y)) +
  geom_line(color = "blue") +
  labs(title = "MAP vs Y", x = "MAP", y = "Y") +
  theme_minimal()
p10 <- ggplot(data.all, aes(x = tc, y = y)) +
  geom_line(color = "blue") +
  labs(title = "TC vs Y", x = "TC", y = "Y") +
  theme_minimal()
p11 <- ggplot(data.all, aes(x = ldl, y = y)) +
  geom_line(color = "blue") +
  labs(title = "LDL vs Y", x = "LDL", y = "Y") +
  theme_minimal()
p12 <- ggplot(data.all, aes(x = hdl, y = y)) +
  geom_line(color = "blue") +
  labs(title = "HDL vs Y", x = "HDL", y = "Y") +
  theme_minimal()
p13 <- ggplot(data.all, aes(x = tch, y = y)) +
  geom_line(color = "blue") +
  labs(title = "TCH vs Y", x = "TCH", y = "Y") +
  theme_minimal()
p14 <- ggplot(data.all, aes(x = ltg, y = y)) +
  geom_line(color = "blue") +
  labs(title = "LTG vs Y", x = "LTG", y = "Y") +
  theme_minimal()
p15 <- ggplot(data.all, aes(x = glu, y = y)) +
  geom_line(color = "blue") +
  labs(title = "GLU vs Y", x = "GLU", y = "Y") +
  theme_minimal()
# Arrange the plots in a grid
grid.arrange(p7, p8, p9, p10, p11, p12, p13, p14, p15, ncol = 3)

# Arrange the plots in a grid
grid.arrange(p7, p8, p9, p10, p11, p12, ncol = 3)

# Arrange the plots in a grid
grid.arrange(p13, p14, p15, ncol = 3)

#create a scatter plot for each variable with y
# Create a scatter plot for each variable with y
library(ggplot2)
library(gridExtra)
# Create individual scatter plots for each variable
p1 <- ggplot(data.all, aes(x = age, y = y)) +
  geom_point(color = "blue") +
  labs(title = "Age vs Y", x = "Age", y = "Y") +
  theme_minimal()
p2 <- ggplot(data.all, aes(x = bmi, y = y)) +
  geom_point(color = "blue") +
  labs(title = "BMI vs Y", x = "BMI", y = "Y") +
  theme_minimal()
p3 <- ggplot(data.all, aes(x = map, y = y)) +
  geom_point(color = "blue") +
  labs(title = "MAP vs Y", x = "MAP", y = "Y") +
  theme_minimal()
p4 <- ggplot(data.all, aes(x = tc, y = y)) +
  geom_point(color = "blue") +
  labs(title = "TC vs Y", x = "TC", y = "Y") +
  theme_minimal()
p5 <- ggplot(data.all, aes(x = ldl, y = y)) +
  geom_point(color = "blue") +
  labs(title = "LDL vs Y", x = "LDL", y = "Y") +
  theme_minimal()
p6 <- ggplot(data.all, aes(x = hdl, y = y)) +
  geom_point(color = "blue") +
  labs(title = "HDL vs Y", x = "HDL", y = "Y") +
  theme_minimal()
p7 <- ggplot(data.all, aes(x = tch, y = y)) +
  geom_point(color = "blue") +
  labs(title = "TCH vs Y", x = "TCH", y = "Y") +
  theme_minimal()
p8 <- ggplot(data.all, aes(x = ltg, y = y)) +
  geom_point(color = "blue") +
  labs(title = "LTG vs Y", x = "LTG", y = "Y") +
  theme_minimal()
p9 <- ggplot(data.all, aes(x = glu, y = y)) +
  geom_point(color = "blue") +
  labs(title = "GLU vs Y", x = "GLU", y = "Y") +
  theme_minimal()
# Arrange the plots in a grid
grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, ncol = 3)

# Arrange the plots in a grid
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 3)

# Arrange the plots in a grid
grid.arrange(p7, p8, p9, ncol = 3)

Alternate Ridge_ Quadratics

#square the BMI from the data all scaled data and add it to the data frame
# Create a new variable for squared BMI
#install the dplyr package
library(dplyr)


#Create a new variable for bmi squared
data.all.scaled2 <- data.all.scaled  %>%
  mutate(bmi_squared = bmi^2)

#create a new variable for htl squared
data.all.scaled2 <- data.all.scaled  %>%
  mutate(htl_squared = hdl^2)

str(data.all.scaled2)

## 'data.frame':    442 obs. of  12 variables:
##  $ age        : num  0.03943 -0.00188 0.09219 -0.0823 0.00541 ...
##  $ sex        : num  0.0507 -0.0446 0.0507 -0.0446 -0.0446 ...
##  $ bmi        : num  0.0522 -0.0596 0.0394 -0.012 -0.0404 ...
##  $ map        : num  0.02125 -0.02727 -0.00571 -0.03849 0.02125 ...
##  $ tc         : num  -0.04421 -0.00845 -0.04558 0.01219 0.00393 ...
##  $ ldl        : num  -0.0348 -0.0192 -0.0342 0.025 0.0156 ...
##  $ hdl        : num  -0.0434 0.07442 -0.03235 -0.03603 0.00814 ...
##  $ tch        : num  -0.00262 -0.04516 -0.00262 0.03067 -0.00262 ...
##  $ ltg        : num  0.02235 -0.04767 0.00291 0.0259 -0.02682 ...
##  $ glu        : num  -0.01787 -0.09851 -0.02642 -0.00943 -0.04824 ...
##  $ y          : num  151 75 141 206 135 97 138 63 110 310 ...
##  $ htl_squared: num  1.88e-03 5.54e-03 1.05e-03 1.30e-03 6.63e-05 ...

head(data.all.scaled2)

##            age         sex         bmi          map           tc         ldl
## 1  0.039431878  0.05068012  0.05221748  0.021247857 -0.044206270 -0.03481480
## 2 -0.001878745 -0.04464164 -0.05962698 -0.027270752 -0.008448088 -0.01916152
## 3  0.092193787  0.05068012  0.03935396 -0.005713851 -0.045581142 -0.03418871
## 4 -0.082295234 -0.04464164 -0.01198357 -0.038494846  0.012191892  0.02499368
## 5  0.005409915 -0.04464164 -0.04036247  0.021247857  0.003934990  0.01559734
## 6 -0.085387666 -0.04464164 -0.04570613 -0.019954320 -0.068949058 -0.07925735
##            hdl          tch          ltg          glu   y  htl_squared
## 1 -0.043396525 -0.002615009  0.022352654 -0.017874473 151 1.883258e-03
## 2  0.074424140 -0.045162796 -0.047668264 -0.098508964  75 5.538953e-03
## 3 -0.032353522 -0.002615009  0.002911379 -0.026424040 141 1.046750e-03
## 4 -0.036034584  0.030670249  0.025899440 -0.009426102 206 1.298491e-03
## 5  0.008142237 -0.002615009 -0.026815883 -0.048243186 135 6.629603e-05
## 6  0.041280735 -0.099198741 -0.032898611 -0.103234502  97 1.704099e-03

#partion the data into training and test sets
set.seed(123) # Set seed for reproducibility
train_index2 <- createDataPartition(data.all.scaled2$y, p = 0.7, list = FALSE)
train_data2 <- as.data.frame(data.all.scaled2[train_index2, ])
test_data2<- as.data.frame(data.all.scaled2[-train_index2, ])
# Check the dimensions of the training and test sets
cat("Training set dimensions are: ", dim(train_data2)[1], " rows and ", dim(train_data2)[2], " columns.\n")

## Training set dimensions are:  311  rows and  12  columns.

cat("Test set dimensions : ", dim(test_data2)[1], " rows and ", dim(test_data2)[2], " columns.\n")

## Test set dimensions :  131  rows and  12  columns.

#perform a ridge model with cross validation
# Create a matrix of predictors and a vector of response variable
x2 <- as.matrix(train_data2[, -which(names(train_data2) == "y")])
y2 <- train_data2$y
#Perform Ridge regression with cross-validation
set.seed(562) # Set seed for reproducibility
cv_ridge2 <- cv.glmnet(x2, y2, alpha = 0, nfolds = 10)
# Plot the cross-validation results
plot(cv_ridge2)

#highlight the 1SE lambda with green vertical line
abline(v = log(cv_ridge2$lambda.1se), col = "green", lty = 2, lwd = 2)
#add legend to the plot denoting teh 1SE lambda
legend("topright", legend = "1SE Lambda", col = "green", lty = 2, lwd = 2)

#Extract the 1SE lambda
lambda_ridge2 <- cv_ridge2$lambda.1se
cat("The optimal lambda for Ridge regression is:", lambda_ridge2, "\n")

## The optimal lambda for Ridge regression is: 66.66497

# Fit the Ridge regression model using the 1SE lambda
ridge_model2 <- glmnet(x2, y2, alpha = 0, lambda = lambda_ridge2)
# Predict on the test set
predictions_ridge2 <- predict(ridge_model2, s = lambda_ridge2, newx = as.matrix(test_data2[, -which(names(test_data2) == "y")]))
# Compute the mean prediction error
mean_prediction_error_ridge2 <- mean(abs(predictions_ridge2 - test_data2$y))
# Compute the mean squared error
mean_squared_error_ridge2 <- mean((predictions_ridge2 - test_data2$y)^2)
# Compute the RSS
residual_sum_of_squares_ridge2 <- sum((predictions_ridge2 - test_data2$y)^2)
# Compute the R squared value
r_squared_ridge2 <- 1 - (residual_sum_of_squares_ridge2 / sum((test_data2$y - mean(test_data2$y))^2))
# Compute the MAD
mad_ridge2 <- mad(predictions_ridge2 - test_data2$y)
# Compute the LR statistic
n2 <- length(y2)
sigma_squared_Ridge2 <- residual_sum_of_squares_ridge2/n2
logLik_ridge2 <- -n2/2 * log(2 * pi) - n2/2 * log(sigma_squared_Ridge2) - 1/(2 * sigma_squared_Ridge2) * residual_sum_of_squares_ridge2
LR_stat_ridge2 <- 2 * ( logLik(cv_LM$finalModel) - logLik_ridge2)
# Compute the BIC
# Manually calculate BIC
n2 <- nrow(train_data2)  # Number of observations
k2 <- length(coef(ridge_model2, s = lambda_ridge2))
# Number of parameters (including intercept)
bic_ridge2 <- n2 * log(residual_sum_of_squares_ridge2 / n2) + log(n2) * k2
# Combine the results into a data frame
results_ridge2 <- data.frame(
  Model = "Ridge Regression Model with BMI Squared",
  Mean_Prediction_Error = mean_prediction_error_ridge2,
  MSE = mean_squared_error_ridge2,
  MAD = mad_ridge2,
  RSS = residual_sum_of_squares_ridge2,
  Rsquared = r_squared_ridge2,
  BIC = bic_ridge2,
  LR_stat = LR_stat_ridge2
)
#format as a kable
results_ridge2 %>%
  kable("html", caption = "Ridge Regression Model with BMI Squared Results") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:5, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(results_ridge2), background = "lightblue")

Ridge Regression Model with BMI Squared Results
Model	Mean_Prediction_Error	MSE	MAD	RSS	Rsquared	BIC	LR_stat
Ridge Regression Model with BMI Squared	49.67256	3550.158	67.59386	465070.8	0.4265936	2342.335	-212.4276

#show the coefficient estimates
#install the broom package
#install.packages("broom")
library(broom)
#tidy the ridge model
ridge_model2_tidy <- tidy(ridge_model2, s = lambda_ridge2)
#format the ridge model coefficients as a kable
ridge_model2_tidy %>%
  kable("html", caption = "Ridge Regression Model with BMI Squared Coefficients") %>%
  kable_styling("striped", full_width = F) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, color = "black") %>%
  row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
  row_spec(1:nrow(ridge_model2_tidy), background = "lightblue")

Ridge Regression Model with BMI Squared Coefficients
term	step	estimate	lambda	dev.ratio
(Intercept)	1	153.55218	66.66497	0.4416078
age	1	37.77190	66.66497	0.4416078
sex	1	-56.17585	66.66497	0.4416078
bmi	1	317.63017	66.66497	0.4416078
map	1	206.66046	66.66497	0.4416078
tc	1	23.19310	66.66497	0.4416078
ldl	1	-16.09537	66.66497	0.4416078
hdl	1	-140.60619	66.66497	0.4416078
tch	1	125.97219	66.66497	0.4416078
ltg	1	243.58352	66.66497	0.4416078
glu	1	115.71980	66.66497	0.4416078
htl_squared	1	-52.20919	66.66497	0.4416078

# Create a data frame with predictions and residuals
predictions_ridge2 <- as.vector(predictions_ridge2)
residuals_ridge2 <- as.vector(predictions_ridge2 - test_data2$y)
#plot the residual vs predicted from the ridge model
library(ggplot2)
# Create a data frame with predictions and residuals
ggplot(data = data.frame(Fitted = predictions_ridge2, Residuals = residuals_ridge2), aes(x = Fitted, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Residuals vs Fitted Values (Ridge Model with BMI Squared)", x = "Fitted Values", y = "Residuals") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

ECON 562: PROJECT 2

Andrew Frohner

2025-04-09

Data Quality Check

Quartiles

Distributions

Correl Matrix

Box Plots

Outliers

Linear Model

Best Subset

Ridge

LASSO

Metrics Table

Scatter Plots

Alternate Ridge_ Quadratics