# Set CRAN mirror and Install kernlab, ggplot2, and dplyr packages
options(repos = c(CRAN = "https://cloud.r-project.org"))
library(kernlab)
library(ggplot2)
library(dplyr)
library(tidyr)
library(caret)
library(e1071)
library(ggpubr)
#load the glmnet package
library(glmnet)
#load the MASS package
library(MASS)
#load the lars package
install.packages("lars")
## package 'lars' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\andre\AppData\Local\Temp\RtmpkBu0N7\downloaded_packages
library(lars)
#load the diabetes data set
data(diabetes)
data.all <- data.frame(cbind(diabetes$x, y=diabetes$y))
Data Quality Check
#dimensions of the data set
dim(data.all)
## [1] 442 11
cat("The dimensions of the DataSet are: ", dim(data.all)[1], " rows and ", dim(data.all)[2], " columns.\n")
## The dimensions of the DataSet are: 442 rows and 11 columns.
#Generate summary statistics of the data using the describle function
# from the psych package
#install.packages("psych")
library(psych)
#summary statistics
describe(data.all)
## vars n mean sd median trimmed mad min max range skew
## age 1 442 0.00 0.05 0.01 0.00 0.05 -0.11 0.11 0.22 -0.23
## sex 2 442 0.00 0.05 -0.04 0.00 0.00 -0.04 0.05 0.10 0.13
## bmi 3 442 0.00 0.05 -0.01 0.00 0.05 -0.09 0.17 0.26 0.59
## map 4 442 0.00 0.05 -0.01 0.00 0.05 -0.11 0.13 0.24 0.29
## tc 5 442 0.00 0.05 0.00 0.00 0.05 -0.13 0.15 0.28 0.38
## ldl 6 442 0.00 0.05 0.00 0.00 0.04 -0.12 0.20 0.31 0.43
## hdl 7 442 0.00 0.05 -0.01 0.00 0.05 -0.10 0.18 0.28 0.79
## tch 8 442 0.00 0.05 0.00 0.00 0.05 -0.08 0.19 0.26 0.73
## ltg 9 442 0.00 0.05 0.00 0.00 0.05 -0.13 0.13 0.26 0.29
## glu 10 442 0.00 0.05 0.00 0.00 0.04 -0.14 0.14 0.27 0.21
## y 11 442 152.13 77.09 140.50 147.54 88.21 25.00 346.00 321.00 0.44
## kurtosis se
## age -0.69 0.00
## sex -1.99 0.00
## bmi 0.07 0.00
## map -0.55 0.00
## tc 0.20 0.00
## ldl 0.56 0.00
## hdl 0.94 0.00
## tch 0.41 0.00
## ltg -0.16 0.00
## glu 0.21 0.00
## y -0.90 3.67
#Check for missing values
missing_values <- colSums(is.na(data.all))
missing_values <- missing_values[missing_values > 0]
if (length(missing_values) > 0) {
cat("Missing values found in the following columns:\n")
print(missing_values)
} else {
cat("No missing values found in the dataset.\n")
}
## No missing values found in the dataset.
Quartiles
#compute the 25th, 50th, and 75th percentiles for all variables
quantiles <- apply(data.all, 2, function(x) quantile(x, probs = c(0.25, 0.5, 0.75)))
#display the quantiles round to 3 decimal places
quantiles <- round(quantiles, 3)
#display the quantiles in column format
quantiles <- as.data.frame(t(quantiles))
#display the quantiles
colnames(quantiles) <- c("25th Percentile", "50th Percentile", "75th Percentile")
#display the quantiles
quantiles
## 25th Percentile 50th Percentile 75th Percentile
## age -0.037 0.005 0.038
## sex -0.045 -0.045 0.051
## bmi -0.034 -0.007 0.031
## map -0.037 -0.006 0.036
## tc -0.034 -0.004 0.028
## ldl -0.030 -0.004 0.030
## hdl -0.035 -0.007 0.029
## tch -0.039 -0.003 0.034
## ltg -0.033 -0.002 0.032
## glu -0.033 -0.001 0.028
## y 87.000 140.500 211.500
library(e1071)
library(dplyr)
library(tidyr)
#calculate skewness for numeric variables
skewness_values <- sapply(data.all, function(x) if(is.numeric(x)) skewness(x, na.rm = TRUE) else NA)
#calculate kurtosis for numeric variables
kurtosis_values <- sapply(data.all, function(x) if(is.numeric(x)) kurtosis(x, na.rm = TRUE) else NA)
#combine skewness and kurtosis into a data frame
skew_kurt <- data.frame(Skewness = skewness_values, Kurtosis = kurtosis_values)
#remove NA values
skew_kurt <- skew_kurt[complete.cases(skew_kurt), ]
#display the skewness and kurtosis values
skew_kurt <- round(skew_kurt, 3)
#display the skewness and kurtosis values
skew_kurt
## Skewness Kurtosis
## age -0.230 -0.688
## sex 0.127 -1.988
## bmi 0.594 0.067
## map 0.289 -0.551
## tc 0.376 0.202
## ldl 0.434 0.565
## hdl 0.794 0.939
## tch 0.730 0.410
## ltg 0.290 -0.159
## glu 0.207 0.206
## y 0.438 -0.896
Distributions
#plot the distribution of the target variable y
ggplot(data.all, aes(x = y)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of Target Variable (y)", x = "y", y = "Density") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#grid these distributions
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
library(ggplot2)
# Create individual plots
p1 <- ggplot(data.all, aes(x = tc)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of TC", x = "TC", y = "Density") +
theme_minimal()
p2 <- ggplot(data.all, aes(x = ldl)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of LDL", x = "LDL", y = "Density") +
theme_minimal()
p3 <- ggplot(data.all, aes(x = hdl)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of HDL", x = "HDL", y = "Density") +
theme_minimal()
p4 <- ggplot(data.all, aes(x = tch)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of TCH", x = "TCH", y = "Density") +
theme_minimal()
p5 <- ggplot(data.all, aes(x = ltg)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of LTG", x = "LTG", y = "Density") +
theme_minimal()
p6 <- ggplot(data.all, aes(x = glu)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of GLU", x = "GLU", y = "Density") +
theme_minimal()
# Arrange the plots in a grid
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 3)

#create individual plots for the age, sex bmi, and map variables, start at p7
p7 <- ggplot(data.all, aes(x = age)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of Age", x = "age", y = "Density") +
theme_minimal()
p8 <- ggplot(data.all, aes(x = sex)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of Sex", x = "sex", y = "Density") +
theme_minimal()
p9 <- ggplot(data.all, aes(x = bmi)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of BMI", x = "bmi", y = "Density") +
theme_minimal()
p10 <- ggplot(data.all, aes(x = map)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of MAP", x = "map", y = "Density") +
theme_minimal()
#arrange the plots in a grid
grid.arrange(p7, p8, p9, p10, ncol = 2)

Correl Matrix
#Create a heatmap of the correaltions using corrplot
#install.packages("corrplot")
library(corrplot)
## corrplot 0.95 loaded
# Compute the correlation matrix
correlation_matrix <- cor(data.all, use = "pairwise.complete.obs")
# Create a heatmap of the correlation matrix
corrplot(correlation_matrix, method = "color", tl.col = "black", tl.srt = 45, addCoef.col = "black", number.cex = 0.7, title = "Correlation Heatmap", mar = c(0, 0, 1, 0))

Box Plots
#Generate Boxplots for each variable
#install.packages("ggpubr")
library(ggpubr)
# Create boxplots for age and overlay the observations
bp_age <- ggplot(data.all, aes(x = "", y = age)) +
geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
labs(title = "Boxplot of Age", x = "", y = "Age") +
theme_minimal()
bp_bmi <- ggplot(data.all, aes(x = "", y = bmi)) +
geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
labs(title = "Boxplot of BMI", x = "", y = "BMI") +
theme_minimal()
bp_map <- ggplot(data.all, aes(x = "", y = map)) +
geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
labs(title = "Boxplot of MAP", x = "", y = "MAP") +
theme_minimal()
bp_tc <- ggplot(data.all, aes(x = "", y = tc)) +
geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
labs(title = "Boxplot of TC", x = "", y = "TC") +
theme_minimal()
bp_ldl <- ggplot(data.all, aes(x = "", y = ldl)) +
geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
labs(title = "Boxplot of LDL", x = "", y = "LDL") +
theme_minimal()
bp_hdl <- ggplot(data.all, aes(x = "", y = hdl)) +
geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
labs(title = "Boxplot of HDL", x = "", y = "HDL") +
theme_minimal()
bp_tch <- ggplot(data.all, aes(x = "", y = tch)) +
geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
labs(title = "Boxplot of TCH", x = "", y = "TCH") +
theme_minimal()
bp_ltg <- ggplot(data.all, aes(x = "", y = ltg)) +
geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
labs(title = "Boxplot of LTG", x = "", y = "LTG") +
theme_minimal()
bp_glu <- ggplot(data.all, aes(x = "", y = glu)) +
geom_boxplot(fill = "lightblue", outlier.colour = "red", outlier.size = 1.5) +
geom_jitter(color = "black", size = 0.5, alpha = 0.5) +
labs(title = "Boxplot of GLU", x = "", y = "GLU") +
theme_minimal()
# Arrange the boxplots in a grid
library(gridExtra)
library(grid)
library(ggplot2)
#Arrange the boxplots in a 3x3 grid
grid.arrange(bp_age, bp_bmi, bp_map, bp_tc, bp_ldl, bp_hdl, ncol = 3)

# Arrange the boxplots in a 3x3 grid
grid.arrange(bp_tch, bp_ltg, bp_glu, ncol = 3)
Outliers
#count the number of observations 2x above the 75th percentile
#for each variable
#install the dplyr package
library(dplyr)
# Create a function to count the number of observations 2x above the 75th percentile
count_above_75th <- function(x) {
threshold <- quantile(x, 0.75, na.rm = TRUE)
sum(x > (2 * threshold), na.rm = TRUE)
}
#apply the function to each variable
outliers_count <- sapply(data.all, count_above_75th)
# Create a data frame to display the results
outliers_df <- data.frame(Variable = names(outliers_count), Outliers_Count = outliers_count)
# Display the results
outliers_df <- outliers_df %>%
arrange(desc(Outliers_Count)) %>%
mutate(Outliers_Count = as.integer(Outliers_Count))
#count the outliers for each variable
outliers_df <- outliers_df %>%
mutate(Outliers_Count = ifelse(Outliers_Count > 0, Outliers_Count, NA))
print(outliers_df)
## Variable Outliers_Count
## tch tch 58
## glu glu 55
## tc tc 52
## hdl hdl 52
## bmi bmi 45
## ldl ldl 43
## ltg ltg 42
## map map 31
## age age 13
## sex sex NA
## y y NA
#count the number of observations 2x below the 25th percentile
#for each variable
# Create a function to count the number of observations below the 25th percentile
count_below_25th <- function(x) {
threshold <- quantile(x, 0.25, na.rm = TRUE)
sum(x < (2 *threshold), na.rm = TRUE)
}
#apply the function to each variable
outliers_count_below <- sapply(data.all, count_below_25th)
# Create a data frame to display the results
outliers_df_below <- data.frame(Variable = names(outliers_count_below), Outliers_Count = outliers_count_below)
# Display the results
outliers_df_below <- outliers_df_below %>%
arrange(desc(Outliers_Count)) %>%
mutate(Outliers_Count = as.integer(Outliers_Count))
#count the outliers for each variable
outliers_df_below <- outliers_df_below %>%
mutate(Outliers_Count = ifelse(Outliers_Count > 0, Outliers_Count, NA))
print(outliers_df_below)
## Variable Outliers_Count
## y y 275
## ldl ldl 39
## glu glu 35
## ltg ltg 31
## age age 30
## tc tc 28
## map map 23
## bmi bmi 21
## hdl hdl 15
## sex sex NA
## tch tch NA
#scale age bmi map tc ldl hdl tch ltg glu by MAD
#install.packages("robustbase")
library(robustbase)
# Load required library
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
# Perform Yeo-Johnson transformation
vars_to_transform <- setdiff(names(data.all), c("sex", "y"))
yj <- powerTransform(data.all[vars_to_transform], family = "yjPower") # Estimate lambda
data.all.scaled <- data.all
data.all.scaled[vars_to_transform] <- yjPower(data.all[vars_to_transform], coef(yj))
#check the scaled data
head(data.all.scaled)
## age sex bmi map tc ldl
## 1 0.039431878 0.05068012 0.05221748 0.021247857 -0.044206270 -0.03481480
## 2 -0.001878745 -0.04464164 -0.05962698 -0.027270752 -0.008448088 -0.01916152
## 3 0.092193787 0.05068012 0.03935396 -0.005713851 -0.045581142 -0.03418871
## 4 -0.082295234 -0.04464164 -0.01198357 -0.038494846 0.012191892 0.02499368
## 5 0.005409915 -0.04464164 -0.04036247 0.021247857 0.003934990 0.01559734
## 6 -0.085387666 -0.04464164 -0.04570613 -0.019954320 -0.068949058 -0.07925735
## hdl tch ltg glu y
## 1 -0.043396525 -0.002615009 0.022352654 -0.017874473 151
## 2 0.074424140 -0.045162796 -0.047668264 -0.098508964 75
## 3 -0.032353522 -0.002615009 0.002911379 -0.026424040 141
## 4 -0.036034584 0.030670249 0.025899440 -0.009426102 206
## 5 0.008142237 -0.002615009 -0.026815883 -0.048243186 135
## 6 0.041280735 -0.099198741 -0.032898611 -0.103234502 97
#check the dimensions of the scaled data
cat("The dimensions of the scaled DataSet are: ", dim(data.all.scaled)[1], " rows and ", dim(data.all.scaled)[2], " columns.\n")
## The dimensions of the scaled DataSet are: 442 rows and 11 columns.
#check the summary statistics of the scaled data
summary(data.all.scaled)
## age sex bmi map
## Min. :-0.097570 Min. :-0.04464 Min. :-0.116944 Min. :-0.130421
## 1st Qu.:-0.036054 1st Qu.:-0.04464 1st Qu.:-0.037738 1st Qu.:-0.038495
## Median : 0.005410 Median :-0.04464 Median :-0.007436 Median :-0.005714
## Mean : 0.002037 Mean : 0.00000 Mean :-0.006053 Mean :-0.002939
## 3rd Qu.: 0.039432 3rd Qu.: 0.05068 3rd Qu.: 0.028658 3rd Qu.: 0.034012
## Max. : 0.122426 Max. : 0.05068 Max. : 0.111448 Max. : 0.111950
## tc ldl hdl
## Min. :-1.266e-01 Min. :-1.155e-01 Min. :-1.023e-01
## 1st Qu.:-3.424e-02 1st Qu.:-3.035e-02 1st Qu.:-3.511e-02
## Median :-4.321e-03 Median :-3.819e-03 Median :-6.584e-03
## Mean : 1.972e-05 Mean : 1.098e-05 Mean : 5.130e-06
## 3rd Qu.: 2.837e-02 3rd Qu.: 2.985e-02 3rd Qu.: 2.931e-02
## Max. : 1.541e-01 Max. : 1.990e-01 Max. : 1.813e-01
## tch ltg glu y
## Min. :-0.099199 Min. :-0.067887 Min. :-0.151938 Min. : 25.0
## 1st Qu.:-0.045163 1st Qu.:-0.027685 1st Qu.:-0.033991 1st Qu.: 87.0
## Median :-0.002615 Median :-0.001925 Median :-0.001079 Median :140.5
## Mean :-0.007068 Mean : 0.014251 Mean :-0.001608 Mean :152.1
## 3rd Qu.: 0.030670 3rd Qu.: 0.039220 3rd Qu.: 0.027360 3rd Qu.:211.5
## Max. : 0.108560 Max. : 0.303423 Max. : 0.123502 Max. :346.0
#check the skewness and kurtosis of the scaled data
skewness_values_scaled <- sapply(data.all.scaled, function(x) if(is.numeric(x)) skewness(x, na.rm = TRUE) else NA)
kurtosis_values_scaled <- sapply(data.all.scaled, function(x) if(is.numeric(x)) kurtosis(x, na.rm = TRUE) else NA)
#combine skewness and kurtosis into a data frame
skew_kurt_scaled <- data.frame(Skewness = skewness_values_scaled, Kurtosis = kurtosis_values_scaled)
#remove NA values
skew_kurt_scaled <- skew_kurt_scaled[complete.cases(skew_kurt_scaled), ]
#display the skewness and kurtosis values
skew_kurt_scaled <- round(skew_kurt_scaled, 3)
#display the skewness and kurtosis values
skew_kurt_scaled
## Skewness Kurtosis
## age -0.067 -0.709
## sex 0.127 -1.988
## bmi -0.006 -0.584
## map 0.033 -0.586
## tc 0.378 0.205
## ldl 0.435 0.568
## hdl 0.795 0.941
## tch -0.022 -0.510
## ltg 1.940 5.160
## glu -0.011 0.154
## y 0.438 -0.896
#count the number of observations with a absolute value of z-score greater than 3
#for each variable
# Create a function to count the number of observations with absolute z-score > 3
count_above_3 <- function(x) {
z_scores <- (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
sum(abs(z_scores) > 3, na.rm = TRUE)
}
#apply the function to the scaled data
outliers_count_3 <- sapply(data.all.scaled, count_above_3)
# Create a data frame to display the results
outliers_df_3 <- data.frame(Variable = names(outliers_count_3), Outliers_Count = outliers_count_3)
# Display the results
outliers_df_3 <- outliers_df_3 %>%
arrange(desc(Outliers_Count)) %>%
mutate(Outliers_Count = as.integer(Outliers_Count))
#count the outliers for each variable
outliers_df_3 <- outliers_df_3 %>%
mutate(Outliers_Count = ifelse(Outliers_Count > 0, Outliers_Count, NA))
print(outliers_df_3)
## Variable Outliers_Count
## ltg ltg 9
## hdl hdl 5
## tc tc 2
## ldl ldl 2
## glu glu 1
## age age NA
## sex sex NA
## bmi bmi NA
## map map NA
## tch tch NA
## y y NA
#change the values of observations with Z-scores greater than 3 to to the 90th percentile value
#for each variable
# Create a function to replace outliers with the 90th percentile value
replace_outliers <- function(x) {
z_scores <- (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
threshold <- quantile(x, 0.90, na.rm = TRUE)
x[abs(z_scores) > 3] <- threshold
return(x)
}
#apply the function to the scaled data
data.all.scaled <- data.all.scaled %>%
mutate(across(c(age, bmi, map, tc, ldl, hdl, tch, ltg, glu),
~ replace_outliers(.)))
#count the number of observations with a absolute value of z-score greater than 3
#for each variable
# Create a function to count the number of observations with absolute z-score > 3
count_above_3 <- function(x) {
z_scores <- (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
sum(abs(z_scores) > 3, na.rm = TRUE)
}
#apply this to the scaled data
outliers_count_3 <- sapply(data.all.scaled, count_above_3)
# Create a data frame to display the results
outliers_df_3 <- data.frame(Variable = names(outliers_count_3), Outliers_Count = outliers_count_3)
# Display the results
outliers_df_3 <- outliers_df_3 %>%
arrange(desc(Outliers_Count)) %>%
mutate(Outliers_Count = as.integer(Outliers_Count))
#count the outliers for each variable
outliers_df_3 <- outliers_df_3 %>%
mutate(Outliers_Count = ifelse(Outliers_Count > 0, Outliers_Count, NA))
print(outliers_df_3)
## Variable Outliers_Count
## ltg ltg 4
## hdl hdl 2
## age age NA
## sex sex NA
## bmi bmi NA
## map map NA
## tc tc NA
## ldl ldl NA
## tch tch NA
## glu glu NA
## y y NA
#Partition the scaled data into training and test sets
set.seed(123) # Set seed for reproducibility
train_index <- createDataPartition(data.all.scaled$y, p = 0.7, list = FALSE)
train_data <- as.data.frame(data.all.scaled[train_index, ])
test_data <- as.data.frame(data.all.scaled[-train_index, ])
# Check the dimensions of the training and test sets
cat("Training set dimensions are: ", dim(train_data)[1], " rows and ", dim(train_data)[2], " columns.\n")
## Training set dimensions are: 311 rows and 11 columns.
cat("Test set dimensions : ", dim(test_data)[1], " rows and ", dim(test_data)[2], " columns.\n")
## Test set dimensions : 131 rows and 11 columns.
dim(test_data)
## [1] 131 11
Linear Model
library(caret)
#fit linear model to predict y using all other variables
#apply 10 fold cross validation
set.seed(123) # Set seed for reproducibility
cv_LM <- train(y ~ ., data = train_data, method = "lm", trControl = trainControl(method = "cv", number = 10))
#check the summary of the linear model using stargazer
#install.packages("stargazer")
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(cv_LM$finalModel, type = "text", title = "Raw Data Regression Summary", digits = 3, single.row = TRUE)
##
## Raw Data Regression Summary
## ===============================================
## Dependent variable:
## ---------------------------
## .outcome
## -----------------------------------------------
## age -7.809 (74.624)
## sex -178.116** (75.318)
## bmi 532.159*** (85.155)
## map 306.876*** (81.933)
## tc -319.161 (359.229)
## ldl 136.715 (315.395)
## hdl -68.716 (220.546)
## tch 115.339 (229.445)
## ltg 521.633*** (145.345)
## glu 92.887 (79.166)
## Constant 152.877*** (4.028)
## -----------------------------------------------
## Observations 311
## R2 0.491
## Adjusted R2 0.474
## Residual Std. Error 55.401 (df = 300)
## F Statistic 28.967*** (df = 10; 300)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
library(stargazer)
#fit a null model
null_model <- lm(y ~ 1, data = train_data)
#check the summary of the null model
stargazer(null_model, type = "text", title = "Null Model Summary", digits = 3, single.row = TRUE)
##
## Null Model Summary
## ===============================================
## Dependent variable:
## ---------------------------
## y
## -----------------------------------------------
## Constant 151.887*** (4.333)
## -----------------------------------------------
## Observations 311
## R2 0.000
## Adjusted R2 0.000
## Residual Std. Error 76.409 (df = 310)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#predict the y values using the linear model
predictions_LM <- predict(cv_LM, newdata = test_data)
#copmute the mean prediction error
mean_prediction_error_LM <- mean(abs(predictions_LM - test_data$y))
#compute the mean squared error
mean_squared_error_LM <- mean((predictions_LM - test_data$y)^2)
#compute the RSS
residual_sum_of_squares_LM <- sum((predictions_LM - test_data$y)^2)
#compute the R squared value
r_squared_LM <- 1 - (residual_sum_of_squares_LM / sum((test_data$y - mean(test_data$y))^2))
#compute the BIC
bic_LM <- BIC(cv_LM$finalModel)
#Compute the MAD
mad_LM <- mad(predictions_LM - test_data$y)
LR_stat <- 2 * (logLik(cv_LM$finalModel) - logLik(null_model))
#combine the results into a data frame
results_LM <- data.frame(
Model = "Linear Model",
Mean_Prediction_Error = mean_prediction_error_LM,
MSE = mean_squared_error_LM,
RSS = residual_sum_of_squares_LM,
MAD = mad_LM,
Rsquared = r_squared_LM,
BIC = bic_LM,
LR_stat = LR_stat
)
#format as a kable
#install.packages("kableExtra")
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
results_LM %>%
kable("html", caption = "Linear Model Results") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:5, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(results_LM), background = "lightblue")
Linear Model Results
Model
|
Mean_Prediction_Error
|
MSE
|
RSS
|
MAD
|
Rsquared
|
BIC
|
LR_stat
|
Linear Model
|
44.78567
|
3090.399
|
404842.3
|
56.35199
|
0.5008519
|
3437.342
|
210.1672
|
#Caculate the standard error of the mean prediction error
#using the bootstrap method
#install.packages("boot")
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:lattice':
##
## melanoma
# Define a function to calculate the mean prediction error
mean_prediction_error <- function(data, indices) {
# Resample the data
resampled_data <- data[indices, ]
# Fit the model on the resampled data
model <- lm(y ~ ., data = resampled_data)
# Predict on the test set
predictions <- predict(model, newdata = test_data)
# Calculate the mean prediction error
mean(abs(predictions - test_data$y))
}
# Perform bootstrap resampling
set.seed(123) # Set seed for reproducibility
boot_results <- boot(data = train_data, statistic = mean_prediction_error, R = 1000)
# Calculate the standard error of the mean prediction error
mean_prediction_error_se <- sd(boot_results$t)
cat("The standard error of the mean prediction error is:", mean_prediction_error_se, "\n")
## The standard error of the mean prediction error is: 1.009269
#plot the residual vs predicted
library(ggplot2)
# Create a data frame with fitted values and residuals
residuals_plot_data <- data.frame(
Fitted = cv_LM$finalModel$fitted.values,
Residuals = cv_LM$finalModel$residuals
)
# Create the residuals vs fitted plot
ggplot(residuals_plot_data, aes(x = Fitted, y = Residuals)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(
title = "Residuals vs Fitted Values",
x = "Fitted Values",
y = "Residuals"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Best Subset
#setup parrallel processing
library(doParallel)
#detect the number of cores
numCores <- detectCores()
#setup the cluster
cl <- makeCluster(5)
#register the cluster
registerDoParallel(cl)
#Apply best subset selection
install.packages("leaps")
## package 'leaps' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\andre\AppData\Local\Temp\RtmpkBu0N7\downloaded_packages
library(leaps)
library(caret)
# Create 10 folds for cross-validation
set.seed(562) # For reproducibility
folds <- createFolds(train_data$y, k = 10, list = TRUE, returnTrain = TRUE)
# Initialize a matrix to store validation errors for each fold and subset size
cv_bic <- matrix(NA, nrow = 10, ncol = 10) # 10 folds, 10 subset sizes (nvmax)
# Perform cross-validation
for (i in seq_along(folds)) {
# Split the data into training and validation sets
training_indices <- folds[[i]]
training_data <- train_data[training_indices, ]
validation_data <- train_data[-training_indices, ]
#Loop through each subset size
for (j in 1:10) {
# Fit the best subset model
best_subset <- regsubsets(y ~ ., data = training_data, nvmax = j)
# Get the coefficients for the best model
best_model <- which.min(summary(best_subset)$bic)
best_model_coefficients <- coef(best_subset, best_model)
# Create a linear model using the selected variables
selected_vars <- names(best_model_coefficients)[-1] # Exclude intercept
formula <- as.formula(paste("y ~", paste(selected_vars, collapse = "+")))
lm_model <- lm(formula, data = training_data)
# Predict on the validation set
predictions <- predict(lm_model, newdata = validation_data)
# Calculate BIC for the model
bic_value <- BIC(lm_model)
# Store the BIC value
cv_bic[i, j] <- bic_value
}
}
# Calculate the average BIC for each subset size
avg_bic <- colMeans(cv_bic, na.rm = TRUE)
# Find the best subset size based on average BIC
best_subset_size <- which.min(avg_bic)
cat("The best subset size based on 10-fold cross-validated BIC is:", best_subset_size, "\n")
## The best subset size based on 10-fold cross-validated BIC is: 5
# Stop the cluster
stopCluster(cl)
#Get the coefficients to extract the predictors for the best subset size
best_model_coefficients <- coef(best_subset, best_subset_size)
# Get the names of the selected variables
selected_vars <- names(best_model_coefficients)[-1] # Exclude intercept
cat("The selected variables for the best subset size are:", selected_vars, "\n")
## The selected variables for the best subset size are: sex bmi map hdl ltg
library(stargazer)
#Create linear model from the best subset selection
# Create a formula for the best subset model
formula_best_subset <- as.formula(paste("y ~", paste(selected_vars, collapse = "+")))
# Fit the best subset model using the training data
best_model_LM <- lm(formula_best_subset, data = train_data)
# Check the summary of the best subset model
stargazer(best_model_LM, type = "text", title = "Best Subset Selection Model Summary", digits = 3, single.row = TRUE)
##
## Best Subset Selection Model Summary
## ===============================================
## Dependent variable:
## ---------------------------
## y
## -----------------------------------------------
## sex -164.834** (73.783)
## bmi 540.858*** (82.457)
## map 310.934*** (78.433)
## hdl -257.417*** (84.008)
## ltg 436.692*** (72.293)
## Constant 152.577*** (3.271)
## -----------------------------------------------
## Observations 311
## R2 0.485
## Adjusted R2 0.477
## Residual Std. Error 55.258 (df = 305)
## F Statistic 57.548*** (df = 5; 305)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#predict the y values using the best model
predictions_best_model <- predict(best_model_LM, newdata = test_data)
#compute the mean prediction error
mean_prediction_error_best_model <- mean(abs(predictions_best_model - test_data$y))
#compute the mean squared error
mean_squared_error_best_model <- mean((predictions_best_model - test_data$y)^2)
#compute the RSS
residual_sum_of_squares_best_model <- sum((predictions_best_model - test_data$y)^2)
#compute the R squared value
r_squared_best_model <- 1 - (residual_sum_of_squares_best_model / sum((test_data$y - mean(test_data$y))^2))
#compute the BIC
bic_best_model <- BIC(best_model_LM)
#compute the MAD
mad_best_model <- mad(predictions_best_model - test_data$y)
#compute the LR statistic
LR_stat_best_model <- 2 * (logLik(cv_LM$finalModel) - logLik(best_model_LM))
#combine the results into a data frame
results_best_model <- data.frame(
Model = "Best Subset Selection Model",
Mean_Prediction_Error = mean_prediction_error_best_model,
MSE = mean_squared_error_best_model,
MAD = mad_best_model,
RSS = residual_sum_of_squares_best_model,
Rsquared = r_squared_best_model,
BIC = bic_best_model,
LR_stat = LR_stat_best_model
)
#format as a kable
results_best_model %>%
kable("html", caption = "Best Subset Selection Model Results") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:5, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(results_best_model), background = "lightblue")
Best Subset Selection Model Results
Model
|
Mean_Prediction_Error
|
MSE
|
MAD
|
RSS
|
Rsquared
|
BIC
|
LR_stat
|
Best Subset Selection Model
|
45.22154
|
3129.867
|
62.59345
|
410012.5
|
0.4944773
|
3412.167
|
3.524188
|
#plot the residual vs predicted
library(ggplot2)
# Create a data frame with predictions and residuals
ggplot(data = data.frame(Fitted = best_model_LM$fitted.values, Residuals = best_model_LM$residuals), aes(x = Fitted, y = Residuals)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Residuals vs Fitted Values (Best Subset Model)", x = "Fitted Values", y = "Residuals") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Ridge
#install glmnet
#install.packages("glmnet")
library(glmnet)
# Create a matrix of predictors and a vector of response variable
x <- as.matrix(train_data[, -which(names(train_data) == "y")])
y <- train_data$y
#Perform Ridge regression with cross-validation
set.seed(562) # Set seed for reproducibility
cv_ridge <- cv.glmnet(x, y, alpha = 0, nfolds = 10)
# Plot the cross-validation results
plot(cv_ridge)
#highlight the 1SE lambda with green vertical line
abline(v = log(cv_ridge$lambda.1se), col = "green", lty = 2, lwd = 2)
#add legend to the plot denoting teh 1SE lambda
legend("topright", legend = "1SE Lambda", col = "green", lty = 2, lwd = 2)

#Extract the 1SE lambda
lambda_ridge <- cv_ridge$lambda.1se
cat("The optimal lambda for Ridge regression is:", lambda_ridge, "\n")
## The optimal lambda for Ridge regression is: 66.66497
library(glmnet)
library(kableExtra)
library(dplyr)
# Fit the Ridge regression model using the 1SE lambda
ridge_model <- glmnet(x, y, alpha = 0, lambda = lambda_ridge)
# Predict on the test set
predictions_ridge <- predict(ridge_model, s = lambda_ridge, newx = as.matrix(test_data[, -which(names(test_data) == "y")]))
# Compute the mean prediction error
mean_prediction_error_ridge <- mean(abs(predictions_ridge - test_data$y))
# Compute the mean squared error
mean_squared_error_ridge <- mean((predictions_ridge - test_data$y)^2)
# Compute the RSS
residual_sum_of_squares_ridge <- sum((predictions_ridge - test_data$y)^2)
# Compute the R squared value
r_squared_ridge <- 1 - (residual_sum_of_squares_ridge / sum((test_data$y - mean(test_data$y))^2))
# Compute the MAD
mad_ridge <- mad(predictions_ridge - test_data$y)
# Compute the LR statistic
n <- length(y)
sigma_squared_Ridge <- residual_sum_of_squares_ridge/n
logLik_ridge <- -n/2 * log(2 * pi) - n/2 * log(sigma_squared_Ridge) - 1/(2 * sigma_squared_Ridge) * residual_sum_of_squares_ridge
LR_stat_ridge <- 2 * ( logLik(cv_LM$finalModel) - logLik_ridge)
# Compute the BIC
# Manually calculate BIC
n <- nrow(train_data) # Number of observations
k <- length(coef(ridge_model, s = lambda_ridge))
# Number of parameters (including intercept)
bic_ridge <- n * log(residual_sum_of_squares_ridge / n) + log(n) * k
# Combine the results into a data frame
results_ridge <- data.frame(
Model = "Ridge Regression Model",
Mean_Prediction_Error = mean_prediction_error_ridge,
MSE = mean_squared_error_ridge,
MAD = mad_ridge,
RSS = residual_sum_of_squares_ridge,
Rsquared = r_squared_ridge,
BIC = bic_ridge,
LR_stat = LR_stat_ridge
)
#format as a kable
results_ridge %>%
kable("html", caption = "Ridge Regression Model Results") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:5, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(results_ridge), background = "lightblue")
Ridge Regression Model Results
Model
|
Mean_Prediction_Error
|
MSE
|
MAD
|
RSS
|
Rsquared
|
BIC
|
LR_stat
|
Ridge Regression Model
|
49.69468
|
3551.386
|
67.61069
|
465231.6
|
0.4263953
|
2336.703
|
-212.3201
|
#install the broom package
#install.packages("broom")
library(broom)
#tidy the ridge model
ridge_model_tidy <- tidy(ridge_model, s = lambda_ridge)
#format the ridge model coefficients as a kable
ridge_model_tidy %>%
kable("html", caption = "Ridge Regression Model Coefficients") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:3, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(ridge_model_tidy), background = "lightblue")
Ridge Regression Model Coefficients
term
|
step
|
estimate
|
lambda
|
dev.ratio
|
(Intercept)
|
1
|
153.45143
|
66.66497
|
0.4416468
|
age
|
1
|
37.52676
|
66.66497
|
0.4416468
|
sex
|
1
|
-56.19174
|
66.66497
|
0.4416468
|
bmi
|
1
|
317.73133
|
66.66497
|
0.4416468
|
map
|
1
|
206.82558
|
66.66497
|
0.4416468
|
tc
|
1
|
23.15439
|
66.66497
|
0.4416468
|
ldl
|
1
|
-15.80633
|
66.66497
|
0.4416468
|
hdl
|
1
|
-141.18176
|
66.66497
|
0.4416468
|
tch
|
1
|
125.91879
|
66.66497
|
0.4416468
|
ltg
|
1
|
243.45232
|
66.66497
|
0.4416468
|
glu
|
1
|
115.65020
|
66.66497
|
0.4416468
|
#grid the 2 kables
library(gridExtra)
library(grid)
library(htmltools)
# Create individual plots
p12 <- results_ridge %>%
kable("html", caption = "Ridge Regression Model Results") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:5, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(results_ridge), background = "lightblue")
p13 <- ridge_model_tidy %>%
kable("html", caption = "Ridge Regression Model Coefficients") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:3, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(ridge_model_tidy), background = "lightblue")
#plot the residual vs predicted from the ridge model
library(ggplot2)
# Create a data frame with predictions and residuals
predictions_ridge <- as.vector(predictions_ridge)
residuals_ridge <- as.vector(predictions_ridge - test_data$y)
ggplot(data = data.frame(Fitted = predictions_ridge, Residuals = residuals_ridge), aes(x = Fitted, y = Residuals)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Residuals vs Fitted Values (Ridge Model)", x = "Fitted Values", y = "Residuals") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

LASSO
library(glmnet)
#Perform Lasso regression with cross validation
set.seed(562) # Set seed for reproducibility
cv_lasso <- cv.glmnet(x, y, alpha = 1, nfolds = 10)
# Plot the cross-validation results
plot(cv_lasso)
#Extract the 1SE lambda
lambda_lasso <- cv_lasso$lambda.1se
cat("The optimal lambda for Lasso regression is:", lambda_lasso, "\n")
## The optimal lambda for Lasso regression is: 10.13248
#highlight the 1SE lambda with green vertical line
abline(v = log(cv_lasso$lambda.1se), col = "green", lty = 2, lwd = 2)
#add legend to the plot denoting the 1SE lambda
legend("topright", legend = "1SE Lambda", col = "green", lty = 2, lwd = 2)

# Fit the Lasso regression model using the 1SE lambda
lasso_model <- glmnet(x, y, alpha = 1, lambda = lambda_lasso)
# Predict on the test set
predictions_lasso <- predict(lasso_model, s = lambda_lasso, newx = as.matrix(test_data[, -which(names(test_data) == "y")]))
# Compute the mean prediction error
mean_prediction_error_lasso <- mean(abs(predictions_lasso - test_data$y))
# Compute the mean squared error
mean_squared_error_lasso <- mean((predictions_lasso - test_data$y)^2)
# Compute the RSS
residual_sum_of_squares_lasso <- sum((predictions_lasso - test_data$y)^2)
# Compute the R squared value
r_squared_lasso <- 1 - (residual_sum_of_squares_lasso / sum((test_data$y - mean(test_data$y))^2))
#compute the MAD
mad_lasso <- mad(predictions_lasso - test_data$y)
# Compute the LR statistic
n <- length(y)
sigma_squared_Lasso <- residual_sum_of_squares_lasso/n
logLik_lasso <- -n/2 * log(2 * pi) - n/2 * log(sigma_squared_Lasso) - 1/(2 * sigma_squared_Lasso) * residual_sum_of_squares_lasso
LR_stat_lasso <- 2 * ( logLik(cv_LM$finalModel) - logLik_lasso)
# Compute the BIC
# Manually calculate BIC
n <- nrow(train_data) # Number of observations
k <- length(coef(lasso_model, s = lambda_lasso))
# Number of parameters (including intercept)
bic_lasso <- n * log(residual_sum_of_squares_lasso / n) + log(n) * k
# Combine the results into a data frame
results_lasso <- data.frame(
Model = "Lasso Regression Model",
Mean_Prediction_Error = mean_prediction_error_lasso,
MSE = mean_squared_error_lasso,
MAD = mad_lasso,
RSS = residual_sum_of_squares_lasso,
Rsquared = r_squared_lasso,
BIC = bic_lasso,
LR_stat = LR_stat_lasso
)
#format as a kable
results_lasso %>%
kable("html", caption = "Lasso Regression Model Results") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:5, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(results_lasso), background = "lightblue")
Lasso Regression Model Results
Model
|
Mean_Prediction_Error
|
MSE
|
MAD
|
RSS
|
Rsquared
|
BIC
|
LR_stat
|
Lasso Regression Model
|
50.03535
|
3631.114
|
65.17592
|
475675.9
|
0.413518
|
2343.607
|
-205.4155
|
#install the broom package
#install.packages("broom")
library(broom)
#tidy the lasso model
lasso_model_tidy <- tidy(lasso_model, s = lambda_lasso)
#format the lasso model coefficients as a kable
lasso_model_tidy %>%
kable("html", caption = "Lasso Regression Model Coefficients") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:3, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(lasso_model_tidy), background = "lightblue")
Lasso Regression Model Coefficients
term
|
step
|
estimate
|
lambda
|
dev.ratio
|
(Intercept)
|
1
|
152.22289
|
10.13248
|
0.4429363
|
bmi
|
1
|
487.03940
|
10.13248
|
0.4429363
|
map
|
1
|
146.51797
|
10.13248
|
0.4429363
|
hdl
|
1
|
-50.29042
|
10.13248
|
0.4429363
|
ltg
|
1
|
354.59762
|
10.13248
|
0.4429363
|
#create a vector of the predicted values
predictions_lasso <- as.vector(predictions_lasso)
residuals_lasso <- as.vector(predictions_lasso - test_data$y)
#plot the residual vs predicted from the lasso model
library(ggplot2)
# Create a data frame with predictions and residuals
ggplot(data = data.frame(Fitted = predictions_lasso, Residuals = residuals_lasso), aes(x = Fitted, y = Residuals)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Residuals vs Fitted Values (Lasso Model)", x = "Fitted Values", y = "Residuals") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Metrics Table
library(dplyr)
#create 1 data table with all the results
# Combine all results into a single data frame
all_results <- rbind(results_LM, results_best_model, results_ridge, results_lasso)
#format as a kable
all_results %>%
kable("html", caption = "All Model Results") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:5, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(all_results), background = "lightblue")
All Model Results
Model
|
Mean_Prediction_Error
|
MSE
|
RSS
|
MAD
|
Rsquared
|
BIC
|
LR_stat
|
Linear Model
|
44.78567
|
3090.399
|
404842.3
|
56.35199
|
0.5008519
|
3437.342
|
210.167174
|
Best Subset Selection Model
|
45.22154
|
3129.867
|
410012.5
|
62.59345
|
0.4944773
|
3412.167
|
3.524188
|
Ridge Regression Model
|
49.69468
|
3551.386
|
465231.6
|
67.61069
|
0.4263953
|
2336.703
|
-212.320094
|
Lasso Regression Model
|
50.03535
|
3631.114
|
475675.9
|
65.17592
|
0.4135180
|
2343.607
|
-205.415464
|
Scatter Plots
#create a line plot for all variables with y
# Create a line plot for all variables with y
library(ggplot2)
library(gridExtra)
# Create individual plots for each variable
p7 <- ggplot(data.all, aes(x = age, y = y)) +
geom_line(color = "blue") +
labs(title = "Age vs Y", x = "Age", y = "Y") +
theme_minimal()
p8 <- ggplot(data.all, aes(x = bmi, y = y)) +
geom_line(color = "blue") +
labs(title = "BMI vs Y", x = "BMI", y = "Y") +
theme_minimal()
p9 <- ggplot(data.all, aes(x = map, y = y)) +
geom_line(color = "blue") +
labs(title = "MAP vs Y", x = "MAP", y = "Y") +
theme_minimal()
p10 <- ggplot(data.all, aes(x = tc, y = y)) +
geom_line(color = "blue") +
labs(title = "TC vs Y", x = "TC", y = "Y") +
theme_minimal()
p11 <- ggplot(data.all, aes(x = ldl, y = y)) +
geom_line(color = "blue") +
labs(title = "LDL vs Y", x = "LDL", y = "Y") +
theme_minimal()
p12 <- ggplot(data.all, aes(x = hdl, y = y)) +
geom_line(color = "blue") +
labs(title = "HDL vs Y", x = "HDL", y = "Y") +
theme_minimal()
p13 <- ggplot(data.all, aes(x = tch, y = y)) +
geom_line(color = "blue") +
labs(title = "TCH vs Y", x = "TCH", y = "Y") +
theme_minimal()
p14 <- ggplot(data.all, aes(x = ltg, y = y)) +
geom_line(color = "blue") +
labs(title = "LTG vs Y", x = "LTG", y = "Y") +
theme_minimal()
p15 <- ggplot(data.all, aes(x = glu, y = y)) +
geom_line(color = "blue") +
labs(title = "GLU vs Y", x = "GLU", y = "Y") +
theme_minimal()
# Arrange the plots in a grid
grid.arrange(p7, p8, p9, p10, p11, p12, p13, p14, p15, ncol = 3)

# Arrange the plots in a grid
grid.arrange(p7, p8, p9, p10, p11, p12, ncol = 3)

# Arrange the plots in a grid
grid.arrange(p13, p14, p15, ncol = 3)

#create a scatter plot for each variable with y
# Create a scatter plot for each variable with y
library(ggplot2)
library(gridExtra)
# Create individual scatter plots for each variable
p1 <- ggplot(data.all, aes(x = age, y = y)) +
geom_point(color = "blue") +
labs(title = "Age vs Y", x = "Age", y = "Y") +
theme_minimal()
p2 <- ggplot(data.all, aes(x = bmi, y = y)) +
geom_point(color = "blue") +
labs(title = "BMI vs Y", x = "BMI", y = "Y") +
theme_minimal()
p3 <- ggplot(data.all, aes(x = map, y = y)) +
geom_point(color = "blue") +
labs(title = "MAP vs Y", x = "MAP", y = "Y") +
theme_minimal()
p4 <- ggplot(data.all, aes(x = tc, y = y)) +
geom_point(color = "blue") +
labs(title = "TC vs Y", x = "TC", y = "Y") +
theme_minimal()
p5 <- ggplot(data.all, aes(x = ldl, y = y)) +
geom_point(color = "blue") +
labs(title = "LDL vs Y", x = "LDL", y = "Y") +
theme_minimal()
p6 <- ggplot(data.all, aes(x = hdl, y = y)) +
geom_point(color = "blue") +
labs(title = "HDL vs Y", x = "HDL", y = "Y") +
theme_minimal()
p7 <- ggplot(data.all, aes(x = tch, y = y)) +
geom_point(color = "blue") +
labs(title = "TCH vs Y", x = "TCH", y = "Y") +
theme_minimal()
p8 <- ggplot(data.all, aes(x = ltg, y = y)) +
geom_point(color = "blue") +
labs(title = "LTG vs Y", x = "LTG", y = "Y") +
theme_minimal()
p9 <- ggplot(data.all, aes(x = glu, y = y)) +
geom_point(color = "blue") +
labs(title = "GLU vs Y", x = "GLU", y = "Y") +
theme_minimal()
# Arrange the plots in a grid
grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, ncol = 3)

# Arrange the plots in a grid
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 3)

# Arrange the plots in a grid
grid.arrange(p7, p8, p9, ncol = 3)

Alternate Ridge_ Quadratics
#square the BMI from the data all scaled data and add it to the data frame
# Create a new variable for squared BMI
#install the dplyr package
library(dplyr)
#Create a new variable for bmi squared
data.all.scaled2 <- data.all.scaled %>%
mutate(bmi_squared = bmi^2)
#create a new variable for htl squared
data.all.scaled2 <- data.all.scaled %>%
mutate(htl_squared = hdl^2)
str(data.all.scaled2)
## 'data.frame': 442 obs. of 12 variables:
## $ age : num 0.03943 -0.00188 0.09219 -0.0823 0.00541 ...
## $ sex : num 0.0507 -0.0446 0.0507 -0.0446 -0.0446 ...
## $ bmi : num 0.0522 -0.0596 0.0394 -0.012 -0.0404 ...
## $ map : num 0.02125 -0.02727 -0.00571 -0.03849 0.02125 ...
## $ tc : num -0.04421 -0.00845 -0.04558 0.01219 0.00393 ...
## $ ldl : num -0.0348 -0.0192 -0.0342 0.025 0.0156 ...
## $ hdl : num -0.0434 0.07442 -0.03235 -0.03603 0.00814 ...
## $ tch : num -0.00262 -0.04516 -0.00262 0.03067 -0.00262 ...
## $ ltg : num 0.02235 -0.04767 0.00291 0.0259 -0.02682 ...
## $ glu : num -0.01787 -0.09851 -0.02642 -0.00943 -0.04824 ...
## $ y : num 151 75 141 206 135 97 138 63 110 310 ...
## $ htl_squared: num 1.88e-03 5.54e-03 1.05e-03 1.30e-03 6.63e-05 ...
head(data.all.scaled2)
## age sex bmi map tc ldl
## 1 0.039431878 0.05068012 0.05221748 0.021247857 -0.044206270 -0.03481480
## 2 -0.001878745 -0.04464164 -0.05962698 -0.027270752 -0.008448088 -0.01916152
## 3 0.092193787 0.05068012 0.03935396 -0.005713851 -0.045581142 -0.03418871
## 4 -0.082295234 -0.04464164 -0.01198357 -0.038494846 0.012191892 0.02499368
## 5 0.005409915 -0.04464164 -0.04036247 0.021247857 0.003934990 0.01559734
## 6 -0.085387666 -0.04464164 -0.04570613 -0.019954320 -0.068949058 -0.07925735
## hdl tch ltg glu y htl_squared
## 1 -0.043396525 -0.002615009 0.022352654 -0.017874473 151 1.883258e-03
## 2 0.074424140 -0.045162796 -0.047668264 -0.098508964 75 5.538953e-03
## 3 -0.032353522 -0.002615009 0.002911379 -0.026424040 141 1.046750e-03
## 4 -0.036034584 0.030670249 0.025899440 -0.009426102 206 1.298491e-03
## 5 0.008142237 -0.002615009 -0.026815883 -0.048243186 135 6.629603e-05
## 6 0.041280735 -0.099198741 -0.032898611 -0.103234502 97 1.704099e-03
#partion the data into training and test sets
set.seed(123) # Set seed for reproducibility
train_index2 <- createDataPartition(data.all.scaled2$y, p = 0.7, list = FALSE)
train_data2 <- as.data.frame(data.all.scaled2[train_index2, ])
test_data2<- as.data.frame(data.all.scaled2[-train_index2, ])
# Check the dimensions of the training and test sets
cat("Training set dimensions are: ", dim(train_data2)[1], " rows and ", dim(train_data2)[2], " columns.\n")
## Training set dimensions are: 311 rows and 12 columns.
cat("Test set dimensions : ", dim(test_data2)[1], " rows and ", dim(test_data2)[2], " columns.\n")
## Test set dimensions : 131 rows and 12 columns.
#perform a ridge model with cross validation
# Create a matrix of predictors and a vector of response variable
x2 <- as.matrix(train_data2[, -which(names(train_data2) == "y")])
y2 <- train_data2$y
#Perform Ridge regression with cross-validation
set.seed(562) # Set seed for reproducibility
cv_ridge2 <- cv.glmnet(x2, y2, alpha = 0, nfolds = 10)
# Plot the cross-validation results
plot(cv_ridge2)
#highlight the 1SE lambda with green vertical line
abline(v = log(cv_ridge2$lambda.1se), col = "green", lty = 2, lwd = 2)
#add legend to the plot denoting teh 1SE lambda
legend("topright", legend = "1SE Lambda", col = "green", lty = 2, lwd = 2)

#Extract the 1SE lambda
lambda_ridge2 <- cv_ridge2$lambda.1se
cat("The optimal lambda for Ridge regression is:", lambda_ridge2, "\n")
## The optimal lambda for Ridge regression is: 66.66497
# Fit the Ridge regression model using the 1SE lambda
ridge_model2 <- glmnet(x2, y2, alpha = 0, lambda = lambda_ridge2)
# Predict on the test set
predictions_ridge2 <- predict(ridge_model2, s = lambda_ridge2, newx = as.matrix(test_data2[, -which(names(test_data2) == "y")]))
# Compute the mean prediction error
mean_prediction_error_ridge2 <- mean(abs(predictions_ridge2 - test_data2$y))
# Compute the mean squared error
mean_squared_error_ridge2 <- mean((predictions_ridge2 - test_data2$y)^2)
# Compute the RSS
residual_sum_of_squares_ridge2 <- sum((predictions_ridge2 - test_data2$y)^2)
# Compute the R squared value
r_squared_ridge2 <- 1 - (residual_sum_of_squares_ridge2 / sum((test_data2$y - mean(test_data2$y))^2))
# Compute the MAD
mad_ridge2 <- mad(predictions_ridge2 - test_data2$y)
# Compute the LR statistic
n2 <- length(y2)
sigma_squared_Ridge2 <- residual_sum_of_squares_ridge2/n2
logLik_ridge2 <- -n2/2 * log(2 * pi) - n2/2 * log(sigma_squared_Ridge2) - 1/(2 * sigma_squared_Ridge2) * residual_sum_of_squares_ridge2
LR_stat_ridge2 <- 2 * ( logLik(cv_LM$finalModel) - logLik_ridge2)
# Compute the BIC
# Manually calculate BIC
n2 <- nrow(train_data2) # Number of observations
k2 <- length(coef(ridge_model2, s = lambda_ridge2))
# Number of parameters (including intercept)
bic_ridge2 <- n2 * log(residual_sum_of_squares_ridge2 / n2) + log(n2) * k2
# Combine the results into a data frame
results_ridge2 <- data.frame(
Model = "Ridge Regression Model with BMI Squared",
Mean_Prediction_Error = mean_prediction_error_ridge2,
MSE = mean_squared_error_ridge2,
MAD = mad_ridge2,
RSS = residual_sum_of_squares_ridge2,
Rsquared = r_squared_ridge2,
BIC = bic_ridge2,
LR_stat = LR_stat_ridge2
)
#format as a kable
results_ridge2 %>%
kable("html", caption = "Ridge Regression Model with BMI Squared Results") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:5, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(results_ridge2), background = "lightblue")
Ridge Regression Model with BMI Squared Results
Model
|
Mean_Prediction_Error
|
MSE
|
MAD
|
RSS
|
Rsquared
|
BIC
|
LR_stat
|
Ridge Regression Model with BMI Squared
|
49.67256
|
3550.158
|
67.59386
|
465070.8
|
0.4265936
|
2342.335
|
-212.4276
|
#show the coefficient estimates
#install the broom package
#install.packages("broom")
library(broom)
#tidy the ridge model
ridge_model2_tidy <- tidy(ridge_model2, s = lambda_ridge2)
#format the ridge model coefficients as a kable
ridge_model2_tidy %>%
kable("html", caption = "Ridge Regression Model with BMI Squared Coefficients") %>%
kable_styling("striped", full_width = F) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:3, color = "black") %>%
row_spec(0, bold = TRUE, color = "white", background = "Grey") %>%
row_spec(1:nrow(ridge_model2_tidy), background = "lightblue")
Ridge Regression Model with BMI Squared Coefficients
term
|
step
|
estimate
|
lambda
|
dev.ratio
|
(Intercept)
|
1
|
153.55218
|
66.66497
|
0.4416078
|
age
|
1
|
37.77190
|
66.66497
|
0.4416078
|
sex
|
1
|
-56.17585
|
66.66497
|
0.4416078
|
bmi
|
1
|
317.63017
|
66.66497
|
0.4416078
|
map
|
1
|
206.66046
|
66.66497
|
0.4416078
|
tc
|
1
|
23.19310
|
66.66497
|
0.4416078
|
ldl
|
1
|
-16.09537
|
66.66497
|
0.4416078
|
hdl
|
1
|
-140.60619
|
66.66497
|
0.4416078
|
tch
|
1
|
125.97219
|
66.66497
|
0.4416078
|
ltg
|
1
|
243.58352
|
66.66497
|
0.4416078
|
glu
|
1
|
115.71980
|
66.66497
|
0.4416078
|
htl_squared
|
1
|
-52.20919
|
66.66497
|
0.4416078
|
# Create a data frame with predictions and residuals
predictions_ridge2 <- as.vector(predictions_ridge2)
residuals_ridge2 <- as.vector(predictions_ridge2 - test_data2$y)
#plot the residual vs predicted from the ridge model
library(ggplot2)
# Create a data frame with predictions and residuals
ggplot(data = data.frame(Fitted = predictions_ridge2, Residuals = residuals_ridge2), aes(x = Fitted, y = Residuals)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Residuals vs Fitted Values (Ridge Model with BMI Squared)", x = "Fitted Values", y = "Residuals") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
