install.packages("cowplot")
install.packages("themis")
install.packages("kknn")

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

also installing the dependencies ‘RANN’, ‘ROSE’


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

# Loading package

library(tidyverse)
library(repr)
library(tidymodels)
library(cowplot)
library(themis)
library(kknn)
options(repr.matrix.max.rows = 9)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.2     ✔ purrr   1.0.1
✔ tibble  3.2.1     ✔ dplyr   1.1.1
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──

✔ broom        1.0.2     ✔ rsample      1.1.1
✔ dials        1.1.0     ✔ tune         1.0.1
✔ infer        1.0.4     ✔ workflows    1.1.2
✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
✔ parsnip      1.0.3     ✔ yardstick    1.1.0
✔ recipes      1.0.4     

── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Dig deeper into tidy modeling with R at https://www.tmwr.org

Error in library(cowplot): there is no package called ‘cowplot’
Traceback:

1. library(cowplot)

# Import dataset

download.file("https://github.com/PedroMMHernandes/GroupProject_DSCI/raw/main/wine.zip", "wine.zip")
unzip("wine.zip", "winequality-white.csv")

set.seed(2024)

white_wine_data <- read_csv2("winequality-white.csv", show_col_types = FALSE) |>#read the dataframe
    mutate_if(is.character, as.numeric) |>
    mutate(quality = as.factor(quality))

head(white_wine_data)

ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.

#split of the unbalanced dataset

wine_data_split <- white_wine_data |>           
    initial_split(prop = 0.75, strata = quality)  
wine_data_train <- training(wine_data_split)    
wine_data_test <- testing(wine_data_split)

# the summary of the training data set 
summary(wine_data_train)

wine_data_train |> 
group_by(quality) |>
summarise(across(`fixed acidity`: `alcohol`, mean, na.rm = TRUE))

 fixed acidity    volatile acidity  citric acid    residual sugar  
 Min.   :  5.00   Min.   :0.0800   Min.   :0.000   Min.   : 0.600  
 1st Qu.: 61.00   1st Qu.:0.2100   1st Qu.:0.270   1st Qu.: 1.700  
 Median : 67.00   Median :0.2600   Median :0.320   Median : 5.250  
 Mean   : 62.45   Mean   :0.2781   Mean   :0.334   Mean   : 6.401  
 3rd Qu.: 73.00   3rd Qu.:0.3200   3rd Qu.:0.390   3rd Qu.: 9.850  
 Max.   :715.00   Max.   :1.1000   Max.   :1.660   Max.   :65.800  
                                                                   
   chlorides       free sulfur dioxide total sulfur dioxide    density      
 Min.   :0.00900   Min.   :   2.00     Min.   :  18.0       Min.   :0.9874  
 1st Qu.:0.03600   1st Qu.:  23.00     1st Qu.: 108.0       1st Qu.:0.9918  
 Median :0.04300   Median :  34.00     Median : 134.0       Median :0.9938  
 Mean   :0.04595   Mean   :  41.25     Mean   : 148.8       Mean   :0.9940  
 3rd Qu.:0.05000   3rd Qu.:  46.00     3rd Qu.: 168.0       3rd Qu.:0.9961  
 Max.   :0.34600   Max.   :1465.00     Max.   :3075.0       Max.   :1.0390  
                                                                            
       pH          sulphates       alcohol          quality 
 Min.   :  3.0   Min.   :0.22   Min.   :8.000e+00   3:  15  
 1st Qu.:304.0   1st Qu.:0.41   1st Qu.:9.200e+01   4: 124  
 Median :316.0   Median :0.48   Median :1.010e+02   5:1100  
 Mean   :287.5   Mean   :0.49   Mean   :1.760e+12   6:1644  
 3rd Qu.:326.0   3rd Qu.:0.55   3rd Qu.:1.130e+02   7: 656  
 Max.   :382.0   Max.   :1.08   Max.   :9.733e+14   8: 132  
                                                    9:   2

#GGpair plot set 1

options(repr.plot.width = 10, repr.plot.height = 4)

plot_va <- wine_data_train |>
    ggplot(aes(x = `volatile acidity`, fill = quality),) +
    geom_density() +
    labs(x = "Volatile acidity %", fill = "Quality in a scale of 1 to 10") 
    
plot_fa <- wine_data_train |>
    ggplot(aes(x = `fixed acidity`, fill = quality),) +
    geom_density() +
    labs(x = "Fixec acidity U/L", fill = "Quality in a scale of 1 to 10") +
    xlim(0,200)
    
plot_ca <- wine_data_train |>
    ggplot(aes(x = `citric acid`, fill = quality),) +
    geom_density() +
    labs(x = "Citric Acid %", fill = "Quality in a scale of 1 to 10") 
    
plot_rs <- wine_data_train |>
    ggplot(aes(x = `residual sugar`, fill = quality),) +
    geom_density() +
    labs(x = "residual sugar U/L", fill = "Quality in a scale of 1 to 10")
    
plot_ch <- wine_data_train |>
    ggplot(aes(x = `chlorides`, fill = quality),) +
    geom_density() +
    labs(x = "Chlorides %", fill = "Quality in a scale of 1 to 10") 
    
plot_fsd <- wine_data_train |>
    ggplot(aes(x = `free sulfur dioxide`, fill = quality),) +
    geom_density() +
    labs(x = "Free sulfur dioxide U/L", fill = "Quality in a scale of 1 to 10") 
    
plot_tsd <- wine_data_train |>
    ggplot(aes(x = `total sulfur dioxide`, fill = quality),) +
    geom_density() +
    labs(x = "Total sulfur dioxide U/L", fill = "Quality in a scale of 1 to 10") 
    
plot_de <- wine_data_train |>
    ggplot(aes(x = `density`, fill = quality),) +
    geom_density() +
    labs(x = "Density %", fill = "Quality in a scale of 1 to 10") 
    
plot_ph <- wine_data_train |>
    ggplot(aes(x = `pH`, fill = quality),) +
    geom_density() +
    labs(x = "pH", fill = "Quality in a scale of 1 to 10") 


plot_su <- wine_data_train |>
    ggplot(aes(x = `sulphates`, fill = quality),) +
    geom_density() +
    labs(x = "Sulphates %", fill = "Quality in a scale of 1 to 10") 
    
plot_al <- wine_data_train |>
    ggplot(aes(x = `alcohol`, fill = quality),) +
    geom_density() +
    #scale_x_log10()+
    labs(x = "Alcohol %", fill = "Quality in a scale of 1 to 10")
    

plot_grid(plot_va, plot_fa, plot_ca, plot_rs) 
plot_grid(plot_fsd, plot_tsd, plot_de, plot_ch) 
plot_grid(plot_ph, plot_su, plot_al)

Warning message:
“Removed 3 rows containing non-finite values (`stat_density()`).”

Warning message:
“Removed 55 rows containing non-finite values (`stat_density()`).”

##GGpair plot set 2

options(repr.plot.width = 10, repr.plot.height = 2)

plot_ch_by_va <- wine_data_train |>
    ggplot(aes(x = `chlorides`, y = `volatile acidity` )) +
    facet_grid(cols = vars(quality)) +
    geom_point()+ 
    labs(y = "Volatile acidity %", x = " chlorides %")

plot_va_by_de <- wine_data_train |>
    ggplot(aes(x = `volatile acidity`, y = `density` )) +
    facet_grid(cols = vars(quality)) +
    geom_point()+
    labs(x = "Volatile acidity %", y = " Density %")

plot_ph_by_su <- wine_data_train |>
    ggplot(aes(x = `pH`, y = `sulphates` )) +
    facet_grid(cols = vars(quality)) +
    geom_point()+ 
    labs(x = "pH", y = " sulphates %")+
    xlim(0, 200)


plot_ch_by_va
plot_va_by_de
plot_ph_by_su

Error in ggplot(wine_data_train, aes(x = chlorides, y = `volatile acidity`)): could not find function "ggplot"
Traceback:

#filter the data

wine_data_clean <- read_csv2("winequality-white.csv", show_col_types = FALSE) |>
    mutate_if(is.character, as.numeric) |>
    mutate(quality = as.factor(quality)) |>
    select(quality, `volatile acidity`, `chlorides`, `density`, `pH`, `sulphates` )

wine_data_clean

ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.

#Creates a recipe to balance the data

wine_recipe_balance <- recipe(quality ~ ., data = wine_data_clean)|>
    step_upsample(quality, over_ratio = 1, skip = FALSE) |>
    prep()
wine_recipe_balance

Recipe

Inputs:

      role #variables
   outcome          1
 predictor          5

Training data contained 4898 data points and no missing data.

Operations:

Up-sampling based on quality [trained]

#Creates the balanced dataset

balanced_wine <- bake(wine_recipe_balance, wine_data_clean)
balanced_wine |> group_by(quality) |> summarise(n = n())

set.seed(2024)

wine_data_split_2 <- balanced_wine |>           #split the balanced dataset
    initial_split(prop = 0.75, strata = quality)


wine_data_train_2 <- training(wine_data_split_2)#training dataset balancedd

wine_data_test_2 <- testing(wine_data_split_2)  #testing dataset balanced

wine_vfold <- vfold_cv(wine_data_train_2, v = 5, strata = quality) #5 fold split off the balanced training dataset

#tuning specs of the data

knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

#scaling and centering the data on the recipe

wine_recipe <- recipe(quality ~ ., data = wine_data_train_2 )|>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

#tunes the data with k from 1 to 20
k_vals <- tibble(neighbors = seq(from = 1, to = 20))
wine_tune <- workflow() |>
    add_recipe(wine_recipe) |>
    add_model(knn_tune) |>
    tune_grid(resamples = wine_vfold, grid = k_vals) |>
    collect_metrics()
wine_tune

#uses the accuracy matrix to make a plot

accuracy_versus_k <- wine_tune |>
    filter(.metric == "accuracy") |>
    ggplot(aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "Accuracy Estimate") 

accuracy_versus_k

#uses the roc_auc matrix to make a plot

roc_auc_versus_k <- wine_tune |>
    filter(.metric == "roc_auc") |>
    ggplot(aes(x = neighbors, y = mean))+
       geom_point() +
       geom_line() +
       labs(x = "Neighbors", y = "ROC_AUC Score") 

roc_auc_versus_k

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 1) |>
    set_engine("kknn") |>
    set_mode("classification")

wine_fit <- workflow() |>
    add_recipe(wine_recipe) |>
    add_model(knn_spec) |>
    fit(wine_data_train_2)

wine_fit

#runs the test data and measures it's accuracy and confusion matrix

wine_predictions <- wine_fit |>
predict(wine_data_test) |>
bind_cols(wine_data_test)
wine_predictions

mnist_metrics <- wine_predictions |> 
    metrics(truth = quality, estimate = .pred_class) |>
    filter(.metric == "accuracy")
mnist_metrics

wine_predictions |> 
    conf_mat(truth = quality, estimate = .pred_class)

══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_center()
• step_scale()

── Model ───────────────────────────────────────────────────────────────────────

Call:
kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(1,     data, 5), kernel = ~"rectangular")

Type of response variable: nominal
Minimal misclassification: 0.08157074
Best kernel: rectangular
Best k: 1

          Truth
Prediction   3   4   5   6   7   8   9
         3   5   0   2   2   0   0   0
         4   0  39   8   0   0   0   0
         5   0   0 302  24   1   0   0
         6   0   0  39 507  15   0   0
         7   0   0   5  16 208   0   0
         8   0   0   1   4   0  43   0
         9   0   0   0   1   0   0   3

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 2) |>
    set_engine("kknn") |>
    set_mode("classification")

wine_fit <- workflow() |>
    add_recipe(wine_recipe) |>
    add_model(knn_spec) |>
    fit(wine_data_train_2)

wine_fit

#runs the test data and measures it's accuracy and confusion matrix

wine_predictions <- wine_fit |>
predict(wine_data_test) |>
bind_cols(wine_data_test)
wine_predictions

mnist_metrics <- wine_predictions |> 
    metrics(truth = quality, estimate = .pred_class) |>
    filter(.metric == "accuracy")
mnist_metrics

wine_predictions |> 
    conf_mat(truth = quality, estimate = .pred_class)

══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_center()
• step_scale()

── Model ───────────────────────────────────────────────────────────────────────

Call:
kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(2,     data, 5), kernel = ~"rectangular")

Type of response variable: nominal
Minimal misclassification: 0.1149445
Best kernel: rectangular
Best k: 2

          Truth
Prediction   3   4   5   6   7   8   9
         3   5   0   2   2   0   0   0
         4   0  39   8   0   0   0   0
         5   0   0 302  24   1   0   0
         6   0   0  39 507  15   0   0
         7   0   0   5  16 208   0   0
         8   0   0   1   4   0  43   0
         9   0   0   0   1   0   0   3

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 4) |>
    set_engine("kknn") |>
    set_mode("classification")

wine_fit <- workflow() |>
    add_recipe(wine_recipe) |>
    add_model(knn_spec) |>
    fit(wine_data_train_2)

wine_fit

#runs the test data and measures it's accuracy and confusion matrix

wine_predictions <- wine_fit |>
predict(wine_data_test) |>
bind_cols(wine_data_test)
wine_predictions

mnist_metrics <- wine_predictions |> 
    metrics(truth = quality, estimate = .pred_class) |>
    filter(.metric == "accuracy")
mnist_metrics

wine_predictions |> 
    conf_mat(truth = quality, estimate = .pred_class)

══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_center()
• step_scale()

── Model ───────────────────────────────────────────────────────────────────────

Call:
kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(4,     data, 5), kernel = ~"rectangular")

Type of response variable: nominal
Minimal misclassification: 0.1566401
Best kernel: rectangular
Best k: 4

          Truth
Prediction   3   4   5   6   7   8   9
         3   5   0   4   3   1   0   0
         4   0  39  23  16   1   0   0
         5   0   0 253  90   6   0   0
         6   0   0  50 329  26   0   0
         7   0   0  19  85 188   0   0
         8   0   0   7  29   2  43   0
         9   0   0   1   2   0   0   3

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 5) |>
    set_engine("kknn") |>
    set_mode("classification")

wine_fit <- workflow() |>
    add_recipe(wine_recipe) |>
    add_model(knn_spec) |>
    fit(wine_data_train_2)

wine_fit

#runs the test data and measures it's accuracy and confusion matrix

wine_predictions <- wine_fit |>
predict(wine_data_test) |>
bind_cols(wine_data_test)
wine_predictions

mnist_metrics <- wine_predictions |> 
    metrics(truth = quality, estimate = .pred_class) |>
    filter(.metric == "accuracy")
mnist_metrics

wine_predictions |> 
    conf_mat(truth = quality, estimate = .pred_class)

══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_center()
• step_scale()

── Model ───────────────────────────────────────────────────────────────────────

Call:
kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(5,     data, 5), kernel = ~"rectangular")

Type of response variable: nominal
Minimal misclassification: 0.17207
Best kernel: rectangular
Best k: 5

          Truth
Prediction   3   4   5   6   7   8   9
         3   5   0   4   4   1   1   0
         4   0  39  30  28   1   0   0
         5   0   0 237 112   7   0   0
         6   0   0  46 265  31   0   0
         7   0   0  30 107 182   0   0
         8   0   0   9  36   2  42   0
         9   0   0   1   2   0   0   3

fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
<dbl>	<chr>	<chr>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<dbl>	<chr>	<dbl>	<dbl>
7	0.27	0.36	20.7	0.045	45	170	1.001	3	0.45	88	6
63	0.3	0.34	1.6	0.049	14	132	0.994	33	0.49	95	6
81	0.28	0.4	6.9	0.05	30	97	0.9951	326	0.44	101	6
72	0.23	0.32	8.5	0.058	47	186	0.9956	319	0.4	99	6
72	0.23	0.32	8.5	0.058	47	186	0.9956	319	0.4	99	6
81	0.28	0.4	6.9	0.05	30	97	0.9951	326	0.44	101	6

quality	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol
<fct>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
3	73.40000	0.3316667	0.3420000	5.166667	0.05680000	204.40000	335.6000	0.9943493	283.4667	0.4533333	8.240000e+01
4	64.63710	0.3874597	0.2915323	4.538306	0.05045161	42.43548	142.2339	0.9941348	289.6855	0.4679839	9.595968e+01
5	64.50909	0.3005818	0.3381818	7.308591	0.05202364	40.52727	154.6609	0.9952705	287.6655	0.4823545	3.130606e+12
6	61.81448	0.2600395	0.3378954	6.478650	0.04511618	40.05049	146.0620	0.9939711	286.6831	0.4907786	1.280211e+12
7	60.80945	0.2647332	0.3255030	5.211966	0.03814024	42.33384	132.6387	0.9924701	286.0518	0.5064177	1.398171e+12
8	58.00000	0.2733333	0.3293182	5.731818	0.03918182	37.23485	200.1288	0.9923600	301.1818	0.4892424	1.035076e+02
9	72.50000	0.2500000	0.4250000	2.100000	0.03150000	29.00000	126.0000	0.9904250	332.5000	0.4500000	1.270000e+02

quality	volatile acidity	chlorides	density	pH	sulphates
<fct>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
6	0.27	0.045	1.0010	3	0.45
6	0.30	0.049	0.9940	33	0.49
6	0.28	0.050	0.9951	326	0.44
6	0.23	0.058	0.9956	319	0.40
6	0.23	0.058	0.9956	319	0.40
⋮	⋮	⋮	⋮	⋮	⋮
5	0.32	0.047	0.99490	315	0.46
6	0.24	0.041	0.99254	299	0.46
7	0.29	0.022	0.98869	334	0.38
6	0.21	0.020	0.98941	326	0.32

quality	n
<fct>	<int>
3	2198
4	2198
5	2198
6	2198
7	2198
8	2198
9	2198

neighbors	.metric	.estimator	mean	n	std_err	.config
<int>	<chr>	<chr>	<dbl>	<int>	<dbl>	<chr>
1	accuracy	multiclass	0.9011729	5	0.003663154	Preprocessor1_Model01
1	roc_auc	hand_till	0.9423509	5	0.002136840	Preprocessor1_Model01
2	accuracy	multiclass	0.9011729	5	0.003663154	Preprocessor1_Model02
2	roc_auc	hand_till	0.9480404	5	0.002351791	Preprocessor1_Model02
3	accuracy	multiclass	0.8511538	5	0.003946463	Preprocessor1_Model03
⋮	⋮	⋮	⋮	⋮	⋮	⋮
19	accuracy	multiclass	0.6936462	5	0.0053361430	Preprocessor1_Model19
19	roc_auc	hand_till	0.9287344	5	0.0008900950	Preprocessor1_Model19
20	accuracy	multiclass	0.6884462	5	0.0045748370	Preprocessor1_Model20
20	roc_auc	hand_till	0.9274932	5	0.0008140696	Preprocessor1_Model20

Chemical Signatures of Quality: Predicting White Wine Ratings using Machine Learning Method¶

Yimeng Sun¶

Introduction¶

Methods and Results¶

Preprocessing and exploratory data analysis¶

Building KNN classification model¶

Testing our classifier and choosing the best k value¶

Preprocessing and Exploratory data analysis¶

Splitting our data into training and testing sets¶

Visualizing the relationship between each variable and wine quality and choose predictors¶

Building KNN classification model¶

Balancing the data set¶

Building the classification model¶

Visualizing the accuracies of different K and choosing the best K value candidate¶

Visualizing the ROC_AUC score of different K and choosing the best K value candidate¶

Retrain the model¶

Retrain the model with the K=1¶

Retrain the model with the K=2¶

Retrain the model with the K=4¶

Retrain the model with the K=5¶

Results and Conclusions¶

Discussion¶

Summary of findings vs expectations:¶

Impact of our findings:¶

What future questions could this lead to?¶

Reference¶