# Standard import(s)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets


# Load wine dataset
obs = pd.read_csv('wine_quality.csv')


# Show available properties of the dataset
obs.keys()

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'type'],
      dtype='object')


# Show dataset description
obs.head()


# Drop duplicate measurements -- makes sense in this case, since measurements 
# with identical rows would imply identical wines (at least in composition),
# which would be double counted in the dataset
obs = obs.drop_duplicates()


obs.shape

(5320, 13)


obs["type"].value_counts()

R    3961
W    1359
Name: type, dtype: int64


obs.describe()


obs.describe()["quality"][["min","max"]]

min    3.0
max    9.0
Name: quality, dtype: float64


obs.groupby("type").mean()["quality"]

type
R    5.854835
W    5.623252
Name: quality, dtype: float64


obs.groupby(["type","quality"])["fixed_acidity"].count()

type  quality
R     3            20
      4           153
      5          1175
      6          1788
      7           689
      8           131
      9             5
W     3            10
      4            53
      5           577
      6           535
      7           167
      8            17
Name: fixed_acidity, dtype: int64

obs


#group by type
groups = obs.groupby("type")



red = groups.get_group("R").copy()
white = groups.get_group("W").copy()



red["good_qual"] = (red["quality"] >= red["quality"].median()).astype(int)
white["good_qual"] = (white["quality"] >= white["quality"].median()).astype(int)

red.drop(columns=["quality","type"],inplace=True)
white.drop(columns=["quality","type"],inplace=True)
red["good_qual"].value_counts()

1    2613
0    1348
Name: good_qual, dtype: int64

red


red.groupby("good_qual").describe()[["alcohol","sulphates"]]


colormap = sns.diverging_palette(220, 10, as_cmap=True)


red.corr()


#Get the correlation matrix
corr_matrix = red.corr()
# Plot figsize
fig, ax = plt.subplots(figsize=(10, 10))
# Generate Heat Map, allow annotations and place floats in map
sns.heatmap(corr_matrix, cmap=colormap, annot=True, fmt=".2f")

plt.show()


corr_abs = corr_matrix.abs()
high_corr_chlorides = list(corr_abs["chlorides"].drop("good_qual").sort_values(ascending=False)[0:6].index)
print(high_corr_chlorides)

['chlorides', 'alcohol', 'density', 'total_sulfur_dioxide', 'citric_acid', 'free_sulfur_dioxide']


list(corr_matrix.abs().drop("good_qual")["chlorides"].sort_values(ascending=False)[0:6].index)

['chlorides',
 'alcohol',
 'density',
 'total_sulfur_dioxide',
 'citric_acid',
 'free_sulfur_dioxide']


sns.pairplot(red[high_corr_chlorides+["good_qual"]], hue = 'good_qual',);


most_corr = corr_abs[corr_abs<1].unstack().sort_values(ascending=False).head(1)
print(most_corr)

residual_sugar  density    0.820498
dtype: float64


label = ["Poor","Good"]
feat1 = most_corr.index[0][1]
feat2 = most_corr.index[0][0]
# Create figure and axis objects.
fig, ax = plt.subplots()
for t in red['good_qual'].unique():
    mask = red['good_qual'] == t
    ax.scatter(red[feat1][mask], red[feat2][mask], label=label[t], alpha=0.5)
    pass
ax.set_xlabel('Density')
ax.set_ylabel('Residual Sugars')
ax.legend();
fig.show()

/var/folders/sl/3pszjjd95ks506qykf9w8w7r0000gn/T/ipykernel_15465/3762291928.py:13: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()


red[(red["good_qual"] ==1)].shape

(2613, 12)


# define data and features
features = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
target   = ['good_qual']
wtype = ["Poor","Good"]


# Combine features and target to create a smaller dataset
dataset = red[features + target]

# Drop duplicates and null values 
dataset = dataset.drop_duplicates().dropna()

# Shuffle
dataset = dataset.sample(frac=1).reset_index(drop=True)

# Get first 1000 observations 
frames = [dataset[dataset.good_qual == 0].head(1000), 
          dataset[dataset.good_qual == 1].head(1000)]

sample = pd.concat(frames)

# Convert pandas.DataFrame to numpy.array
#X = sample[features].values
#y = sample[target].values.flatten()

X = dataset[features].values
y = dataset[target].values.flatten()

# Check shapes
X.shape, y.shape

((3961, 11), (3961,))


sample[target].values.flatten()

array([0, 0, 0, ..., 1, 1, 1])


from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.tree import export_graphviz
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics # Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing # Import preprocessing for String-Int conversion

y

array([0, 1, 0, ..., 1, 0, 1])


# split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
print(x_train.shape,x_test.shape)

(2772, 11) (1189, 11)


# Create Decision Tree classifer object with these parameters
dt = DecisionTreeClassifier(criterion = 'entropy',max_depth = 4)
# Train Decision Tree Classifer
dt = dt.fit(x_train,y_train)
# Predict the response for test dataset
y_pred = dt.predict(x_train)
print("Accuracy Training:",metrics.accuracy_score(y_train, y_pred))
y_pred = dt.predict(x_test)
print("Accuracy Testing:",metrics.accuracy_score(y_test, y_pred))

Accuracy Training: 0.7633477633477633
Accuracy Testing: 0.7417998317914214


export_graphviz(dt,
                out_file="tree.dot",
                rounded=True,
                filled=True
               )
#save the decision_tree as png
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt, 
                   feature_names=features,
                   class_names=wtype,
                   fontsize=20,
                   filled=True)
fig.savefig("decision_tree.png")


features[dt.feature_importances_.argmax()]

'alcohol'


abc = AdaBoostClassifier(n_estimators=100,learning_rate=.1)
abc.fit(x_train, y_train)
y_pred = abc.predict(x_train)
print("Accuracy Training:",metrics.accuracy_score(y_train, y_pred))
y_pred = abc.predict(x_test)
print("Accuracy Testing:",metrics.accuracy_score(y_test, y_pred))

Accuracy Training: 0.7633477633477633
Accuracy Testing: 0.7611438183347351


# here we use the Gradient boosting classifier
max_depth    =  4
n_estimators = 100

bdt = GradientBoostingClassifier(max_depth=max_depth, n_estimators=n_estimators)
bdt.fit(x_train, y_train)

GradientBoostingClassifier(max_depth=4)

GradientBoostingClassifier(max_depth=4)


y_pred = bdt.predict(x_train)
print("Accuracy Training:",metrics.accuracy_score(y_train, y_pred))
y_pred = bdt.predict(x_test)
print("Accuracy Testing:",metrics.accuracy_score(y_test, y_pred))

Accuracy Training: 0.8531746031746031
Accuracy Testing: 0.7678721614802355


# Fit a random forest classifier
rf = RandomForestClassifier(n_estimators=n_estimators, criterion = 'gini',max_features=5, max_depth=4,random_state=1)
rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=4, max_features=5, random_state=1)

RandomForestClassifier(max_depth=4, max_features=5, random_state=1)


y_pred = rf.predict(x_train)
print("Accuracy Training:",metrics.accuracy_score(y_train, y_pred))
y_pred = rf.predict(x_test)
print("Accuracy Testing:",metrics.accuracy_score(y_test, y_pred))

Accuracy Training: 0.7763347763347763
Accuracy Testing: 0.7569386038687973


from sklearn.model_selection import GridSearchCV


ada_dic={
    "n_estimators":[10,50,100,200,500,1000],
    "learning_rate": [0.01 ,.05,.1,.5,1],
    }


grid_search = GridSearchCV(AdaBoostClassifier(),ada_dic,n_jobs=7)


grid_search.fit(x_train, y_train)

GridSearchCV(estimator=AdaBoostClassifier(), n_jobs=7,
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
                         'n_estimators': [10, 50, 100, 200, 500, 1000]})

GridSearchCV(estimator=AdaBoostClassifier(), n_jobs=7,
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
                         'n_estimators': [10, 50, 100, 200, 500, 1000]})

AdaBoostClassifier()

AdaBoostClassifier()


grid_search.best_estimator_

AdaBoostClassifier(learning_rate=0.05, n_estimators=200)

AdaBoostClassifier(learning_rate=0.05, n_estimators=200)


grid_search.best_estimator_.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': None,
 'learning_rate': 0.05,
 'n_estimators': 200,
 'random_state': None}


y_pred = grid_search.predict(x_train)
print("Accuracy Training:",metrics.accuracy_score(y_train, y_pred))
y_pred = grid_search.predict(x_test)
print("Accuracy Testing:",metrics.accuracy_score(y_test, y_pred))

Accuracy Training: 0.7640692640692641
Accuracy Testing: 0.7603027754415476

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	pH	sulphates	alcohol	quality	type
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5	W
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5	W
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5	W
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6	W
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5	W

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	pH	sulphates	alcohol	quality
count	5320.000000	5320.000000	5320.000000	5320.000000	5320.000000	5320.000000	5320.000000	5320.000000	5320.000000	5320.000000	5320.000000	5320.000000
mean	7.215179	0.344130	0.318494	5.048477	0.056690	30.036654	114.109023	0.994535	3.224664	0.533357	10.549241	5.795677
std	1.319671	0.168248	0.147157	4.500180	0.036863	17.805045	56.774223	0.002966	0.160379	0.149743	1.185933	0.879772
min	3.800000	0.080000	0.000000	0.600000	0.009000	1.000000	6.000000	0.987110	2.720000	0.220000	8.000000	3.000000
25%	6.400000	0.230000	0.240000	1.800000	0.038000	16.000000	74.000000	0.992200	3.110000	0.430000	9.500000	5.000000
50%	7.000000	0.300000	0.310000	2.700000	0.047000	28.000000	116.000000	0.994650	3.210000	0.510000	10.400000	6.000000
75%	7.700000	0.410000	0.400000	7.500000	0.066000	41.000000	153.250000	0.996770	3.330000	0.600000	11.400000	6.000000
max	15.900000	1.580000	1.660000	65.800000	0.611000	289.000000	440.000000	1.038980	4.010000	2.000000	14.900000	9.000000

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	pH	sulphates	alcohol	quality	type
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4	5	W
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.99680	3.20	0.68	9.8	5	W
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.99700	3.26	0.65	9.8	5	W
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.99800	3.16	0.58	9.8	6	W
5	7.4	0.66	0.00	1.8	0.075	13.0	40.0	0.99780	3.51	0.56	9.4	5	W
...	...	...	...	...	...	...	...	...	...	...	...	...	...
6492	6.2	0.21	0.29	1.6	0.039	24.0	92.0	0.99114	3.27	0.50	11.2	6	R
6493	6.6	0.32	0.36	8.0	0.047	57.0	168.0	0.99490	3.15	0.46	9.6	5	R
6494	6.5	0.24	0.19	1.2	0.041	30.0	111.0	0.99254	2.99	0.46	9.4	6	R
6495	5.5	0.29	0.30	1.1	0.022	20.0	110.0	0.98869	3.34	0.38	12.8	7	R
6496	6.0	0.21	0.38	0.8	0.020	22.0	98.0	0.98941	3.26	0.32	11.8	6	R

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	pH	sulphates	alcohol	good_qual
1599	7.0	0.27	0.36	20.7	0.045	45.0	170.0	1.00100	3.00	0.45	8.8	1
1600	6.3	0.30	0.34	1.6	0.049	14.0	132.0	0.99400	3.30	0.49	9.5	1
1601	8.1	0.28	0.40	6.9	0.050	30.0	97.0	0.99510	3.26	0.44	10.1	1
1602	7.2	0.23	0.32	8.5	0.058	47.0	186.0	0.99560	3.19	0.40	9.9	1
1605	6.2	0.32	0.16	7.0	0.045	30.0	136.0	0.99490	3.18	0.47	9.6	1
...	...	...	...	...	...	...	...	...	...	...	...	...
6492	6.2	0.21	0.29	1.6	0.039	24.0	92.0	0.99114	3.27	0.50	11.2	1
6493	6.6	0.32	0.36	8.0	0.047	57.0	168.0	0.99490	3.15	0.46	9.6	0
6494	6.5	0.24	0.19	1.2	0.041	30.0	111.0	0.99254	2.99	0.46	9.4	1
6495	5.5	0.29	0.30	1.1	0.022	20.0	110.0	0.98869	3.34	0.38	12.8	1
6496	6.0	0.21	0.38	0.8	0.020	22.0	98.0	0.98941	3.26	0.32	11.8	1

	alcohol								sulphates
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
good_qual
0	1348.0	9.909172	0.895594	8.0	9.2	9.7	10.5	13.6	1348.0	0.481825	0.100946	0.25	0.41	0.47	0.53	0.88
1	2613.0	10.940254	1.212499	8.5	10.0	10.9	12.0	14.2	2613.0	0.494749	0.119275	0.22	0.41	0.48	0.56	1.08

Wine quality dataset¶

Dataset information¶

Purpose:¶

Data Evaluation¶

number of red and white wines¶

statistic of the features in the dataset¶

Minimum and maximum wine qualities in the dataset¶

type of wine with a higher score on average¶

Number of measurements for each quality value per wine type¶

Data preperation¶

Focusing on the red wine dataset¶

statistics per good/bad wine in the red wine dataset for alcohol and sulphates¶

Visualisation¶

Making a correlation plot of all the features in the red wine dataset¶

Five most correlated features to chlorides and make a pairplot split by good_qual¶

Making a scatter plot of the two most correlated features.¶

9. Prepare a smaller pandas.DataFrame for visualisation and classification¶

Decision tree classification¶

wine that gets you drunk quicker is better ;)! This could be tested by running pseudo experiments through the DT¶

Fitting a boosted decision tree classifier¶

Fitting a Random forest decision tree classifier (1 Mark)¶

Hyper parameter optimisation¶

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	pH	sulphates	alcohol	good_qual
fixed_acidity	1.000000	-0.019214	0.298959	0.083620	0.024036	-0.058396	0.082425	0.266091	-0.431274	-0.017453	-0.110788	-0.097136
volatile_acidity	-0.019214	1.000000	-0.163228	0.098340	0.086287	-0.102471	0.102315	0.060603	-0.046954	-0.021150	0.046815	-0.223359
citric_acid	0.298959	-0.163228	1.000000	0.106269	0.132590	0.091681	0.122845	0.160076	-0.183015	0.049442	-0.076514	0.010923
residual_sugar	0.083620	0.098340	0.106269	1.000000	0.076091	0.306835	0.409583	0.820498	-0.165997	-0.020503	-0.398167	-0.109543
chlorides	0.024036	0.086287	0.132590	0.076091	1.000000	0.101272	0.191145	0.253088	-0.090573	0.017871	-0.356928	-0.192464
free_sulfur_dioxide	-0.058396	-0.102471	0.091681	0.306835	0.101272	1.000000	0.619437	0.294638	-0.007750	0.037932	-0.251768	0.003781
total_sulfur_dioxide	0.082425	0.102315	0.122845	0.409583	0.191145	0.619437	1.000000	0.536868	0.008239	0.136544	-0.446643	-0.170925
density	0.266091	0.060603	0.160076	0.820498	0.253088	0.294638	0.536868	1.000000	-0.063734	0.082048	-0.760162	-0.292018
pH	-0.431274	-0.046954	-0.183015	-0.165997	-0.090573	-0.007750	0.008239	-0.063734	1.000000	0.142353	0.093095	0.103796
sulphates	-0.017453	-0.021150	0.049442	-0.020503	0.017871	0.037932	0.136544	0.082048	0.142353	1.000000	-0.022850	0.053950
alcohol	-0.110788	0.046815	-0.076514	-0.398167	-0.356928	-0.251768	-0.446643	-0.760162	0.093095	-0.022850	1.000000	0.401458
good_qual	-0.097136	-0.223359	0.010923	-0.109543	-0.192464	0.003781	-0.170925	-0.292018	0.103796	0.053950	0.401458	1.000000