In [1]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt  
%matplotlib inline
In [2]:
df = pd.read_csv('winequality-white.csv',sep=";")
In [3]:
df.head()
Out[3]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.0 0.27 0.36 20.7 0.045 45.0 170.0 1.0010 3.00 0.45 8.8 6
1 6.3 0.30 0.34 1.6 0.049 14.0 132.0 0.9940 3.30 0.49 9.5 6
2 8.1 0.28 0.40 6.9 0.050 30.0 97.0 0.9951 3.26 0.44 10.1 6
3 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40 9.9 6
4 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40 9.9 6
In [4]:
df.tail()
Out[4]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
4893 6.2 0.21 0.29 1.6 0.039 24.0 92.0 0.99114 3.27 0.50 11.2 6
4894 6.6 0.32 0.36 8.0 0.047 57.0 168.0 0.99490 3.15 0.46 9.6 5
4895 6.5 0.24 0.19 1.2 0.041 30.0 111.0 0.99254 2.99 0.46 9.4 6
4896 5.5 0.29 0.30 1.1 0.022 20.0 110.0 0.98869 3.34 0.38 12.8 7
4897 6.0 0.21 0.38 0.8 0.020 22.0 98.0 0.98941 3.26 0.32 11.8 6
In [5]:
df.shape
Out[5]:
(4898, 12)
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
In [7]:
df.describe()
Out[7]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000
mean 6.854788 0.278241 0.334192 6.391415 0.045772 35.308085 138.360657 0.994027 3.188267 0.489847 10.514267 5.877909
std 0.843868 0.100795 0.121020 5.072058 0.021848 17.007137 42.498065 0.002991 0.151001 0.114126 1.230621 0.885639
min 3.800000 0.080000 0.000000 0.600000 0.009000 2.000000 9.000000 0.987110 2.720000 0.220000 8.000000 3.000000
25% 6.300000 0.210000 0.270000 1.700000 0.036000 23.000000 108.000000 0.991723 3.090000 0.410000 9.500000 5.000000
50% 6.800000 0.260000 0.320000 5.200000 0.043000 34.000000 134.000000 0.993740 3.180000 0.470000 10.400000 6.000000
75% 7.300000 0.320000 0.390000 9.900000 0.050000 46.000000 167.000000 0.996100 3.280000 0.550000 11.400000 6.000000
max 14.200000 1.100000 1.660000 65.800000 0.346000 289.000000 440.000000 1.038980 3.820000 1.080000 14.200000 9.000000
In [8]:
df.quality.unique()
Out[8]:
array([6, 5, 7, 8, 4, 3, 9])
In [9]:
df.quality.value_counts()
Out[9]:
6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64
In [10]:
df.corr()
Out[10]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
fixed acidity 1.000000 -0.022697 0.289181 0.089021 0.023086 -0.049396 0.091070 0.265331 -0.425858 -0.017143 -0.120881 -0.113663
volatile acidity -0.022697 1.000000 -0.149472 0.064286 0.070512 -0.097012 0.089261 0.027114 -0.031915 -0.035728 0.067718 -0.194723
citric acid 0.289181 -0.149472 1.000000 0.094212 0.114364 0.094077 0.121131 0.149503 -0.163748 0.062331 -0.075729 -0.009209
residual sugar 0.089021 0.064286 0.094212 1.000000 0.088685 0.299098 0.401439 0.838966 -0.194133 -0.026664 -0.450631 -0.097577
chlorides 0.023086 0.070512 0.114364 0.088685 1.000000 0.101392 0.198910 0.257211 -0.090439 0.016763 -0.360189 -0.209934
free sulfur dioxide -0.049396 -0.097012 0.094077 0.299098 0.101392 1.000000 0.615501 0.294210 -0.000618 0.059217 -0.250104 0.008158
total sulfur dioxide 0.091070 0.089261 0.121131 0.401439 0.198910 0.615501 1.000000 0.529881 0.002321 0.134562 -0.448892 -0.174737
density 0.265331 0.027114 0.149503 0.838966 0.257211 0.294210 0.529881 1.000000 -0.093591 0.074493 -0.780138 -0.307123
pH -0.425858 -0.031915 -0.163748 -0.194133 -0.090439 -0.000618 0.002321 -0.093591 1.000000 0.155951 0.121432 0.099427
sulphates -0.017143 -0.035728 0.062331 -0.026664 0.016763 0.059217 0.134562 0.074493 0.155951 1.000000 -0.017433 0.053678
alcohol -0.120881 0.067718 -0.075729 -0.450631 -0.360189 -0.250104 -0.448892 -0.780138 0.121432 -0.017433 1.000000 0.435575
quality -0.113663 -0.194723 -0.009209 -0.097577 -0.209934 0.008158 -0.174737 -0.307123 0.099427 0.053678 0.435575 1.000000
In [11]:
corr = df.corr()
ax = sns.heatmap(
    corr, vmin =-1, vmax =1, center=0,
    cmap = sns.diverging_palette(20, 220, n=200),
    square = True, linewidths=.5
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
In [12]:
b_plot = sns.boxplot(y = 'fixed acidity', data = df, width = 0.5) 
In [13]:
b_plot = sns.boxplot(y = 'volatile acidity', data = df, width = 0.5) 
In [14]:
b_plot = sns.boxplot(y = 'alcohol', data = df, width = 0.5)  
In [15]:
l = df.columns.values
number_of_columns=12
number_of_rows = 12-1/number_of_columns
plt.figure(figsize=(number_of_columns,10*number_of_rows))
for i in range(0,12):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.set_style('whitegrid')
    sns.boxplot(df[l[i]],color='green',orient='v')
    plt.tight_layout()