Cluster Analysis

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import numpy as np
import plotly.express as px
import plotly.offline as py
import pandas as pd
from sklearn import decomposition
from pymongo import MongoClient
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from scipy import stats
from sklearn.preprocessing import PowerTransformer
import numpy as np
from sklearn.manifold import Isomap
import matplotlib.pyplot as plt
from sklearn.metrics import  mean_absolute_percentage_error
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import PowerTransformer
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import plotly.graph_objects as go
import plotly.tools as tls
from plotly.offline import plot, iplot, init_notebook_mode
from IPython.core.display import display, HTML
from plotly.subplots import make_subplots
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
init_notebook_mode(connected = True)
config={'showLink': False, 'displayModeBar': False}

num_att = ['First Registration',
 'Mileage',
 'Power(hp)',
 'Displacement', 'Price']

X_num_att = ['First Registration',
 'Mileage',
 'Power(hp)',
 'Displacement']

cat_att = ['Make', 'Model', 'Body', 'Fuel','Gearing Type']



def df_to_plotly(df):
    return {'z': df.values.tolist(),
            'x': df.columns.tolist(),
            'y': df.index.tolist()}


class InitalCleaning(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        play_df = play_df.drop(columns = ['ID', 'Loaded_in_DW', 'Model Code'])
        
        #Adjust Column Types
        play_df['Power(hp)'] = pd.to_numeric(play_df['Power(hp)'])
        play_df['Displacement'] = pd.to_numeric(play_df['Displacement'])
        play_df['Mileage'] = pd.to_numeric(play_df['Mileage'])
        play_df['Price'] = pd.to_numeric(play_df['Price'])
        
        #Drop rows with null values
        play_df = play_df[~(play_df['Make'].isna() | play_df['Model'].isna())]
        play_df = play_df[~(play_df['Displacement'].isna() & play_df['Power(hp)'].isna())]
        play_df = play_df[~(play_df["Body"].isna())]
        play_df = play_df[~(play_df["Fuel"].isna())]
        play_df = play_df[~(play_df["Price"].isna())]
        
        #Drop fake ads and update column values
        play_df = play_df[~(play_df['Displacement'] < 900)]
        play_df = play_df[~(play_df['Displacement'] > 70000)]
        play_df = play_df[~((play_df['Power(hp)']>500) & \
                        (~((play_df['Make'] == 'Audi') | (play_df['Make'] == 'BMW')  | (play_df['Make'] == 'Mercedes-Benz'))))]
        play_df = play_df[~((play_df['Power(hp)']<30) & (~(play_df['Fuel'] == 'Electricity')))]
        play_df = play_df[~(play_df['Price']>300000)]
        play_df["Fuel"] = play_df["Fuel"].str.split("/").str[0]
        play_df['Gearing Type'] = play_df['Gearing Type'].replace({np.nan : 'Manual'})

        
        return play_df
    
    
class AdjustEquip(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        #Keep only 0 and 1 values
        equipment = play_df.iloc[:,9:79]
        equipment = equipment.replace({np.nan: 0})
        equipment = equipment.replace({'1': 1})
        a = set(equipment['Warranty'])
        a.remove(0)
        a.remove(1)
        equipment = equipment.replace(list(a) , 1)
        
        #Convert each column's type to int
        for col in equipment.columns:
            equipment[col] = equipment[col].astype(int)
        play_df.iloc[:,9:79] = equipment

        
        return play_df
    
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        
        play_df = pd.get_dummies(data = play_df,
                                      columns = ['Make', 'Model', 'Body', 'Fuel','Gearing Type'],
                                      dummy_na = False)
        
        
        return play_df

class RegistrationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, tip = 0):
        self.tip = tip
    def fit(self, X, y=None):
        return self
    def transform(self, play_df, y = None):
        
        #Reset index is used to avoid null values when merging
        play_df.reset_index(drop=True, inplace=True)
        a = date_magic(play_df['First Registration'])
        a.reset_index(drop = True, inplace = True)
        play_df['First Registration'] = a
        
        if self.tip == 0:
             play_df = play_df[(play_df['Make']=='BMW') & (play_df['First Registration'] > 2005) & ( (play_df['Model'].str.startswith('3')) | (play_df['Model'].str.startswith('1'))  ) ]
        elif self.tip == 1:
             play_df = play_df[(play_df['Make']=='BMW') & (play_df['First Registration'] > 2005)   ]

                
                 
        return play_df


class IQROutlierRemoval_new(BaseEstimator, TransformerMixin):
    def __init__(self, num_att):
        self.num_att = num_att
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
         
#         Q1 = play_df.quantile(0.25)
#         Q3 = play_df.quantile(0.75)
#         IQR = Q3- Q1
#         play_df = play_df[~((play_df < (Q1 - 1.5 * IQR)) |(play_df > (Q3 + 1.5 * IQR))).any(axis=1)]
        return pd.DataFrame(play_df, columns = num_att)


class IQROutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, num_att):
        self.num_att = num_att
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
        bmw_num = play_df[self.num_att]
        bmw_cat = play_df.drop(self.num_att, axis = 1)
        Q1 = bmw_num.quantile(0.25)
        Q3 = bmw_num.quantile(0.75)
        IQR = Q3- Q1
        bmw_num = bmw_num[~((bmw_num < (Q1 - 1.5 * IQR)) |(bmw_num > (Q3 + 1.5 * IQR))).any(axis=1)]
        return pd.DataFrame(bmw_num, columns = num_att)
    

class ZScoreOutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, num_att):
        self.num_att = num_att
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
        z = np.abs(stats.zscore(play_df, nan_policy='omit'))
        return play_df[(z < 3).all(axis=1)]
    

class StandardScalerIndices(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()
    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self
    def transform(self, play_df, y = None):     
        return pd.DataFrame(self.scaler.transform(play_df),columns = play_df.columns, index = play_df.index)

    
class PowerTransformerIndices(BaseEstimator, TransformerMixin):
    def __init__(self, method):
        self.method = method
        self.transformer = PowerTransformer(method = self.method)
    def fit(self, X, y=None):
        self.transformer.fit(X)
        return self
    def transform(self, play_df, y = None):
        return pd.DataFrame(self.transformer.transform(play_df),columns = play_df.columns, index = play_df.index)
    
    
class RemovePrice(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self
    def transform(self, play_df, y = None):
        return play_df.drop('Price', axis = 1)

class IterativeImputerIndices(BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.imputer = IterativeImputer(estimator = estimator)
    def fit(self, X, y=None):
        self.imputer.fit(X)
        return self
    def transform(self, play_df, y = None):
        return pd.DataFrame(self.imputer.transform(play_df),columns = play_df.columns, index = play_df.index)

class JoinTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, BMW_df, new_att):
        self.BMW_df = BMW_df
        self.new_att = new_att
    def fit(self,X,y = None):
        return self
    def transform(self, X, y = None):
        new = self.BMW_df.drop(self.new_att, axis = 1)
        return X.join(new)
        

def date_magic(d):
    months_dict = {'Jan': 1,'Feb':2,'Mar':3,'Apr':4 ,'May':5 ,'Jun':6,'Jul':7,
                      'Aug':8,'Sep':9,'Oct':10 ,'Nov':11,'Dec':12}
    year = []
    month = []
    num_year = []
    for el in d:
        el = str(el)

        split = el.split('-')
        if (len(split)>1):
            month.append(months_dict[split[0]])
            if(int(split[1])>20):
                year.append(int('19' + split[1]))
            else:
                year.append(int('20' + split[1]))
            continue

        split = el.split('/')
        if(len(split)>1):
            month.append(int(split[0]))
            year.append(int(split[1]))
            continue

        month.append(1)
        year.append(int(el))
    month = pd.Series(month)
    year  = pd.Series(year)
    num_year = year + month/12  
    return pd.Series(num_year)


def readData():

    client = MongoClient('mongodb+srv://<Name>:<Pass>v@dwprojectcluster.lpqbf.mongodb.net/cars_database?retryWrites=true&w=majority')

    df_cars = pd.DataFrame(list(client.cars_database.cars.find({})))
    df_cars.drop('_id', axis = 1, inplace = True)
    df_cars = df_cars[df_cars['Loaded_in_DW'].eq(False)]
    df_cars.drop_duplicates(subset=['ID'], inplace = True)
    
    return df_cars




recnik = {}
recnik['Method'] = []
recnik['Mean Percentage Error'] = []
recnik['Standard Deviation'] = []
def randomforestCV( max_features, n_estimators, X, y, message, scoring = 'neg_mean_absolute_percentage_error', recnik = recnik):
    rfr = RandomForestRegressor(max_features = max_features, n_estimators = n_estimators, random_state = 2)
    rfr.fit(X,y)
    rfr_scores = cross_val_score(rfr, X, y, scoring = scoring, cv = 5, n_jobs = -1)
    print("Scores:", -rfr_scores)
    print("Mean:", -rfr_scores.mean())
    print("Standard deviation:", rfr_scores.std())
    recnik['Method'].append(message)
    recnik['Mean Percentage Error'].append(-rfr_scores.mean())
    recnik['Standard Deviation'].append(rfr_scores.std())

    
def gridSearch(param_grid, model, X, y):
    grid_search = GridSearchCV(model, param_grid, cv=5,
                           scoring='neg_mean_absolute_percentage_error',
                           return_train_score=True,
                           verbose = 10, n_jobs = -1)
    grid_search.fit(X, y)
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(-mean_score, params)
    return grid_search

Here we are trying an unsupervised learning technique called k-means clustering which, by definition, aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean (cluster centers or cluster centroid), serving as a prototype of the cluster.

1. Forming a dataset

For this part we will be using a reduced dataset containing only BMW listings. Moreover, only the numeric attributes will be taken into consideration.

df_cars = readData()
type0_pipeline = Pipeline([
                        ('initial', InitalCleaning()),
                        ('equipment', AdjustEquip()),
                        ('date', RegistrationTransformer(tip = 0))
                    ])
BMW_df = type0_pipeline.fit_transform(df_cars)
clustering = BMW_df[num_att]

final_pipeline = Pipeline([
                        ('IQR_removal',IQROutlierRemoval(num_att = num_att)),
                        ('std_scaler', StandardScalerIndices()),
                        ('yeo-johnson',PowerTransformerIndices(method='yeo-johnson')),
                        ('z_score_removal', ZScoreOutlierRemoval(num_att = num_att)),
                        ('Imputer', IterativeImputerIndices(estimator = ExtraTreesRegressor(n_estimators=10, random_state=0))),
                        
                        ])
ready = final_pipeline.fit_transform(clustering)

E:\Users\Filip\Anaconda\lib\site-packages\ipykernel_launcher.py:198: RuntimeWarning:

invalid value encountered in less

2. Selecting the number of clusters

As the name of the algorithm suggest, the most important part is choosing the optimal number of clusters, $k$. To do so, we have to choose a suitable metric. One approach is to use the models inertia, which is avaliable as an attribute to the sklearn KMeans class. Inertia is the sum of squared error for each cluster. Therefore the smaller the inertia the denser the cluster. However if we plot the inertia against the number of clusters $k$, we will end up with a monotonically decreasing function which will reach zero at a point where each point will represent a different cluster, $k = \text{number_clusters}$. For this reason, we will use a more precise approach called silhouette score. This technique computes the silhouette coefficient over all instances, which is equal to:

$\text{silhouette_coefficinet} = { {(b-a)} \over {max(b,a)} } $, where:
- $a$ is the mean distance to the other instances in the same cluster
- $b$ is the mean distance to the instances of the next closest cluster

Knowning this, we can conclude that the silhouette coefficinet can have values between -1 and 1 where:

Result close to $1$ means the instance is well inside its own cluster and far from other clusters
Result close to $0$ suggests the instance is close to cluster boundary
Resutl close to $-1$ implies that the instance has been placed in a wrong cluster

The silhouette score is the mean silhouette coefficint over all the instances.

kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(ready)
                for k in range(2, 10)]



silhouette_scores = [silhouette_score(ready, model.labels_)
                     for model in kmeans_per_k[0:]]

The figure below shows the optimal number of clusters is 5.

fig = go.Figure()
fig.add_trace(go.Scatter(y = silhouette_scores,  name = 'Scores'))

fig.update_layout(plot_bgcolor='rgb(255,255,255)',xaxis_title="k",
    yaxis_title='Silhouette Scores')
fig.update_xaxes(ticks = 'outside', showline=True, linecolor='black')
fig.update_yaxes(ticks = 'outside', showline=True, linecolor='black')



plot(fig, filename = 'fig.html', config = config)
display(HTML('fig.html'))

3. Feature Importances

3.1 Decision Tree to predict the clusters' labels

In order to see which features have the biggest impact on the cluster definition, we will transform the problem from an unsupervised to a supervised task. This means that the attributes that were used to create the clusters will now serve as independent values, $X$, whereas the labels that we got from the last step will represent the dependent value $y$ .Next we are fitting a Decision Tree Classifier on this data, which will yield the features' importances, which are represented in the following bar plot.

labels = kmeans_per_k[3].labels_
model = DecisionTreeClassifier()
# fit the model
model.fit(ready, labels)

fig2 = px.bar(y = model.feature_importances_)

fig2.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [0, 1, 2, 3, 4],
        ticktext = ['First Registration', 'Mileage', 'Power(hp)', 'Displacement','Price']
    )
)


fig2.update_layout(plot_bgcolor='rgb(255,255,255)',xaxis_title="Feature Name",
    yaxis_title='Importance')


plot(fig2, filename = 'fig2.html', config = config)
display(HTML('fig2.html'))

3.2 Correlation Matrix

As a second approach to measure the features significance in defining the clusters, we have decided to use the correlation matrix between the features. This brings us to the same conclusion as the previous approach, which is that the most important features are the: Price, Displacement and the Power(hp).

ready['Labels'] = labels
corr2 = df_to_plotly(ready.corr())
fig3 = px.imshow(corr2['z'], x = corr2['x'], y = corr2['y'], color_continuous_scale=px.colors.sequential.RdBu)

plot(fig3, filename = 'fig3.html', config = config)
display(HTML('fig3.html'))

4. 3D Plot

As a last step we are plotting the instances on a 3 dimensioanl plot, where the axes represent the 3 most important features. The color of the instance defines its cluster

lab_id = ready['Labels']
joinn = BMW_df.join(lab_id)
joinn = joinn[~(joinn['Labels'].isna())]
join = joinn[num_att]
trace1 = go.Scatter3d(
    x= join['Power(hp)'],
    y= join['Displacement'],
    z= join['Price'],
    mode='markers',
     marker=dict(
        color = labels, 
        size= 10,
        line=dict(
            color= labels,
            width= 12
        ),
        opacity=0.8
     )
)


layout = go.Layout(
    title = 'Clusters',
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0  
    ),
    scene = dict(
            xaxis = dict(title  = 'Power(hp)'),
            yaxis = dict(title  = 'Displacement'),
            zaxis = dict(title  = 'Price')
        )
)

fig4 = go.Figure(data = trace1, layout = layout)

plot(fig4, filename = 'fig4.html', config = config)
display(HTML('fig4.html'))