In this part we are trying to expand the number of features in our dataset firstly by joining the Sellers and Ads collections in our MongoDB database.
This will yield a dataset that will additionaly have the city where each listing is located.
Next this table is further expanded with a dataset downloaded from Kaggle which contains information about all the cities in Europe.
The columns of this dataset are as follows:
- Country
- City
- Latitude
- Longitude
We will be joining our dataset to this one on the field City. This way the resulting dataset will be richer with the longitude and latitude of the city.
import numpy as np
import pandas as pd
from pymongo import MongoClient
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
tbr = ['1','10 months','11 months','112 months','12 months','13 months','14 months','15 months','16 months','17 months','18 months',
'19 months','2 months','20 months','21 months','22 months','23 months','24 months','25 months','26 months','27 months',
'28 months','29 months','3 months','30 months','31 months','32 months','33 months','34 months','35 months','36 months',
'38 months','4 months','40 months','41 months','42 months','43 months','44 months','45 months','46 months','47 months',
'48 months','5 months','50 months','52 months','53 months','54 months','55 months','56 months','58 months','59 months',
'6 months','60 months','7 months','72 months','8 months','84 months','88 months','9 months', '0 months','1 months']
countries_dict = {'it': 'Italy','de':'Germany','fr':'France','cz':'Czech Republic','tr':'Turkey','at':'Austria','lu':'Luxemburg',
'be':'Belgium','nl':'Netherlands','gb':'Great Britain','es':'Spain','bg':'Bulgaria','ru':'Russia','pl':'Poland',
'ro':'Romania','hu':'Hungary'}
def readData():
client = MongoClient('mongodb+srv://<User>:<Pass>@dwprojectcluster.lpqbf.mongodb.net/cars_database?retryWrites=true&w=majority')
df_cars = pd.DataFrame(list(client.cars_database.cars.find({})))
df_cars.drop('_id', axis = 1, inplace = True)
df_cars = df_cars[df_cars['Loaded_in_DW'].eq(False)]
df_ads = pd.DataFrame(list(client.cars_database.ads.find({})))
df_ads.drop(['_id', 'Price'], axis =1, inplace = True)
df_sellers = pd.DataFrame(list(client.cars_database.sellers.find({})))
df_sellers.drop('_id', axis = 1, inplace = True)
df_sellers['City'] = df_sellers['City'].str.lower()
df_ads.drop_duplicates(subset=['CarID'], inplace = True)
df_cars.drop_duplicates(subset=['ID'], inplace = True)
df_sellers.drop_duplicates(subset = ['SellerID'], inplace = True)
return df_cars,df_ads,df_sellers
def readCities():
df_cities = pd.read_csv('cities2.csv')
df_cities_eu = df_cities.loc[df_cities['Country'].eq('de') | df_cities['Country'].eq('it')
| df_cities['Country'].eq('at') | df_cities['Country'].eq('nl')
| df_cities['Country'].eq('be') | df_cities['Country'].eq('lu')
| df_cities['Country'].eq('pl') | df_cities['Country'].eq('ru')
| df_cities['Country'].eq('es') | df_cities['Country'].eq('fr')
| df_cities['Country'].eq('cz') | df_cities['Country'].eq('ro')
| df_cities['Country'].eq('gb') | df_cities['Country'].eq('cz')
| df_cities['Country'].eq('hu') | df_cities['Country'].eq('tr')
| df_cities['Country'].eq('bg')]
df_cities_eu.drop_duplicates(subset=['City'], inplace = True)
return df_cities_eu
def merge(df_cars,df_ads,df_sellers,df_cities_eu):
df_cars['ID'] = pd.to_numeric(df_cars['ID'])
df_ads['CarID'] = pd.to_numeric(df_ads['CarID'])
df_ads['SellerID'] = pd.to_numeric(df_ads['SellerID'])
df_sellers['SellerID']=pd.to_numeric(df_sellers['SellerID'])
sellers_extended = df_sellers.join(df_cities_eu.set_index('City'), on = 'City',rsuffix='_other')
sellers_extended.loc[sellers_extended['Country'].isnull(), 'Country'] = sellers_extended['Country_other'].map(countries_dict)
sellers_extended['Country'] = sellers_extended['Country'].replace({np.nan: 'Undefined'})
join1 = df_cars.merge(df_ads, left_on = 'ID', right_on = 'CarID', how = 'inner')
join2 = join1.merge(sellers_extended, left_on = 'SellerID', right_on = 'SellerID', how = 'inner')
return join2
[df_cars,df_ads,df_sellers] = readData()
df_cities_eu = readCities()
final = merge(df_cars,df_ads,df_sellers,df_cities_eu)
lat_lon = final[['Latitude',"Longitude", 'City']]
lat_lon = lat_lon.dropna().reset_index(drop = True)
gr = lat_lon.groupby(['Latitude','Longitude', 'City']).agg(
count = ('Latitude', 'count'),
).reset_index()
gr['count_str'] = gr['count']
gr['count_str'] = gr['count_str'].apply(str)
gr = gr.sort_values('count', ascending = False)
gr[0:20]
import plotly.graph_objects as go
import plotly.tools as tls
from plotly.offline import plot, iplot, init_notebook_mode
from IPython.core.display import display, HTML
init_notebook_mode(connected = True)
config={'showLink': False, 'displayModeBar': False}
fig = go.Figure(data=go.Scattergeo(
lon = gr['Longitude'],
lat = gr['Latitude'],
text = 'City name: ' + gr['City'] + '<br> Number of ads: '+ gr['count_str'],
marker = dict (color = np.log(gr['count'][0:len(gr)-2]), colorbar=dict(
title="Colorbar"
),
)
))
fig.update_layout(
geo_scope='europe',
)
plot(fig, filename = 'fig.html', config = config)
#Binder
#iplot(fig4, filename = 'plotly_figures/fig4.html', config = config)
#ThebeLab
display(HTML('fig.html'))
df_cars.info()
final.info()
Conclusion
Seeing the results above, it is obvious that the transformations we made in order to add another feature (location) to each listing resulted in shrinkage of the dataset size. From the inital 265099 listings we are down to 86313. Out of those only 55260 have information about the Latitude and Longitude of the city. The inital reducing of the size is due to the fact that we don't have information about each of the sellers and the majority of the listings are with unknown seller. The missing Latitude and Longitude fields are because of the unability to find the required city in the Kaggle dataset.
Knowning this, we decided it is best to stick with the inital dataset and have as many entries as possible.