Introduction
Wish is an American online e-commerce platform which facilitates product transactions between sellers and buyers. This dataset is from Kaggle with information scraped from the Wish platform. The products listed in the dataset are those that would appear if we type “summer” in the search filed of the platform.
This dataset contains 1573 rows and 43 columns with the columns containing information about product listing, product ratings, sales performance, etc. With all this information, I can explore correlations and patterns regarding the success of a product and the various components. For example, what features play an important role in affecting the sale of a product? Can we validate the established idea of human sensitiveness to price drops? Do products with bad ratings but feature a price drop sell? If they do, to what extent does the seller need to drop the price in order to attract buyers? Does product listing affect product sale? Are consumers attracted by certain words? What are the top categories of products that sells best? What’s the price range that is most attractive to buyers?
The dataset can be found here: Wish database
import os
from os import getcwd
getcwd()
import warnings
warnings.filterwarnings('ignore')
os.chdir('C:\\Users\\linli\\Desktop\\In progress project\\Wish')
import pandas as pd
import seaborn as sns
import numpy as np
import plotly
from plotly import graph_objs as go
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint as sp_randint
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from mlxtend.regressor import StackingRegressor
random_seed=748
random_state=random_seed
df = pd.read_csv('summer-products-with-rating-and-performance_2020-08.csv')
print(df.shape)
df.head(2)
df.columns
df.describe()
print(df.info())
#detect which cells have missing values, and then count how many in each column
missing_val_count_by_column = (df.isnull().sum())
missing_val_count_by_column[missing_val_count_by_column > 0]
#check how many uniques and nulls for all variables
def unique_null(df):
unique = pd.DataFrame( df.nunique(), columns= ['unique#'] )
null = pd.DataFrame( df.isnull().sum(), columns= ['null#'] )
tb = pd.concat( [unique, null], axis = 1 )
tb['observation#'] = df.shape[0]
if tb['null#'].sum() > 0:
tb = tb[['observation#','unique#', 'null#']].sort_values(by=['null#'], ascending = False)
elif tb['unique#'].sum() != tb['observation#'].sum():
tb = tb[['observation#','unique#', 'null#']].sort_values(by=['unique#'], ascending = True)
else:
tb = tb[['observation#','unique#', 'null#']].sort_index()
return tb
unique_null(df)
All these columns have only one unique value,drop zero-variance columns.
df.drop(['theme','currency_buyer','crawl_month'], inplace=True,axis=1)
"has_urgency_banner" has a lot of missing values because there simply isn't a urgency flag on it, so we'll fillna with 0 in this case.
df.has_urgency_banner.head(10)
df.has_urgency_banner.fillna(0, inplace=True)
df[df['rating_five_count'].isnull()==True][['rating', 'rating_count',
'rating_five_count', 'rating_four_count', 'rating_three_count',
'rating_two_count', 'rating_one_count']]
It turns out those rows with missing values all have 0 rating_count and a rating of 5, it’s a weird situation because if there nobody has submitted a rating for these products, where does rating score of 5 come from? I decided to fill the missing value with 0 in this case.
for number in ['one', 'two', 'three', 'four', 'five']:
column_name = 'rating_'+ number +'_count'
df[column_name].fillna(0, inplace=True)
unique_null(df)
df.urgency_text.head(10)
The missing values are simply the ones that don't have the urgency text.
df.urgency_text.fillna('0', inplace=True)
df.merchant_profile_picture.fillna('unknown',inplace=True)
df.merchant_name.fillna('unknown',inplace=True)
df.merchant_info_subtitle.fillna('unknown',inplace=True)
df.product_color.value_counts()
df.origin_country.value_counts()
df.product_variation_size_id.value_counts()
df.origin_country.fillna('CN', inplace=True)
df.product_color.fillna('black', inplace=True)
df.product_variation_size_id.fillna('S', inplace=True)
unique_null(df)
No more missing value, but there are duplicate product IDs.
df = df.drop_duplicates(subset='product_id').reset_index(drop=True)
unique_null(df)
Rating and units_sold.
plt.figure(figsize = (28,12)),
ax = plt.subplot(1,7,1)
sns.scatterplot(x="rating_one_count", y="units_sold", data=df, ax= ax);
ax = plt.subplot(1,7,2)
sns.scatterplot(x="rating_two_count", y="units_sold", data=df, ax= ax);
ax = plt.subplot(1,7,3)
sns.scatterplot(x="rating_three_count", y="units_sold", data=df, ax= ax);
ax = plt.subplot(1,7,4)
sns.scatterplot(x="rating_four_count", y="units_sold", data=df, ax= ax);
ax = plt.subplot(1,7,5)
sns.scatterplot(x="rating_five_count", y="units_sold", data=df, ax= ax);
ax = plt.subplot(1,7,6)
sns.scatterplot(x="rating_count", y="units_sold", data=df, ax= ax);
ax = plt.subplot(1,7,7)
sns.scatterplot(x="rating", y="units_sold", data=df, ax= ax);
df1 = pd.DataFrame(df, columns = ['rating', 'rating_count','rating_five_count','rating_four_count','rating_three_count','rating_two_count',
'rating_one_count','units_sold'])
fig = px.scatter(df1, x="rating", y="units_sold",color="rating_count",
size='units_sold')
fig.show()
df1.corr()
All five rating counts and total rating counts are important for units_sold. However, all five rating counts and total rating counts are highly correlated, thus I will only keep rating, rating_count.
fig = px.scatter(df, x="rating_count", y="price",color="units_sold",
size='units_sold')
fig.show()
print(df.price.describe(percentiles = [0.25,0.50,0.75,0.85,0.90,1]))
Products with high rating counts together with product prices lower than 10 have the highest sales.
df["discount"] = df["retail_price"]-df["price"]
plt.figure(figsize = (16,8)),
ax = plt.subplot(1,3,1)
sns.scatterplot(x="discount", y="units_sold", data=df, ax= ax);
ax = plt.subplot(1,3,2)
sns.scatterplot(x="discount", y="rating_five_count", data=df, ax= ax);
ax = plt.subplot(1,3,3)
sns.scatterplot(x="rating_count", y="price", data=df, ax= ax);
It's surprising to note that discount is not an important factor for high sales or good ratings.
#check outliers
plt.figure(figsize = (16,8)),
ax = plt.subplot(1,3,1)
sns.boxplot(y = df.units_sold, ax= ax);
ax = plt.subplot(1,3,2)
sns.boxplot(y = df.price, ax= ax);
ax = plt.subplot(1,3,3)
sns.boxplot(y = df.retail_price, ax= ax);
plt.figure(figsize = (16,8)),
ax = plt.subplot(1,3,1)
sns.boxplot(y = df.rating, ax= ax);
ax = plt.subplot(1,3,2)
sns.boxplot(y = df.rating_count, ax= ax);
ax = plt.subplot(1,3,3)
sns.boxplot(y = df.merchant_rating_count, ax= ax);
def out_iqr(df , column):
global lower,upper
q25, q75 = np.quantile(df[column], 0.25), np.quantile(df[column], 0.75)
iqr = q75 - q25
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off
print('The IQR is',iqr)
print('The lower bound value is', lower)
print('The upper bound value is', upper)
df1 = df[df[column] > upper]
df2 = df[df[column] < lower]
return print('Total number of outliers are', df1.shape[0]+ df2.shape[0])
out_iqr(df,'units_sold')
plt.figure(figsize = (10,6))
sns.distplot(df.units_sold, kde=False)
plt.axvspan(xmin = lower,xmax= df.units_sold.min(),alpha=0.2, color='red')
plt.axvspan(xmin = upper,xmax= df.units_sold.max(),alpha=0.2, color='red')
I am not going to remove the outliers because they appear to have the correct inputs and contain true sales information. I will use Mean Absolute Error as the final metric.
def replace_name(a,b):
df.origin_country.replace(a,b,inplace=True)
replace_name( 'CN',"China" )
replace_name( "US","United States of America" )
replace_name( "unknown","unknown" )
replace_name("VE","Venezuela" )
replace_name( 'GB',"Great Britain" )
replace_name( 'SG',"Singapore" )
replace_name( 'AT',"Austria" )
labels = df.origin_country.value_counts(normalize=True).index.values
values = df.origin_country.value_counts().values
fig = go.Figure()
fig.add_trace(go.Pie(labels=labels, values=values))
fig.update_layout(title="Product origin country", legend_title="Country names", template="plotly_dark")
df[df.origin_country=="China"]['price'].describe()
About 75% of products coming from China are under 11 euros.
color_data=df['product_color'].value_counts().loc[lambda x : x>10]
color_data
labels = color_data.index.values
values =color_data.value_counts().values
fig = go.Figure()
fig.add_trace(go.Pie(labels=labels, values=values))
fig.update_layout(title="Product color", legend_title="Colors", template="plotly_dark")
fig
Black and white are most popular colors.
plt.figure(figsize = (6,6)),
sns.barplot(x='uses_ad_boosts',y='units_sold',data=df)
Interestingly, sellers without using the ad boosts have higher sales.
plt.figure(figsize = (25,8)),
ax = plt.subplot(1,5,1)
sns.barplot(x='badge_local_product',y='units_sold',data=df,ax= ax);
ax = plt.subplot(1,5,2)
sns.barplot(x='badge_product_quality',y='units_sold',data=df,ax= ax);
ax = plt.subplot(1,5,3)
sns.barplot(x='badge_fast_shipping',y='units_sold',data=df,ax= ax);
ax = plt.subplot(1,5,4)
sns.barplot(x='shipping_is_express',y='units_sold',data=df,ax= ax);
ax = plt.subplot(1,5,5)
sns.barplot(x='has_urgency_banner',y='units_sold',data=df,ax= ax);
Sellers with a product quality badge have higher sales. Sellers without an urgency banner, express shipping, local product, fast shipping badges have higher sales.
prices_by_country = df[['price','discount','retail_price','origin_country']].groupby('origin_country').mean()
fig = go.Figure()
fig.add_trace(go.Bar(x=prices_by_country.index.values, y=prices_by_country.price, name="Price"))
fig.add_trace(go.Scatter(x=prices_by_country.index.values, y=prices_by_country.discount, name="Discount"))
fig.add_trace(go.Bar(x=prices_by_country.index.values, y=prices_by_country.retail_price, name="Retail Price"))
fig.update_layout(title="Prices Categories By Country", xaxis_title="Countries", yaxis_title="Discount", template="plotly_dark", legend_title="Legend")
U.S. products have highest discount.
df['shipping_option_name'].value_counts()
livrasion_prices = df[ df.shipping_option_name =='Livraison standard']['shipping_option_price'].value_counts().index.values
livrasion_prices_frquency = df[df.shipping_option_name =='Livraison standard']['shipping_option_price'].value_counts().values
fig = go.Figure()
fig.add_trace(go.Pie(labels=livrasion_prices, values=livrasion_prices_frquency))
fig.update_layout(title="Livrasion Standard Prices", legend_title="Prices In Euros", template="plotly_dark")
Livraison Standard is a quite popular option for shipping and most customers choose shipping options from 1-3 euros.
# description of the tags
plt.rcParams['figure.figsize'] = (10,10)
plt.style.use('fast')
wc = WordCloud(background_color = 'orange', width = 1500, height = 1500).generate(str(df['tags']))
plt.title('Description of the Tag', fontsize = 20)
plt.imshow(wc)
plt.axis('off')
plt.show()
plt.rcParams['figure.figsize'] = (10,10)
plt.style.use('fast')
wc = WordCloud(background_color = 'green', width = 1500, height = 1500).generate(str(df['title_orig']))
plt.title('Description of the Tag', fontsize = 20)
plt.imshow(wc)
plt.axis('off')
plt.show()
Add a tag_count column to the dataframe.
def tag_count(tags):
tag_str = tags
prod_tags = tag_str.split(',')
return len(prod_tags)
df['tag_count'] = df['tags'].apply(tag_count)
df.head(2)
Explore product
product_cat_columns = df.loc[:, df.columns.str.startswith("product")].columns.values
df[product_cat_columns].head()
df.drop(['product_picture','product_url'], inplace=True, axis=1)
df_products = df[['tags', 'price','discount','uses_ad_boosts', 'units_sold', 'rating','rating_count', 'product_id','badges_count', 'badge_product_quality','merchant_rating']].copy().sort_values(['units_sold','badges_count'], ascending=False)
products_by_id = df_products.set_index('product_id')
The top 6 products sold units are 100k, while others are at 50k, so that's a massive difference.
# Top 10 products sold for women
df_products.loc[df_products.tags.str.contains('[Ww]omen')].head(10).index
# Top 10 products in general
df_products.head(10).index
The index is same for both in general and women products, so top buyers are women or people shopping for ladiesware products in Wish.
A list of top 10 items
df[['title', 'units_sold','price','product_color','origin_country','rating','rating_count','merchant_rating_count']].sort_values(by = 'units_sold',
ascending = False).head(10)
scaler = MinMaxScaler()
plot_data = products_by_id.copy()
plot_data.iloc[:,1:] = scaler.fit_transform(plot_data.iloc[:,1:])
fig = go.Figure()
fig.add_trace(go.Bar(x=plot_data.head(10).index.values,y=plot_data.head(10).units_sold,name="Units Sold" ))
fig.add_trace(go.Scatter(x=plot_data.head(10).index.values,y=plot_data.head(10).price, mode="lines+markers", name="Price" ))
fig.add_trace(go.Scatter(x=plot_data.head(10).index.values,y=plot_data.head(10).rating_count,mode="lines+markers",name="Product rating counts" ))
fig.add_trace(go.Scatter(x=plot_data.head(10).index.values,y=plot_data.head(10).rating,mode="lines+markers",name="Product rating" ))
fig.add_trace(go.Scatter(x=plot_data.head(10).index.values,y=plot_data.head(10).merchant_rating,mode="lines+markers",name="Merchant rating" ))
fig.update_layout(title="Top 10 Products Sold", legend_title="Features")
Check correlation between unit_sold and 3 categorical variables.
#use one hot encoding to change categorical variables to dummy variable
dummies_color = pd.get_dummies(df['product_color'], drop_first=True)
dummies_variation = pd.get_dummies(df['product_variation_size_id'])
dummies_origin = pd.get_dummies(df['origin_country'])
feat_onehot = pd.concat([dummies_color, dummies_variation, dummies_origin, df['units_sold']], axis=1)
feat_onehot.head(1)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
feat_onehot_corr = feat_onehot.corr()
feat_onehot_corr['units_sold'].sort_values(ascending=False).head(6)
df.drop(labels = ['product_color', 'product_variation_size_id', 'origin_country'],
axis=1,
inplace=True)
The correlations between units_sold and 3 categorical variables, color, size, origin country are not very high, those will not be considered in the model. From the EDA, drop some other unimportant variables.
df.drop(labels = ['tags','title', 'title_orig', 'urgency_text', 'merchant_title',
'merchant_name','merchant_info_subtitle','merchant_id',
'product_id','merchant_profile_picture',
'shipping_option_name','rating_five_count',
'rating_four_count','rating_three_count',
'rating_two_count','rating_one_count','discount'], axis=1, inplace=True)
df.head(2)
Data is ready for modeling
y=df.pop('units_sold')
X=df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
scaler_x = MinMaxScaler(feature_range=(0,1))
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test, y_pred)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test, y_pred))}')
# Visualize the predictions (in blue) against the actual values (in red)
plt.figure(figsize = (8,8)),
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(y_pred, hist=False, color='b',label='prediction', ax=ax1).set_title('Linear Regression')
X_1 = sm.add_constant(X)
model = sm.OLS(y,X_1).fit()
#Backward Elimination
cols = list(X.columns)
pmax = 1
while (len(cols)>0):
p= []
X_1 = X[cols]
X_1 = sm.add_constant(X_1)
model = sm.OLS(y,X_1).fit()
p = pd.Series(model.pvalues.values[1:],index = cols)
pmax = max(p)
feature_with_p_max = p.idxmax()
if(pmax>0.05):
cols.remove(feature_with_p_max)
else:
break
selected_features_BE = cols
print(selected_features_BE)
X_backward_elimination = X[['retail_price', 'uses_ad_boosts', 'rating_count', 'badge_fast_shipping', 'merchant_rating_count', 'tag_count']]
#refit the model using variables selected by backward elimination
X_2 = sm.add_constant(X_backward_elimination)
#Fitting sm.OLS model
model = sm.OLS(y,X_2).fit()
model.summary()
#no of features
nof_list=np.arange(1,13)
high_score=0
#Variable to store the optimum features
nof=0
score_list =[]
for n in range(len(nof_list)):
model = XGBRegressor()
rfe = RFE(model,nof_list[n])
X_train_rfe = rfe.fit_transform(X_train,y_train)
X_test_rfe = rfe.transform(X_test)
model.fit(X_train_rfe,y_train)
score = model.score(X_test_rfe,y_test)
score_list.append(score)
if(score>high_score):
high_score = score
nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))
cols = list(X.columns)
model = LinearRegression()
#Initializing RFE model
rfe = RFE(model, 9)
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)
#Fitting the data to model
model.fit(X_rfe,y)
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)
from sklearn.linear_model import LassoCV
reg = LassoCV(random_state=random_seed)
reg.fit(X, y)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
imp_coef = coef.sort_values()
matplotlib.rcParams['figure.figsize'] = (8,8)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model")
#ridge
Ridge = linear_model.Ridge(random_state=random_seed)
Ridge.fit(X_train,y_train)
ridge_pred= Ridge.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,ridge_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,ridge_pred)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,ridge_pred))}')
params_Ridge = {'alpha': np.array([0.01,0.1,1,5,10,15,20,25,30,35,40,45,50])}
Ridge_GS = GridSearchCV(Ridge, param_grid=params_Ridge)
Ridge_GS.fit(X_train,y_train)
Ridge_GS.best_params_
pred_Ridge_GS = Ridge_GS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_Ridge_GS)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_Ridge_GS)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_Ridge_GS))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(ridge_pred, hist=False, color='b',label='prediction', ax=ax1).set_title('Ridge Regression');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_Ridge_GS, hist=False, color='b',label='prediction', ax=ax2).set_title('Ridge Regression After Parameter Tuning');
Lasso = linear_model.Lasso(alpha=0.01, random_state=random_seed)
Lasso.fit(X_train,y_train)
lasso_pred = Lasso.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,lasso_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,lasso_pred)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,lasso_pred))}')
params_Lasso = {'alpha': np.array([0.01,0.1,1,5,10,15,20,25,30,35,40,45,50,60,70,80,100])}
Lasso_GS = GridSearchCV(Lasso, param_grid=params_Lasso)
Lasso_GS.fit(X_train,y_train)
Lasso_GS.best_params_
pred_Lasso_GS = Lasso_GS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_Lasso_GS)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_Lasso_GS)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_Lasso_GS))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(lasso_pred, hist=False, color='b',label='prediction', ax=ax1).set_title('Lasso Regression');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_Lasso_GS, hist=False, color='b',label='prediction', ax=ax2).set_title('Lasso Regression After Parameter Tuning');
EN = linear_model.ElasticNet(random_state=random_seed)
EN.fit(X_train,y_train)
pred_EN = EN.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_EN)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_EN)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_EN))}')
params_EN_RS = {'alpha':np.array([0.0001,0.001,0.01,0.1,1,5,10,15,20,25,30,35,40,45,50]),
'l1_ratio':uniform(0.0001,1) }
EN_RS = RandomizedSearchCV(linear_model.ElasticNet(), param_distributions=params_EN_RS,n_iter=100)
EN_RS.fit(X_train,y_train)
EN_RS.best_params_
pred_EN_RS = EN_RS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_EN_RS )}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_EN_RS )}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_EN_RS ))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_EN, hist=False, color='b',label='prediction', ax=ax1).set_title('Elastic Net');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_EN_RS, hist=False, color='b',label='prediction', ax=ax2).set_title('Elastic Net After Parameter Tuning');
knnr = KNeighborsRegressor()
knnr.fit(X_train,y_train)
pred_knnr = knnr.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_knnr)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_knnr)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_knnr))}')
params_knn = {'n_neighbors':[5,6,7,8,9,10],
'leaf_size':[1,2,3,5],
'weights':['uniform', 'distance'],
'algorithm':['auto', 'ball_tree','kd_tree','brute']}
model_knn1 = GridSearchCV(knnr, param_grid=params_knn)
model_knn1.fit(X_train,y_train)
model_knn1.best_params_
pred_knnr_GS = model_knn1.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_knnr_GS)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_knnr_GS)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_knnr_GS))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_knnr, hist=False, color='b',label='prediction', ax=ax1).set_title('KNN Regressor');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_knnr_GS, hist=False, color='b',label='prediction', ax=ax2).set_title('KNN Regressor After Parameter Tuning');
DTR = DecisionTreeRegressor(max_depth=5,random_state=random_seed)
DTR.fit(X_train,y_train)
Pred_DTR = DTR.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,Pred_DTR)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,Pred_DTR)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,Pred_DTR))}')
params = {'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15],
'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11],
'max_depth':[2,3,4,5,6,7,8]}
DTR_GS = GridSearchCV(DTR, param_grid=params)
DTR_GS.fit(X_train,y_train)
DTR_GS.best_params_
pred_DTR_GS = DTR_GS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_DTR_GS)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_DTR_GS)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_DTR_GS))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(Pred_DTR, hist=False, color='b',label='prediction', ax=ax1).set_title('Decision Tree');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_DTR_GS, hist=False, color='b',label='prediction', ax=ax2).set_title('Decision Tree After Parameter Tuning');
baggingR = BaggingRegressor(random_state=random_seed)
baggingR.fit(X_train,y_train)
bag_test_pred = baggingR.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,bag_test_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,bag_test_pred)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,bag_test_pred))}')
params_bag_GS = {"n_estimators": [1,2,5,10],
"max_features":[0.5,1],
"max_samples": [0.1,0.5,1],
"bootstrap": [True, False],
"bootstrap_features": [True, False]}
Bag_model_GS = GridSearchCV(baggingR, param_grid=params_bag_GS)
Bag_model_GS.fit(X_train,y_train)
Bag_model_GS.best_params_
pred_bag_GS = Bag_model_GS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_bag_GS)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_bag_GS)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_bag_GS))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(bag_test_pred, hist=False, color='b',label='prediction', ax=ax1).set_title('Bagging');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_bag_GS, hist=False, color='b',label='prediction', ax=ax2).set_title('Bagging After Parameter Tuning');
rfr = RandomForestRegressor(random_state=random_seed)
rfr.fit(X_train,y_train)
rfr_test_pred = rfr.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,rfr_test_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,rfr_test_pred)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,rfr_test_pred))}')
params_RF = {"max_depth": [3,5,6,7,8,9],
"max_features":['auto', 'sqrt', 'log2'],
"min_samples_split": [2, 3,5,7],
"min_samples_leaf": [1, 3,5,6]}
model_RF_GS = GridSearchCV(rfr, param_grid=params_RF)
model_RF_GS.fit(X_train,y_train)
model_RF_GS.best_params_
pred_RF_GS = model_RF_GS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_RF_GS )}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_RF_GS )}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_RF_GS ))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(rfr_test_pred, hist=False, color='b',label='prediction', ax=ax1).set_title('Random Forest');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_RF_GS , hist=False, color='b',label='prediction', ax=ax2).set_title('Random Forest After Parameter Tuning');
AdaBoost = AdaBoostRegressor(random_state=random_seed)
AdaBoost.fit(X_train,y_train)
AdaBoost_test_pred = AdaBoost.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,AdaBoost_test_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,AdaBoost_test_pred)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,AdaBoost_test_pred))}')
params_AdbR_GS = {'learning_rate':[0.05,0.1,0.2,0.6,0.8,1],
'n_estimators': [50,60,100],
'loss' : ['linear', 'square', 'exponential']}
model_AdaR_GS = GridSearchCV(AdaBoostRegressor(), param_grid=params_AdbR_GS)
model_AdaR_GS.fit(X_train,y_train)
model_AdaR_GS.best_params_
pred_AdaR_GS = model_AdaR_GS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_AdaR_GS)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_AdaR_GS)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_AdaR_GS))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(AdaBoost_test_pred, hist=False, color='b',label='prediction', ax=ax1).set_title('Ada Boosting');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_AdaR_GS, hist=False, color='b',label='prediction', ax=ax2).set_title('Ada Boosting After Parameter Tuning');
GBR = GradientBoostingRegressor(random_state=random_seed)
GBR.fit(X_train,y_train)
GBR_test_pred = GBR.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,GBR_test_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,GBR_test_pred)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,GBR_test_pred))}')
params_GBR_GS = {"max_depth": [3,5,6,7],
"max_features":['auto', 'sqrt', 'log2'],
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
'learning_rate':[0.05,0.1,0.2],
'n_estimators': [10,30,50,70]}
model_GradR2_GS = GridSearchCV(GradientBoostingRegressor(), param_grid=params_GBR_GS)
model_GradR2_GS.fit(X_train,y_train)
model_GradR2_GS.best_params_
pred_GradR_GS = model_GradR2_GS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_GradR_GS )}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_GradR_GS )}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_GradR_GS ))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(GBR_test_pred, hist=False, color='b',label='prediction', ax=ax1).set_title('Gradient Boosting');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_GradR_GS, hist=False, color='b',label='prediction', ax=ax2).set_title('Gradient Boosting After Parameter Tuning');
xgbr = XGBRegressor(random_state=random_seed)
xgbr.fit(X_train,y_train)
pred_xgbr = xgbr.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_xgbr)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_xgbr )}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_xgbr ))}')
params_xgbR_GS = {"max_depth": [3,4,5,6,7,8],
"min_child_weight" : [4,5,6,7,8],
'learning_rate':[0.05,0.1,0.2,0.25,0.8,1],
'n_estimators': [10,30,50,70,80,100]}
model_xgbR_GS = GridSearchCV(XGBRegressor(), param_grid=params_xgbR_GS)
model_xgbR_GS.fit(X_train,y_train)
model_xgbR_GS.best_params_
pred_xgbR_GS = model_xgbR_GS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_xgbR_GS)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_xgbR_GS)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_xgbR_GS))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_xgbr, hist=False, color='b',label='prediction', ax=ax1).set_title('XgBoost');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_xgbR_GS, hist=False, color='b',label='prediction', ax=ax2).set_title('XgBoost After Parameter Tuning');
svr= SVR(C=1, cache_size=500, epsilon=1, kernel='linear')
svr.fit(X_train, y_train)
pred_svr = svr.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_svr)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_svr)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_svr))}')
params_svr_GS ={"gamma" : ['auto', 'scale'],
"C" : [0.1, 0.5, 1, 50, 100, 1000],
"epsilon" : [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
estimator = SVR(kernel='linear', gamma='auto')
svr_GS = GridSearchCV(estimator, params_svr_GS)
svr_GS .fit(X_train, y_train)
svr_GS.best_params_
pred_svr_GS = svr_GS.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,pred_svr_GS)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,pred_svr_GS)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,pred_svr_GS))}')
plt.figure(figsize = (14,6)),
ax1 = plt.subplot(1,2,1)
sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_xgbr, hist=False, color='b',label='prediction', ax=ax1).set_title('SVR');
ax2 = plt.subplot(1,2,2)
ax1 = sns.distplot(y_test, hist=False, color='r', label='actual')
sns_plot = sns.distplot(pred_svr_GS, hist=False, color='b',label='prediction', ax=ax2).set_title('SVR After Parameter Tuning');
Regressors=['Linear','Ridge','Lasso','ElasticNet','KNN','Decision Tree','Bagging','RF','AdaBoost','GradientB','XgBoost','SVR']
mae=[1881,1921,1881,1907,3492,1703,1442,1478,1653,1454,1438,2305]
df = pd.DataFrame({"Regressors":Regressors,
"Mean Absolute Errors":mae})
plt.figure(figsize=(12,12))
# make barplot and sort bars
sns.barplot(x='Regressors',
y="Mean Absolute Errors",
data=df,
order=df.sort_values('Mean Absolute Errors').Regressors)
# set labels
plt.xlabel("Regressor Names", size=15)
plt.ylabel("Mean Absolute Errors", size=15)
plt.title("Mean absolute errors for different regressors in ascending order", size=18)
mod1 = XGBRegressor(random_state=random_seed)
mod2 = BaggingRegressor(random_state=random_seed)
mod3 = GradientBoostingRegressor(random_state=random_seed)
mod4 = RandomForestRegressor(random_state=random_seed)
mod5 = XGBRegressor(random_state=random_seed)
sr = StackingRegressor(regressors=[mod1, mod2,mod3,mod4,mod5],
meta_regressor=mod5)
sr.fit(X_train,y_train)
sr_pred = sr.predict(X_test)
print(f'R squared: {metrics.r2_score(y_test,sr_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,sr_pred)}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test,sr_pred))}')
Some interesting findings from the dataset: