The dataset can be found here: Adult income database
Many learning algorithms were designed assuming well-balanced class distributions, i.e. no significant differences in class prior probabilities. However, this is not always the case in real world data since one class might be represented by a large number of examples, while the others are represented by only a few, and this is the case for this dataset. The target variable "income" has imbalanced distribution with about 25% being '>50K" and 75% being "<=50K".
The solution to the imbalanced data are mainly two types: data level methods and the algorithmic level methods.
Data level methods
Data level methods consist of balancing classes by resampling the original data set, such that under-represented classes are over-sampled, and over-represented classes are under-sampled.
Oversampling(heuristic method)
There are two approaches supported for generating new data points, Synthetic Minority Over-sampling Technique (SMOTE) and Adaptive Synthetic Sampling (ADASYN). Both techniques use interpolation to generate new datapoints. To be specific, SMOTE is an over-sampling method with synthetic data generation. Its main idea is to form new minority class examples by interpolating between several examples from the minority class that lie together. And ADASYN uses a weighted distribution for different minority class examples according to their level of difficulty in learning, where more synthetic data is generated for minority class examples that are harder to learn compared to those minority examples that are easier to learn.
Random under-sampling(non-heuristic method)
It aims to balance class distribution through the random elimination of majority class examples. But there is critic that under-sampling can eventually discard data potentially important for learning.
Algorithmic level methods:
Idea: adapting existing algorithms and techniques to the especial characteristics of imbalanced data.
These proposals include cost-sensitive learning, one-class classifiers, and ensembles of classifiers, among others.
#import packages
import os
from os import getcwd
getcwd()
import pandas as pd
import seaborn as sns
import numpy as np
import plotly as py
py.offline.init_notebook_mode(connected = True)
from plotly import graph_objs as go
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')
os.chdir('C:\\Users\\linli\\Desktop\\In progress project\\Income Classification')
df = pd.read_csv('income_evaluation.csv')
print(df.shape)
df.head(2)
df.columns
Column names have space in the front, need to rename them.
df.columns= ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship',
'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
Replace the "?" in cells with NaN.
df.replace(' ?', np.NaN, inplace=True)
#detect which cells have missing values, and then count how many there are in each column
missing_val_count_by_column = (df.isnull().sum())
missing_val_count_by_column[missing_val_count_by_column > 0]
#check how many uniques and nulls we got for all variables in a dataframe
def unique_null(df):
unique = pd.DataFrame( df.nunique(), columns= ['unique#'] )
null = pd.DataFrame( df.isnull().sum(), columns= ['null#'] )
tb = pd.concat( [unique, null], axis = 1 )
tb['observation#'] = df.shape[0]
if tb['null#'].sum() > 0:
tb = tb[['observation#','unique#', 'null#']].sort_values(by=['null#'], ascending = False)
elif tb['unique#'].sum() != tb['observation#'].sum():
tb = tb[['observation#','unique#', 'null#']].sort_values(by=['unique#'], ascending = True)
else:
tb = tb[['observation#','unique#', 'null#']].sort_index()
return tb
unique_null(df)
Three attributes have missing values, let's take care of them one by one.
df.occupation.value_counts()
The counts for top few occupations are very close, impute them with "unknown" instead of the category with highest frequency.
df.occupation.fillna('unknown',inplace=True)
df.workclass.value_counts()
df.workclass.fillna(' Private',inplace=True)
df.workclass.value_counts()
df.native_country.value_counts()
df.native_country.fillna(' United-States',inplace=True)
unique_null(df)
No more missing value!
# visualize frequency distribution of income variable
f,ax=plt.subplots(1,2,figsize=(10,6))
ax[0] = df['income'].value_counts().plot.pie(explode=[0,0],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Income Share')
ax[1] = sns.countplot(x="income", data=df, palette="Set1")
ax[1].set_title("Frequency distribution of income variable")
plt.show()
sns.pairplot(df, hue = "income")
Numerical variables
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(df.corr(),annot=True, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Correlation Matrix HeatMap for Numerical Variables",fontweight="bold")
plt.show()
#numerical attributes distribution
p = df.hist(figsize = (14,14))
sns.jointplot(x='age',y='hours_per_week',data=df,kind='hex',color='Green')
sns.jointplot(x='hours_per_week',y='education_num',data=df,kind='hex',color='blue')
df1 = pd.DataFrame(df.groupby(['occupation'])['hours_per_week'].mean().sort_values(ascending = False))
df1.plot.bar(figsize=(8,6))
plt.title('Occupation vs Hours per week')
plt.show()
Categorical variables
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt1 = df.workclass.value_counts().plot('bar')
plt.title('Workclass Histogram')
plt1.set(xlabel = 'Workclass', ylabel='Frequency of workclass')
plt.subplot(1,2,2)
plt1 = df.education.value_counts().plot('bar')
plt.title('Education Histogram')
plt1.set(xlabel = 'Education', ylabel='Frequency of education')
plt.show()
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt1 = df.marital_status.value_counts().plot('bar')
plt.title('Marital status Histogram')
plt1.set(xlabel = 'Marital status', ylabel='Frequency of marital status')
plt.subplot(1,2,2)
plt1 = df.occupation.value_counts().plot('bar')
plt.title('Occupation Histogram')
plt1.set(xlabel = 'Occupation', ylabel='Frequency of occupation')
plt.show()
#relationship,race
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt1 = df.relationship.value_counts().plot('pie')
plt.title('Relationship Pie Chart')
plt.subplot(1,2,2)
plt1 = df.race.value_counts().plot('pie')
plt.title('Race Pie Chart')
plt.show()
Numerical and categorical variables' relationships with target variable "income"
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt.title('Age vs Income')
sns.boxplot(x=df.income, y=df.age, palette=("cubehelix"))
plt.subplot(1,2,2)
plt.title('Number of education years vs Income')
sns.boxplot(x=df.income, y=df.education_num, palette=("cubehelix"))
plt.show()
plt.figure(figsize=(15,6))
plt.subplot(1,3,1)
plt.title('Capital gain vs Income')
sns.boxplot(x=df.income, y=df.capital_gain, palette=("cubehelix"))
plt.subplot(1,3,2)
plt.title('Capital loss vs Income')
sns.boxplot(x=df.income, y=df.capital_loss, palette=("cubehelix"))
plt.subplot(1,3,3)
plt.title('Hours per week vs Income')
sns.boxplot(x=df.income, y=df.hours_per_week, palette=("cubehelix"))
plt.show()
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plt.title('Income vs Sex')
sns.countplot(x="income", hue="sex", data=df)
plt.subplot(1,2,2)
plt.title('Income vs Race')
sns.countplot(x="income", hue="race", data=df)
plt.show()
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plt.title("Age wrt race")
sns.boxplot(x ='race', y="age", data = df)
plt.subplot(1,2,2)
plt.title("Age, race and income")
sns.boxplot(x="income", y="age", hue="race",
data=df, palette="Set2")
plt.show()
plt.figure(figsize=(10,6))
sns.countplot(y="workclass", hue="income", data=df)
Priveate workclass has much higher income in both classes.
plt.figure(figsize=(16,8))
sns.countplot(y="education",hue="income",data=df)
Among people who has higher education types such as masters,doctorate,prof-school, they tend to have higher income. But getting higher degree is not a necessity for high income. Compared to people who has masters,doctorate,prof-school, people who has degree in bachelor, high school,college, have higher counts for ">50K".
plt.figure(figsize=(12,6))
sns.countplot(y="marital_status",hue="income",data=df)
plt.figure(figsize=(12,6))
sns.countplot(y="relationship",hue="income",data=df)
plt.figure(figsize=(12,8))
sns.countplot(y="income", hue="occupation", data=df, palette="Set3")
df[df['income'] == ' <=50K']['age'].mean()
df[df['income'] == ' >50K']['age'].mean()
# Explore Age distibution
g = sns.kdeplot(df["age"][(df["income"] == ' >50K')], color="Red", shade = True)
g = sns.kdeplot(df["age"][(df["income"] == " <=50K")], ax =g, color="Blue", shade= True)
g.set_xlabel("Age")
g.set_ylabel("Income")
g = g.legend([">50K","<50K"])
df[df['income'] == ' <=50K']['hours_per_week'].mean()
df[df['income'] == ' >50K']['hours_per_week'].mean()
g = sns.kdeplot(df["hours_per_week"][(df["income"] == ' >50K')], color="Red", shade = True)
g = sns.kdeplot(df["hours_per_week"][(df["income"] == " <=50K")], ax =g, color="Blue", shade= True)
g.set_xlabel("Hours Per Week")
g.set_ylabel("Income")
g = g.legend([">50K","<50K"])
df[df['income'] == ' <=50K']['education_num'].mean()
df[df['income'] == ' >50K']['education_num'].mean()
g = sns.kdeplot(df['education_num'][(df["income"] == ' >50K')], color="Red", shade = True)
g = sns.kdeplot(df['education_num'][(df["income"] == " <=50K")], ax =g, color="Blue", shade= True)
g.set_xlabel("Year of education")
g.set_ylabel("Income")
g = g.legend([">50K","<50K"])
One hot encode categorical variables and scale numerical variables to [0,1].
# Defining the map function
def dummies(x,df):
temp = pd.get_dummies(df[x], drop_first = True)
df = pd.concat([df, temp], axis = 1)
df.drop([x], axis = 1, inplace = True)
return df
# Applying the function to the cars_lr
df = dummies('workclass',df)
df = dummies('education',df)
df = dummies('marital_status',df)
df = dummies('occupation',df)
df = dummies('relationship',df)
df = dummies('race',df)
df = dummies('sex',df)
df = dummies('native_country',df)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_vars = ['age','education_num','hours_per_week','capital_gain', 'capital_loss','fnlwgt']
df[num_vars] = scaler.fit_transform(df[num_vars])
X = df.drop(['income'], axis=1)
df['income']=pd.get_dummies(df.income)
#1 is <=50k, 0 is >50k
y = df['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# getting the shapes
print("Shape of x_train :", X_train.shape)
print("Shape of x_test :",X_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)
X_train.head(6)
Use RandomForestClassifier for feature selection.
# create the classifier with n_estimators = 100
clf = RandomForestClassifier(n_estimators=100, random_state=0)
# fit the model to the training set
clf.fit(X_train, y_train)
# view the feature scores
feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores
Use K-means for clustering. When plotting the inertia as a function of the number of clusters. The inertia drops very quickly as the number of clusters increase up to 4, but it decreases much more slowly as the clusters increase. Thus I picked the the inflexion point, the "elbow" at the number of clusters=4.
from sklearn.cluster import KMeans
X1 = df[['fnlwgt','age','capital_gain']].iloc[: , :].values
inertia = []
for n in range(1 , 11):
algorithm = (KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300,
tol=0.0001, random_state= 111 , algorithm='elkan') )
algorithm.fit(X1)
inertia.append(algorithm.inertia_)
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()
algorithm = (KMeans(n_clusters = 4 ,init='k-means++', n_init = 10 ,max_iter=300,
tol=0.0001, random_state= 111 , algorithm='elkan') )
algorithm.fit(X1)
labels3 = algorithm.labels_
centroids3 = algorithm.cluster_centers_
df['label3'] = labels3
trace1 = go.Scatter3d(
x= df['age'],
y= df['fnlwgt'],
z= df['education_num'],
mode='markers',
marker=dict(
color = df['label3'],
size= 20,
line=dict(
color= df['label3'],
width= 12
),
opacity=0.8
)
)
data = [trace1]
layout = go.Layout(
title= 'Clusters',
scene = dict(
xaxis = dict(title = 'fnlwgt'),
yaxis = dict(title = 'age'),
zaxis = dict(title = 'capital_gain')
)
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)
The target variable "income" has only two categories in this dataset, either ">50K" or "<=50K", but in reality it can be divided into many more categories, for example, 40K,60K,80K,100K, 4 categories in total. But from the plot, we can see that there is no clear boundary for each cluster.
Simple classifiers, weighted classifiers, oversampling, undersampling
Build a pipeline for trying various sampling methods
from sklearn import metrics
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks, EditedNearestNeighbours
def model_resampling_pipeline(X_train, X_test, y_train, y_test, model):
results = {'ordinary': {},
'class_weight': {},
'oversample': {},
'undersample': {}}
# ------ No balancing ------
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predictions)
precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, predictions)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions, pos_label=1)
auc = metrics.auc(fpr, tpr)
results['ordinary'] = {'accuracy': accuracy, 'precision': precision, 'recall': recall,
'fscore': fscore, 'n_occurences': support,
'predictions_count': Counter(predictions),
'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn,
'auc': auc}
# ------ Class weight ------
if 'class_weight' in model.get_params().keys():
model.set_params(class_weight='balanced')
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predictions)
precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, predictions)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions, pos_label=1)
auc = metrics.auc(fpr, tpr)
results['class_weight'] = {'accuracy': accuracy, 'precision': precision, 'recall': recall,
'fscore': fscore, 'n_occurences': support,
'predictions_count': Counter(predictions),
'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn,
'auc': auc}
# ------------ OVERSAMPLING TECHNIQUES ------------
print('------ Oversampling methods ------')
techniques = [RandomOverSampler(),
SMOTE(),
ADASYN()]
for sampler in techniques:
technique = sampler.__class__.__name__
print(f'Technique: {technique}')
print(f'Before resampling: {sorted(Counter(y_train).items())}')
X_resampled, y_resampled = sampler.fit_sample(X_train, y_train)
print(f'After resampling: {sorted(Counter(y_resampled).items())}')
model.fit(X_resampled, y_resampled)
predictions = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predictions)
precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, predictions)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions, pos_label=1)
auc = metrics.auc(fpr, tpr)
results['oversample'][technique] = {'accuracy': accuracy,
'precision': precision,
'recall': recall,
'fscore': fscore,
'n_occurences': support,
'predictions_count': Counter(predictions),
'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn,
'auc': auc}
# ------------ UNDERSAMPLING TECHNIQUES ------------
print('------ Undersampling methods ------')
techniques = [RandomUnderSampler(),
NearMiss(version=1),
NearMiss(version=2),
TomekLinks(),
EditedNearestNeighbours()]
for sampler in techniques:
technique = sampler.__class__.__name__
if technique == 'NearMiss': technique+=str(sampler.version)
print(f'Technique: {technique}')
print(f'Before resampling: {sorted(Counter(y_train).items())}')
X_resampled, y_resampled = sampler.fit_sample(X_train, y_train)
print(f'After resampling: {sorted(Counter(y_resampled).items())}')
model.fit(X_resampled, y_resampled)
predictions = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predictions)
precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, predictions)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions, pos_label=1)
auc = metrics.auc(fpr, tpr)
results['undersample'][technique] = {'accuracy': accuracy,
'precision': precision,
'recall': recall,
'fscore': fscore,
'n_occurences': support,
'predictions_count': Counter(predictions),
'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn,
'auc': auc}
return results
Build tool to visualize results
def evaluate_method(results, method, metrics = ['precision', 'recall', 'fscore']):
fig, ax = plt.subplots(1, 7, sharey=True, figsize=(16, 6))
for i, metric in enumerate(metrics):
ax[i*2].axhline(results['ordinary'][metric][0], label='No Resampling')
ax[i*2+1].axhline(results['ordinary'][metric][1], label='No Resampling')
if results['class_weight']:
ax[i*2].bar(0, results['class_weight'][metric][0], label='Adjust Class Weight')
ax[i*2+1].bar(0, results['class_weight'][metric][1], label='Adjust Class Weight')
ax[0].legend(loc='upper center', bbox_to_anchor=(9, 1.01),
ncol=1, fancybox=True, shadow=True)
for j, (technique, result) in enumerate(results[method].items()):
ax[i*2].bar(j+1, result[metric][0], label=technique)
ax[i*2+1].bar(j+1, result[metric][1], label=technique)
ax[i*2].set_title(f'>50K: \n{metric}')
ax[i*2+1].set_title(f'<=50K: \n{metric}')
# AUC vis
ax[6].set_title(f'Area under curve')
ax[6].axhline(results['ordinary']['auc'], label='No Resampling')
if results['class_weight']:
ax[6].bar(0, results['class_weight']['auc'], label='Adjust Class Weight')
for j, (technique, result) in enumerate(results[method].items()):
ax[6].bar(j+1, result['auc'], label=technique)
model = LogisticRegression()
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')
model = DecisionTreeClassifier()
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')
model = RandomForestClassifier()
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')
model = SVC()
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')
model = KNeighborsClassifier()
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')
model=GaussianNB()
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')
model= QuadraticDiscriminantAnalysis()
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')
model = AdaBoostClassifier()
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')
model=GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')
model = MLPClassifier(hidden_layer_sizes=(50, 50), activation='relu', solver='lbfgs')
results = model_resampling_pipeline(X_train, X_test, y_train, y_test, model)
evaluate_method(results, 'oversample')
evaluate_method(results, 'undersample')