In this HR analytics problem, we try to understand what causes the employees of a big company to leave prematurely, and come up with predictive models that predict which employees will leave next. With the help of actionable insights and accurate predictions, companies can take steps to ensure that their talent is happy and engaged and can rethink their retention strategies to keep employees from leaving.
Based on our analysis, we found that the Random Forests model outperformed all the other models in terms of predictive accuracy and AUC score. It is a model that is built based on the technique of ensemble learning, where several base estimators are built independently and their predictions are averaged. This helps reduce the variance and improves the generalizability and robustness over a single estimator. Furthermore, we obtain an importance score for each of the features used in the classification process which will help in interpreting the model's results. In our case, the model gave a lot of weight to Satisfaction Level, Time Spent at the Company, and Number of Projects. These attributes are quite obvious and therefore we feel confident in the predictions of this model.
We'll start by loading in all the required packages and defining a few helper functions that we'll use later on while performing machine learning.
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from __future__ import division
from sklearn import svm
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, validation_curve, learning_curve
import seaborn as sns
#sns.set(style="white", context="talk")
sns.set(style="whitegrid")
from sklearn.tree import export_graphviz
from IPython.display import Image
import pydotplus
# Some helper functions for performing machine learning
def classify(grid, X_train, y_train, X_test, y_test):
results = dict()
#Training the model using grid search & cross validation
start_time = time.time()
grid.fit(X_train, y_train)
end_time = time.time() - start_time
results['training_time'] = end_time
#Testing the model on the held out test data set
start_time = time.time()
grid_test = grid.predict(X_test)
end_time = time.time() - start_time
results['testing_time'] = end_time
results['accuracy'] = metrics.accuracy_score(y_test, grid_test)
results['report'] = metrics.classification_report(y_test, grid_test)
results['matrix'] = metrics.confusion_matrix(y_test, grid_test)
results['grid'] = grid
results['grid_test'] = grid_test
return(results)
def get_feature_importances(grid, X_test):
#Returns a dataframe with feature importance info
ls = list()
ls.append({'feature':X_test.columns[a], 'importance':b})
feature_importances = pd.DataFrame(ls).sort_values(by = ['importance'], ascending=False)
return(feature_importances)
def plot_feature_importances(feature_importances):
ax = sns.stripplot(x = "importance", y = "feature", data = feature_importances)
ax.set(xlabel = 'Importance', ylabel='Feature')
return(ax)
def plot_validation_curve(model, X, y, items):
train_scores, test_scores = validation_curve(model,
X,
y,
param_name=items['param_name'],
param_range=items['param_range'],
cv=10,
scoring=items['scoring'],
n_jobs=-1)
train_score_means = np.mean(train_scores, axis=1)
test_score_means = np.mean(test_scores, axis=1)
plt.title(items['title'])
plt.xlabel(items['param_name'])
plt.ylabel(items['scoring'])
plt.ylim(0.0, 1.0)
plt.plot(items['param_range'], train_score_means, color="darkorange", label="Training Score")
plt.plot(items['param_range'], test_score_means, color="navy", label="Testing Score")
plt.legend(loc="best")
return(plt)
def plot_learning_curve(model, X, y, items):
train_sizes_abs, train_scores, test_scores = learning_curve(model,
X,
y,
train_sizes=items['train_sizes'],
cv=items['cv'],
n_jobs=1)
train_score_means = np.mean(train_scores, axis=1)
test_score_means = np.mean(test_scores, axis=1)
plt.title(items['title'])
plt.xlabel('train_sizes')
plt.ylabel(items['scoring'])
plt.ylim(0.0, 1.0)
plt.plot(train_sizes_abs, train_score_means, color="darkorange", label="Training Score")
plt.plot(train_sizes_abs, test_score_means, color="navy", label="Testing Score")
plt.legend(loc="best")
return(plt)
def plot_roc_curve(items):
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(items['y_test'], items['y_pred'])
roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
plt.title(items['title'])
plt.plot(false_positive_rate, true_positive_rate, "b", label='AUC = %0.2f'% roc_auc)
plt.plot([0,1],[0,1],'r--')
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
return(plt)
def print_results(results):
print "Best Estimator: \n\n%s"%(results['grid'].best_estimator_)
print "\nTraining Time: \n\n%s seconds"%(results['training_time'])
print "\nTesting Time: \n\n%s seconds"%(results['testing_time'])
print "\nAccuracy: \n\n%s"%(results['accuracy'])
print "\nConfusion Matrix: \n\n%s"%(results['matrix'])
print "\nClassification Report: \n\n%s"%(results['report'])
Lets first explore this dataset, which was obtained from Kaggle.
In our dataset we have the records of approximately 15,000 employees with information on attributes such as “Satisfaction Level”, “Salary”, “Department”, and “Average Monthly Hours”, to name a few. The target variable is “Sales”, a binary variable that tells us if an employee has quit or is still working for the company.
#Reading in the data
df = pd.read_csv("....csv")
print df.shape
print df.info()
Lets get a quick look at some high level stats about our dataset.
We can see that the median satisfaction level is about 61% and on average employees work for upto 200 hours a month.
df.describe()
The column “sales” represents the department to which the employees belong.
Lets go ahead and change the column name to something more intuitive, and find out how many departments are there.
print pd.unique(df.sales.ravel())
# We'll rename the "sales" column to "department".
df.rename(columns={"sales":"department"}, inplace=True)
We can see that there are over 4000 employees belonging to the sales department and they constitute the majority in the workforce.
ax = None
ax = sns.countplot(y="department", data=df)
sns.plt.show()
What are the average monthly hours for different salary groups? Lets find out by comparing the employees who have left with those who haven't.
From the plot below we can see that the mean average monthly hour is 200 hours and is almost consistent across all groups.
Former employees with low and medium salaries seem to have put in slightly more hours a month than everyone else and former employees with high salaries worked slightly less hours.
ax = None
ax = sns.factorplot(data=df, x="salary", y="average_montly_hours", col="left", kind="bar")
(ax
.set_axis_labels("Salary", "Mean Average Monthly Hour")
.set_xticklabels(["Low", "Medium", "High"])
.despine(left=True, right=True))
sns.plt.show()
What scores do employees get on their performance reviews? Is it the same across the different salary groups?
The plot below answers these questions for us. We can see that the employees of different salary groups who've stayed have similar average performance scores and they're all well above 70%.
Its interesting to note that the Low and Medium salaried employees who've left held an average performance score above 70%, but the scores of High salaried employees fall well below 70%.
ax = None
ax = sns.factorplot(data=df, x="salary", y="last_evaluation", col="left", kind="point")
(ax
.set_axis_labels("Salary", "Mean Last Evaluation")
.set_xticklabels(["Low", "Medium", "High"])
.despine(left=True, right=True))
sns.plt.show()
We wish to find out if satisfaction levels play a role in employees leaving their companies. Lets find out what the distribution looks like in this dataset.
It appears that the employees who have left have lower average satisfaction levels, ranging from a little over 0.37 to 0.46. Employees who stayed possess relatively higher satisfaction levels.
ax = None
ax = sns.factorplot(data=df, x="salary", y="satisfaction_level", col="left")
(ax
.set_axis_labels("Salary", "Satisfaction Level")
.set_xticklabels(["Low", "Medium", "High"])
.despine(left=True, right=True))
sns.plt.show()
Does the satisfaction level of the employees depend on how many projects they've been assigned to in a year? As it turns out, it does.
It appears that the people who are most satisfied with their jobs are assigned around 2 to 4 projects a year, and this satisfaction level is almost consistent across different salary groups. Employees who are assigned to just 1 or even more than 4 projects have relatively lower satisfaction levels.
ax = None
ax = sns.factorplot(data=df,
x="number_project",
y="satisfaction_level", col="salary")
(ax
.set_axis_labels("Number of Projects", "Satisfaction Level")
.set_xticklabels([1, 2, 3, 4, 5, 6, 7])
.despine(left=True, right=True))
sns.plt.show()
In this section we'll develop supervised classification models that will predict whether a given employee will or won't leave the company.
Before we go on to develop those models, we'll have to do a bit of data preprocessing.
The columns "salary" and "department" are both categorical and will need to be encoded as binary variables. We perform One Hot Encoding to obtain their binary representations.
print pd.unique(df.salary.ravel())
print pd.unique(df.department.ravel())
#One hot encoding - Transforming all the categorical variables to their binary representations
salary = pd.get_dummies(df['salary'], drop_first=False)
department = pd.get_dummies(df['department'], drop_first=False)
df.drop(['salary', 'department'], axis=1, inplace=True)
df = pd.concat([df, salary, department], axis=1)
print df.info()
The plot below shows us how the class label "left" is distributed. We see that the vast majority of data points belong to class 0, the label which represents the employees who have not left the company.
The remaining belong to class 1, the label which indicates that they have left.
ax = None
ax = sns.countplot(x="left", data=df)
sns.plt.show()
Since the dataset is imbalanced and skewed towards class 0, we'll use stratified sampling to construct our training and test data sets.
Stratified sampling ensures that the percentage of both the classes remains consistent across the training and test sets.
train, test = train_test_split(df, test_size = 0.3, random_state = 5, stratify = df['left'])
X_train = train.ix[:, df.columns.difference(['left'])]
y_train = train.ix[:, 'left']
X_test = test.ix[:, df.columns.difference(['left'])]
y_test = test.ix[:, ['left']]
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape
We'll use four different classification algorithms to develop the predictive models and find out which one performs the best.
Lets start with the Random Forests algorithm, which works by building several decision trees and averaging their results.
Apart from being highly interpretable, Decision Tree based modelling techniques also help us in understanding how important the features of a dataset are.
#Random Forests classification
n = range(1, 101)
param_grid = dict(n_estimators=n)
random_forests = RandomForestClassifier(random_state=5)
grid = GridSearchCV(random_forests, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
results1 = classify(grid, X_train, y_train, X_test, y_test)
#Visualizing the importance of the features
#The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature.
#It is also known as the Gini importance.
feature_importances = get_feature_importances(results1['grid'], X_test)
ax = None
ax = plot_feature_importances(feature_importances)
sns.plt.show()
Our Random Forests model tells us that the feature "satisfaction_level" plays the most important role in determining the class of a tuple.
The processing times and accuracy scores are shown below.
print_results(results1)
Lets drop the features with relatively lower scores, generate new train/test sets,and retrain the RF model to see if there are any changes in the accuracy.
top_features = feature_importances[:5]['feature'].tolist() + ['left']
new_df = df[top_features]
train, test = train_test_split(new_df, test_size = 0.3, random_state = 5, stratify = new_df['left'])
X_train = train.ix[:, new_df.columns.difference(['left'])]
y_train = train.ix[:, 'left']
X_test = test.ix[:, new_df.columns.difference(['left'])]
y_test = test.ix[:, ['left']]
#Random Forests classification with reduced features
n = range(1, 101)
param_grid = dict(n_estimators=n)
random_forests = RandomForestClassifier(random_state=5)
grid = GridSearchCV(random_forests, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
results1 = classify(grid, X_train, y_train, X_test, y_test)
feature_importances = get_feature_importances(results1['grid'], X_test)
ax = None
ax = plot_feature_importances(feature_importances)
sns.plt.show()
%%HTML
<h3>
<p>We can see that reducing the number of features hasn't affected the overall scores.</p>
<p>Although the model takes quite a while to train, it produces pretty impressive class precision and recall scores.</p>
</h3>
print_results(results1)
Lets take a look at what the ROC curve looks like for our Random Forests model.
The ROC curve is a plot of the False Positive Rate vs the True Positive Rate of a binary classifier.
AUC, which stands for Area Under the Curve, tells us what the value of the area under the curve is.
The closer AUC is to 1.0, the better our model is at performing classifications.
items = {"title":"Receiver Operating Characteristic (Random Forests)",
"y_test":y_test,
"y_pred":results1['grid_test']}
plt = plot_roc_curve(items)
plt.show()
Validation curves help us visualize how the performance of our model during training and testing varies with changes in its hyperparameter values.
#Random Forests Validation Curve
random_forests_classifier = RandomForestClassifier(random_state=5)
n_estimators = range(1, 101)
items = {'title':'Random Forests Validation Curve',
'param_range':n_estimators,
'param_name':'n_estimators',
'scoring':'accuracy'}
plt = plot_validation_curve(random_forests_classifier, X_train, y_train, items)
plt.show()
Learning curves help us visualize how the performance of our model during training and testing varies with changes in the training set size.
#Random Forests Learning Curve
random_forests_classifier = RandomForestClassifier(random_state=5, n_estimators=10)
train_sizes = np.linspace(.05, 1.0, 10)
items = {'title':'Random Forests Learning Curve',
'train_sizes':train_sizes,
'cv':10,
'scoring':'accuracy'}
plt = plot_learning_curve(random_forests_classifier, X_train, y_train, items)
plt.show()
We'll now develop prediction models using SVM, K Nearest Neighbors, and a standard Decision Tree, and also generate ROC, learning and validation curves for each of the models to find out how they perform.
#SVM Classification
kernel = ['sigmoid', 'rbf']
param_grid = dict(kernel=kernel)
support_vector_classifier = svm.SVC(random_state=5)
grid = GridSearchCV(support_vector_classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
results2 = classify(grid, X_train, y_train, X_test, y_test)
print_results(results2)
#ROC Curve for SVM
items = {"title":"Receiver Operating Characteristic (SVM)",
"y_test":y_test,
"y_pred":results2['grid_test']}
plt = plot_roc_curve(items)
plt.show()
#SVM Validation Curve
support_vector_classifier = svm.SVC(random_state=5, kernel="rbf")
gamma = np.logspace(-6, -1, 5)
items = {'title':'SVM Validation Curve',
'param_range':gamma,
'param_name':'gamma',
'scoring':'accuracy'}
plt = plot_validation_curve(support_vector_classifier, X_train, y_train, items)
plt.show()
#SVM Learning Curve
support_vector_classifier = svm.SVC(random_state=5, kernel="rbf")
train_sizes = np.linspace(.05, 1.0, 10)
items = {'title':'SVM Learning Curve',
'train_sizes':train_sizes,
'cv':10,
'scoring':'accuracy'}
plt = plot_learning_curve(support_vector_classifier, X_train, y_train, items)
plt.show()
#KNN Classification
p = range(1, 3)
n_neighbors = range(1, 101)
param_grid = dict(p=p, n_neighbors=n_neighbors)
knn_classifier = KNeighborsClassifier()
grid = GridSearchCV(knn_classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
results3 = classify(grid, X_train, y_train, X_test, y_test)
print_results(results3)
items = {"title":"Receiver Operating Characteristic (K Nearest Neighbors)",
"y_test":y_test,
"y_pred":results3['grid_test']}
plt = plot_roc_curve(items)
plt.show()
#KNN Validation curve
knn_classifier = KNeighborsClassifier()
n_neighbors = range(1, 50)
items = {'title':'KNN Validation Curve',
'param_range':n_neighbors,
'param_name':'n_neighbors',
'scoring':'accuracy'}
plt = plot_validation_curve(knn_classifier, X_train, y_train, items)
plt.show()
#KNN Learning curve
knn_classifier = KNeighborsClassifier(n_neighbors=2)
train_sizes = np.linspace(.05, 1.0, 10)
items = {'title':'KNN Learning Curve',
'train_sizes':train_sizes,
'cv':10,
'scoring':'accuracy'}
plt = plot_learning_curve(knn_classifier, X_train, y_train, items)
plt.show()
#Decision Tree Classifier
max_depth = range(10, 31)
param_grid = dict(max_depth=max_depth)
decision_tree_classifier = DecisionTreeClassifier(random_state=5)
grid = GridSearchCV(decision_tree_classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
results4 = classify(grid, X_train, y_train, X_test, y_test)
print_results(results4)
items = {"title":"Receiver Operating Characteristic (Decision Tree)",
"y_test":y_test,
"y_pred":results3['grid_test']}
plt = plot_roc_curve(items)
plt.show()
#Printing out the decision tree
my_tree = results4['grid'].best_estimator_
dot_data = export_graphviz(my_tree, out_file=None, feature_names=X_test.columns, filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("~/Desktop/tree.pdf")
Image(graph.create_png())
Lets visualise the ROC curves of all our models together to find out which one performed the best.
We can see that all the models performed well, but the Random Forests model performed the best, with an AUC score of 0.98.
colors = ['cyan', 'indigo','blue', 'darkorange']
titles = ["Random Forests", "Support Vector Machines", "K Nearest Neighbors", "Decision Trees"]
results = [results1, results2, results3, results4]
models = zip(colors, results, titles)
for model in models:
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, model[1]['grid_test'])
roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate, model[0], label=model[2]+'/AUC = %0.2f'% roc_auc)
plt.plot([0,1],[0,1],'r--')
plt.title("Receiver Operating Characteristic")
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()