train_file_path = "/train.csv"
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
#import warnings
#warnings.filterwarnings("ignore")
import os
import time
import numpy as np
import pandas as pd
#from matplotlib import pyplot as plt
from __future__ import division
from sklearn import svm
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, validation_curve, learning_curve
def get_feature_importances(grid, X_test):
#Returns a dataframe with feature importance info
ls = list()
ls.append({'feature':X_test.columns[a], 'importance':b})
feature_importances = pd.DataFrame(ls).sort_values(by = ['importance'], ascending=False)
return(feature_importances)
def classify(grid, X_train, y_train, X_test, y_test):
results = dict()
#Training the model using grid search & cross validation
start_time = time.time()
grid.fit(X_train, y_train)
end_time = time.time() - start_time
results['training_time'] = end_time
#Testing the model on the held out test data set
start_time = time.time()
grid_test = grid.predict(X_test)
end_time = time.time() - start_time
results['testing_time'] = end_time
results['accuracy'] = metrics.accuracy_score(y_test, grid_test)
results['report'] = metrics.classification_report(y_test, grid_test)
results['matrix'] = metrics.confusion_matrix(y_test, grid_test)
results['grid'] = grid
results['grid_test'] = grid_test
return(results)
data = (pd
.read_csv(train_file_path)
# .replace(-1, np.NaN)
)
grouping = data[["id", "target"]].groupby("target").size()
class_0_num = int(round(grouping[0]*0.15))
class_1_num = int(round(grouping[1]*0.75))
train_sampled = pd.concat([data.loc[data.target==0].sample(class_0_num, random_state=10),
data.loc[data.target==1].sample(class_1_num, random_state=10)])
del data
mappings = {}
categorical_columns = filter(lambda column_name: column_name.endswith("_cat"), train_sampled.columns)
for column in categorical_columns:
mappings[column] = column
train_encoded = pd.get_dummies(train_sampled, columns=categorical_columns, prefix=mappings, dummy_na=False)
train_encoded.shape
nan_columns = filter(lambda column_name: column_name.endswith("-1"), train_encoded.columns)
train_encoded.drop(labels=nan_columns, axis=1, inplace=True)
train_encoded.shape
train, test = train_test_split(train_encoded,
test_size=0.3,
random_state=10,
stratify=train_encoded["target"])
X_train = train.ix[:, train_encoded.columns.difference(['id','target'])]
y_train = train.ix[:, 'target']
X_test = test.ix[:, train_encoded.columns.difference(['id','target'])]
y_test = test.ix[:, ['target']]
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
C_OPTIONS = [0.01, 0.1, 1.0, 10.0, 100.0]
pipe = Pipeline([('reduce_dim', PCA(n_components=100)), ('classify', LogisticRegression())])
param_grid = [{"classify__C":C_OPTIONS}]
grid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid)
grid.fit(X_train, y_train)
grid.best_estimator_
grid_test = grid.predict(X_test)
print metrics.classification_report(y_test, grid_test)
print metrics.confusion_matrix(y_test, grid_test, labels=[0, 1])
data[["id", "target"]].groupby("target").size()
7/13*100