1.1.0
Import packages¶import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns
#import dataset
df = pd.read_csv("data/RAW_Admit.csv")
df.head()
ohe_df = pd.get_dummies(df['rank'],prefix='rank',drop_first=True)
df = pd.concat([df,ohe_df],axis=1)
df.head()
# covariance matrix of the ohe concat matrix
correlation = df.drop(['admit','rank','gpa'],axis=1).corr()
sns.heatmap(correlation,annot=True)
2.1.0
Pre-Processing data¶gre
,gpa
,rank
is that they are of drastically different scalesmin()
and max()
3.1.0
Splitting data sets¶Final Feature space |
---|
7 |
Training Data | Validation Data | Test Data | Total Data Points |
---|---|---|---|
70% | 20% | 10% | 400 |
# doing a randomized split of the dataset so as to reduce training bias
df_train = df.sample(n=int((70/100)*(400)))
df_validation = df.sample(n=int((20/100)*(400)))
df_test = df.sample(n=int((10/100)*(400)))
df_all = df.sample(df.shape[0])
# Data point counts
print('{0:15} : ({1},{2})'.format('Training Data',df_train.shape[0],df_train.shape[1]))
print('{0:15} : ({1},{2})'.format('Validation Data',df_validation.shape[0],df_validation.shape[1]))
print('{0:15} : ({1},{2})'.format('Test Data',df_test.shape[0],df_test.shape[1]))
4.1.0
Logistic Regression - Gradient Ascent - Maximizing the log-likelihood¶# helper functions
def sigmoid(_X):
return 1/(1 + np.exp(-_X))
def gradient_descent(_X,_Y,_epochs,_alpha,_verbose):
# normalizing features
for col in range(_X.shape[1]):
_X[:,col] = (_X[:,col] - _X[:,col].min())/(_X[:,col].max() - _X[:,col].min())
# initializing weights to 1.0f and accounting for w0
W = np.ones([_X.shape[1]+1,1])
X = np.concatenate((np.ones([_X.shape[0],1]),_X),axis=1)
# gradient descents
for i in range(_epochs):
h = sigmoid(np.dot(X,W))
W = W + _alpha*X.T.dot((_Y - h))
if _verbose and i%1000 == 0:
cost = (-_Y*np.log(h) - (1-_Y)*np.log(1-h))
print('epoch( {0:4d}/{1:} ) loss_avg : {2}'.format(i,_epochs,((cost - cost.min())/(cost.max() - cost.min())).mean()))
return W
def train(_X,_Y,_epochs,_alpha,_verbose=True):
return gradient_descent(_X,_Y,_epochs,_alpha,_verbose)
def predict(_X,_W,show_prob=False):
# normalizing features
for col in range(_X.shape[1]):
_X[:,col] = (_X[:,col] - _X[:,col].min())/(_X[:,col].max() - _X[:,col].min())
X = np.insert(_X,0,np.ones(_X.shape[0]),axis=1)
if show_prob:
return sigmoid(np.dot(X,_W))
return sigmoid(np.dot(X,_W)).round().astype(int)
def plot(df,feature):
plt.plot(df[feature],df['admit'],'.',color='xkcd:azure',label='real admit')
plt.plot(df[feature],df['predicted admit'],'+',color='xkcd:orange',label='predicted admit')
plt.xlabel(feature)
plt.ylabel('admit')
plt.legend(loc='best')
plt.show()
def pair_plot(df,f1,f2):
# plots
plt.scatter(df[df['admit']==1][f1], df[df['admit']==1][f2], s=10, label='admit')
plt.scatter(df[df['admit']==0][f1], df[df['admit']==0][f2], s=10, label='not admit')
plt.legend()
plt.show()
4.2.0
Logistic Regression - Model Training¶x_cols = ['gre','rank_2','rank_3','rank_4']
y_col = ['admit']
W = train(df_train[x_cols].to_numpy(),df_train[y_col].to_numpy(),20000,0.001,True)
df_train['predicted admit'] = predict(df_train[x_cols].to_numpy(),W)
4.3.0
Logistic Regression - Model Prediction¶df_validation['predicted admit'] = predict(df_validation[x_cols].to_numpy(),W)
df_validation.head()
4.4.0
Logistic Regression - Model Validation against scikit
¶We've finished training our model and running it on validation data, but if we don't have any standard to compare it with then we basically don't know if the model is working as it should
Let's use scikit-learn
on the same data and check predictions
# setting up scikit learns logistic regression
clf = LogisticRegression(random_state=0).fit(df_train[x_cols].to_numpy(),df_train[y_col].to_numpy().ravel())
# generating scikit learn predictions - validation data
df_validation['scikit predicted admit'] = clf.predict(df_validation[x_cols].to_numpy())
df_validation.head()
5.1.0
Model Results¶plot(df_validation,'gre')
plot(df_validation,'gpa')
plot(df_validation,'rank')
pair_plot(df_validation,'gre','gpa')
plot(df_validation,'rank')
df_validation.dtypes
5.2.0
Model Performance and comparision¶#test data predictions
df_test['predicted admit'] = predict(df_test[x_cols].to_numpy(),W)
df_test['scikit predicted admit'] = clf.predict(df_test[x_cols].to_numpy())
# accuracy scores
my_accuracy = accuracy_score(df_test[['predicted admit']].to_numpy(),df_test[y_col].to_numpy())
scikit_accuracy = accuracy_score(df_test[['scikit predicted admit']].to_numpy(),df_test[y_col].to_numpy())
df_accuracy = pd.DataFrame([[my_accuracy,scikit_accuracy]],columns=['MY ACCURACY','SCIKIT ACCURACY'])
df_accuracy.head()