Implementing a Naive Bayes Classifier from scratch.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('diabetes.csv')
df.head(10)
df.info()
df.isnull().any().any()
Q1. Fill out this function which splits the dataset into X_train, y_train, X_test, y_test.
def splitDataset(dataset, split, target_label):
# Shuffle the dataset
#dataset = dataset.sample(frac=1)
train_size = int(len(dataset) * split)
X = dataset.drop(target_label, axis=1)
y = dataset[target_label]
X_train = X[:train_size].values
X_test = X[train_size:].values
y_train = y[:train_size].values
y_test = y[train_size:].values
return X_train, y_train, X_test, y_test
dataset = pd.DataFrame([[1, 0], [2, 0], [3, 1], [4, 1], [5, 1]])
dataset = pd.DataFrame({
'Feature1' : [1,2,3,4,5],
'Feature2' : [3,4,5,6,7],
'Target' : [0,0,1,1,1]
})
dataset
split = 0.67
target_label = 'Target'
X_train, y_train, X_test, y_test = splitDataset(dataset, split, target_label)
print(('Split {0} rows into train with {1} and test with {2}').format(len(dataset), len(y_train), len(y_test)))
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
Q2. Fill out this function which separates the data by class(labels). Create a dictionary object where the keys are the class value and then add a list of all the records that have the class as the value in the dictionary.
def separateByClass(X, y):
'''
Returns a dict with each key maps to a list
of rows belonging to that class.
'''
separated = {}
for i in range(len(X)):
if y[i] not in separated:
separated[y[i]] = []
separated[y[i]].append(X[i])
return separated
# This cell should run properly
X = [[1, 20], [2, 21], [3, 22]]
y = [1, 0, 1]
separated = separateByClass(X, y)
print("Rows belonging to class 0 : ", separated[0])
print("Rows belonging to class 1 : ", separated[1])
import math
def mean(numbers):
'''Return mean of numbers'''
return sum(numbers) / float(len(numbers))
def stdev(numbers):
'''
Return standard deviation of numbers
NOTE :
'''
avg = mean(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)
# This cell should run properly
numbers = [1, 2, 3, 4, 5]
print(('Summary of {0}: mean={1}, stdev={2}').format(numbers, mean(numbers), stdev(numbers)))
Q4. Fill out a function which calculates the mean and standard deviation of each column in a dataset. Store the mean and standard deviation for each column as a tuple or list. Return a list that contains each column's statistics (mean, stdev).
def summarize(X):
'''
Return a list of shape (num_cols,2) where the ith
element is (mean_col_i, stdev_col_i).
'''
summary = []
for column in np.array(X).T:
summary.append((mean(column), stdev(column)))
return summary
# This cell should run properly
dataset = [[1, 20, 4], [2, 21, 0], [3, 22, 10], [4,20,7]]
print(np.array(dataset))
summary = summarize(dataset)
print(('\n Attribute summaries: {0}').format(summary))
Q5. Summarize the columns in the dataset organized by class values. Split the dataset by class, then calculate statistics on each subset. Return a dictionary that contains the results in the form of a list of tuples of statistics for each class value.
def summarizeByClass(X, y):
'''
Return a dict containing class summariy lists for values.
Map each summary to the class label.
Use the `separateByClass` and then then `summarize` function
for each class.
'''
separated = separateByClass(X, y)
summaries = {}
for label in separated:
summaries[label] = summarize(separated[label])
return summaries
# This should work properly
X = [[1, 20, 4], [2, 21, 0], [3, 22, 10], [4,20,7]]
y = [1, 0, 1, 0]
print(np.array(X))
summary = summarizeByClass(X, y)
print('\nSummary by class value:')
print("\t Class0 : ", summary[0])
print("\t Class1 : ", summary[1])
We will now calculate the probability or likelihood of a data point to belong to a certain class.
One way we can do this is to assume that data is drawn from a distribution, such as a bell curve or Gaussian distribution.
Q6. Fill out the function which calculates the likelihood of data point using Gaussian density function.
def calculateProbability(x, mean, stdev):
'''
Use the Gaussian Probability Density Function
to estimate the probablity of a point belonging
to a certain class.
'''
exp_in = ((x-mean)/float(stdev))
expon = (-0.5) * (exp_in**2)
return np.exp(expon) / float(np.sqrt(2 * np.pi) * stdev)
# This cell should run properly
x1 = 70.5
mean1 = 73
stdev1 = 10
probability = calculateProbability(x1, mean1, stdev1)
print(('Probability of belonging to this class: {0}').format(probability))
Q7. Fill out the function which calculates the probability that a data point belongs to either class. We can calculate the probabilities of an attribute belonging to a class using the above function, and we can combine the probabilities by multiplying them(Naive). Thus, this function returns a dictionary which shows the probability that the data summary belongs to a particular class.
P(class=0|X1,X2) = P(X1|class=0) * P(X2|class=0) * P(class=0)
def calculateClassProbabilities(summaries, inputVector):
'''
Map each class label to the probablity of the point
belonging to that particular class. Use the Naive
Bayes assumption of conditional independence.
Also use the `calculateProbability` function defined
above.
'''
probabilities = {}
# Iterate over classes
for classValue, classSummaries in summaries.items():
# Initalize P(class|attribute_vec) to 1.
probabilities[classValue] = 1
# Iterate over columns and update P(class|attribute_vec).
for i in range(len(classSummaries)):
# Obtain mean, standard-deviation for the [class,attribute] combination
mean, stdev = classSummaries[i]
# Multiply P(class|attribute_vec) by P(attribute|class)
probabilities[classValue] *= calculateProbability(inputVector[i], mean, stdev)
return probabilities
# This cell should run properly
# Single Attribute, Two classes
summaries = {0: [(1, 0.5)], 1: [(20, 5.0)]}
# One attribute, and one label to predict.
inputVector = [1.1, '?']
probabilities = calculateClassProbabilities(summaries, inputVector)
print(probabilities)
print(('\nProbabilities for each class: '))
print("\t Class0 : ", probabilities[0])
print("\t Class1 : ", probabilities[1])
Q8a. Fill out the function which makes the prediction which class a datapoint belongs to. Hint
import operator
def predict(summaries, inputVector):
'''
Return class with maximum probablity.
Class label should be the same with which it is
referred to in `summaries`.
Hint : Use the `calculateClassProbabilities`
function.
'''
probabilities = calculateClassProbabilities(summaries, inputVector)
max_class = max(probabilities, key=probabilities.get)
# print(probabilities)
return max_class
# This cell should run properly
# When our dataset has 2 attributes/features
summaries = {
'A': [(1, 0.5), (2, 1)],
'B': [(20, 5.0), (20, 1.0)]
}
inputVector1 = [1.1, 4]
result1 = predict(summaries, inputVector1)
print(('Prediction for vec1: {0}').format(result1))
print()
inputVector2 = [18.0, 20.0]
result2 = predict(summaries, inputVector2)
print(('Prediction for vec2: {0}').format(result2))
Q8b. Fill out this function for generating predictions for a list of test datapoints.
def getPredictions(summaries, X_test):
'''
Get predictions for multiple data points
using the `predict` function.
'''
predictions = []
for i in range(len(X_test)):
predictions.append(predict(summaries, X_test[i]))
return predictions
# This cell should run properly
summaries = {'A': [(1, 0.5), (2, 1)], 'B': [(20, 5.0), (20, 1.0)]}
testSet = [[1.1,3], [19.1, 16]]
predictions = getPredictions(summaries, testSet)
print(('Predictions: {0}').format(predictions))
Q9. Fill out this function which returns the accuracy of the predictions generated by the Naive Bayes Classifier.
def getAccuracy(y_test, y_pred):
return sum([y_test[i] == y_pred[i] for i in range(len(y_test))]) / float(len(y_test)) * 100
# This cell should run properly
test = ['a', 'a', 'b']
predictions = ['a', 'b', 'b']
accuracy = getAccuracy(test, predictions)
print(('Accuracy: {0}').format(accuracy))
Q10. Fill out this Naive Bayes function which takes in the dataframe and target_label parameters and prints its accuracy.
def NaiveBayesClassifier(dataset, target_label):
split = 0.7
X_train, y_train, X_test, y_test = splitDataset(dataset, split, target_label)
print(('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(y_train), len(y_test)))
summaries = summarizeByClass(X_train, y_train)
y_pred = getPredictions(summaries, X_test)
return getAccuracy(y_test, y_pred)
# This cell should run properly
NaiveBayesClassifier(df, "Outcome")