import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
insurance = pd.read_csv('insurance.csv')
insurance.head()
insurance.describe()
insurance.info()
# Draw a scatterplot for insurance dataset
sns.regplot(x='X',y='Y',data=insurance)
This tutorial is broken down into the following parts:
These steps will give you the foundation you need to implement and train simple linear regression models for your own prediction problems.
# Calculate the mean value of a list of numbers
def mean(values):
return sum(values)/len(values)
# Calculate the variance of a list of numbers
def variance(values, mean):
return sum([(x-mean)**2 for x in values])/len(values)
# Check your mean and variance functions
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
x_mean, y_mean = mean(x), mean(y)
x_var, y_var = variance(x, x_mean), variance(y, y_mean)
print('x stats: mean=%.3f variance=%.3f' % (x_mean, x_var))
print('y stats: mean=%.3f variance=%.3f' % (y_mean, y_var))
# Calculate covariance between x and y
def covariance(x, x_mean, y, y_mean):
return sum([(x[i] - x_mean) * (y[i] - y_mean) for i in range(len(x))])/len(x)
# Check your covariance function
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
x_mean, y_mean = mean(x), mean(y)
covar = covariance(x, x_mean, y, y_mean)
print('Covariance: %.3f' % (covar))
# Calculate coefficients
def coefficients(dataset):
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
x_mean, y_mean = mean(x), mean(y)
b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
b0 = y_mean - (b1 * x_mean)
return [b0, b1]
# Check your coefficients function
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
b0, b1 = coefficients(dataset)
print('Coefficients: b0=%.3f, b1=%.3f' % (b0, b1)) # Should give b0=0.4 and b1=0.8
# Split a dataset into a train and test set (without sci-kit learn)
def train_test_split(dataset, split):
dataset = dataset.copy()
# np.random.shuffle(dataset)
seg = round(split*len(dataset))
train_data = dataset[:seg]
test_data = dataset[seg:]
return [train_data,test_data]
# Use simple linear regression to return predictions on test set.
def simple_linear_regression(train, test):
b0, b1 = coefficients(train)
predicted = [b0 + (b1 * i) for i in test]
return predicted
# Calculate root mean squared error
def rmse_metric(actual, predicted):
sum_error = 0.0
for i in range(len(actual)):
prediction_error = predicted[i] - actual[i]
sum_error += (prediction_error ** 2)
mean_error = sum_error / len(actual)
return np.sqrt(mean_error)
# Evaluate an algorithm using a train/test split
def evaluate_algorithm(dataset, algorithm, split, error_metric, *args):
train, test = train_test_split(dataset, split)
x_test = [row[0] for row in test]
predicted = algorithm(train, x_test, *args)
y_test = [row[-1] for row in test]
error = error_metric(y_test, predicted)
return error
# Find rmse for Insurance dataset
split = 0.7
rmse = evaluate_algorithm(insurance.values, simple_linear_regression, split, rmse_metric)
print('RMSE: %.3f' % (rmse))
NOTE : If you are getting an incorrect value, make sure that you don't shuffle the dataset.
Also make sure that you use the insurance
dataset.