Information

Prelims

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
from sklearn import preprocessing
import pickle
from sklearn.metrics import roc_auc_score
In [3]:
!ls ../input/eval-lab-3-f464
sample_submission.csv  test.csv  train.csv
In [4]:
train_raw = pd.read_csv("train.csv")
test_raw = pd.read_csv("test.csv")
In [5]:
print(train_raw.shape)
train_raw.head()
(4930, 21)
Out[5]:
custId gender SeniorCitizen Married Children TVConnection Channel1 Channel2 Channel3 Channel4 ... Channel6 Internet HighSpeed AddedServices Subscription tenure PaymentMethod MonthlyCharges TotalCharges Satisfied
0 888 Male 0 No No Cable Yes No No No ... No Yes No Yes Monthly 1 Cash 80.85 80.85 0
1 4093 Male 1 No No Cable No Yes No No ... Yes Yes No Yes Monthly 6 Bank transfer 85.15 503.6 0
2 4966 Male 1 No No Cable Yes Yes Yes No ... No Yes Yes Yes Monthly 9 Net Banking 100.50 918.6 0
3 4788 Male 0 No No Cable No Yes No No ... No Yes No Yes Monthly 5 Cash 80.10 398.55 1
4 1531 Male 0 No No No No tv connection No tv connection No tv connection No tv connection ... No tv connection Yes No No Biannually 16 Credit card 18.95 326.8 1

5 rows × 21 columns

In [6]:
print(test_raw.shape)
test_raw.head()
(2113, 20)
Out[6]:
custId gender SeniorCitizen Married Children TVConnection Channel1 Channel2 Channel3 Channel4 Channel5 Channel6 Internet HighSpeed AddedServices Subscription tenure PaymentMethod MonthlyCharges TotalCharges
0 3904 Female 0 No No DTH Yes No Yes No No No Yes No Yes Monthly 52 Net Banking 59.45 3043.7
1 5496 Female 0 Yes No Cable Yes No No No No No Yes No Yes Monthly 9 Net Banking 79.75 769.1
2 497 Female 0 Yes Yes Cable Yes Yes No No No No Yes No Yes Monthly 10 Net Banking 88.85 929.45
3 4260 Female 0 No No DTH Yes Yes No No No Yes Yes No Yes Monthly 2 Net Banking 70.75 146.9
4 4748 Male 0 No No Cable Yes Yes Yes No Yes Yes Yes Yes Yes Monthly 45 Credit card 108.45 4964.7

EDA

In [7]:
train_labels = train_raw['Satisfied']
In [8]:
labels, counts = np.unique(train_labels, return_counts=True)
print(counts)
plt.bar(labels, counts)
plt.show()
[1309 3621]
In [9]:
data_raw = pd.concat([train_raw.iloc[:,0:-1], test_raw])
print(data_raw.shape)
(7043, 20)
In [10]:
data_raw
Out[10]:
custId gender SeniorCitizen Married Children TVConnection Channel1 Channel2 Channel3 Channel4 Channel5 Channel6 Internet HighSpeed AddedServices Subscription tenure PaymentMethod MonthlyCharges TotalCharges
0 888 Male 0 No No Cable Yes No No No No No Yes No Yes Monthly 1 Cash 80.85 80.85
1 4093 Male 1 No No Cable No Yes No No No Yes Yes No Yes Monthly 6 Bank transfer 85.15 503.6
2 4966 Male 1 No No Cable Yes Yes Yes No No No Yes Yes Yes Monthly 9 Net Banking 100.50 918.6
3 4788 Male 0 No No Cable No Yes No No No No Yes No Yes Monthly 5 Cash 80.10 398.55
4 1531 Male 0 No No No No tv connection No tv connection No tv connection No tv connection No tv connection No tv connection Yes No No Biannually 16 Credit card 18.95 326.8
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2108 3957 Female 0 No No Cable Yes Yes No No Yes No Yes No Yes Monthly 1 Net Banking 95.85 95.85
2109 3376 Female 0 Yes Yes DTH No No No No No No Yes No Yes Monthly 1 Bank transfer 44.35 44.35
2110 2396 Male 0 Yes No Cable Yes Yes Yes Yes No No Yes Yes Yes Monthly 24 Net Banking 104.65 2542.45
2111 1011 Male 1 No No Cable Yes Yes Yes Yes No No Yes Yes Yes Monthly 24 Credit card 102.95 2496.7
2112 6457 Male 0 No No No No tv connection No tv connection No tv connection No tv connection No tv connection No tv connection Yes No No Monthly 5 Cash 21.05 113.85

7043 rows × 20 columns

In [11]:
for col in data_raw.columns:
    t = data_raw[col]
    print(col,":", t[t.isna()].shape[0])
custId : 0
gender : 0
SeniorCitizen : 0
Married : 0
Children : 0
TVConnection : 0
Channel1 : 0
Channel2 : 0
Channel3 : 0
Channel4 : 0
Channel5 : 0
Channel6 : 0
Internet : 0
HighSpeed : 0
AddedServices : 0
Subscription : 0
tenure : 0
PaymentMethod : 0
MonthlyCharges : 0
TotalCharges : 0
In [12]:
for col in data_raw.columns:
    data_raw.loc[data_raw[col] == 'No tv connection',col] = 'No'
    data_raw.loc[data_raw[col] == 'No internet',col] = 'No'
/opt/conda/lib/python3.6/site-packages/pandas/core/ops/__init__.py:1115: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  result = method(y)
In [13]:
from matplotlib import gridspec

fig = plt.figure(figsize=[30,10])
gs = gridspec.GridSpec(3, 6) 

for i,col in enumerate(data_raw.columns):
    if(col in ['custId', 'MonthlyCharges', 'TotalCharges']): continue

    t = data_raw[col]
    labels, counts = np.unique(t,return_counts=True)
    plt.subplot(gs[i-1])
    plt.title(col)
    plt.bar(labels, counts)
plt.show()

Feature Generation

In [14]:
# isMale

gender = data_raw['gender']
isMale_idx = np.where(gender == 'Male')[0]
isMale = pd.DataFrame(np.zeros_like(gender), columns=['isMale'])
isMale.iloc[isMale_idx,0] = 1
isMale.head()
Out[14]:
isMale
0 1
1 1
2 1
3 1
4 1
In [15]:
# isSenior

isSenior = pd.DataFrame(np.array(data_raw['SeniorCitizen']), columns=['isSenior'])
isSenior.head()
Out[15]:
isSenior
0 0
1 1
2 1
3 0
4 0
In [16]:
# isMarried

res = data_raw['Married']
isMarried_idx = np.where(res == 'Yes')[0]
isMarried = pd.DataFrame(np.zeros_like(res), columns=['isMarried'])
isMarried.iloc[isMarried_idx,0] = 1
isMarried.head()
Out[16]:
isMarried
0 0
1 0
2 0
3 0
4 0
In [17]:
# hasChildren

res = data_raw['Children']
hasChildren_idx = np.where(res == 'Yes')[0]
hasChildren = pd.DataFrame(np.zeros_like(res), columns=['hasChildren'])
hasChildren.iloc[hasChildren_idx,0] = 1
hasChildren.head()
Out[17]:
hasChildren
0 0
1 0
2 0
3 0
4 0
In [18]:
# tvConnection

res = data_raw['TVConnection']
choices = ['No', 'Cable', 'DTH']
tvConnection =  pd.DataFrame(np.zeros([res.shape[0], 3]), columns=['hasConnection_'+choice for choice in choices])

for i,choice in enumerate(choices):
    choice_idx = np.where(res == choice)[0]
    tvConnection.iloc[choice_idx,i] = 1

tvConnection
Out[18]:
hasConnection_No hasConnection_Cable hasConnection_DTH
0 0.0 1.0 0.0
1 0.0 1.0 0.0
2 0.0 1.0 0.0
3 0.0 1.0 0.0
4 1.0 0.0 0.0
... ... ... ...
7038 0.0 1.0 0.0
7039 0.0 0.0 1.0
7040 0.0 1.0 0.0
7041 0.0 1.0 0.0
7042 1.0 0.0 0.0

7043 rows × 3 columns

In [19]:
# channels

channel_nos = ['Channel1', 'Channel2', 'Channel3', 'Channel4', 'Channel5', 'Channel6']
channels = pd.DataFrame(np.zeros([len(data_raw),6]), columns=[channel.lower() for channel in channel_nos])

for i,channel in enumerate(channel_nos):
    res = data_raw[channel]
    choice_idx = np.where(res == 'Yes')[0]
    channels.iloc[choice_idx,i] = 1
    
channels
Out[19]:
channel1 channel2 channel3 channel4 channel5 channel6
0 1.0 0.0 0.0 0.0 0.0 0.0
1 0.0 1.0 0.0 0.0 0.0 1.0
2 1.0 1.0 1.0 0.0 0.0 0.0
3 0.0 1.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ...
7038 1.0 1.0 0.0 0.0 1.0 0.0
7039 0.0 0.0 0.0 0.0 0.0 0.0
7040 1.0 1.0 1.0 1.0 0.0 0.0
7041 1.0 1.0 1.0 1.0 0.0 0.0
7042 0.0 0.0 0.0 0.0 0.0 0.0

7043 rows × 6 columns

In [20]:
# hasInternet

res = data_raw['Internet']
internet_idx = np.where(res == 'Yes')[0]
hasInternet = pd.DataFrame(np.zeros_like(res), columns=['hasInternet'])
hasInternet.iloc[internet_idx,0] = 1
hasInternet.head(10)
Out[20]:
hasInternet
0 1
1 1
2 1
3 1
4 1
5 1
6 0
7 1
8 1
9 1
In [21]:
# isHighSpeed

res = data_raw['HighSpeed']
highSpeed_idx = np.where(res == 'Yes')[0]
isHighSpeed = pd.DataFrame(np.zeros_like(res), columns=['isHighSpeed'])
isHighSpeed.iloc[highSpeed_idx,0] = 1
isHighSpeed.head(10)
Out[21]:
isHighSpeed
0 0
1 0
2 1
3 0
4 0
5 1
6 0
7 1
8 1
9 1
In [22]:
# addedServices

res = data_raw['AddedServices']
addedServices_idx = np.where(res == 'Yes')[0]
addedServices = pd.DataFrame(np.zeros_like(res), columns=['addedServices'])
addedServices.iloc[addedServices_idx,0] = 1
addedServices.head(10)
Out[22]:
addedServices
0 1
1 1
2 1
3 1
4 0
5 1
6 1
7 1
8 1
9 0
In [23]:
# subscription

res = data_raw['Subscription']
choices = ['Annually', 'Biannually', 'Monthly']
subscription =  pd.DataFrame(np.zeros([res.shape[0], 3]), columns=['billed'+choice for choice in choices])

for i,choice in enumerate(choices):
    choice_idx = np.where(res == choice)[0]
    subscription.iloc[choice_idx,i] = 1

subscription
Out[23]:
billedAnnually billedBiannually billedMonthly
0 0.0 0.0 1.0
1 0.0 0.0 1.0
2 0.0 0.0 1.0
3 0.0 0.0 1.0
4 0.0 1.0 0.0
... ... ... ...
7038 0.0 0.0 1.0
7039 0.0 0.0 1.0
7040 0.0 0.0 1.0
7041 0.0 0.0 1.0
7042 0.0 0.0 1.0

7043 rows × 3 columns

In [24]:
# paymentMethod

res = data_raw['PaymentMethod']
choices = ['Bank transfer', 'Cash', 'Credit card', 'Net Banking']
paymentMethod =  pd.DataFrame(np.zeros([res.shape[0], 4]), columns=['pay'+choice.replace(' ','') for choice in choices])

for i,choice in enumerate(choices):
    choice_idx = np.where(res == choice)[0]
    paymentMethod.iloc[choice_idx,i] = 1

paymentMethod
Out[24]:
payBanktransfer payCash payCreditcard payNetBanking
0 0.0 1.0 0.0 0.0
1 1.0 0.0 0.0 0.0
2 0.0 0.0 0.0 1.0
3 0.0 1.0 0.0 0.0
4 0.0 0.0 1.0 0.0
... ... ... ... ...
7038 0.0 0.0 0.0 1.0
7039 1.0 0.0 0.0 0.0
7040 0.0 0.0 0.0 1.0
7041 0.0 0.0 1.0 0.0
7042 0.0 1.0 0.0 0.0

7043 rows × 4 columns

In [25]:
dataframes = [
    'isMale',
    'isSenior',
    'isMarried',
    'hasChildren',
    'tvConnection',
    'channels',
    'hasInternet',
    'isHighSpeed',
    'addedServices',
    'subscription',
    'paymentMethod'
]

free = [
    'tenure', 
    'custId', 
    'MonthlyCharges', 
    'TotalCharges'
]

data = pd.concat([eval(dataframe) for dataframe in dataframes], sort=False, axis=1)
data2 = pd.concat([data_raw[free_idx] for free_idx in free], sort=False, axis=1).reset_index()
print(data.shape)
print(data2.shape)
data = pd.concat([data, data2], axis=1)
data = data.drop(['index'], axis=1)
(7043, 23)
(7043, 5)
In [26]:
for i,val in enumerate(data["TotalCharges"]):
    
    if(val != ' '):
        data.loc[i,"TotalCharges"] = float(val)
        
    else:
        data.loc[i,"TotalCharges"] = float('nan')
In [27]:
notna = data["TotalCharges"][data["TotalCharges"].notna()]
mean = notna.sum() / len(notna)
# print(mean)
data = data.fillna(mean)
# data[data['TotalCharges'].isna()]
In [28]:
data.columns
Out[28]:
Index(['isMale', 'isSenior', 'isMarried', 'hasChildren', 'hasConnection_No',
       'hasConnection_Cable', 'hasConnection_DTH', 'channel1', 'channel2',
       'channel3', 'channel4', 'channel5', 'channel6', 'hasInternet',
       'isHighSpeed', 'addedServices', 'billedAnnually', 'billedBiannually',
       'billedMonthly', 'payBanktransfer', 'payCash', 'payCreditcard',
       'payNetBanking', 'tenure', 'custId', 'MonthlyCharges', 'TotalCharges'],
      dtype='object')
In [29]:
data = data.astype({
    'isMale' : np.int8,
    'isSenior' : np.int8,
    'isMarried' : np.int8,
    'hasChildren' : np.int8,
    'hasConnection_No' : np.int8,
    'hasConnection_Cable' : np.int8,
    'hasConnection_DTH' : np.int8,
    'channel1' : np.int8,
    'channel2' : np.int8,
    'channel3' : np.int8,
    'channel4' : np.int8,
    'channel5' : np.int8,
    'channel6' : np.int8,
    'hasInternet' : np.int8,
    'isHighSpeed' : np.int8,
    'addedServices' : np.int8,
    'billedAnnually' : np.int8,
    'billedBiannually' : np.int8,
    'billedMonthly' : np.int8,
    'payBanktransfer' : np.int8,
    'payCash' : np.int8,
    'payCreditcard' : np.int8,
    'payNetBanking' : np.int8,
    'tenure' : np.int8,
    'custId' : np.int16,
    'MonthlyCharges' : np.float64,
    'TotalCharges' : np.float64,
})
In [30]:
data.transpose()
Out[30]:
0 1 2 3 4 5 6 7 8 9 ... 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042
isMale 1.00 1.00 1.0 1.00 1.00 1.00 1.00 0.00 0.0 1.00 ... 0.00 0.00 0.00 0.0 0.00 0.00 0.00 1.00 1.00 1.00
isSenior 0.00 1.00 1.0 0.00 0.00 0.00 0.00 1.00 0.0 0.00 ... 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 1.00 0.00
isMarried 0.00 0.00 0.0 0.00 0.00 1.00 1.00 0.00 1.0 0.00 ... 0.00 0.00 0.00 0.0 1.00 0.00 1.00 1.00 0.00 0.00
hasChildren 0.00 0.00 0.0 0.00 0.00 0.00 1.00 0.00 0.0 0.00 ... 0.00 0.00 0.00 0.0 1.00 0.00 1.00 0.00 0.00 0.00
hasConnection_No 0.00 0.00 0.0 0.00 1.00 0.00 0.00 0.00 0.0 0.00 ... 0.00 0.00 1.00 0.0 1.00 0.00 0.00 0.00 0.00 1.00
hasConnection_Cable 1.00 1.00 1.0 1.00 0.00 0.00 0.00 1.00 0.0 0.00 ... 1.00 1.00 0.00 1.0 0.00 1.00 0.00 1.00 1.00 0.00
hasConnection_DTH 0.00 0.00 0.0 0.00 0.00 1.00 1.00 0.00 1.0 1.00 ... 0.00 0.00 0.00 0.0 0.00 0.00 1.00 0.00 0.00 0.00
channel1 1.00 0.00 1.0 0.00 0.00 1.00 0.00 1.00 1.0 0.00 ... 0.00 0.00 0.00 0.0 0.00 1.00 0.00 1.00 1.00 0.00
channel2 0.00 1.00 1.0 1.00 0.00 1.00 1.00 0.00 1.0 0.00 ... 0.00 1.00 0.00 0.0 0.00 1.00 0.00 1.00 1.00 0.00
channel3 0.00 0.00 1.0 0.00 0.00 0.00 1.00 0.00 1.0 0.00 ... 0.00 0.00 0.00 0.0 0.00 0.00 0.00 1.00 1.00 0.00
channel4 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.0 0.00 ... 0.00 1.00 0.00 1.0 0.00 0.00 0.00 1.00 1.00 0.00
channel5 0.00 0.00 0.0 0.00 0.00 1.00 1.00 0.00 1.0 0.00 ... 0.00 0.00 0.00 1.0 0.00 1.00 0.00 0.00 0.00 0.00
channel6 0.00 1.00 0.0 0.00 0.00 1.00 0.00 0.00 1.0 0.00 ... 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00
hasInternet 1.00 1.00 1.0 1.00 1.00 1.00 0.00 1.00 1.0 1.00 ... 1.00 1.00 1.00 1.0 1.00 1.00 1.00 1.00 1.00 1.00
isHighSpeed 0.00 0.00 1.0 0.00 0.00 1.00 0.00 1.00 1.0 1.00 ... 0.00 1.00 0.00 1.0 1.00 0.00 0.00 1.00 1.00 0.00
addedServices 1.00 1.00 1.0 1.00 0.00 1.00 1.00 1.00 1.0 0.00 ... 1.00 1.00 0.00 0.0 1.00 1.00 1.00 1.00 1.00 0.00
billedAnnually 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.0 0.00 ... 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00
billedBiannually 0.00 0.00 0.0 0.00 1.00 1.00 0.00 0.00 1.0 0.00 ... 0.00 0.00 0.00 0.0 1.00 0.00 0.00 0.00 0.00 0.00
billedMonthly 1.00 1.00 1.0 1.00 0.00 0.00 1.00 1.00 0.0 1.00 ... 1.00 1.00 1.00 1.0 0.00 1.00 1.00 1.00 1.00 1.00
payBanktransfer 0.00 1.00 0.0 0.00 0.00 0.00 0.00 0.00 0.0 0.00 ... 0.00 0.00 0.00 0.0 0.00 0.00 1.00 0.00 0.00 0.00
payCash 1.00 0.00 0.0 1.00 0.00 0.00 0.00 0.00 0.0 0.00 ... 0.00 0.00 1.00 0.0 0.00 0.00 0.00 0.00 0.00 1.00
payCreditcard 0.00 0.00 0.0 0.00 1.00 1.00 0.00 0.00 1.0 0.00 ... 0.00 1.00 0.00 1.0 1.00 0.00 0.00 0.00 1.00 0.00
payNetBanking 0.00 0.00 1.0 0.00 0.00 0.00 1.00 1.00 0.0 1.00 ... 1.00 0.00 0.00 0.0 0.00 1.00 0.00 1.00 0.00 0.00
tenure 1.00 6.00 9.0 5.00 16.00 70.00 34.00 9.00 64.0 5.00 ... 2.00 45.00 15.00 35.0 23.00 1.00 1.00 24.00 24.00 5.00
custId 888.00 4093.00 4966.0 4788.00 1531.00 3063.00 6889.00 875.00 1048.0 1084.00 ... 6137.00 4598.00 607.00 1604.0 1636.00 3957.00 3376.00 2396.00 1011.00 6457.00
MonthlyCharges 80.85 85.15 100.5 80.10 18.95 76.95 44.85 84.45 85.0 47.15 ... 70.95 89.30 19.90 85.3 25.60 95.85 44.35 104.65 102.95 21.05
TotalCharges 80.85 503.60 918.6 398.55 326.80 5289.80 1442.60 762.50 5484.4 223.15 ... 137.95 4016.85 320.45 2917.5 514.75 95.85 44.35 2542.45 2496.70 113.85

27 rows × 7043 columns

In [31]:
data.dtypes
Out[31]:
isMale                    int8
isSenior                  int8
isMarried                 int8
hasChildren               int8
hasConnection_No          int8
hasConnection_Cable       int8
hasConnection_DTH         int8
channel1                  int8
channel2                  int8
channel3                  int8
channel4                  int8
channel5                  int8
channel6                  int8
hasInternet               int8
isHighSpeed               int8
addedServices             int8
billedAnnually            int8
billedBiannually          int8
billedMonthly             int8
payBanktransfer           int8
payCash                   int8
payCreditcard             int8
payNetBanking             int8
tenure                    int8
custId                   int16
MonthlyCharges         float64
TotalCharges           float64
dtype: object
In [32]:
labels = train_raw['Satisfied']
labels = np.array(labels)
In [33]:
file = open('data.pkl', 'ab') 
pickle.dump(data, file)

file = open('labels.pkl', 'ab') 
pickle.dump(labels, file)       

Train Test

In [34]:
file = open('data.pkl', 'rb')      
data = pickle.load(file) 

file = open('labels.pkl', 'rb')      
labels = pickle.load(file)

train_len = 4930
test_len = 2113

fdata = data
In [35]:
from sklearn.preprocessing import StandardScaler, RobustScaler
data = StandardScaler().fit_transform(fdata)
data = pd.DataFrame(data, columns=fdata.columns)
data.transpose()
Out[35]:
0 1 2 3 4 5 6 7 8 9 ... 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042
isMale 0.990532 0.990532 0.990532 0.990532 0.990532 0.990532 0.990532 -1.009559 -1.009559 0.990532 ... -1.009559 -1.009559 -1.009559 -1.009559 -1.009559 -1.009559 -1.009559 0.990532 0.990532 0.990532
isSenior -0.439916 2.273159 2.273159 -0.439916 -0.439916 -0.439916 -0.439916 2.273159 -0.439916 -0.439916 ... -0.439916 -0.439916 -0.439916 -0.439916 -0.439916 -0.439916 -0.439916 -0.439916 2.273159 -0.439916
isMarried -0.966622 -0.966622 -0.966622 -0.966622 -0.966622 1.034530 1.034530 -0.966622 1.034530 -0.966622 ... -0.966622 -0.966622 -0.966622 -0.966622 1.034530 -0.966622 1.034530 1.034530 -0.966622 -0.966622
hasChildren -0.654012 -0.654012 -0.654012 -0.654012 -0.654012 -0.654012 1.529024 -0.654012 -0.654012 -0.654012 ... -0.654012 -0.654012 -0.654012 -0.654012 1.529024 -0.654012 1.529024 -0.654012 -0.654012 -0.654012
hasConnection_No -0.525927 -0.525927 -0.525927 -0.525927 1.901403 -0.525927 -0.525927 -0.525927 -0.525927 -0.525927 ... -0.525927 -0.525927 1.901403 -0.525927 1.901403 -0.525927 -0.525927 -0.525927 -0.525927 1.901403
hasConnection_Cable 1.129102 1.129102 1.129102 1.129102 -0.885660 -0.885660 -0.885660 1.129102 -0.885660 -0.885660 ... 1.129102 1.129102 -0.885660 1.129102 -0.885660 1.129102 -0.885660 1.129102 1.129102 -0.885660
hasConnection_DTH -0.723740 -0.723740 -0.723740 -0.723740 -0.723740 1.381712 1.381712 -0.723740 1.381712 1.381712 ... -0.723740 -0.723740 -0.723740 -0.723740 -0.723740 -0.723740 1.381712 -0.723740 -0.723740 -0.723740
channel1 1.256171 -0.796070 1.256171 -0.796070 -0.796070 1.256171 -0.796070 1.256171 1.256171 -0.796070 ... -0.796070 -0.796070 -0.796070 -0.796070 -0.796070 1.256171 -0.796070 1.256171 1.256171 -0.796070
channel2 -0.790132 1.265612 1.265612 1.265612 -0.790132 1.265612 1.265612 -0.790132 1.265612 -0.790132 ... -0.790132 1.265612 -0.790132 -0.790132 -0.790132 1.265612 -0.790132 1.265612 1.265612 -0.790132
channel3 -0.723968 -0.723968 1.381277 -0.723968 -0.723968 -0.723968 1.381277 -0.723968 1.381277 -0.723968 ... -0.723968 -0.723968 -0.723968 -0.723968 -0.723968 -0.723968 -0.723968 1.381277 1.381277 -0.723968
channel4 -0.725563 -0.725563 -0.725563 -0.725563 -0.725563 -0.725563 -0.725563 -0.725563 -0.725563 -0.725563 ... -0.725563 1.378241 -0.725563 1.378241 -0.725563 -0.725563 -0.725563 1.378241 1.378241 -0.725563
channel5 -0.639439 -0.639439 -0.639439 -0.639439 -0.639439 1.563872 1.563872 -0.639439 1.563872 -0.639439 ... -0.639439 -0.639439 -0.639439 1.563872 -0.639439 1.563872 -0.639439 -0.639439 -0.639439 -0.639439
channel6 -0.633933 1.577454 -0.633933 -0.633933 -0.633933 1.577454 -0.633933 -0.633933 1.577454 -0.633933 ... -0.633933 -0.633933 -0.633933 -0.633933 -0.633933 -0.633933 -0.633933 -0.633933 -0.633933 -0.633933
hasInternet 0.327438 0.327438 0.327438 0.327438 0.327438 0.327438 -3.054010 0.327438 0.327438 0.327438 ... 0.327438 0.327438 0.327438 0.327438 0.327438 0.327438 0.327438 0.327438 0.327438 0.327438
isHighSpeed -0.854176 -0.854176 1.170719 -0.854176 -0.854176 1.170719 -0.854176 1.170719 1.170719 1.170719 ... -0.854176 1.170719 -0.854176 1.170719 1.170719 -0.854176 -0.854176 1.170719 1.170719 -0.854176
addedServices 0.829798 0.829798 0.829798 0.829798 -1.205113 0.829798 0.829798 0.829798 0.829798 -1.205113 ... 0.829798 0.829798 -1.205113 -1.205113 0.829798 0.829798 0.829798 0.829798 0.829798 -1.205113
billedAnnually -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 ... -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249 -0.514249
billedBiannually -0.562975 -0.562975 -0.562975 -0.562975 1.776278 1.776278 -0.562975 -0.562975 1.776278 -0.562975 ... -0.562975 -0.562975 -0.562975 -0.562975 1.776278 -0.562975 -0.562975 -0.562975 -0.562975 -0.562975
billedMonthly 0.904184 0.904184 0.904184 0.904184 -1.105970 -1.105970 0.904184 0.904184 -1.105970 0.904184 ... 0.904184 0.904184 0.904184 0.904184 -1.105970 0.904184 0.904184 0.904184 0.904184 0.904184
payBanktransfer -0.529885 1.887201 -0.529885 -0.529885 -0.529885 -0.529885 -0.529885 -0.529885 -0.529885 -0.529885 ... -0.529885 -0.529885 -0.529885 -0.529885 -0.529885 -0.529885 1.887201 -0.529885 -0.529885 -0.529885
payCash 1.835513 -0.544807 -0.544807 1.835513 -0.544807 -0.544807 -0.544807 -0.544807 -0.544807 -0.544807 ... -0.544807 -0.544807 1.835513 -0.544807 -0.544807 -0.544807 -0.544807 -0.544807 -0.544807 1.835513
payCreditcard -0.525047 -0.525047 -0.525047 -0.525047 1.904590 1.904590 -0.525047 -0.525047 1.904590 -0.525047 ... -0.525047 1.904590 -0.525047 1.904590 1.904590 -0.525047 -0.525047 -0.525047 1.904590 -0.525047
payNetBanking -0.711026 -0.711026 1.406418 -0.711026 -0.711026 -0.711026 1.406418 1.406418 -0.711026 1.406418 ... 1.406418 -0.711026 -0.711026 -0.711026 -0.711026 1.406418 -0.711026 1.406418 -0.711026 -0.711026
tenure -1.277445 -1.073843 -0.951682 -1.114563 -0.666639 1.532261 0.066327 -0.951682 1.287938 -1.114563 ... -1.236724 0.514251 -0.707359 0.107048 -0.381597 -1.277445 -1.277445 -0.340876 -0.340876 -1.114563
custId -1.295042 0.281338 0.710724 0.623174 -0.978782 -0.225267 1.656552 -1.301436 -1.216346 -1.198639 ... 1.286680 0.529723 -1.433252 -0.942877 -0.927138 0.214447 -0.071318 -0.553332 -1.234544 1.444072
MonthlyCharges 0.534710 0.677625 1.187796 0.509783 -1.522595 0.405090 -0.661784 0.654359 0.672639 -0.585341 ... 0.205674 0.815554 -1.491021 0.682610 -1.301576 1.033249 -0.678402 1.325725 1.269224 -1.452799
TotalCharges -0.972453 -0.785795 -0.602559 -0.832178 -0.863858 1.327467 -0.371196 -0.671483 1.413389 -0.909623 ... -0.947242 0.765418 -0.866662 0.280020 -0.780872 -0.965830 -0.988569 0.114423 0.094223 -0.957883

27 rows × 7043 columns

In [36]:
cols = [
        'isMale', 
        'isSenior', 
        'isMarried', 
        'hasChildren', 
        'hasConnection_No',
        'hasConnection_Cable', 
        'hasConnection_DTH', 
        'channel1', 
        'channel2',
        'channel3', 
        'channel4', 
        'channel5', 
        'channel6', 
        'hasInternet',
        'isHighSpeed', 
        'addedServices', 
        'billedAnnually', 
        'billedBiannually',
        'billedMonthly', 
        'payBanktransfer', 
        'payCash', 
        'payCreditcard',
        'payNetBanking', 
        'custId', 
        'tenure', 
        'MonthlyCharges', 
        'TotalCharges'
       ]
In [37]:
new_col = [
#         'isMale', 
        'isSenior', 
#         'isMarried', 
#         'hasChildren', 
        'hasConnection_No',
        'hasConnection_Cable', 
#         'hasConnection_DTH', 
#         'channel1', 
#         'channel2',
#         'channel3', 
#         'channel4', 
        'channel5', 
        'channel6', 
#         'hasInternet',
#         'isHighSpeed', 
        'addedServices', 
#         'billedAnnually', 
#         'billedBiannually',
        'billedMonthly', 
#         'payBanktransfer', 
#         'payCash', 
#         'payCreditcard',
        'payNetBanking', 
#         'custId', 
#         'tenure', 
#         'MonthlyCharges', 
#         'TotalCharges'
       ]
In [38]:
lda_cols = [
#         'isSenior', 
#         'isMarried', 
#         'hasChildren', 
        'hasConnection_No',
        'hasConnection_Cable', 
#         'channel5', 
#         'channel6', 
        'addedServices', 
#         'billedAnnually', 
        'tenure', 
        'billedMonthly',
#         'TotalCharges'
       ]
In [39]:
insane_cols = [
        'billedMonthly'
       ]
In [40]:
non_corr = [
        'isMale', 
#         'isSenior', 
#         'isMarried', 
#         'hasChildren', 
#         'hasConnection_No',
#         'hasConnection_Cable', 
#         'hasConnection_DTH', 
#         'channel1', 
#         'channel2',
#         'channel3', 
#         'channel4', 
#         'channel5', 
#         'channel6', 
#         'hasInternet',
#         'isHighSpeed', 
#         'addedServices', 
#         'billedAnnually', 
#         'billedBiannually',
#         'billedMonthly', 
#         'payBanktransfer', 
#         'payCash', 
#         'payCreditcard',
#         'payNetBanking', 
#         'custId', 
        'tenure', 
        'MonthlyCharges', 
        'TotalCharges'
       ]
In [41]:
data = fdata[cols]
In [42]:
import seaborn as sns
check = pd.concat([data[:train_len], pd.DataFrame(labels, columns=['label'])], sort=False, axis=1)

plt.figure(figsize=(30,18))
sns.heatmap(check.corr(), annot=True, cmap='RdYlGn', linewidths=0.2, cbar=False) 
plt.yticks(fontsize="20")
plt.xticks(fontsize="15", rotation=30)
plt.show()
In [43]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.decomposition import KernelPCA, PCA


def rand(data, pca_num=None):
#     data = PCA(pca_num).fit_transform(data) 
    X_train, X_val, y_train, y_val = train_test_split(
                                data[:train_len], 
                                labels, 
                                test_size=0.1, 
                                random_state=None, 
                        )
    
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)
#     X_train, y_train = ADASYN().fit_resample(X_train, y_train)
    return X_train, X_val, y_train, y_val

X_test = data[train_len:]
Using TensorFlow backend.

Model1

In [44]:
# %%time

import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

skf = StratifiedKFold(n_splits=5)
train_data = data[:train_len]
# pca = PCA(n_components=8)
# train_data = pca.fit_transform(train_data)
# print(pca.explained_variance_ratio_)
# print("done")

train_avg = 0
val_avg = 0

train_data = np.asarray(train_data)
skf.get_n_splits(train_data, labels)

for train_idx,val_idx in skf.split(train_data, labels):

    X_train, y_train = train_data[train_idx,:], labels[train_idx]
    X_val, y_val = train_data[val_idx,:], labels[val_idx]
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)

    model = LinearDiscriminantAnalysis()
    t = model.fit(X_train, y_train) 

    train_score = roc_auc_score(y_train, model.predict(X_train))
    val_score = roc_auc_score(y_val, model.predict(X_val))
    train_avg += train_score
    val_avg += val_score

train_avg = train_avg / 5
val_avg = val_avg / 5

print(train_avg)
print(val_avg)
print(log_loss(y_train, model.predict_proba(X_train)))
print(log_loss(y_val, model.predict_proba(X_val)))
/opt/conda/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/opt/conda/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/opt/conda/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
/opt/conda/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
0.7773407239724082
0.7625261489284038
0.47452418037640354
0.5245791469667758
/opt/conda/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")

Model2

In [45]:
# %%time

import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix


skf = StratifiedKFold(n_splits=5)
train_data = data[:train_len]

lda = LinearDiscriminantAnalysis()
train_data = lda.fit_transform(data[:train_len], labels)
print(train_data.shape)

# pca = KernelPCA(n_components=8)
# train_data = pca.fit_transform(train_data)
# print(pca.explained_variance_ratio_)

# print("done")

train_avg = 0
val_avg = 0

train_data = np.asarray(train_data)
skf.get_n_splits(train_data, labels)

for train_idx,val_idx in skf.split(train_data, labels):

    X_train, y_train = train_data[train_idx,:], labels[train_idx]
    X_val, y_val = train_data[val_idx,:], labels[val_idx]
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)

    kmeans = KMeans(n_clusters=10, random_state=0)
    kmeans.fit(X_train)

    train_preds = kmeans.predict(X_train)

    cm = confusion_matrix(y_train,train_preds).astype(float)
    cm[0,] = cm[0,] / cm[0,].sum() * 100
    cm[1,] = cm[1,] / cm[1,].sum() * 100
    cluster_idx = np.argmax(cm, axis=0)

    train_preds = [cluster_idx[i] for i in train_preds]
    train_score = roc_auc_score(y_train, train_preds)

    val_preds = kmeans.predict(X_val)
    val_preds = [cluster_idx[i] for i in val_preds]
    val_score = roc_auc_score(y_val, val_preds)
    
    train_avg += train_score
    val_avg += val_score

train_avg = train_avg / 5
val_avg = val_avg / 5

print(train_avg)
print(val_avg)
# print(log_loss(y_train, model.predict_proba(X_train)))
# print(log_loss(y_val, model.predict_proba(X_val)))
/opt/conda/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
(4930, 1)
0.7680546960372417
0.7655279272064586

Model3

In [46]:
# %%time

import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix

skf = StratifiedKFold(n_splits=5)
train_data = data[:train_len]

lda = LinearDiscriminantAnalysis()
train_data = lda.fit_transform(data[:train_len], labels)

train_avg = 0
val_avg = 0

train_data = np.asarray(train_data)
skf.get_n_splits(train_data, labels)

for train_idx,val_idx in skf.split(train_data, labels):

    X_train, y_train = train_data[train_idx,:], labels[train_idx]
    X_val, y_val = train_data[val_idx,:], labels[val_idx]
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)

    kmeans = KMeans(n_clusters=10, random_state=0)
    kmeans.fit(X_train)

    train_preds = kmeans.predict(X_train)

    cm = confusion_matrix(y_train,train_preds).astype(float)
    cm[0,] = cm[0,] / cm[0,].sum() * 100
    cm[1,] = cm[1,] / cm[1,].sum() * 100
    cluster_idx = np.argmax(cm, axis=0)

    train_preds = [cluster_idx[i] for i in train_preds]
    train_score = roc_auc_score(y_train, train_preds)

    val_preds = kmeans.predict(X_val)
    val_preds = [cluster_idx[i] for i in val_preds]
    val_score = roc_auc_score(y_val, val_preds)
    
    train_avg += train_score
    val_avg += val_score

train_avg = train_avg / 5
val_avg = val_avg / 5

print(train_avg)
print(val_avg)
/opt/conda/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
0.7664322088767767
0.7652448751092942

Model4

In [47]:
%%time

import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.metrics import confusion_matrix

skf = StratifiedKFold(n_splits=3)
train_data = data[:train_len]

# lda = LinearDiscriminantAnalysis()
# train_data = lda.fit_transform(data[:train_len], labels)

pca = KernelPCA(n_components=5)
train_data = pca.fit_transform(data[:train_len], labels)

train_avg = 0
val_avg = 0

train_data = np.asarray(train_data)
skf.get_n_splits(train_data, labels)

for train_idx,val_idx in skf.split(train_data, labels):

    X_train, y_train = train_data[train_idx,:], labels[train_idx]
    X_val, y_val = train_data[val_idx,:], labels[val_idx]
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)

    kmeans = AffinityPropagation(verbose=True,convergence_iter=50, max_iter=300, damping=0.7)
    kmeans.fit(X_train)

    train_preds = kmeans.predict(X_train)

    cm = confusion_matrix(y_train,train_preds).astype(float)
    cm[0,] = cm[0,] / cm[0,].sum() * 100
    cm[1,] = cm[1,] / cm[1,].sum() * 100
    cluster_idx = np.argmax(cm, axis=0)

    train_preds = [cluster_idx[i] for i in train_preds]
    train_score = roc_auc_score(y_train, train_preds)

    val_preds = kmeans.predict(X_val)
    val_preds = [cluster_idx[i] for i in val_preds]
    val_score = roc_auc_score(y_val, val_preds)
    
    train_avg += train_score
    val_avg += val_score
    break

train_avg = train_avg / 1
val_avg = val_avg / 1

print(train_avg)
print(val_avg)
Converged after 208 iterations.
0.6354598177299089
0.5921919997573271
CPU times: user 1min 47s, sys: 1.87 s, total: 1min 49s
Wall time: 1min 48s

Model5

In [48]:
# %%time

import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.cluster import KMeans, Birch
from sklearn.metrics import confusion_matrix

skf = StratifiedKFold(n_splits=5)
train_data = data[:train_len]

# lda = LinearDiscriminantAnalysis()
# train_data = lda.fit_transform(data[:train_len], labels)

train_avg = 0
val_avg = 0

train_data = np.asarray(train_data)
skf.get_n_splits(train_data, labels)

for train_idx,val_idx in skf.split(train_data, labels):

    X_train, y_train = train_data[train_idx,:], labels[train_idx]
    X_val, y_val = train_data[val_idx,:], labels[val_idx]
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)

    kmeans = Birch(threshold=0.01)
    kmeans.fit(X_train)

    train_preds = kmeans.predict(X_train)

    cm = confusion_matrix(y_train,train_preds).astype(float)
    cm[0,] = cm[0,] / cm[0,].sum() * 100
    cm[1,] = cm[1,] / cm[1,].sum() * 100
    cluster_idx = np.argmax(cm, axis=0)

    train_preds = [cluster_idx[i] for i in train_preds]
    train_score = roc_auc_score(y_train, train_preds)

    val_preds = kmeans.predict(X_val)
    val_preds = [cluster_idx[i] for i in val_preds]
    val_score = roc_auc_score(y_val, val_preds)
    
    train_avg += train_score
    val_avg += val_score

train_avg = train_avg / 5
val_avg = val_avg / 5

print(train_avg)
print(val_avg)
0.591272310658578
0.5835043902703871
In [ ]:
 
In [ ]:
 
In [ ]:
 

Predict

Model 1

In [49]:
# train_data = data[:train_len]

# # lda = LinearDiscriminantAnalysis()
# # train_data = lda.fit_transform(data[:train_len], labels)

# pca = KernelPCA(n_components=5)
# train_data = pca.fit(data[:train_len], labels)
# test_data = pca.transform(data[:test_len], labels)

# X_train, y_train = SMOTE().fit_resample(X_train, y_train)

# kmeans = AffinityPropagation(verbose=True,convergence_iter=50, max_iter=300, damping=0.7)
# kmeans.fit(X_train)

# train_preds = kmeans.predict(X_train)

# cm = confusion_matrix(y_train,train_preds).astype(float)
# cm[0,] = cm[0,] / cm[0,].sum() * 100
# cm[1,] = cm[1,] / cm[1,].sum() * 100
# cluster_idx = np.argmax(cm, axis=0)

# train_preds = [cluster_idx[i] for i in train_preds]
# train_score = roc_auc_score(y_train, train_preds)

# val_preds = kmeans.predict(X_val)
# val_preds = [cluster_idx[i] for i in val_preds]
# val_score = roc_auc_score(y_val, val_preds)
In [ ]:
 

Model2

In [50]:
pca_num = None
# data = PCA(pca_num).fit_transform(data)

# pca = KernelPCA(n_components=8)
# data = pca.fit_transform(data)

X_train = data[:train_len]
X_test = data[train_len:]
y_train = labels
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

model = LinearDiscriminantAnalysis()
t = model.fit(X_train, y_train)
roc_auc_score(y_train, model.predict(X_train))
/opt/conda/lib/python3.6/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
Out[50]:
0.7752002209334439
In [51]:
def generate_submission(model, fname):
    pred_test = model.predict(X_test)

    sub = pd.DataFrame({
        'custId' : test_raw['custId'],
        'Satisfied': pred_test
    })
    
    sub.to_csv(fname, index=False)
    return sub
In [52]:
# generate_submission(model, 'pred21-all_features-lda-no_pca-SMOTE-full_data-st_normalized.csv')
generate_submission(model, 'predxx-Final.csv')
Out[52]:
custId Satisfied
0 3904 1
1 5496 0
2 497 0
3 4260 0
4 4748 0
... ... ...
2108 3957 0
2109 3376 0
2110 2396 0
2111 1011 0
2112 6457 1

2113 rows × 2 columns

In [ ]: