MachineLearningSamples-ChurnPrediction/CATelcoCustomerChurnModelingWithoutDprep.py at master · sethmott/MachineLearningSamples-ChurnPrediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Customer Churn Prediction
import pickle

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import csv
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder

from azureml.logging import get_azureml_logger

# initialize the logger
run_logger = get_azureml_logger()
run_logger.log('amlrealworld.ChurnPrediction.CATelcoCustomerChurnModelingWithoutDprep','true')

# Perform Data Preparation
df = pd.read_csv('data/CATelcoCustomerChurnTrainingSample.csv')
df = df.fillna(0)
df = df.drop_duplicates()
df = df.drop('year', 1)
df = df.drop('month', 1)

# One-Hot Encoding
columns_to_encode = list(df.select_dtypes(include=['category','object']))
for column_to_encode in columns_to_encode:
    dummies = pd.get_dummies(df[column_to_encode])
    one_hot_col_names = []
    for col_name in list(dummies.columns):
        one_hot_col_names.append(column_to_encode + '_' + col_name)
    dummies.columns = one_hot_col_names
    df = df.drop(column_to_encode, axis=1)
    df = df.join(dummies)

model = GaussianNB()

random_seed = 42
train, test = train_test_split(df, random_state = random_seed, test_size = 0.3)

target = train['churn'].values
train = train.drop('churn', 1)
train = train.values
model.fit(train, target)


expected = test['churn'].values
test = test.drop('churn', 1)
predicted = model.predict(test)
print("Naive Bayes Classification Accuracy", accuracy_score(expected, predicted))
# Log the Naive Bayes accuracy
run_logger.log("Naive Bayes Accuracy", accuracy_score(expected, predicted))

dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt.fit(train, target)
predicted = dt.predict(test)
print("Decision Tree Classification Accuracy", accuracy_score(expected, predicted))
# log the DTree Accuracy
run_logger.log("DTree Accuracy", accuracy_score(expected, predicted))

# serialize the model on disk in the special 'outputs' folder
print ("Export the model to outputs/model.pkl")
f = open('./outputs/model.pkl', 'wb')
pickle.dump(model, f)
f.close()