-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlinearregression.py
More file actions
135 lines (104 loc) · 4.88 KB
/
linearregression.py
File metadata and controls
135 lines (104 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
"""LinearRegression.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/188IWg8peo3MaDh-R479doxR30dmUghgR
"""
# import necessary packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import itertools
import numpy as np
# Set ipython's max row display
pd.set_option('display.max_row', 1000)
# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)
# define the columns to included in the dataframe
COLUMNS = ['crim','zn','indus','chas','nox','rm',
'age','dis','rad','tax','ptratio','black','lstat','medv']
url='https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
# create the dataframe
df = pd.read_csv(url, delim_whitespace=True, names=COLUMNS)
# shuffle the data and reset the index
df.sample(frac=1).reset_index(drop=True)
# print the first 5 rows of the dataframe
print(df.head())
# get the information regarding the dataframe
print(df.info())
# get the statistics of the data
print(df.describe().transpose())
# plot the boxplot using seaborn
sns.set(style='ticks')
sns.boxplot(data=df)
# plot the pairplot of the dataframe
sns.pairplot(data=df)
# compute the correlation among the features in the dataset
corr_data = df.corr(method='pearson')
print(corr_data)
# plot the correlation matrix data using matplotlib
plt.matshow(corr_data)
plt.xticks(range(len(corr_data.columns)), corr_data.columns)
plt.yticks(range(len(corr_data.columns)), corr_data.columns)
plt.colorbar()
plt.show()
# define the columns to get scaled except the 'age' column and the dependent column 'medv'
df_subcategories = df.loc[:, ~df.columns.isin(['age', 'medv'])]
# instantiate contructor for the MinMaxScaler for the independent variables
x_scaler = MinMaxScaler()
#fit and transform the data except for 'age' and 'medv' columns
df_scaled = x_scaler.fit_transform(df_subcategories)
df.loc[:, ~df.columns.isin([ 'age', 'medv'])] = df_scaled
# instantiate contructor for the MinMaxScaler for the dependent variable 'medv'
y_scaler = MinMaxScaler()
# fit and transform the 'medv' column
df_y_scaled = y_scaler.fit_transform(df.loc[:, 'medv'].values.reshape(-1,1))
df.loc[:, 'medv'] = df_y_scaled
print(df)
# create a list of continuous value columns to parsed for feature columns except 'age' and 'medv'. 'age' can be parsed into bucketized column and 'medv' shouldn't be converted into a feature column since it is the dependent column
continuous_columns = [c for c in df.columns if c not in ['age', 'medv']]
bucketized_columns = ['age']
# convert the continous columns into numeric feature columns
cont_feature_cols = [tf.feature_column.numeric_column(c) for c in continuous_columns]
print(cont_feature_cols)
# first convert the 'age' column into numeric feature column and then into a bucketized feature column with the boundaries set in the 'boundaries' parameter
age_numeric_cols = tf.feature_column.numeric_column('age')
age_bucketized_cols = [tf.feature_column.bucketized_column(source_column=age_numeric_cols, boundaries=[i for i in range(0, 105, 5)])]
print(age_bucketized_cols)
# split dataset into train, test and prediction sets
train_data = df.iloc[:int(0.8*len(df)), :]
test_data = df.iloc[int(0.8*len(df)):int(0.98*len(df)),:]
prediction_data = df.iloc[int(0.98*len(df)):, :]
print(train_data.shape, test_data.shape, prediction_data.shape)
cols = list(df.columns)
LABEL = cols.pop(-1)
FEATURES = cols
print(cols)
print('FEATURES: ',FEATURES)
print('LABEL: ', LABEL)
# instantiate linear regressor estimator object with the feature columns defined earlier and the model directory
estimator = tf.estimator.LinearRegressor(feature_columns=cont_feature_cols + age_bucketized_cols, model_dir='linear_train_4')
# define the input function to be parsed for training, evaluating and predicting
def get_input_fn(data, num_epochs=None, n_batch=128, shuffle=True):
return tf.estimator.inputs.pandas_input_fn(
x= pd.DataFrame({d: data[d].values for d in FEATURES}),
y= pd.Series(data[LABEL].values),
batch_size=n_batch,
num_epochs=num_epochs,
shuffle=shuffle
)
# train the estimator model
estimator.train(input_fn=get_input_fn(train_data, num_epochs=10, n_batch=128, shuffle=True))
# evaluate the estimator model using test data
ev = estimator.evaluate(input_fn=get_input_fn(test_data, num_epochs=1, n_batch=128, shuffle=False))
# print the evaluation loss
loss = ev['loss']
print('Loss: ', loss)
# find the prediction results using the prediction data
y_hat = estimator.predict(input_fn=get_input_fn(prediction_data, num_epochs=1, n_batch=128, shuffle=True))
# iterate over the predicted results
predictions = [p['predictions'] for p in itertools.islice(y_hat, prediction_data.shape[0])]
# do inverse transform the predicted result as we have rescaled the 'medv' earlier
print(y_scaler.inverse_transform(predictions))