-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathRegression.py
More file actions
136 lines (94 loc) · 3.76 KB
/
Regression.py
File metadata and controls
136 lines (94 loc) · 3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#pip install numpy
#pip install scipy
#pip install scikit-learn
#pip install matplotlib
#pip install quandl
import pandas as pd
import quandl
df = quandl.get("WIKI/GOOGL")
print(df.head())
#Thus, let's go ahead and pair down our original dataframe a bit:
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
# Let's go ahead and transform our data next:
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0
#Next, we'll do daily percent change:
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.
#Now we will define a new dataframe as:
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
print(df.head())
#Regression - Features and Labels
import quandl, math
import numpy as np
import pandas as pd
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression
#math.ceil(x)
#Return the ceiling of x as a float,
#the smallest integer value greater than or equal to x.
#Forcost, fill missing data and return ceiling as float, smallest integer values
#greater than or equal to x
#In our case, we've decided the features are a bunch of the current values, and the label shall be the price, in the future, where the future is 1% of the entire length of the dataset out. We'll assume all current columns are our features,
#so we'll add a new column with a simple pandas operation:
#We're saying we want to forecast out 1% of the entire length of the dataset
forecast_col = 'Adj. Close'
df.fillna(value=-99999, inplace=True)
forecast_out = int(math.ceil(0.01 * len(df)))
#In our case, we've decided the features are a bunch of the current values,
#and the label shall be the price, in the future, where the future is 1% of the entire length of the dataset out.
#We'll assume all current columns are our features, so we'll add a new column with a simple pandas operation:
df['label'] = df[forecast_col].shift(-forecast_out)
#Regression - Training and Testing
#We'll then drop any still NaN information from the dataframe:
#df.dropna(inplace=True)
#X = np.array(df.drop(['label'], 1))
#y = np.array(df['label'])
#X = preprocessing.scale(X)
#y = np.array(df['label'])
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#clf = svm.SVR()
#clf = LinearRegression()
#clf.fit(X_train, y_train)
#confidence = clf.score(X_test, y_test)
#print(confidence)
#clf = LinearRegression()
#clf = LinearRegression(n_jobs=-1)
#for k in ['linear','poly','rbf','sigmoid']:
# clf = svm.SVR(kernel=k)
# clf.fit(X_train, y_train)
# confidence = clf.score(X_test, y_test)
# print(k,confidence)
#Predict
X = np.array(df.drop(['label'], 1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
df.dropna(inplace=True)
y = np.array(df['label'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print(confidence)
forecast_set = clf.predict(X_lately)
print(forecast_set, confidence, forecast_out)
#visualizing this information
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
next_unix += 86400
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()