DataAnalysisAndMachineLearning/StockPricePredictorV1.py at main · RedZapdos123/DataAnalysisAndMachineLearning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#A program to manually implement multiple linear regression with standard scaling model
# to predict stock prices of a stock from it's last day's opening, highest and lowest prices and it's trading volume.
#The dataset should have columns in this format: Date, Open, High, Low, Close, Adjusted Close, Volume.

import numpy as np
import pandas as pd

#Loading and performing preprocessing on the dataset.
def loadData(filePath):
    try:
        data = pd.read_csv(filePath)
        print("File loaded successfully.")
        return data
    except FileNotFoundError:
        print(f"Error: File '{filePath}' not found.")
        return None
    except pd.errors.EmptyDataError:
        print("Error: File is empty.")
        return None
    except pd.errors.ParserError:
        print("Error: File is not in the correct format.")
        return None

#Taking the file path as user input.
filePath = input("Input the datasets' file path: ")

data = loadData(filePath)

if data is not None:
    data = data[['Open', 'High', 'Low', 'Close', 'Volume']]
else:
    exit("Data loading failed. Exited.")

#Adding previous day's Closing price as column by shifting the rows of 'Close' columns downwards by one.
data['PreviousClose'] = data['Close'].shift(1)
data.dropna(inplace = True)

#Standardizing the feature data columns.
features = ['Open', 'High', 'Low', 'Volume', 'PreviousClose']
meanValues = data[features].mean()
stdValues = data[features].std()
X = (data[features] - meanValues) / stdValues

#Keeping Y (the closing prices) in original scale.
Y = data['Close'].values

#Adding the bias term.
X = np.hstack((np.ones((X.shape[0], 1)), X))

#Shuffling the data for better training and testing sets.
def shuffleData(X, Y):
    indexes = np.arange(X.shape[0])
    np.random.shuffle(indexes)
    return X[indexes], Y[indexes]

X, Y = shuffleData(X, Y)

#Splitting data into training and testing sets, in 80:20 ratio.
def dataSplit(X, Y, testSize = 0.2):
    trainSize = int(len(X) * (1 - testSize))
    Xtrain, Xtest = X[:trainSize], X[trainSize:]
    Ytrain, Ytest = Y[:trainSize], Y[trainSize:]
    return Xtrain, Xtest, Ytrain, Ytest

Xtrain, Xtest, Ytrain, Ytest = dataSplit(X, Y, testSize=0.2)

#Initializing the weights.
np.random.seed(69)
w = np.random.randn(Xtrain.shape[1])

#The Mean Squared Error function.
def MSE(X, Y, w):
    m = X.shape[0]
    PredY = X.dot(w)
    return (1/(2*m)) * np.sum((PredY - Y)**2)

#The Gradient Descent function.
def gradientDescent(X, Y, w, learningRate, epochs):
    m = X.shape[0]
    errors = []

    for epoch in range(epochs):
        PredY = X.dot(w)
        dw = (1/m) * X.T.dot(PredY - Y)
        w -= learningRate*dw
        error = MSE(X, Y, w)
        errors.append(error)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Error: {error}")

    return w, errors

#Training the model with the parameters.
learningRate = 0.005
epochs = 5000
w, errors = gradientDescent(Xtrain, Ytrain, w, learningRate, epochs)

#The Prediction function.
def predict(X, w):
    return X.dot(w)

PredY = predict(Xtest, w)

#Evaluating the performance.
MSEValue = np.mean((PredY - Ytest)**2)
print(f"Mean Squared Error: {MSEValue}")

#Using C-efficient of Determination (R2) method.
R2Value = 1 - ( np.sum((Ytest-PredY)**2)/np.sum((Ytest - np.mean(Ytest))**2))
print(f"Coefficient of Determination (R2): {R2Value}")

#Displaying the first five predictions against actual values.
for i in range(min(5, len(PredY))):
    print(f"Predicted: {PredY[i]}; Actual: {Ytest[i]}")