-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLogisticsRegressionWithNoramlization.py
More file actions
126 lines (87 loc) · 426 KB
/
LogisticsRegressionWithNoramlization.py
File metadata and controls
126 lines (87 loc) · 426 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""DataSciPart5(logisticsRegression).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1KpjK6qqdUz9dvAst5j66aOSfiaaUsbUy
# Logistic Regression
> In statistics the logistic model is used to model propbability of a certain class or event existing such as pass/fail, win/lose, alive/dead.
> Linear Vs Logistics


> Sometimes we need normalization or scaling
* The goal of normalization is to change the values of numeric columns in the dataset to a common scale,without distorting the differences in the ranges of values.(100,200,432,430-> 0.1,0.2,0.43,0.43)
* Methods of Data Normalization – Decimal Scaling,Min-Max Normalization,z-Score Normalization(zero-mean Normalization)


> Types of Scaler
* Min Max Scaler(0,1) xnormalized=(x-xmin)/(xmax-xmin)
* Standard scaler (0 as median) z=(x-u)/σ u is mean and σ is std. deviation
"""
#NORMALIZATION
import matplotlib.pyplot as plt
np.random.seed(0)
x1=np.random.randint(1,60,30)
x1=np.sort(x1)
x2=np.random.randint(10000,70000,30)
plt.plot(x1,x2)
x1min=min(x1)
x1max=max(x1)
x2min=min(x2)
x2max=max(x2)
x1norm=(x1-x1min)/(x1max-x1min)
x2norm=(x2-x2min)/(x2max-x2min)
print(x1norm,x2norm)
plt.plot(x1,x2)
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/sudarshan-koirala/Logistic-Regression-Social-Network-Ads/master/Social_Network_Ads.csv')
print(df.head())
print("Coumns we have: ",df.columns)
print("info",df.info())
# x=df.iloc[:,[2,3]].values same
x=df.iloc[:,2:4].values
y=df.iloc[:,4].values
print(x.ndim)
print("Count of purchased or not purchased: \n",df['Purchased'].value_counts())
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.8,random_state=0)
print(f"x_train shape {x_train.shape}")
print(f"x_test shape {x_test.shape}")
print(f"y_train shape {y_train.shape}")
print(f"y_test shape {y_test.shape}")
#training 75% and testing 25%
import numpy as np
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print(y_pred)
print(y_test)
#This is beacuse The salary and age are not in normalized form ie btw 0 and 1.
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)# this gives 68%
#do normalization for our data
#for training set we do fit_transform beaucse we need to compute features and then use it to autoscale the data
#for testing we already have features ,so we only do the transform part.
from sklearn.preprocessing import MinMaxScaler
scale=MinMaxScaler()
x_train=scale.fit_transform(x_train) #fit transform
x_test=scale.transform(x_test) #only transform
model=LogisticRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print(y_pred)
print(y_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)# this gives 89%
#test_size=0.3, 84%
#test_size=0.5, 77.5%
#test_size=0.1 95%
#test_size=0.8 75%
#print(x_train)#This is how the vlaues are scale down btw 0 and 1
#Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred,y_test)#TP FP TN FN
# in order to improve accuracy : one way is to specify the test_size=0.3 in train_test_split()
#or else use svm