-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdisaster_regression.py
More file actions
77 lines (56 loc) · 2.04 KB
/
disaster_regression.py
File metadata and controls
77 lines (56 loc) · 2.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import csv
import os
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
TRAIN_DATASET_PATH = "{base_path}/datasets/titanic/train.csv".format(
base_path=BASE_PATH
)
TEST_DATASET_PATH = "{base_path}/datasets/titanic/test.csv".format(
base_path=BASE_PATH
)
DATASET_COLUMNS = [
'Pclass',
'Sex',
'Age',
]
def _create_dataset(training=False):
dataset_path = TEST_DATASET_PATH
if training:
dataset_path = TRAIN_DATASET_PATH
# We read csv file to create dataset and
# we clean data a little bit
dataset = pandas.read_csv(dataset_path)
dataset['Sex'] = dataset['Sex'].apply(
lambda sex:1 if sex == 'male' else 0
)
dataset['Age'] = dataset['Age'].fillna(dataset['Age'].mean())
dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].mean())
return dataset
def _create_train_dataset():
train_dataset = _create_dataset(training=True)
# We pick each passenger's data and
# whether they survived or not
data = train_dataset[DATASET_COLUMNS].values
survived = train_dataset['Survived'].values
return survived, data
def _create_test_dataset():
test_dataset = _create_dataset()
# We pick each passenger's data
# This time we'll predict whether they survive or not
data = test_dataset[DATASET_COLUMNS].values
passenger_id = test_dataset["PassengerId"].values
passenger_name = test_dataset["Name"].values
return data, passenger_id, passenger_name
survived, training_data = _create_train_dataset()
# We create the regressor and train it
regressor = LogisticRegression()
regressor.fit(training_data, survived)
testing_data, passenger_id, passenger_name = _create_test_dataset()
predicted = regressor.predict(testing_data)
results = pandas.DataFrame(columns=['PassengerId', 'PassengerName', 'Survived'])
results['PassengerId'] = passenger_id
results['PassengerName'] = passenger_name
results['Survived'] = predicted.astype(bool)
print results