-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathSplitter_C.pyx
More file actions
143 lines (106 loc) · 4.6 KB
/
Copy pathSplitter_C.pyx
File metadata and controls
143 lines (106 loc) · 4.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
from Predicate import Predicate
import cython
__author__ = 'popka'
from utils.Placement import Placement
import numpy as np
cimport numpy as np
from math import fabs
import time
from Impurity.RegressionImpurity import RegressionImpurity
class Splitter():
"""
Сплиттер находит оптимальное разделение "параметр предиката"
"""
@staticmethod
def split_categorial(x, y, impurity):
combiner = Placement(np.unique(x))
max_impurity = -1
best_c = []
for c in combiner:
mask = np.in1d(x, c)
y_left = y[mask]
y_right = y[np.invert(mask)]
imp = impurity.calculate_split(y_left, y_right)
if imp > max_impurity:
max_impurity = imp
best_c = c
return best_c, max_impurity
@staticmethod
def split_quantitative(x, y, impurity, steps=100):
"""
Функция делит количественную переменную так, чтобы разделение приводило к максимальному уменьшению импюрити
:param x: столбец признака
:param y: столбец целевой функции
:param impurity: объект-наследний класса Impurity
:param steps: количество шагов. Если None, то просмотреть все объекты в отдельности
:return: возвращает лучшее разделение и знаение импюрити
"""
max_impurity = -1
best_value = 0
if steps is None:
argsort = x.argsort()
x = x[argsort]
y = y[argsort]
for value in x[:-1]:
y_left = y[x<=value]
y_right = y[x>value]
if len(y_right) == 0:
continue
imp = impurity.calculate_split(y_left, y_right)
if imp > max_impurity:
max_impurity = imp
best_value = value
else:
current = min(x)
end = max(x)
step = float(end-current)/steps
current += step
# 1.1 только, чтобы оставался в конце хотя бы один элемент. Т.е. Значение было больше одного шага и меньше двух шагов
while (current < end-step*1.1):
y_left = y[x<=current]
y_right = y[x>current]
imp = impurity.calculate_split(y_left, y_right)
if imp > max_impurity:
max_impurity = imp
best_value = current
current += step
return best_value, max_impurity
#np.ndarray[np.float64_t, ndim=1]
@staticmethod
def split_quick_quantitative(x, y, impurity):
cdef float max_gain = -1
cdef float best_value = -1
cdef np.ndarray[np.longlong_t, ndim=1] argsort = x.argsort()
x = x[argsort]
y = y[argsort]
cdef float imp_full = impurity.calculate_node(y)
cdef int length = len(y)
#rs - right_split
cdef Split rs = Split(np.mean(y[1:]), imp_full, impurity.calculate_node(y[1:]), len(y)-1)
#ls - left_split
cdef Split ls = Split(y[0], 0, 0, 1)
for i in range(1, len(y)-1):
rs.prev_mean = rs.mean
rs.mean = (rs.length*rs.mean - y[i])/(rs.length-1)
rs.var = fabs((rs.var*rs.length-(y[i]-rs.mean)*(y[i]-rs.prev_mean))/(rs.length-1))
rs.length -= 1.
ls.length += 1.
ls.prev_mean = ls.mean
ls.mean = ls.prev_mean+(y[i]-ls.prev_mean)/ls.length
ls.var = fabs(((ls.length-1)*ls.var + (y[i]-ls.prev_mean)*(y[i]-ls.mean))/ls.length)
if (x[i+1]!=x[i]):
gain = imp_full - rs.var*rs.length/length - ls.var*ls.length/length
if gain > max_gain:
max_gain = gain
best_value = (x[i]+x[i+1])/2.
# если не нашли максимального (по сути, когда все значения столбца одинаковы)
# возвращаем None, None
return best_value, max_gain
cdef class Split:
cdef public float mean, prev_mean, var, length
def __init__(self, float mean, float prev_mean, float var, float length):
self.mean = mean
self.prev_mean = prev_mean
self.var = var
self.length = length