GradientBoosting/Splitter_C.pyx at master · pashna/GradientBoosting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
from Predicate import Predicate
import cython

__author__ = 'popka'
from utils.Placement import Placement
import numpy as np
cimport numpy as np
from math import fabs
import time
from Impurity.RegressionImpurity import RegressionImpurity


class Splitter():
    """
    Сплиттер находит оптимальное разделение "параметр предиката"
    """

    @staticmethod
    def split_categorial(x, y, impurity):

        combiner = Placement(np.unique(x))
        max_impurity = -1
        best_c = []
        for c in combiner:

            mask = np.in1d(x, c)
            y_left = y[mask]
            y_right = y[np.invert(mask)]
            imp = impurity.calculate_split(y_left, y_right)

            if imp > max_impurity:
                max_impurity = imp
                best_c = c

        return best_c, max_impurity


    @staticmethod
    def split_quantitative(x, y, impurity, steps=100):
        """
        Функция делит количественную переменную так, чтобы разделение приводило к максимальному уменьшению импюрити
        :param x: столбец признака
        :param y: столбец целевой функции
        :param impurity: объект-наследний класса Impurity
        :param steps: количество шагов. Если None, то просмотреть все объекты в отдельности
        :return: возвращает лучшее разделение и знаение импюрити
        """
        max_impurity = -1
        best_value = 0

        if steps is None:
            argsort = x.argsort()
            x = x[argsort]
            y = y[argsort]


            for value in x[:-1]:

                y_left = y[x<=value]
                y_right = y[x>value]
                if len(y_right) == 0:
                    continue
                imp = impurity.calculate_split(y_left, y_right)

                if imp > max_impurity:
                    max_impurity = imp
                    best_value = value
        else:
            current = min(x)
            end = max(x)
            step = float(end-current)/steps
            current += step
            # 1.1 только, чтобы оставался в конце хотя бы один элемент. Т.е. Значение было больше одного шага и меньше двух шагов
            while (current < end-step*1.1):
                y_left = y[x<=current]
                y_right = y[x>current]
                imp = impurity.calculate_split(y_left, y_right)

                if imp > max_impurity:
                    max_impurity = imp
                    best_value = current

                current += step

        return best_value, max_impurity

    #np.ndarray[np.float64_t, ndim=1]
    @staticmethod
    def split_quick_quantitative(x, y, impurity):

        cdef float max_gain = -1
        cdef float best_value = -1

        cdef np.ndarray[np.longlong_t, ndim=1] argsort = x.argsort()
        x = x[argsort]
        y = y[argsort]

        cdef float imp_full = impurity.calculate_node(y)
        cdef int length = len(y)

        #rs - right_split
        cdef Split rs = Split(np.mean(y[1:]), imp_full, impurity.calculate_node(y[1:]), len(y)-1)

        #ls - left_split
        cdef Split ls = Split(y[0], 0, 0, 1)

        for i in range(1, len(y)-1):

            rs.prev_mean = rs.mean
            rs.mean = (rs.length*rs.mean - y[i])/(rs.length-1)
            rs.var = fabs((rs.var*rs.length-(y[i]-rs.mean)*(y[i]-rs.prev_mean))/(rs.length-1))
            rs.length -= 1.

            ls.length += 1.
            ls.prev_mean = ls.mean
            ls.mean = ls.prev_mean+(y[i]-ls.prev_mean)/ls.length
            ls.var = fabs(((ls.length-1)*ls.var + (y[i]-ls.prev_mean)*(y[i]-ls.mean))/ls.length)


            if (x[i+1]!=x[i]):
                gain = imp_full - rs.var*rs.length/length - ls.var*ls.length/length

                if gain > max_gain:
                    max_gain = gain
                    best_value = (x[i]+x[i+1])/2.


        # если не нашли максимального (по сути, когда все значения столбца одинаковы)
        # возвращаем None, None
        return best_value, max_gain


cdef class Split:

    cdef public float mean, prev_mean, var, length

    def __init__(self, float mean, float prev_mean, float var, float length):
        self.mean = mean
        self.prev_mean = prev_mean
        self.var = var
        self.length = length