LightML.jl/src/supervised_learning/randomForests.jl at master · memoiry/LightML.jl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87

type randomForest
    min_split::Int64
    min_gain::Float64
    max_depth::Integer
    n_estimators::Int64
    max_features::Union{Int64, String}
    feature_index::Dict{Int64, Vector}
    trees::Vector{ClassificationTree}
end

function randomForest(;
                      min_split::Int64 = 2,
                      min_gain::Float64 = 1e-7,
                      max_depth::Integer = 10000000,
                      n_estimators::Int64 = 200,
                      max_features::Union{Int64, String} = "nothing",
                      feature_index::Dict{Int64,Vector} = Dict{Int64,Vector}(),
                      trees::Vector{ClassificationTree} = Vector{ClassificationTree}()
                      )
    for i = 1:n_estimators
        push!(trees, ClassificationTree(min_gain = min_gain, min_samples_split = min_split, max_depth = max_depth))
    end
    return randomForest(min_split, min_gain, max_depth, n_estimators, max_features, feature_index, trees)
end


function train!(model::randomForest, X::Matrix, y::Vector)
    n_sample, n_feature = size(X)
    if model.max_features == "nothing"
        model.max_features = trunc(Int64, sqrt(n_feature))
    end
    sets = get_random_subsets(X, y, model.n_estimators)
    #println("Start training Random Forests")
    for i = 1:model.n_estimators
        #temp = i/model.n_estimators*100
        #println("Progress: $(temp) %")
        temp_data = sets[1,:,:]
        X = temp_data[:, 1:(end-1)]
        y = temp_data[:, end]
        idx = sample(1:n_feature, model.max_features, replace = false)
        model.feature_index[i] = idx
        X = X[:, idx]
        train!(model.trees[i], X, y)
    end
end

function predict(model::randomForest,
                 x::Matrix)
    n_sample = size(x,1)
    res = zeros(n_sample, model.n_estimators)
    for i = 1:model.n_estimators
        idx = model.feature_index[i]
        res[:,i] = predict(model.trees[i], x[:, idx])
    end
    ans = zeros(n_sample)
    for k = 1:n_sample
        most = 0
        max_time = 0
        for i in unique(res[k,:])
            n_times = 0
            for j = 1:model.n_estimators
                if i == res[j]
                    n_times += 1
                end
            end
            if n_times > max_time
                most = i
            end
        end
        ans[k] = most
    end
    return ans
end


function test_randomForest()
    X_train, X_test, y_train, y_test = make_iris()
    model = randomForest()
    train!(model,X_train, y_train)
    predictions = predict(model,X_test)
    println("classification accuracy ", accuracy(y_test, predictions))
    #pca
    pcamodel = PCA()
    train!(pcamodel, X_test)
    plot_in_2d(pcamodel, X_test, predictions, "RadomForest")
end