homework1/naive.py at master · machinelearningpku2016/homework1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import csv

data=[line.replace("\n","").split(',') for line in open('data/car/car.data').readlines()]
train_list=[]
check_list=[]
test_list=[]
for i in range(len(data)):
	if i%7==0 :
		check_list.append(i)
	if i%7==1:
		test_list.append(i)
	else:
		train_list.append(i)
train_data=list(map(lambda k:data[k], train_list))
check_data=list(map(lambda k:data[k], check_list))
test_data=list(map(lambda k:data[k], test_list))

#统计类型
n = 6
typestatistics = []
N = len(train_data)
for i in range(0,n + 1):
    typestatistics.append([])
for data in train_data:
    for i in range(0,n + 1):
        atype = data[i]
        if typestatistics[i].count(atype) == 0:
            typestatistics[i].append(atype)
typecount = []
for i in range(0,n + 1):
    typecount.append(len(typestatistics[i]))


condicount = []
condiprob = []
for i in range(0,n):
    condicount.append([])
    condiprob.append([])
    for x in range(0,typecount[i]):
        condicount[i].append([])
        condiprob[i].append([])
        for y in range(0,typecount[n]):
            condicount[i][x].append(0)
            condiprob[i][x].append(0.0)
ycount = []
for typey in range(0,typecount[n]):
    ycount.append(0)

#统计condicountcount
for i in range(0,N):
    y = typestatistics[n].index(train_data[i][n])
    ycount[y] = ycount[y] + 1
    for j in range(0,n):
        x = typestatistics[j].index(train_data[i][j])
        condicount[j][x][y] = condicount[j][x][y] + 1

#估计
for i in range(0,n):
    for x in range(0,typecount[i]):
        for y in range(0,typecount[n]):
            Sj = float(typecount[i])
            lamda = 1.0
            condiprob[i][x][y] = (float(condicount[i][x][y]) + lamda) / (float(ycount[y]) + Sj * lamda)

#测试
correct = 0
wrong = 0
classdata = test_data
Ntest = len(classdata)
for data in classdata:
    pridprob = []
    for y in range(0,typecount[n]):
        pridprob.append(1.0)
        for i in range(0,n):
            x = typestatistics[i].index(data[i])
            pridprob[y] = pridprob[y] * condiprob[i][x][y]
    pridtype = 0
    for y in range(1,typecount[n]):
        if pridprob[y] > pridprob[pridtype]:
            pridtype = y
    #print (pridtype)
    realy = typestatistics[n].index(data[n])
    #print (realy)

    if pridtype == realy:
        correct = correct + 1
    else:
        wrong = wrong + 1

accuracyrate = float(correct) / float(correct + wrong)
print(accuracyrate)