决策树实现连续型属性值的分类问题

题目与数据来源于 Gender Recognition by Voice | Kaggle ,大意是通过声音的多个属性来判断性别。

预览数据组成:

1653529082501

​ 发现这些属性值都是连续值, 因为连续属性的可取值数目不再有限,因此需要连续属性离散化,常用的离散化策略是二分法, 这步的目的是使得同一类别的样本尽可能地被划分在一起,这样才能达到分类的目的。而同一类的样本尽可能分在一起,翻译一下就是划分后两个子集的信息熵之和最小。

​ 先将该属性所有属性值排序,计算每相邻两个点的中点作为阈值,在数据集中将该属性大于阈值的划分为一类,小于阈值的划分为另一类,然后计算信息熵,最后取令信息熵最小的阈值作为划分标准,将属性值二值化。按实际来说这个阈值应该在所有属性值的平均值附近取到,所以可以适当缩小范围以加快计算速度。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import math
import statistics
import numpy as np
import pandas as pd


dataset = pd.read_csv("/kaggle/input/voicegender/voice.csv")
# 由于数据集的组成是前一半都是male,后一半都是female,所以先用以下方法对数据集进行划分。
dataset = list(np.array(dataset))
data = dataset[:1267]
data = data + dataset[1584:2851]
test = dataset[1267:1584]
test = test + dataset[2851:]
data = np.array(data)

# 对每个属性值进行二值化操作的阈值
threshold = [0 for i in range(len(data[0])-1)]


# 得到每个属性的阈值
for i in range(len(data[0])-1):
min_entropy = 100 #记录最小熵
feature_value = list(data[:,i]) #得到所有属性值
feature_value.sort() # 排序
feature_value_avg = [(feature_value[x] + feature_value[x+1])/2 for x in range(len(feature_value)-1)] # 求相邻两点的平均值
# 缩小范围
for m in range(int(len(feature_value_avg)*0.3), int(len(feature_value_avg)*0.7)):
sum_1 = 0 # 当前属性值比阈值大的一类
count_male1 = 0 # 比阈值大的一类中的male数量
sum_2 = 0 # 比阈值小的一类
count_male2 = 0 # 比阈值小的一类的male数量
for r in data:
if r[i] > feature_value_avg[m]:
sum_1 += 1
if r[-1] == 'male':
count_male1 +=1
else:
sum_2 += 1
if r[-1] == 'male':
count_male2 += 1
prob_male1 = count_male1/sum_1
prob_male2 = count_male2/sum_2
# 计算划分后子集的信息熵
entropy = - prob_male1* math.log(prob_male1, 2) - (1 - prob_male1) * math.log((1- prob_male1), 2) -\
prob_male2* math.log(prob_male2, 2) - (1 - prob_male2) * math.log((1- prob_male2), 2)
if entropy < min_entropy:
min_entropy = entropy
threshold[i] = feature_value_avg[m]

# 二值化
for i in range(len(data)):
for j in range(len(data[0])-1):
if data[i][j] > threshold[j]:
data[i][j] = 1
else:
data[i][j] = 0

建树

确定各个条件作为分类判定的顺序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import operator
def Split_Data(dataset, axis, value):
'''
使用传入的axis以及value划分数据集
axis代表在每个列表中的第X位,value为用来划分的特征值
'''
new_subset = []
# 利用循环将不符合value的特征值划分入另一集合
# 相当于将value单独提取出来(或作为叶节点)
for vec in dataset:
if vec[axis] == value:
feature_split = list(vec[:axis])
feature_split.extend(vec[axis + 1:])
new_subset.append(feature_split)
# extend将VEC中的元素一一纳入feature_split
# append则将feature_split作为列表结合进目标集合

return new_subset

def Split_by_entropy(dataset):
'''
使用熵原则进行数据集划分
@信息增益:info_gain = old -new
@最优特征:best_feature
@类别集合:uniVal
'''
feature_num = len(dataset[0]) - 1
ent_old = cal_entropy(dataset)
best_gain = 0.0
best_feature = -1
# ENT_OLD代表划分前集合的熵,ENT_NEW代表划分后的熵
# best_gain将在迭代每一次特征的时候更新,最终选出最优特征
for i in range(feature_num):
feature_list = [x[i] for x in dataset]
uniVal = set(feature_list)
ent_new = 0.0
# 使用set剔除重复项,保留该特征对应的不同取值
for value in uniVal:
sub_set = Split_Data(dataset, i, value)
prob = len(sub_set) / float(len(dataset))
# 使用熵计算函数求出划分后的熵值
ent_new += prob * (0 - cal_entropy(sub_set))

# 由ent_old - ent_new选出划分对应的最优特征
Info_gain = ent_old - ent_new
if(Info_gain > best_gain):
best_gain = Info_gain
best_feature = i

return best_feature

def cal_entropy(data):
# 计算样本实例的熵
entries_num = len(data)
label_count = {} # 字典存储每个类别出现的次数

for vec in data:
cur_label = vec[-1]
# 将样本标签提取出来,并计数
label_count[cur_label] = label_count.get(cur_label, 0) + 1
Entropy = 0.0
# 对每一个类别,计算样本中取到该类的概率
# 最后将概率带入,求出熵
for key in label_count:
prob = float(label_count[key]) / entries_num
Entropy -= prob * math.log(prob, 2) #此处使用numpy.math
return Entropy

def Majority_vote(classList):
'''
使用多数表决法:若集合中属于第K类的节点最多,则此分支集合
划分为第K类
'''
classcount = {}
for vote in classList:
classcount[vote] = classcount.get(vote,0) + 1
sorted_count = sorted(classcount.items(), key = operator.itemgetter(1),\
reverse = True)
# 获取每一类出现的节点数(没出现默认为0)并进行排序
# 返回最大项的KEY所对应的类别
return sorted_count[0][0]

def Create_Tree(dataset,labels):

classList = [x[-1] for x in dataset]
if classList.count(classList[0]) == len(classList):
return classList[0]
#
if len(dataset[0]) == 1:
return Majority_vote(classList)

best_feature = Split_by_entropy(dataset)
best_labels = labels[best_feature]

myTree = {best_labels:{}}
# 此位置书上写的有误,书上为del(labels[bestFeat])
# 相当于操作原始列表内容,导致原始列表内容发生改变
# 按此运行程序,报错'no surfacing'is not in list
# 以下代码已改正

# 复制当前特征标签列表,防止改变原始列表的内容
subLabels=labels[:]
# 删除属性列表中当前分类数据集特征
del(subLabels[best_feature])

# 使用列表推导式生成该特征对应的列
f_val = [x[best_feature] for x in dataset]
uni_val = set(f_val)
for value in uni_val:
# 递归创建子树并返回
myTree[best_labels][value] = Create_Tree(Split_Data(dataset\
,best_feature,value), subLabels)

return myTree

def classify(inp_tree, labels, test_vec):
first_node = list(inp_tree.keys())[0]
second_dict = inp_tree[first_node]
index = labels.index(first_node)
class_label = 'male'
for key in second_dict.keys():
if test_vec[index] == key:
if type(second_dict[key]).__name__ == 'dict':
class_label = classify(second_dict[key], labels, test_vec)
else:
class_label = second_dict[key]
return class_label


labels = ['1', '2', '3', '4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20']
data = list(data)
print(type(data))
Tree = Create_Tree(data, labels)
def trans(input, threshold):
output = [0 for i in range(len(input)-1)]
for i in range(len(input)-1):
if input[i] > threshold[i]:
output[i] = 1
return output
labels = ['1', '2', '3', '4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20']
data = list(data)
Tree = Create_Tree(data, labels)
acc = 0
print(len(test))
for i in range(len(test)):
t = trans(test[i], threshold)
if classify(Tree, labels,t) == test[i][-1]:
acc +=1
print(acc/len(test))


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!