1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
| import operator def Split_Data(dataset, axis, value): ''' 使用传入的axis以及value划分数据集 axis代表在每个列表中的第X位,value为用来划分的特征值 ''' new_subset = [] for vec in dataset: if vec[axis] == value: feature_split = list(vec[:axis]) feature_split.extend(vec[axis + 1:]) new_subset.append(feature_split) return new_subset
def Split_by_entropy(dataset): ''' 使用熵原则进行数据集划分 @信息增益:info_gain = old -new @最优特征:best_feature @类别集合:uniVal ''' feature_num = len(dataset[0]) - 1 ent_old = cal_entropy(dataset) best_gain = 0.0 best_feature = -1 for i in range(feature_num): feature_list = [x[i] for x in dataset] uniVal = set(feature_list) ent_new = 0.0 for value in uniVal: sub_set = Split_Data(dataset, i, value) prob = len(sub_set) / float(len(dataset)) ent_new += prob * (0 - cal_entropy(sub_set)) Info_gain = ent_old - ent_new if(Info_gain > best_gain): best_gain = Info_gain best_feature = i return best_feature
def cal_entropy(data): entries_num = len(data) label_count = {} for vec in data: cur_label = vec[-1] label_count[cur_label] = label_count.get(cur_label, 0) + 1 Entropy = 0.0 for key in label_count: prob = float(label_count[key]) / entries_num Entropy -= prob * math.log(prob, 2) return Entropy
def Majority_vote(classList): ''' 使用多数表决法:若集合中属于第K类的节点最多,则此分支集合 划分为第K类 ''' classcount = {} for vote in classList: classcount[vote] = classcount.get(vote,0) + 1 sorted_count = sorted(classcount.items(), key = operator.itemgetter(1),\ reverse = True) return sorted_count[0][0]
def Create_Tree(dataset,labels):
classList = [x[-1] for x in dataset] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataset[0]) == 1: return Majority_vote(classList) best_feature = Split_by_entropy(dataset) best_labels = labels[best_feature] myTree = {best_labels:{}} subLabels=labels[:] del(subLabels[best_feature])
f_val = [x[best_feature] for x in dataset] uni_val = set(f_val) for value in uni_val: myTree[best_labels][value] = Create_Tree(Split_Data(dataset\ ,best_feature,value), subLabels) return myTree
def classify(inp_tree, labels, test_vec): first_node = list(inp_tree.keys())[0] second_dict = inp_tree[first_node] index = labels.index(first_node) class_label = 'male' for key in second_dict.keys(): if test_vec[index] == key: if type(second_dict[key]).__name__ == 'dict': class_label = classify(second_dict[key], labels, test_vec) else: class_label = second_dict[key] return class_label
labels = ['1', '2', '3', '4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'] data = list(data) print(type(data)) Tree = Create_Tree(data, labels) def trans(input, threshold): output = [0 for i in range(len(input)-1)] for i in range(len(input)-1): if input[i] > threshold[i]: output[i] = 1 return output labels = ['1', '2', '3', '4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20'] data = list(data) Tree = Create_Tree(data, labels) acc = 0 print(len(test)) for i in range(len(test)): t = trans(test[i], threshold) if classify(Tree, labels,t) == test[i][-1]: acc +=1 print(acc/len(test))
|