鍖椾含鐨偆鐥呭尰闄㈢數璇? http://pf.39.net/bdfyy/bjzkbdfyy/
#熵defcalc_ent(datasets):data_length=len(datasets)label_count={}foriinrange(data_length):label=datasets[i][-1]iflabelnotinlabel_count:label_count[label]=0label_count[label]+=1ent=-sum([(p/data_length)*log(p/data_length,2)forpinlabel_count.values()])returnent#defentropy(y):#"""#Entropyofalabelsequence#"""#hist=np.bincount(y)#ps=hist/np.sum(hist)#return-np.sum([p*np.log2(p)forpinpsifp0])#经验条件熵defcond_ent(datasets,axis=0):data_length=len(datasets)feature_sets={}foriinrange(data_length):feature=datasets[i][axis]iffeaturenotinfeature_sets:feature_sets[feature]=[]feature_sets[feature].append(datasets[i])cond_ent=sum([(len(p)/data_length)*calc_ent(p)forpinfeature_sets.values()])returncond_ent#信息增益definfo_gain(ent,cond_ent):returnent-cond_entdefinfo_gain_train(datasets):count=len(datasets[0])-1ent=calc_ent(datasets)#ent=entropy(datasets)best_feature=[]forcinrange(count):c_info_gain=info_gain(ent,cond_ent(datasets,axis=c))best_feature.append((c,c_info_gain))print(特征({})-info_gain-{:.3f}.format(labels[c],c_info_gain))#比较大小best_=max(best_feature,key=lambdax:x[-1])return特征({})的信息增益最大,选择为根节点特征.format(labels[best_[0]])
#定义节点类二叉树classNode:def__init__(self,root=True,label=None,feature_name=None,feature=None):self.root=rootself.label=labelself.feature_name=feature_nameself.feature=featureself.tree={}self.result={label::self.label,feature:self.feature,tree:self.tree}def__repr__(self):return{}.format(self.result)defadd_node(self,val,node):self.tree[val]=nodedefpredict(self,features):ifself.rootisTrue:returnself.labelreturnself.tree[features[self.feature]].predict(features)classDTree:def__init__(self,epsilon=0.1):self.epsilon=epsilonself._tree={}#熵
staticmethoddefcalc_ent(datasets):data_length=len(datasets)label_count={}foriinrange(data_length):label=datasets[i][-1]iflabelnotinlabel_count:label_count[label]=0label_count[label]+=1ent=-sum([(p/data_length)*log(p/data_length,2)forpinlabel_count.values()])returnent#经验条件熵defcond_ent(self,datasets,axis=0):data_length=len(datasets)feature_sets={}foriinrange(data_length):feature=datasets[i][axis]iffeaturenotinfeature_sets:feature_sets[feature]=[]feature_sets[feature].append(datasets[i])cond_ent=sum([(len(p)/data_length)*self.calc_ent(p)forpinfeature_sets.values()])returncond_ent#信息增益staticmethoddefinfo_gain(ent,cond_ent):returnent-cond_entdefinfo_gain_train(self,datasets):count=len(datasets[0])-1ent=self.calc_ent(datasets)best_feature=[]forcinrange(count):c_info_gain=self.info_gain(ent,self.cond_ent(datasets,axis=c))best_feature.append((c,c_info_gain))#比较大小best_=max(best_feature,key=lambdax:x[-1])returnbest_deftrain(self,train_data):"""input:数据集D(DataFrame格式),特征集A,阈值etaoutput:决策树T"""_,y_train,features=train_data.iloc[:,:-1],train_data.iloc[:,-1],train_data.columns[:-1]#1,若D中实例属于同一类Ck,则T为单节点树,并将类Ck作为结点的类标记,返回Tiflen(y_train.value_counts())==1:returnNode(root=True,label=y_train.iloc[0])#2,若A为空,则T为单节点树,将D中实例树最大的类Ck作为该节点的类标记,返回Tiflen(features)==0:returnNode(root=True,label=y_train.value_counts().sort_values(ascending=False).index[0])#3,计算最大信息增益同5.1,Ag为信息增益最大的特征max_feature,max_info_gain=self.info_gain_train(np.array(train_data))max_feature_name=features[max_feature]#4,Ag的信息增益小于阈值eta,则置T为单节点树,并将D中是实例数最大的类Ck作为该节点的类标记,返回Tifmax_info_gainself.epsilon:returnNode(root=True,label=y_train.value_counts().sort_values(ascending=False).index[0])#5,构建Ag子集node_tree=Node(root=False,feature_name=max_feature_name,feature=max_feature)feature_list=train_data[max_feature_name].value_counts().indexforfinfeature_list:sub_train_df=train_data.loc[train_data[max_feature_name]==f].drop([max_feature_name],axis=1)#6,递归生成树sub_tree=self.train(sub_train_df)node_tree.add_node(f,sub_tree)#pprint.pprint(node_tree.tree)returnnode_treedeffit(self,train_data):self._tree=self.train(train_data)returnself._treedefpredict(self,X_test):returnself._tree.predict(X_test)#书上题目5.1defcreate_data():datasets=[[青年,否,否,一般,否],[青年,否,否,好,否],[青年,是,否,好,是],[青年,是,是,一般,是],[青年,否,否,一般,否],[中年,否,否,一般,否],[中年,否,否,好,否],[中年,是,是,好,是],[中年,否,是,非常好,是],[中年,否,是,非常好,是],[老年,否,是,非常好,是],[老年,否,是,好,是],[老年,是,否,好,是],[老年,是,否,非常好,是],[老年,否,否,一般,否],]labels=[u年龄,u有工作,u有自己的房子,u信贷情况,u类别]#返回数据集和每个维度的名称returndatasets,labeldatasets,labels=create_data()train_data=pd.DataFrame(datasets,columns=labels)train_data
datasets,labels=create_data()data_df=pd.DataFrame(datasets,columns=labels)dt=DTree()tree=dt.fit(data_df)tree{label::None,feature:2,tree:{否:{label::None,feature:1,tree:{否:{label::否,feature:None,tree:{}},是:{label::是,feature:None,tree:{}}}},是:{label::是,feature:None,tree:{}}}}
dt.predict([老年,否,否,一般])否江苏吴彦祖