#coding=utf-8 import numpy as np from random import shuffle import sklearn from sklearn import svm from sklearn import tree from sklearn.linear_model import LogisticRegression from sklearn import cross_validation from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier import nltk from sklearn.metrics import precision_score,recall_score,f1_score,cohen_kappa_score from sklearn.ensemble import GradientBoostingClassifier from sklearn.cross_validation import train_test_split # f=open('d:/pymeng/helpful_paper/final_code/kappa_test.txt') # file=f.readlines() # feature_all=np.loadtxt(file) # f.close() # tag1=feature_all[:,0] # tag2=feature_all[:,1] # print 'kappa value is %f' % cohen_kappa_score(tag1,tag2) f=open('d:/pymeng/helpful_paper/final_code/all_feature_value.txt') file=f.readlines() feature_all=np.loadtxt(file) f.close() f_selected_X=feature_all[:,8:-1] #初始训练集,第8到最后列 f_selected_y=feature_all[:,-1] #标签,不变 f_selected_initial_train,f_selected_initial_test,train_tag,testtag=train_test_split(f_selected_X,f_selected_y,test_size=0.20,random_state=531) gbdt=GradientBoostingClassifier(n_estimators=23,max_features=None,min_samples_split=1,random_state=0) gbdt.fit(f_selected_initial_train,train_tag) accuracy=precision_score(testtag,gbdt.predict(f_selected_initial_test)) print "base accuracy is %f" %accuracy print "base recall rate is %f" %recall_score(testtag,gbdt.predict(f_selected_initial_test)) print "base F-measure is %f" %f1_score(testtag,gbdt.predict(f_selected_initial_test)) f_selected_base=f_selected_X f_candidate_all=feature_all[:,:8] for i in range(8): f_selected_i=np.column_stack([f_selected_base,f_candidate_all[:,i]]) f_selected_initial_train,f_selected_initial_test,train_tag,testtag=train_test_split(f_selected_i,f_selected_y,test_size=0.20,random_state=531) gbdt=GradientBoostingClassifier(n_estimators=23,max_features=None,min_samples_split=1,random_state=0) gbdt.fit(f_selected_initial_train,train_tag) accuracy=precision_score(testtag,gbdt.predict(f_selected_initial_test)) print "%d's accuracy is %f" %(i,accuracy) print " recall rate is %f" %recall_score(testtag,gbdt.predict(f_selected_initial_test)) print "F-measure is %f" %f1_score(testtag,gbdt.predict(f_selected_initial_test)) f_selected_base=f_selected_i if i==7:break f_candidate_all=feature_all[:,:8] a,b=f_candidate_all.shape f_selected_X=feature_all[:,8:-1] #初始训练集,第8到最后列 f_selected_y=feature_all[:,-1] #标签,不变 f_selected_initial_train,f_selected_initial_test,train_tag,testtag=train_test_split(f_selected_X,f_selected_y,test_size=0.20,random_state=531) gbdt=GradientBoostingClassifier(n_estimators=23,max_features=None,min_samples_split=1,random_state=0) gbdt.fit(f_selected_initial_train,train_tag) E_selected=f1_score(testtag,gbdt.predict(f_selected_initial_test)) print a,b while (1): E_can_all={} for i in range(b): fi=f_candidate_all[:,i] f_selected_X_fi=np.column_stack([f_selected_X,fi]) f_selected_train,f_selected_test,train_tag,testtag=train_test_split(f_selected_X_fi,f_selected_y,test_size=0.20,random_state=531) gbdt=GradientBoostingClassifier(n_estimators=23,max_features=None,min_samples_split=1,random_state=0) gbdt.fit(f_selected_train,train_tag) E_selected_i=f1_score(testtag,gbdt.predict(f_selected_test)) print "E_selected_%d is %f" %(i,E_selected_i) E_can_all[i]=E_selected_i E_all_max=max(E_can_all.values()) E_all_max_index=E_can_all.values().index(E_all_max) print "E_all_max_index %d " %E_all_max_index if E_all_max>=E_selected: fmax=f_candidate_all[:,E_all_max_index] f_selected_X=np.column_stack([f_selected_X,fmax]) E_selected=E_all_max f_candidate_all=np.delete(f_candidate_all,E_all_max_index,1) a,b=f_candidate_all.shape print "numpy column is %d" %b print "repeat once again" if b==0 or E_all_max