import random import math '''class Instance is complete You can use self.label: label=1 ==> +; label=0 ==> - self.fts: mapping a t c g TO 0,1,2,3. For example self.fts[3]=2 means 3rd feature is 'c' self.uniqeId: every instance will have an uniqe Id; ''' class Instance: FTSVALUERANGE=4 def __init__(self,line,id): self.fts=[] self.uniqeId=id temp=line.split() if(temp[1]=='+'): self.label=1 else: self.label=0 charfts = temp[0] for i in range(len(charfts)): if(charfts[i]=='a'): self.fts.append(0) elif(charfts[i]=='t'): self.fts.append(1) elif(charfts[i]=='c'): self.fts.append(2) else: self.fts.append(3) def selfprint(self): print self.label,self.fts,'\n' class Stump: '''You can use self.root: feature to test in the root node self.leaflabel: branches of the root; leaflabel[i]=1 indicates the class label of the leaf i is +''' def learn(self, trainInstances): NUMFTS=len(trainInstances[0].fts) self.leaflabel=[None]*Instance.FTSVALUERANGE #dummy value #TODO: finish this function; you should change def classify(self,testInstances): ''' Apply the stump on testInstances, return a list of predictions. Note: testInstances and predictions should have the same length! ''' predictions = [] #TODO: finish this function return predictions def sampleByUniform(instances,L): ''' resample L instances with replacement from originalInstances uniformly. Note: (1) the length of resampledInstance should be L; (2) every instance of originalInstances, including head and tail, should have equal odd to be chosen. ''' resampledInstance = [] #TODO: finish this function return resampledInstance def sampleByWeight(instances, p, L): ''' resample L instances with replacement from originalInstances following distribution p. Note: p is a distribution so make sure when you call this function, sum(p)=1. ''' resampledInstance=[] #TODO: finish this function return resampledInstance def baggingClassify(trainInstances, testInstances, L, M): ''' bagging classifier: your job is (1) generating M new training sets, each with L instances sampled uniformly with replacement from trainInstances; (2) training M stumps with these M training sets; (3) majority vote ''' predictions = [] #TODO finish this function return predictions def computeAccuracy(predictions, testInstances): #This function is complete if len(predictions)!= len(testInstances): return 0 else: right=0 wrong=0 for i in range(len(predictions)): if(predictions[i]==testInstances[i].label): right+=1 else: wrong+=1 return right*1.0/(right+wrong) # adaboostClassify: your job is to follow the pseudo code to finish # adaboosting. Remember in Part B you will use ID3 as the base learner. # So try to keep your code flexible when you are doing partA. def adaboostClassify(trainInstances, testInstances, L,K): predictions = [] #TODO: funish this function return predictions def load(trainfile,testfile,trainInstances,testInstances): #This function is complete f=open(trainfile,'r') a=f.readlines() uniqId=0 for i in range(len(a)): ins = Instance(a[i],uniqId) trainInstances.append(ins) uniqId = uniqId+1 f=open(testfile,'r') a = f.readlines() for i in range(len(a)): ins = Instance(a[i],uniqId) testInstances.append(ins) uniqId = uniqId+1 def main(): trainInstances=[] testInstances=[] load('training.txt','test.txt',trainInstances,testInstances) print 'Number of Train/test instances',len(trainInstances),len(testInstances) s = Stump() s.learn(trainInstances) stump_train_predictions = s.classify(trainInstances) print 'Stump accuracy on training', computeAccuracy(stump_train_predictions,trainInstances) stump_test_predictions = s.classify(testInstances) print 'Stump accuracy on testing', computeAccuracy(stump_test_predictions,testInstances) bagging_train_predictions = baggingClassify(trainInstances,trainInstances,71,100) print 'Bagging accuracy on training', computeAccuracy(bagging_train_predictions,trainInstances) bagging_test_predictions = baggingClassify(trainInstances,testInstances,71,100) print 'Bagging accuracy on testing', computeAccuracy(bagging_test_predictions,testInstances) adaboost_train_predictions = adaboostClassify(trainInstances,trainInstances,71,100) print 'Adaboost accuracy on training', computeAccuracy(adaboost_train_predictions,trainInstances) adaboost_test_predictions = adaboostClassify(trainInstances,testInstances,71,100) print 'Adaboost accuracy on testing', computeAccuracy(adaboost_test_predictions,testInstances) #Usage: put training.txt & test.txt in the same directory #python ensemble.py main()