#coding:utf-8 import numpy as np class NaiveBayes(): def __init__(self): pass def createVocabList(self, train_x): vocabSet = set([]) for wordList in train_x: vocabSet = vocabSet | set(wordList) return list(vocabSet) def listOfWords2Vec(self, vocabList, wordList): wordsVec = [0] * len(vocabList) for word in wordList: if word in vocabList: wordsVec[vocabList.index(word)] = 1 # 词集模型 # wordsVec[vocabList.index(word)] += 1 # 词袋模型 # else: # print "the word:%s is not in my vocabulary!" % word return wordsVec def fit(self, train_x, train_y): vocabList = self.createVocabList(train_x) trainMat = [] for wordList in train_x: trainMat.append(self.listOfWords2Vec(vocabList, wordList)) trainMatrix = np.array(trainMat) ## array trainLabel = np.array(train_y) ## array numTrainDocs = len(trainMatrix) # 统计样本个数 numWords = len(trainMatrix[0]) # 统计特征个数,理论上是词库的长度 ## 计算p(c0),p(c1) p1 = sum(trainLabel)/float(numTrainDocs) # 对应p(c1) p0 = 1-p1 # 对应p(c0) ## 计算p(wi|c0),p(wi|c1) p0Num = np.ones(numWords) # 初始样本个数为1,防止条件概率为0,影响结果 p1Num = np.ones(numWords) p0InAll = 2.0 # 词库中只有两类,所以此处初始化为2 p1InAll = 2.0 for i in range(numTrainDocs): if trainLabel[i] == 1: p1Num += trainMatrix[i] p1InAll += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0InAll += sum(trainMatrix[i]) p0Vec = np.log(p0Num/p0InAll) # 对应p(wi|c0) p1Vec = np.log(p1Num/p1InAll) # 对应p(wi|c1) ## 整合参数 param = p0, p1, p0Vec, p1Vec return vocabList, param def predict(self, test_X, vocabList, param): p0, p1, p0Vec, p1Vec = param testMat = [] for wordList in test_X: testMat.append(self.listOfWords2Vec(vocabList, wordList)) testMatrix = np.array(testMat) ## array predict_y = [] for vec in testMatrix: prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b) prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b) if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类 predict_y.append(1) else: predict_y.append(0) predictLabel = np.array(predict_y) ## array return predictLabel def predict1(self, test_X, test_y, vocabList, param): p0, p1, p0Vec, p1Vec = param testMat = [] for wordList in test_X: testMat.append(self.listOfWords2Vec(vocabList, wordList)) testMatrix = np.array(testMat) ## array m = testMatrix.shape[0] predict_y = [] for vec in testMatrix: prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b) prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b) if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类 predict_y.append(1) else: predict_y.append(0) testLabel = np.array(test_y) ## array predictLabel = np.array(predict_y) ## array print 'accuracy:', sum(testLabel==predictLabel)/float(m) return predictLabel class LogisticRegression(): # 二分类,0/1分类 def __init__(self): pass def createVocabList(self, train_x): vocabSet = set([]) for wordList in train_x: vocabSet = vocabSet | set(wordList) return list(vocabSet) def listOfWords2Vec(self, vocabList, wordList): wordsVec = [0] * len(vocabList) for word in wordList: if word in vocabList: wordsVec[vocabList.index(word)] = 1 # 词集模型 # wordsVec[vocabList.index(word)] += 1 # 词袋模型 # else: # print "the word:%s is not in my vocabulary!" % word return wordsVec def sigmoid(self, inX): return 1.0/(1 + np.exp(-inX)) # 使用梯度下降方法训练模型,alpha为步长(学习率),maxCycles最大迭代次数 def fit(self, train_x, train_y, alpha=0.01, maxCycles=100): vocabList = self.createVocabList(train_x) trainMat = [] for wordList in train_x: trainMat.append(self.listOfWords2Vec(vocabList, wordList)) trainMatrix = np.matrix(trainMat) ## matrix是二维的 # size: m*n trainLabel = np.matrix(train_y).T ## matrix是二维的 # size: m*1 m, n = trainMatrix.shape weigh = np.matrix(np.ones((n, 1))) # size: n*1 for i in range(maxCycles): hx = self.sigmoid(trainMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率 error = trainLabel-hx # size: m*1 weigh += alpha*trainMatrix.T*error # size: n*1 return vocabList, weigh # 使用学习得到的参数进行分类 def predict(self, test_X, vocabList, weigh): testMat = [] for wordList in test_X: testMat.append(self.listOfWords2Vec(vocabList, wordList)) testMatrix = np.matrix(testMat) ## matrix是二维的 m = testMatrix.shape[0] hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率 predict_y = [] for i in range(m): ## 对应0/1分类 if hx[i][0] > 0.5: predict_y.append(1) else: predict_y.append(0) predictLabel = np.array(predict_y) ## array # predictLabel = np.matrix(predict_y).T ## matrix return predictLabel # 使用学习得到的参数进行分类 def predict1(self, test_X, test_y, vocabList, weigh): testMat = [] for wordList in test_X: testMat.append(self.listOfWords2Vec(vocabList, wordList)) testMatrix = np.matrix(testMat) ## matrix是二维的 m = testMatrix.shape[0] hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率 predict_y = [] for i in range(m): ## 对应0/1分类 if hx[i][0] > 0.5: predict_y.append(1) else: predict_y.append(0) testLabel = np.array(test_y) ## array predictLabel = np.array(predict_y) ## array print 'accuracy:', sum(testLabel==predictLabel)/float(m) return predictLabel def loadTrainDataSet(): train_x = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks','ate','my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] train_y = [0,1,0,1,0,1] # 0:good; 1: bad return train_x, train_y def loadTestDataSet(): test_X = [['love', 'my', 'girl', 'friend'], ['stupid', 'garbage'], ['Haha', 'I', 'really', "Love", "You"], ['This', 'is', "my", "dog"]] test_y = [0,1,0,0] # 0:good; 1: bad return test_X, test_y if __name__ == '__main__': train_X, train_y = loadTrainDataSet() test_X, test_y = loadTestDataSet() clf = NaiveBayes() vocabList, param = clf.fit(train_X, train_y) results = clf.predict(test_X, vocabList, param) print results results1 = clf.predict1(test_X, test_y, vocabList, param) print results1 clf = LogisticRegression() vocabList, weigh = clf.fit(train_X, train_y) results = clf.predict(test_X, vocabList, weigh) print results results1 = clf.predict1(test_X, test_y, vocabList, weigh) print results1
转载请注明:宁哥的小站 » 朴素贝叶斯与逻辑回归算法实现