朴素贝叶斯与逻辑回归算法实现

数据挖掘与机器学习 fireling 5023℃
  1. #coding:utf-8
  2. import numpy as np
  3.  
  4. class NaiveBayes():
  5. def __init__(self):
  6. pass
  7.  
  8. def createVocabList(self, train_x):
  9. vocabSet = set([])
  10. for wordList in train_x:
  11. vocabSet = vocabSet | set(wordList)
  12. return list(vocabSet)
  13.  
  14. def listOfWords2Vec(self, vocabList, wordList):
  15. wordsVec = [0] * len(vocabList)
  16. for word in wordList:
  17. if word in vocabList:
  18. wordsVec[vocabList.index(word)] = 1 # 词集模型
  19. # wordsVec[vocabList.index(word)] += 1 # 词袋模型
  20. # else:
  21. # print "the word:%s is not in my vocabulary!" % word
  22. return wordsVec
  23.  
  24. def fit(self, train_x, train_y):
  25. vocabList = self.createVocabList(train_x)
  26. trainMat = []
  27. for wordList in train_x:
  28. trainMat.append(self.listOfWords2Vec(vocabList, wordList))
  29. trainMatrix = np.array(trainMat) ## array
  30. trainLabel = np.array(train_y) ## array
  31. numTrainDocs = len(trainMatrix) # 统计样本个数
  32. numWords = len(trainMatrix[0]) # 统计特征个数,理论上是词库的长度
  33. ## 计算p(c0),p(c1)
  34. p1 = sum(trainLabel)/float(numTrainDocs) # 对应p(c1)
  35. p0 = 1-p1 # 对应p(c0)
  36. ## 计算p(wi|c0),p(wi|c1)
  37. p0Num = np.ones(numWords) # 初始样本个数为1,防止条件概率为0,影响结果
  38. p1Num = np.ones(numWords)
  39. p0InAll = 2.0 # 词库中只有两类,所以此处初始化为2
  40. p1InAll = 2.0
  41. for i in range(numTrainDocs):
  42. if trainLabel[i] == 1:
  43. p1Num += trainMatrix[i]
  44. p1InAll += sum(trainMatrix[i])
  45. else:
  46. p0Num += trainMatrix[i]
  47. p0InAll += sum(trainMatrix[i])
  48. p0Vec = np.log(p0Num/p0InAll) # 对应p(wi|c0)
  49. p1Vec = np.log(p1Num/p1InAll) # 对应p(wi|c1)
  50. ## 整合参数
  51. param = p0, p1, p0Vec, p1Vec
  52. return vocabList, param
  53.  
  54. def predict(self, test_X, vocabList, param):
  55. p0, p1, p0Vec, p1Vec = param
  56. testMat = []
  57. for wordList in test_X:
  58. testMat.append(self.listOfWords2Vec(vocabList, wordList))
  59. testMatrix = np.array(testMat) ## array
  60. predict_y = []
  61. for vec in testMatrix:
  62. prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
  63. prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
  64. if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类
  65. predict_y.append(1)
  66. else:
  67. predict_y.append(0)
  68. predictLabel = np.array(predict_y) ## array
  69. return predictLabel
  70.  
  71. def predict1(self, test_X, test_y, vocabList, param):
  72. p0, p1, p0Vec, p1Vec = param
  73. testMat = []
  74. for wordList in test_X:
  75. testMat.append(self.listOfWords2Vec(vocabList, wordList))
  76. testMatrix = np.array(testMat) ## array
  77. m = testMatrix.shape[0]
  78. predict_y = []
  79. for vec in testMatrix:
  80. prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
  81. prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
  82. if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类
  83. predict_y.append(1)
  84. else:
  85. predict_y.append(0)
  86. testLabel = np.array(test_y) ## array
  87. predictLabel = np.array(predict_y) ## array
  88. print 'accuracy:', sum(testLabel==predictLabel)/float(m)
  89. return predictLabel
  90.  
  91. class LogisticRegression(): # 二分类,0/1分类
  92. def __init__(self):
  93. pass
  94.  
  95. def createVocabList(self, train_x):
  96. vocabSet = set([])
  97. for wordList in train_x:
  98. vocabSet = vocabSet | set(wordList)
  99. return list(vocabSet)
  100.  
  101. def listOfWords2Vec(self, vocabList, wordList):
  102. wordsVec = [0] * len(vocabList)
  103. for word in wordList:
  104. if word in vocabList:
  105. wordsVec[vocabList.index(word)] = 1 # 词集模型
  106. # wordsVec[vocabList.index(word)] += 1 # 词袋模型
  107. # else:
  108. # print "the word:%s is not in my vocabulary!" % word
  109. return wordsVec
  110.  
  111. def sigmoid(self, inX):
  112. return 1.0/(1 + np.exp(-inX))
  113.  
  114. # 使用梯度下降方法训练模型,alpha为步长(学习率),maxCycles最大迭代次数
  115. def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):
  116. vocabList = self.createVocabList(train_x)
  117. trainMat = []
  118. for wordList in train_x:
  119. trainMat.append(self.listOfWords2Vec(vocabList, wordList))
  120. trainMatrix = np.matrix(trainMat) ## matrix是二维的 # size: m*n
  121. trainLabel = np.matrix(train_y).T ## matrix是二维的 # size: m*1
  122. m, n = trainMatrix.shape
  123. weigh = np.matrix(np.ones((n, 1))) # size: n*1
  124. for i in range(maxCycles):
  125. hx = self.sigmoid(trainMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
  126. error = trainLabel-hx # size: m*1
  127. weigh += alpha*trainMatrix.T*error # size: n*1
  128. return vocabList, weigh
  129.  
  130. # 使用学习得到的参数进行分类
  131. def predict(self, test_X, vocabList, weigh):
  132. testMat = []
  133. for wordList in test_X:
  134. testMat.append(self.listOfWords2Vec(vocabList, wordList))
  135. testMatrix = np.matrix(testMat) ## matrix是二维的
  136. m = testMatrix.shape[0]
  137. hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
  138. predict_y = []
  139. for i in range(m): ## 对应0/1分类
  140. if hx[i][0] > 0.5:
  141. predict_y.append(1)
  142. else:
  143. predict_y.append(0)
  144. predictLabel = np.array(predict_y) ## array
  145. # predictLabel = np.matrix(predict_y).T ## matrix
  146. return predictLabel
  147.  
  148. # 使用学习得到的参数进行分类
  149. def predict1(self, test_X, test_y, vocabList, weigh):
  150. testMat = []
  151. for wordList in test_X:
  152. testMat.append(self.listOfWords2Vec(vocabList, wordList))
  153. testMatrix = np.matrix(testMat) ## matrix是二维的
  154. m = testMatrix.shape[0]
  155. hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
  156. predict_y = []
  157. for i in range(m): ## 对应0/1分类
  158. if hx[i][0] > 0.5:
  159. predict_y.append(1)
  160. else:
  161. predict_y.append(0)
  162. testLabel = np.array(test_y) ## array
  163. predictLabel = np.array(predict_y) ## array
  164. print 'accuracy:', sum(testLabel==predictLabel)/float(m)
  165. return predictLabel
  166.  
  167. def loadTrainDataSet():
  168. train_x = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
  169. ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
  170. ['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'],
  171. ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
  172. ['mr', 'licks','ate','my', 'steak', 'how', 'to', 'stop', 'him'],
  173. ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
  174. train_y = [0,1,0,1,0,1] # 0:good; 1: bad
  175. return train_x, train_y
  176.  
  177. def loadTestDataSet():
  178. test_X = [['love', 'my', 'girl', 'friend'],
  179. ['stupid', 'garbage'],
  180. ['Haha', 'I', 'really', "Love", "You"],
  181. ['This', 'is', "my", "dog"]]
  182. test_y = [0,1,0,0] # 0:good; 1: bad
  183. return test_X, test_y
  184.  
  185. if __name__ == '__main__':
  186. train_X, train_y = loadTrainDataSet()
  187. test_X, test_y = loadTestDataSet()
  188. clf = NaiveBayes()
  189. vocabList, param = clf.fit(train_X, train_y)
  190. results = clf.predict(test_X, vocabList, param)
  191. print results
  192. results1 = clf.predict1(test_X, test_y, vocabList, param)
  193. print results1
  194. clf = LogisticRegression()
  195. vocabList, weigh = clf.fit(train_X, train_y)
  196. results = clf.predict(test_X, vocabList, weigh)
  197. print results
  198. results1 = clf.predict1(test_X, test_y, vocabList, weigh)
  199. print results1
  200.  

转载请注明:宁哥的小站 » 朴素贝叶斯与逻辑回归算法实现

喜欢 (2)