Apriori算法的实现

数据挖掘与机器学习 fireling 5905℃
  1.  
  2. #coding:utf-8
  3.  
  4. class Apriori():
  5. def __init__(self):
  6. pass
  7.  
  8. '''
  9. 关联分析的目标包括两项:发现频繁项集和发现关联规则
  10. '''
  11.  
  12. '''
  13. 频繁项集:{}
  14. 对于包含N种物品的数据集共有2^N-1种项集组合。
  15. 支持度(support)
  16. 一个项集的支持度被定义为数据集中包含该项集的记录所占的比例。
  17. Apriori算法:如果某个项集是频繁的,那么它的所有子集也是频繁的。
  18. 如果一个项集是非频繁集,那么它的所有超集也是非频繁集。
  19. '''
  20.  
  21. def _createC1(self, dataSet):
  22. C1 = []
  23. for transaction in dataSet:
  24. for item in transaction:
  25. if [item] not in C1:
  26. C1.append([item])
  27. C1.sort()
  28. return map(frozenset, C1) # use frozen set so we can use it as a key in a dict
  29.  
  30. def _scanD(self, D, Ck, minSupport=0.5):
  31. ssCnt = {}
  32. for tid in D:
  33. for can in Ck:
  34. if can.issubset(tid):
  35. if can in ssCnt:
  36. ssCnt[can] += 1
  37. else:
  38. ssCnt[can] = 1
  39. # if can not in ssCnt:
  40. # ssCnt[can] = 0
  41. # ssCnt[can] += 1
  42. # print ssCnt
  43. numItems = len(D)
  44. retList = []
  45. supportK = {}
  46. for key in ssCnt:
  47. support = ssCnt[key]/float(numItems) # 计算支持度
  48. if support >= minSupport:
  49. retList.append(key)
  50. supportK[key] = support
  51. return retList, supportK
  52.  
  53. def aprioriGen(self, Lk, k): # k>=2
  54. retList = []
  55. lenLk = len(Lk)
  56. for i in range(lenLk):
  57. for j in range(i+1, lenLk):
  58. L1 = list(Lk[i])[:k-2]
  59. L2 = list(Lk[j])[:k-2]
  60. L1.sort()
  61. L2.sort()
  62. if L1 == L2: # if first k-2 elements are equal. when k is 3, {0,1},{0,2},{1,2}→{0,1}U{0,2}→{0,1,2}
  63. retList.append(Lk[i] | Lk[j])
  64. return retList
  65.  
  66. def apriori(self, dataSet, minSupport=0.5): # minSupport 最小支持度
  67. D = map(set, dataSet) # 转换为集合set
  68. C1 = self._createC1(dataSet) # 创建C1,转换为集合frozenset
  69. L1, supp1 = self._scanD(D, C1, minSupport) # 基于C1和minSupport创建L1
  70. L = []
  71. supportData = {}
  72. L.append(L1)
  73. supportData.update(supp1)
  74. k = 2
  75. while len(L[k-2]) > 1:
  76. Ck = self.aprioriGen(L[k-2], k) # 创建Ck
  77. Lk, suppK = self._scanD(D, Ck, minSupport) # 基于Ck和minSupport创建Lk
  78. L.append(Lk)
  79. supportData.update(suppK)
  80. k += 1
  81. return L, supportData
  82.  
  83. '''
  84. 关联规则:→
  85. 可信度(confidence):也称置信度
  86. 可信度(尿布→葡萄酒) = 支持度({尿布,葡萄酒})/支持度({尿布})
  87. 如果某条规则并不满足最小可信度要求,那么该规则的所有子集也不会满足最小可信度要求。
  88. '''
  89.  
  90. def _calcConf(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表,如{0},{1}
  91. prunedH = []
  92. for conseq in H:
  93. conf = supportData[freqSet]/supportData[freqSet-conseq] # 计算可信度
  94. if conf >= minConf:
  95. print freqSet-conseq, '-->', conseq, 'conf:', conf
  96. brl.append((freqSet-conseq, conseq, conf))
  97. prunedH.append(conseq)
  98. return prunedH
  99.  
  100. def _rulesFromConseq(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表,如{0},{1}
  101. m = len(H[0])
  102. if len(freqSet) > (m+1):
  103. Hmp1 = self.aprioriGen(H, m+1) # 合并规则
  104. Hmp = self._calcConf(freqSet, Hmp1, supportData, brl, minConf) # Hmp为出现在右部的合并规则列表,如{0,1}
  105. if len(Hmp) > 1: # 如果规则列表长度大于1,则进一步合并
  106. self._rulesFromConseq(freqSet, Hmp, supportData, brl, minConf)
  107.  
  108. def generateRules(self, L, supportData, minConf=0.7): # minConf 最小可信度
  109. bigRuleList = []
  110. for i in range(1, len(L)): # 从包含两个或者更多元素的项集开始规则构建过程
  111. for freqSet in L[i]:
  112. H1 = [frozenset([item]) for item in freqSet] # 构建只包含单个元素的列表,即出现在规则右部的规则列表,如{0},{1}
  113. if i > 1:
  114. self._rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # 生成候选规则
  115. else:
  116. self._calcConf(freqSet, H1, supportData, bigRuleList, minConf) # 对规则进行评估
  117. return bigRuleList
  118.  
  119. def loadDataSet():
  120. return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
  121.  
  122. if __name__ == '__main__':
  123. dataSet = loadDataSet()
  124. ap = Apriori()
  125. L, suppData = ap.apriori(dataSet, minSupport=0.5)
  126. print L
  127. print suppData
  128. rules = ap.generateRules(L, suppData, minConf=0.6)
  129. print rules
  130.  

转载请注明:宁哥的小站 » Apriori算法的实现

喜欢 (2)