def __init__(self, inputfile):
self.transactions = []
self.itemSet = set([])
inf = open(inputfile, 'rb')
for line in inf.readlines():
elements = set(filter(lambda entry: len(entry)>0, line.strip().split(',')))
if len(elements)>0:
self.transactions.append(elements)
for element in elements:
self.itemSet.add(element)
inf.close()
self.toRetItems = {}
self.associationRules = []
def getSupport(self, itemcomb):
if type(itemcomb) != frozenset:
itemcomb = frozenset([itemcomb])
within_transaction = lambda transaction: reduce(and_, [(item in transaction) for item in itemcomb])
count = len(filter(within_transaction, self.transactions))
return float(count)/float(len(self.transactions))
def runApriori(self, minSupport=0.15, minConfidence=0.6):
itemCombSupports = filter(lambda freqpair: freqpair[1]>=minSupport,
map(lambda item: (frozenset([item]), self.getSupport(item)), self.itemSet))
currentLset = set(map(lambda freqpair: freqpair[0], itemCombSupports))
k = 2
while len(currentLset)>0:
currentCset = set([i.union(j) for i in currentLset for j in currentLset if len(i.union(j))==k])
currentItemCombSupports = filter(lambda freqpair: freqpair[1]>=minSupport,
map(lambda item: (item, self.getSupport(item)), currentCset))
currentLset = set(map(lambda freqpair: freqpair[0], currentItemCombSupports))
itemCombSupports.extend(currentItemCombSupports)
k += 1
for key, supportVal in itemCombSupports:
self.toRetItems[key] = supportVal
self.calculateAssociationRules(minConfidence=minConfidence)
def calculateAssociationRules(self, minConfidence=0.6):
for key in self.toRetItems:
subsets = [frozenset(item) for k in range(1, len(key)) for item in combinations(key, k)]
for subset in subsets:
confidence = self.toRetItems[key] / self.toRetItems[subset]
if confidence > minConfidence:
self.associationRules.append([subset, key-subset, confidence]) 用Scala也大概六十多行:
import scala.io.Sourceimport scala.collection.immutable.Listimport scala.collection.immutable.Setimport java.io.Fileimport scala.collection.mutable.Mapclass AprioriAlgorithm(inputFile: File) {
var transactions : List[Set[String]] = List()
var itemSet : Set[String] = Set()
for (line<-Source.fromFile(inputFile).getLines()) {
val elementSet = line.trim.split(',').toSet
if (elementSet.size > 0) {
transactions = transactions :+ elementSet
itemSet = itemSet ++ elementSet
}
}
var toRetItems : Map[Set[String], Double] = Map()
var associationRules : List[(Set[String], Set[String], Double)] = List()
def getSupport(itemComb : Set[String]) : Double = {
def withinTransaction(transaction : Set[String]) : Boolean = itemComb
.map( x => transaction.contains(x))
.reduceRight((x1, x2) => x1 && x2)
val count = transactions.filter(withinTransaction).size
count.toDouble / transactions.size.toDouble
}
def runApriori(minSupport : Double = 0.15, minConfidence : Double = 0.6) = {
var itemCombs = itemSet.map( word => (Set(word), getSupport(Set(word))))
.filter( wordSupportPair => (wordSupportPair._2 > minSupport))
var currentLSet : Set[Set[String]] = itemCombs.map( wordSupportPair => wordSupportPair._1).toSet
var k : Int = 2
while (currentLSet.size > 0) {
val currentCSet : Set[Set[String]] = currentLSet.map( wordSet => currentLSet.map(wordSet1 => wordSet | wordSet1))
.reduceRight( (set1, set2) => set1 | set2)
.filter( wordSet => (wordSet.size==k))
val currentItemCombs = currentCSet.map( wordSet => (wordSet, getSupport(wordSet)))
.filter( wordSupportPair => (wordSupportPair._2 > minSupport))
currentLSet = currentItemCombs.map( wordSupportPair => wordSupportPair._1).toSet
itemCombs = itemCombs | currentItemCombs
k += 1
}
for (itemComb<-itemCombs) {
toRetItems += (itemComb._1 -> itemComb._2)
}
calculateAssociationRule(minConfidence)
}
def calculateAssociationRule(minConfidence : Double = 0.6) = {
toRetItems.keys.foreach(item =>
item.subsets.filter( wordSet => (wordSet.size<item.size & wordSet.size>0))
.foreach( subset => {associationRules = associationRules :+ (subset, item diff subset,
toRetItems(item).toDouble/toRetItems(subset).toDouble)
}
)
)
associationRules = associationRules.filter( rule => rule._3>minConfidence)
}}
我不建议用Java,应改用Python或Scala一类的语言。如果用Python,代码大概50行左右,但可以想像用Java便看起来复杂得多。看如下:
import java.util.HashMapimport java.util.HashSet
import java.util.Iterator
import java.util.Map
import java.util.Set
import java.util.TreeMap
/**
* <B>关联规则挖掘:Apriori算法</B>
*
* <P>按照Apriori算法的基本思想来实现
*
* @author king
* @since 2013/06/27
*
*/
public class Apriori {
private Map<Integer, Set<String>> txDatabase // 事务数据库
private Float minSup // 最小支持度
private Float minConf // 最小置信度
private Integer txDatabaseCount // 事务数据库中的事务数
private Map<Integer, Set<Set<String>>> freqItemSet // 频繁项集集合
private Map<Set<String>, Set<Set<String>>> assiciationRules // 频繁关联规则集合
public Apriori(
Map<Integer, Set<String>> txDatabase,
Float minSup,
Float minConf) {
this.txDatabase = txDatabase
this.minSup = minSup
this.minConf = minConf
this.txDatabaseCount = this.txDatabase.size()
freqItemSet = new TreeMap<Integer, Set<Set<String>>>()
assiciationRules = new HashMap<Set<String>, Set<Set<String>>>()
}
/**
* 扫描事务数据库,计算频繁1-项集
* @return
*/
public Map<Set<String>, Float> getFreq1ItemSet() {
Map<Set<String>, Float> freq1ItemSetMap = new HashMap<Set<String>, Float>()
Map<Set<String>, Integer> candFreq1ItemSet = this.getCandFreq1ItemSet()
Iterator<Map.Entry<Set<String>, Integer>> it = candFreq1ItemSet.entrySet().iterator()
while(it.hasNext()) {
Map.Entry<Set<String>, Integer> entry = it.next()
// 计算支持度
Float supported = new Float(entry.getValue().toString())/new Float(txDatabaseCount)
if(supported>=minSup) {
freq1ItemSetMap.put(entry.getKey(), supported)
}
}
return freq1ItemSetMap
}
/**
* 计算候选频繁1-项集
* @return
*/
public Map<Set<String>, Integer> getCandFreq1ItemSet() {
Map<Set<String>, Integer> candFreq1ItemSetMap = new HashMap<Set<String>, Integer>()
Iterator<Map.Entry<Integer, Set<String>>> it = txDatabase.entrySet().iterator()
// 统计支持数,生成候选频繁1-项集
while(it.hasNext()) {
Map.Entry<Integer, Set<String>> entry = it.next()
Set<String> itemSet = entry.getValue()
for(String item : itemSet) {
Set<String> key = new HashSet<String>()
key.add(item.trim())
if(!candFreq1ItemSetMap.containsKey(key)) {
Integer value = 1
candFreq1ItemSetMap.put(key, value)
}
else {
Integer value = 1+candFreq1ItemSetMap.get(key)
candFreq1ItemSetMap.put(key, value)
}
}
}
return candFreq1ItemSetMap
}
/**
* 根据频繁(k-1)-项集计算候选频繁k-项集
*
* @param m 其中m=k-1
* @param freqMItemSet 频繁(k-1)-项集
* @return
*/
public Set<Set<String>> aprioriGen(int m, Set<Set<String>> freqMItemSet) {
Set<Set<String>> candFreqKItemSet = new HashSet<Set<String>>()
Iterator<Set<String>> it = freqMItemSet.iterator()
Set<String> originalItemSet = null
while(it.hasNext()) {
originalItemSet = it.next()
Iterator<Set<String>> itr = this.getIterator(originalItemSet, freqMItemSet)
while(itr.hasNext()) {
Set<String> identicalSet = new HashSet<String>() // 两个项集相同元素的集合(集合的交运算)
identicalSet.addAll(originalItemSet)
Set<String> set = itr.next()
identicalSet.retainAll(set) // identicalSet中剩下的元素是identicalSet与set集合中公有的元素
if(identicalSet.size() == m-1) { // (k-1)-项集中k-2个相同
Set<String> differentSet = new HashSet<String>() // 两个项集不同元素的集合(集合的差运算)
differentSet.addAll(originalItemSet)
differentSet.removeAll(set) // 因为有k-2个相同,则differentSet中一定剩下一个元素,即differentSet大小为1
differentSet.addAll(set) // 构造候选k-项集的一个元素(set大小为k-1,differentSet大小为k)
if(!this.has_infrequent_subset(differentSet, freqMItemSet))
candFreqKItemSet.add(differentSet) // 加入候选k-项集集合
}
}
}
return candFreqKItemSet
}
/**
* 使用先验知识,剪枝。若候选k项集中存在k-1项子集不是频繁k-1项集,则删除该候选k项集
* @param candKItemSet
* @param freqMItemSet
* @return
*/
private boolean has_infrequent_subset(Set<String> candKItemSet, Set<Set<String>> freqMItemSet) {
Set<String> tempSet = new HashSet<String>()
tempSet.addAll(candKItemSet)
Iterator<String> itItem = candKItemSet.iterator()
while(itItem.hasNext()) {
String item = itItem.next()
tempSet.remove(item)// 该候选去掉一项后变为k-1项集
if(!freqMItemSet.contains(tempSet))// 判断k-1项集是否是频繁项集
return true
tempSet.add(item)// 恢复
}
return false
}
/**
* 根据一个频繁k-项集的元素(集合),获取到频繁k-项集的从该元素开始的迭代器实例
* @param itemSet
* @param freqKItemSet 频繁k-项集
* @return
*/
private Iterator<Set<String>> getIterator(Set<String> itemSet, Set<Set<String>> freqKItemSet) {
Iterator<Set<String>> it = freqKItemSet.iterator()
while(it.hasNext()) {
if(itemSet.equals(it.next())) {
break
}
}
return it
}
/**
* 根据频繁(k-1)-项集,调用aprioriGen方法,计算频繁k-项集
*
* @param k
* @param freqMItemSet 频繁(k-1)-项集
* @return
*/
public Map<Set<String>, Float> getFreqKItemSet(int k, Set<Set<String>> freqMItemSet) {
Map<Set<String>, Integer> candFreqKItemSetMap = new HashMap<Set<String>, Integer>()
// 调用aprioriGen方法,得到候选频繁k-项集
Set<Set<String>> candFreqKItemSet = this.aprioriGen(k-1, freqMItemSet)
// 扫描事务数据库
Iterator<Map.Entry<Integer, Set<String>>> it = txDatabase.entrySet().iterator()
// 统计支持数
while(it.hasNext()) {
Map.Entry<Integer, Set<String>> entry = it.next()
Iterator<Set<String>> kit = candFreqKItemSet.iterator()
while(kit.hasNext()) {
Set<String> kSet = kit.next()
Set<String> set = new HashSet<String>()
set.addAll(kSet)
set.removeAll(entry.getValue()) // 候选频繁k-项集与事务数据库中元素做差运算
if(set.isEmpty()) { // 如果拷贝set为空,支持数加1
if(candFreqKItemSetMap.get(kSet) == null) {
Integer value = 1
candFreqKItemSetMap.put(kSet, value)
}
else {
Integer value = 1+candFreqKItemSetMap.get(kSet)
candFreqKItemSetMap.put(kSet, value)
}
}
}
}
要比较好的实现的话去WEKA源码里面找,或者http://www.helsinki.fi/~holler/datamining/algorithms.html也有~不过其实要把人家写的读懂也挺烦的,Apriori是很基本的,Java也有很多好用的集合类,加把劲一天就能写个能用的出来~