朴素贝叶斯分类

《Machine Learning In Action: Naive Bayes Classfication》

1. 加载数据

  • bayes.py
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    # 模拟加载数据
    def loadData():
    postingList = [['my', 'dogs', 'has', 'flea', 'problem', 'help', 'please'],
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec


    # 将所有文档去重、排序,得到我的字典
    def createVocaList(dataset):
    vc_set = set([])
    for docu in dataset:
    vc_set = vc_set | set(docu) # put all words in every document into the vocalSet
    vc_list = list(vc_set)
    list.sort(vc_list)
    return vc_list


    # 根据以上字典, 将一句话转换为vector
    def sentence2Vec(vocaList, input):
    print("Input sentence:%s" % input)
    vec = [0] * len(vocaList)
    for word in input:
    if word in vocaList:
    vec[vocaList.index(word)] = 1
    else:
    print("The word:%s is not in my Vocabulary!" % word)
    print("Returned vector:%s" % vec)
    return vec

2. 训练模型

  • train.py
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    from numpy import *

    def trainNaiveBayes(trainX,trainY):
    totalDoc = len(trainX)
    totalWord = len(trainX[0])

    # 计算侮辱性文档的概率: p(c=1)
    p_class_1 = sum(trainY)/float(totalDoc)

    # 为了解决有的单词在某类文档没有出现导致对应的概率为0, 所以把每个单词出现的初始次数设置为1
    # p_0_num = zeros(totalWord)
    # p_1_num = zeros(totalWord)

    p_0_num = ones(totalWord)
    p_1_num = ones(totalWord)

    p_0_denom = 2.0
    p_1_denom = 2.0

    for i in range(totalDoc):
    if trainY[i] == 1:
    p_1_num +=trainX[i] #计算每个词汇在侮辱性文档中的总数
    p_1_denom += sum(trainX[i]) #计算所有侮辱性文档的词汇总数
    else:
    p_0_num += trainX[i]
    p_0_denom += sum(trainX[i])

    p_1_vector = p_1_num/p_1_denom
    p_0_vector = p_0_num/p_0_denom

    print("p(W|c=1) : ",p_1_vector)
    print("p(W|c=0) : ",p_0_vector)
    print("p(c=1) : ",p_class_1)
    # 返回p(w0|c=1),p(w1|c=1),p(w2|c=1),....
    # 返回p(w0|c=0),p(w1|c=0),p(w2|c=0),....
    # 返回p(c=1)
    return p_1_vector,p_0_vector,p_class_1

3. 测试

  • Test.py
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    import bayes
    import train
    from math import log


    # 这里的概率转换成对数log
    def naiveBayesClassfication(testX, p_1_vector, p_0_vector, p_class_1):
    p1 = log(sum(testX * p_1_vector)) + log(p_class_1)
    p0 = log(sum(testX * p_0_vector)) + log(1 - p_class_1)

    print("p1 = ", p1, ", while p0 = ", p0)
    if p1 > p0:
    return 1
    else:
    return 0

    if __name__ == "__main__":
    document, trainY = bayes.loadData()
    lis = bayes.createVocaList(document)
    print("My dictionary:\n", lis, "\n")

    trainX = []
    for sentence in document:
    print("************************" * 5)
    vec = bayes.sentence2Vec(lis, sentence)
    trainX.append(vec)
    print("")

    print("\n\n")
    print("************************" * 5)
    print("trainX = ", trainX)
    print("trainY = ", trainY)

    print("\n\n")
    print("************************" * 5)
    p_1_vector, p_0_vector, p_class_1 = train.trainNaiveBayes(trainX, trainY)

    print("/****************************Training Work Completed************************************/")


    entry1 = ["love", "my", "dalmation"]
    vect1 = bayes.sentence2Vec(lis,entry1)
    print(entry1,"classified as :",naiveBayesClassfication(vect1,p_1_vector,p_0_vector,p_class_1))

    entry2 = ["stupid","garbage"]
    vect2 = bayes.sentence2Vec(lis,entry2)
    print(entry2,"classified as :",naiveBayesClassfication(vect2,p_1_vector,p_0_vector,p_class_1))