码迷,mamicode.com
首页 > 其他好文 > 详细

bayes

时间:2015-01-26 06:30:47      阅读:235      评论:0      收藏:0      [点我收藏+]

标签:

from numpy import *

import time
starttime = time.time()


def loadDataSet(): 
    postingList = [[my, dog, has, flea,
                    problems, help, please],
                    [maybe, not, take, him,
                    to, dog, park, stupid],
                    [my, dalmation, is, so, cute,
                    I, love, him],
                    [stop, posting, stupid, worthless, 
                    garbage],
                    [mr, licks, ate, my, steak, how,
                    to, stop, him],
                    [quit, buying, worthless, dog, food,
                    stupid]]
    classVec = [0, 1, 0, 1, 0, 1] 
    return postingList, classVec

def createVocabList(dataSet): # dataSet = postingList 
    vocabSet = set([]) # vocabSet = set(dataSet)
    for document in dataSet:
        vocabSet = vocabSet | set(document) # 
    return list(vocabSet) # createVocabList = list(set(dataSet)) 

def setOfWords2Vec(vocabList, inputSet): 
    returnVec = [0] * len(vocabList) # [0, 0 , 0 ,0,..] len(vocabList)  0
    for word in vocabList:
        if word in inputSet:
            returnVec[vocabList.index(word)] = 1 + 1.0
        else:
            returnVec[vocabList.index(word)] = 1.0
            print "the word: %s is not in my Vocabulary!" % word
    return returnVec 




def txt2trainxy(filename1, filename2):
    import re
    reg = re.compile(r\W*) #
    # step 1: loading data...
    print "stet 1: loading data..."
    from os import listdir
    ld1 = listdir(email/ + filename1); ld2 = listdir(email/ + filename2)
    filelist = ld1 + ld2
    trainy = ((filename1 + \t) * len(ld1) + (filename2 + \t) * len(ld2)).split()
    
    trainx = []; fulltext = []; i = 0
    for File in filelist:
        if i < len(ld1):
            fr = reg.split(open(email/ + filename1 + / + File).readlines()[0].lower())
        else:
            fr = reg.split(open(email/ + filename2 + / + File).readlines()[0].lower())
        trainx.append([f for f in fr if len(f) > 2]) #
        fulltext.extend([f for f in fr if len(f) > 2]) #
        i += 1
    fulltext = list(set(fulltext))
    # set of words
    trainxws = [[list(set(item)).count(strg) + 1.0 for strg in fulltext] for item in trainx]
    # bag of words 
    trainxwb = [[item.count(strg) + 1.0 for strg in fulltext] for item in trainx]

    return trainxws, trainxwb, trainy, trainx, fulltext

def testx2vec(testx, fulltext):
    # set of words
    testxws = [list(set(testx)).count(strg) + 1.0 for strg in fulltext] #
    # bag of words 
    testxwb = [testx.count(strg) + 1.0 for strg in fulltext] #
    for word in testx:
        if word not in fulltext:
            print "the word: %s is not in my fulltext!" % word
    return testxws, testxwb

def bayes(testx, trainx, trainy, fulltext):
    print "---Getting Prob..."
    s = set(trainy); l = len(trainy); r = len(trainx[0])
    IDs = [[id for id in range(l) if trainy[id] == item] for item in s]
    logproby = [log(array(trainy.count(item)) / float(l)) for item in s]
    numbxv = [sum([trainx[id] for id in ids], 0) for ids in IDs]
    numbx = [sum([trainx[id] for id in ids]) + 2.0 for ids in IDs] #
    probx = [numbxv[i] / float(numbx[i]) for i in range(len(s))]
    logprobx = [[log(p[i]) for i in range(r)] for p in probx]
    print "---Printing Prob..."
    #print probx
    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][0]] # argsort() small to big
    print trainy[IDs[0][0]]
    print [fulltext[i] for i in (-array(probx)).argsort()[:,: 5][1]]
    print trainy[IDs[1][0]]
    """
    print IDs
    print numbxv
    print logprobx
    """

    # step 4: showing the result...
    print "---Showing the result..."
    # set of words
    sumlogpxws = sum(array(logprobx) * testx, 1)
    sumlogpxyws = array(sumlogpxws) + array(logproby)
    #print logprobx
    print sumlogpxws
    print sum(array(probx) * testx, 1)
    bestyws = trainy[IDs[sumlogpxyws.argmax()][0]]
    print "---From set of words: ", bestyws
    """
    # bag of words
    sumlogpxwb = sum(array(logprobx) * testxwb, 1)
    sumlogpxywb = array(sumlogpxwb) + array(logproby)
    bestywb = trainy[IDs[sumlogpxywb.argmax()][0]]
    print "---From bag of words: ", bestywb
    """
    return bestyws
    

def main():
    # step 1: loading data...
    trainxws, trainxwb, trainy, trainx, fulltext = txt2trainxy(spam,ham)
    print fulltext

    # step 2: training...
    print "step 2: training..."
    pass

    # step 3: testing...
    print "step 3: testing..."
    print "---Preparing testdata..."
    import random
    l = len(trainy)
    testid = random.sample(range(l), 20)
    testxxx = [trainxws[i] for i in testid]
    testyyy = [trainy[i] for i in testid]
    testtrainxws = [trainxws[i] for i in range(l) if i not in testid]
    testtrainy = [trainy[i] for i in range(l) if i not in testid]
    print "---Testing now..."
    errorcount = 0; p = len(testid)
    for i in range(p):
        if bayes(testxxx[i], testtrainxws, testtrainy, fulltext) != testyyy[i]:
            errorcount += 1
    print errorcount
    print p
    print "---Errorrate is: ", (errorcount / float(p))


    # step 4: showing the result
    print "step 4: using..."
    testx = [love, my, dalmation]
    print "the testx is: ", testx
    print "---Changing testx into vector..."
    testxws, testxwb = testx2vec(testx, fulltext)
    #print testxws
    bayes(testxws, testtrainxws, testtrainy, fulltext)

main()


"""
trainx, trainy = loadDataSet()
fulltext = createVocabList(trainx)
print fulltext
print setOfWords2Vec(fulltext, trainx[0])
trainxws = []
for t in trainx:
    trainxws.append(setOfWords2Vec(fulltext, t))
testEntry1 = [‘love‘, ‘my‘, ‘dalmation‘]
testEntry2 = [‘stupid‘, ‘garbage‘]
bayes(testEntry1, trainxws, trainy, fulltext)

"""

 

bayes

标签:

原文地址:http://www.cnblogs.com/monne/p/4249324.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!