码迷,mamicode.com
首页 > 编程语言 > 详细

python numpy 把数据向量化

时间:2015-08-21 17:07:04      阅读:184      评论:0      收藏:0      [点我收藏+]

标签:

某公司的数据处理。

比如有数据:

0 1 20100304 51.14 5 62.2 0 0.0 1 6.82 4.08666666667 4 3
0 0 20100307 2.47333333333 3 75.9333333333 0 0.0 114 13.5666666667 2.86666666667 0 0
0 2 20100318 2.49333333333 3 58.6666666667 0 0.0 1 5.22666666667 1.61333333333 0 0
0 2 20100329 9.14666666667 14 70.6428571429 0 0.0 1 31.84 7.51333333333 11 7
0 0 20100401 3.36666666667 5 71.1 1 75.0 1 1.86 1.29333333333 0 0
0 1 20100401 7.44666666667 11 58.1909090909 0 0.0 51 40.8066666667 12.5066666667 2 1
0 0 20100401 1.0 1 138.0 0 0.0 1 1.0 1.0 0 0
1 0 20100428 1.0 1 85.0 0 0.0 1 0.0 0.0 0 0
1 0 20100429 1.0 1 128.0 0 0.0 1 24.54 3.59333333333 0 0
0 0 20100506 1.92 2 67.0 0 0.0 44 6.65333333333 1.91333333333 4 1

要求:

1、除去第1列和第2列。
2、对剩下的日期列提取特征,以月份为特征,形式:201003

3、对剩下的数字列,用最大值和最小值的区间,划分十份,每一份就是一个特征,形式:[1,5)、[5,9)...[21,25],注意最后一个区间闭合了。

4、用上面获取的特征,把原始数据向量化,形式:

1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 


开写:

先下载安装pyExcelerator:http://sourceforge.net/projects/pyexcelerator/,解压后setup.py install

# -*- coding: cp936 -*-

import numpy as np
from numpy import * 
import pprint 
from pyExcelerator import *


#all_col_feature存放特征们
all_col_feature = []

for i in range(2,13):
    #获取[日期]列特征:201003,201004...201408...
    if i == 2 :
        FILENAME = "lost5"
        date = [e[0:6] for e in loadtxt(FILENAME , delimiter = " " , usecols=(2,) , dtype=str)]
        date = list(set(date)) #去重
        date.sort()
        all_col_feature.append(date)
    #获取[数字]列特征:取最大值,最小值,划分10份,每一份就是一个特征,[1,5),[5,9)..[21,25]
    else:
        digit_col = filter(lambda e:e!=0,loadtxt(FILENAME , delimiter = " " , usecols=(i,) , dtype=float))#获取数据后去0
        digit_col.sort()
        max_num = digit_col[-1]
        min_num = digit_col[0]
        block_size = (max_num - min_num) /10
        col_feature = []
        for i in range(11):
            col_feature.append(min_num+i*block_size)
        all_col_feature.append(col_feature)



w = Workbook() #创建一个工作簿    
ws = w.add_sheet('Zero_One_Data') #创建一个工作表
row_index = 0
col_index = 0
for i in range(len(all_col_feature)):
    if i == 0:
        for j in all_col_feature[i]:
            ws.write(row_index,col_index,str(j))
            col_index += 1
    else:
        for j in range(len(all_col_feature[i])-1):
            ws.write(row_index,col_index,str(all_col_feature[i][j])+"~"+str(all_col_feature[i][j+1]))
            col_index += 1
row_index += 1

out = file('out.txt','w')


res = []
for row in loadtxt(FILENAME , delimiter = " " , usecols=(2,3,4,5,6,7,8,9,10,11,12) , dtype=str):
    #数据整理成这个形式:every_row_feature_index = [0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1]
    #比如:第一个0表示在[日期]列的特征下标为0,-1表示NONE值
    #比如:第一个1表示在第二列的特征下标为1,心好累,我这都要注释?
    every_row_feature_index = []
    a = [i for i in range(len(all_col_feature[0])) if row[0][0:6] <= all_col_feature[0][i]]
    every_row_feature_index.append(a[0])
    
    for col in range(1,11):
        b = [i for i in range(len(all_col_feature[col])) if float(row[col]) < float(all_col_feature[col][i])]
        if len(b) == 0:#max_num
            every_row_feature_index.append(len(all_col_feature)-2)
        else:
            every_row_feature_index.append(b[0]-1) #-1表NONE
        
    
    #生成向量
    #就是把every_row_feature_index = [0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1]转成下面的形式:
    #[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    zero_one_every_row_res = []
    col_index = 0
    for i in range(len(every_row_feature_index)):
        if i == 0: #处理[日期]列
            for j in range(every_row_feature_index[i]):
                out.write('0'+' ')
                ws.write(row_index,col_index,'0')
                col_index += 1
            out.write('1'+' ')
            ws.write(row_index,col_index,'1')
            col_index += 1
            for j in range(every_row_feature_index[i]+1,len(all_col_feature[i])):
                out.write('0'+' ')
                ws.write(row_index,col_index,'0')
                col_index += 1
        else:#处理其他列
            if every_row_feature_index[i] == -1: #-1表NONE值 向量全为0
                for j in range(len(all_col_feature[i])-1):
                    out.write('0'+' ')
                    ws.write(row_index,col_index,'0')
                    col_index += 1
            else:
                for j in range(every_row_feature_index[i]):
                    out.write('0'+' ')
                    ws.write(row_index,col_index,'0')
                    col_index += 1
                out.write('1'+' ')
                ws.write(row_index,col_index,'1')
                col_index += 1
                for j in range(every_row_feature_index[i]+1,len(all_col_feature[i])-1):
                    out.write('0'+' ')
                    ws.write(row_index,col_index,'0')
                    col_index += 1
    out.write('\n')
    row_index += 1
    if row_index == 100:#为了更快看到结果,只设置100行,去掉可得到全部
        break

out.close()
w.save('out.xls')


看看结果:

技术分享

技术分享

版权声明:本文为博主原创文章,未经博主允许不得转载。

python numpy 把数据向量化

标签:

原文地址:http://blog.csdn.net/emaste_r/article/details/47836823

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!