标签:
问题导读:
1.合并数据集
2.重塑和轴向旋转
3.数据转换(待续)
解决方案:
In [3]: df1 = pd.DataFrame(
   ...: {'key':['b','b','a','c','a','a','b'],
   ...: 'data1':range(7)}
   ...: )
In [4]: df1
Out[4]: 
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
[7 rows x 2 columns]
In [5]: df2 = pd.DataFrame(
   ...: {'key':['a','b','d'],
   ...: 'data2':range(3)}
   ...: )
In [6]: df2
Out[6]: 
   data2 key
0      0   a
1      1   b
2      2   d
[3 rows x 2 columns]
In [7]: pd.merge(df1,df2)
Out[7]: 
   data1 key  data2
0      0   b      1
1      1   b      1
2      6   b      1
3      2   a      0
4      4   a      0
5      5   a      0
[6 rows x 3 columns]
In [8]: pd.merge(df1,df2, on='key')
Out[8]: 
   data1 key  data2
0      0   b      1
1      1   b      1
2      6   b      1
3      2   a      0
4      4   a      0
5      5   a      0
[6 rows x 3 columns]
In [10]: df3 = pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],
   ....: 'data1':range(7)})
In [11]: df4 = pd.DataFrame({'rkey':['a','b','d'],'data2':range(3)})
In [12]: pd.merge(df3,df4,left_on='lkey',right_on='rkey')
Out[12]: 
   data1 lkey  data2 rkey
0      0    b      1    b
1      1    b      1    b
2      6    b      1    b
3      2    a      0    a
4      4    a      0    a
5      5    a      0    a
[6 rows x 4 columns]
In [13]: pd.merge(df1,df2,how='outer') Out[13]: data1 key data2 0 0 b 1 1 1 b 1 2 6 b 1 3 2 a 0 4 4 a 0 5 5 a 0 6 3 c NaN 7 NaN d 2 [8 rows x 3 columns]
In [15]: df1 = pd.DataFrame({'key':['b','b','a','c','a','b'],
   ....: 'data1':range(6)})
In [16]: df2 = pd.DataFrame({'key':['a','b','a','b','d'],
   ....: 'data2':range(5)})
In [17]: pd.merge(df1,df2,on='key', how = 'left')
Out[17]: 
    data1 key  data2
0       0   b      1
1       0   b      3
2       1   b      1
3       1   b      3
4       5   b      1
5       5   b      3
6       2   a      0
7       2   a      2
8       4   a      0
9       4   a      2
10      3   c    NaN
[11 rows x 3 columns]
In [18]: pd.merge(df1,df2,on='key', how = 'right')
Out[18]: 
    data1 key  data2
0       0   b      1
1       1   b      1
2       5   b      1
3       0   b      3
4       1   b      3
5       5   b      3
6       2   a      0
7       4   a      0
8       2   a      2
9       4   a      2
10    NaN   d      4
[11 rows x 3 columns]
In [19]: pd.merge(df1,df2,on='key',how='inner')
Out[19]: 
   data1 key  data2
0      0   b      1
1      0   b      3
2      1   b      1
3      1   b      3
4      5   b      1
5      5   b      3
6      2   a      0
7      2   a      2
8      4   a      0
9      4   a      2
[10 rows x 3 columns]
In [21]: pd.merge(df1,df2,on='key',how='outer')
Out[21]: 
    data1 key  data2
0       0   b      1
1       0   b      3
2       1   b      1
3       1   b      3
4       5   b      1
5       5   b      3
6       2   a      0
7       2   a      2
8       4   a      0
9       4   a      2
10      3   c    NaN
11    NaN   d      4
[12 rows x 3 columns]
In [27]: left = pd.DataFrame({'key1':['foo','foo','bar'],
'key2':['one','two','one'],
'key3':[1,2,3]})
In [28]: right = pd.DataFrame({'key1':['foo','foo','foo','bar'],
'key2':['one','one','one','two'],
'rval':[4,5,6,7]})
In [29]: pd.merge(left,right,on=['key1','key2'],how='outer')
Out[29]: 
  key1 key2  key3  rval
0  foo  one     1     4
1  foo  one     1     5
2  foo  one     1     6
3  foo  two     2   NaN
4  bar  one     3   NaN
5  bar  two   NaN     7
[6 rows x 4 columns]
In [30]: pd.merge(left,right,on='key1')
Out[30]: 
  key1 key2_x  key3 key2_y  rval
0  foo    one     1    one     4
1  foo    one     1    one     5
2  foo    one     1    one     6
3  foo    two     2    one     4
4  foo    two     2    one     5
5  foo    two     2    one     6
6  bar    one     3    two     7
[7 rows x 5 columns]
In [31]: pd.merge(left,right,on='key1',suffixes=('_left','_right'))
Out[31]: 
  key1 key2_left  key3 key2_right  rval
0  foo       one     1        one     4
1  foo       one     1        one     5
2  foo       one     1        one     6
3  foo       two     2        one     4
4  foo       two     2        one     5
5  foo       two     2        one     6
6  bar       one     3        two     7
[7 rows x 5 columns]In [37]: left1 = pd.DataFrame({'key':['a','b','a','a','b','c'],
'value':range(6)})
In [38]: right1 = pd.DataFrame({'group_val':[3.5,7]},index = ['a','b'])
In [39]: pd.merge(left1,right1,left_on='key',right_index=True)
Out[39]: 
  key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0
[5 rows x 3 columns]
In [40]: right1
Out[40]: 
   group_val
a        3.5
b        7.0
[2 rows x 1 columns]
In [41]: left1
Out[41]: 
  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
[6 rows x 2 columns]
In [48]: lefth = pd.DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
'key2':[2000,2001,2002,2001,2002],
'data':np.arange(5.)})
In [49]: righth = pd.DataFrame(np.arange(12).reshape((6,2)),
index = [['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
[2001,2000,2000,2000,2001,2002]],
columns = ['event1','event2'])
In [50]: lefth
Out[50]: 
   data    key1  key2
0     0    Ohio  2000
1     1    Ohio  2001
2     2    Ohio  2002
3     3  Nevada  2001
4     4  Nevada  2002
[5 rows x 3 columns]
In [52]: righth
Out[52]: 
             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11
[6 rows x 2 columns]
In [53]: pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
Out[53]: 
   data    key1  key2  event1  event2
0     0    Ohio  2000       4       5
0     0    Ohio  2000       6       7
1     1    Ohio  2001       8       9
2     2    Ohio  2002      10      11
3     3  Nevada  2001       0       1
[5 rows x 5 columns]
In [54]: pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')
Out[54]: 
   data    key1  key2  event1  event2
0     0    Ohio  2000       4       5
0     0    Ohio  2000       6       7
1     1    Ohio  2001       8       9
2     2    Ohio  2002      10      11
3     3  Nevada  2001       0       1
4     4  Nevada  2002     NaN     NaN
4   NaN  Nevada  2000       2       3
[7 rows x 5 columns]
In [55]: left2 = pd.DataFrame([[1.,2.],[3.,4.],[5.,6.]], index = ['a','c','e'], ....: columns= ['Ohio','Nevada']) In [56]: right2 = pd.DataFrame([[7.,8.],[9.,10.],[11.,12],[13,14]], ....: index = ['b','c','d','e'],columns=['Missouri','Alabama']) In [57]: left2 Out[57]: Ohio Nevada a 1 2 c 3 4 e 5 6 [3 rows x 2 columns] In [58]: right2 Out[58]: Missouri Alabama b 7 8 c 9 10 d 11 12 e 13 14 [4 rows x 2 columns] In [59]: pd.merge(left2,right2,how='outer',left_index=True,right_index=True) Out[59]: Ohio Nevada Missouri Alabama a 1 2 NaN NaN b NaN NaN 7 8 c 3 4 9 10 d NaN NaN 11 12 e 5 6 13 14 [5 rows x 4 columns]
In [60]: left2.join(right2,how='outer') Out[60]: Ohio Nevada Missouri Alabama a 1 2 NaN NaN b NaN NaN 7 8 c 3 4 9 10 d NaN NaN 11 12 e 5 6 13 14 [5 rows x 4 columns] In [61]: left2.join(right2) Out[61]: Ohio Nevada Missouri Alabama a 1 2 NaN NaN c 3 4 9 10 e 5 6 13 14 [3 rows x 4 columns] In [62]: left1 Out[62]: key value 0 a 0 1 b 1 2 a 2 3 a 3 4 b 4 5 c 5 [6 rows x 2 columns] In [63]: right1 Out[63]: group_val a 3.5 b 7.0 [2 rows x 1 columns] In [64]: left1.join(right1,on='key') Out[64]: key value group_val 0 a 0 3.5 1 b 1 7.0 2 a 2 3.5 3 a 3 3.5 4 b 4 7.0 5 c 5 NaN [6 rows x 3 columns]
In [65]: another = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]], ....: index=['a','c','e','f'],columns=['New York','Oregon']) In [66]: left2.join([right2,another]) Out[66]: Ohio Nevada Missouri Alabama New York Oregon a 1 2 NaN NaN 7 8 c 3 4 9 10 9 10 e 5 6 13 14 11 12 [3 rows x 6 columns]
# coding=utf-8
import numpy as np
import pandas as pd
"""
numpy 的concatenate 函数:该函数用于合并原始的numpy 的数组
"""
arr = np.arange(12).reshape((3, 4))
print arr
'''
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
'''
print np.concatenate([arr, arr], axis=1)
'''
[[ 0  1  2  3  0  1  2  3]
 [ 4  5  6  7  4  5  6  7]
 [ 8  9 10 11  8  9 10 11]]
'''
"""
对于pandas 对象 Series 和 DataFrame 他们的带有标签的轴使你能够进一步推广数组的连接运算
"""
"""
(1) 各对象其他轴上的索引不同时,那些轴做的交集还是并集?
答案:默认是并集
"""
# concat 默认在axis=0 上工作
s1 = pd.Series([0,1], index=['a','b'])
# s1 = pd.Series([1,2,2], index=['d','b','f'])
s2 = pd.Series([2,3,4], index=['c','d','e'])
s3 = pd.Series([5,6], index=['f','g'])
print pd.concat([s1,s2,s3])
'''
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
'''
# 如果要求在axis=1 进行操作则结果变成一个DataFrame
print pd.concat([s1,s2,s3], axis=1)
'''
    0   1   2
a   0 NaN NaN
b   1 NaN NaN
c NaN   2 NaN
d NaN   3 NaN
e NaN   4 NaN
f NaN NaN   5
g NaN NaN   6
'''
# 传入的如果是join=‘inner’ 即可得到他们的交集
s4 = pd.concat([s1 * 5, s3])
print pd.concat([s1,s4], axis=1)
'''
dtype: int64
    0  1
a   0  0
b   1  5
f NaN  5
g NaN  6
[4 rows x 2 columns]
'''
print pd.concat([s1,s4],axis=1,join='inner')
'''
   0  1
a  0  0
b  1  5
[2 rows x 2 columns]
'''
# 指定轴进行连接操作,如果连接的两个Series 都没有该轴,则值为NaN
print pd.concat([s1,s4], axis=1, join_axes=[['a','c','b','e']])
'''
    0   1
a   0   0
c NaN NaN
b   1   5
e NaN NaN
[4 rows x 2 columns]
'''
# 参与连接的片段的在结果中区分不开,这时候我们使用keys 参数建立一个层次化索引
result = pd.concat([s1,s2,s3], keys=['one','two','three'])
print result
'''
one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64
'''
"""
s1 = pd.Series([1,2,6], index=['a','b','f'])
s2 = pd.Series([3,4], index=['c','d'])
s3 = pd.Series([5,6], index=['e','f'])
result = pd.concat([s1,s2,s3], keys=['one','two','three'])
print result
'''
one    a    1
       b    2
       f    6
two    c    3
       d    4
three  e    5
       f    6
dtype: int64
'''
"""
# print result.unstack()
# 沿axis=1 对series 进行合并,则keys 就会成为DataFrame的列头
print pd.concat([s1,s2,s3], axis=1, keys=['one','two','three'])
print pd.concat([s1,s2,s3], axis=1)
'''
dtype: int64
   one  two  three
a    0  NaN    NaN
b    1  NaN    NaN
c  NaN    2    NaN
d  NaN    3    NaN
e  NaN    4    NaN
f  NaN  NaN      5
g  NaN  NaN      6
[7 rows x 3 columns]
    0   1   2
a   0 NaN NaN
b   1 NaN NaN
c NaN   2 NaN
d NaN   3 NaN
e NaN   4 NaN
f NaN NaN   5
g NaN NaN   6
[7 rows x 3 columns]
'''
# coding=utf-8
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.arange(6).reshape(3,2), index=['a','b','c'], columns=['one','two'])
df2 = pd.DataFrame(5+np.arange(4).reshape(2,2), index=['a','c'], columns=['three','four'])
print pd.concat([df1,df2],axis=1, keys=['level1','levle2'])
'''
   level1       levle2
      one  two   three  four
a       0    1       5     6
b       2    3     NaN   NaN
c       4    5       7     8
[3 rows x 4 columns]
'''
# 如果传入的不是列表而是一个字典,则字典的建就会被当做keys选项的值
dic = {'level1':df1, 'level2':df2}
print pd.concat(dic,axis=1)
'''
   level1       level2
      one  two   three  four
a       0    1       5     6
b       2    3     NaN   NaN
c       4    5       7     8
[3 rows x 4 columns]
'''
# coding=utf-8 import pandas as pd import numpy as np df1 = pd.DataFrame(np.random.randn(3,4), columns=['a','b','c','d']) df2 = pd.DataFrame(np.random.randn(2,3), columns=['b','d','a']) print pd.concat([df1,df2]) # 不保留连接轴的索引,创建新索引 print pd.concat([df1,df2], ignore_index=True) print pd.concat([df1,df2], ignore_index=False)
# coding=utf-8
import pandas as pd
import numpy as np
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f','e','d','c','b','a'])
b = pd.Series(np.arange(len(a), dtype=np.float64), index=['f','e','d','c','b','a'])
b[-1] = np.nan
print np.where(pd.isnull(a), b, a)
print b[:-2].combine_first(a[2:])
'''
[ 0.   2.5  2.   3.5  4.5  nan]
a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64
'''
# 对于DataFrame, combine_first 自然也是会在列上做同样的事情,我们可以把这个动作当成是参数对象数据对调用对象的数据打补丁
df1 = pd.DataFrame({
    'a':[1.,np.nan, 5., np.nan],
    'b':[np.nan, 2., np.nan, 6.],
    'c':range(2,18,4)
})
df2 = pd.DataFrame({
    'a':[5., 4., np.nan, 3., 7.],
    'b':[np.nan, 3., 4., 6., 8.]
})
print df1.combine_first(df2)
# coding=utf-8
import pandas as pd
import numpy as np
"""
层次化索引为DataFrame 数据的重排任务提供了一种具有良好一致性的方式
(1) stack 将数据的列“旋转”为行
(2) unstack 将数据的行“旋转”为列
"""
data = pd.DataFrame(np.arange(6).reshape((2,3)),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one','two','three'], name='number')
                    )
print data
# 我们使用stack方法将DataFrame的列转换成行,得到一个Series
result = data.stack()
print result
# 同样使用unstack也可以将一个层次化索引的Series 转化得到一个DataFrame
print result.unstack()
'''
number    one  two  three
state
Ohio        0    1      2
Colorado    3    4      5
[2 rows x 3 columns]
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64
number    one  two  three
state
Ohio        0    1      2
Colorado    3    4      5
[2 rows x 3 columns]
'''
# unstack操作的是最内层的(stack也是)
# 当我们传入的分层级别的编号或名称,同样可以对其他级别进行unstack 操作
print result.unstack(0) == result.unstack('state')
'''
state   Ohio Colorado
number
one     True     True
two     True     True
three   True     True
[3 rows x 2 columns]
'''
 # coding=utf-8
import pandas as pd
import numpy as np
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one','two'])
print data2
# 连接 -> 层次化索引的Series -> DataFrame(行:每个Series;列:Series的索引)
print data2.unstack()
'''
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64
      a   b  c  d   e
one   0   1  2  3 NaN
two NaN NaN  4  5   6
[2 rows x 5 columns]
'''
print data2.unstack().stack()
print data2.unstack().stack(dropna=False)
'''
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: float64
one  a     0
     b     1
     c     2
     d     3
     e   NaN
two  a   NaN
     b   NaN
     c     4
     d     5
     e     6
dtype: float64
'''
# 对DataFrame进行操作,旋转轴 的级别将会成为结果中最低级别
df = pd.DataFrame(
    {'left': result, 'right': result + 5},
    columns=pd.Index(['left', 'right'], name='side')
)
print df.unstack('state')
print df.unstack('state').stack('side')
# coding=utf-8
import pandas as pd
import numpy as np
data = pd.read_csv("/home/peerslee/py/pydata/pydata-book-master/ch07/macrodata.csv")
frame01 = pd.DataFrame(data, columns=['year', 'realgdp', 'infl', 'unemp'])
path01 = '/home/peerslee/py/pydata/pydata-book-master/ch07/macrodata01.csv'
frame01.to_csv(path01, index=False, header=False) # 将去除index 和 header
names02 = ['year', 'realgdp', 'infl', 'unemp']
frame02 = pd.read_table(path01, sep=',', names=names02, index_col='year') # 将'year'列设置为行索引
path02 = '/home/peerslee/py/pydata/pydata-book-master/ch07/macrodata02.csv'
frame02.stack().to_csv(path02) # 轴向旋转之后写入文件中,year列会自动向后填充
names03 = ['date','item','value']
frame03 = pd.read_table(path02, sep=',', names=names03,)
print frame03
'''
    date     item     value
0   1959  realgdp  2710.349
1   1959     infl     0.000
2   1959    unemp     5.800
3   1959  realgdp  2778.801
4   1959     infl     2.340
5   1959    unemp     5.100
6   1959  realgdp  2775.488
7   1959     infl     2.740
8   1959    unemp     5.300
9   1959  realgdp  2785.204
10  1959     infl     0.270
'''
result_path = '/home/peerslee/py/pydata/pydata-book-master/ch07/result_data.csv'
frame03.to_csv(result_path) # 将数据保存起来
但是我在进行数据pivot 的时候,出现了错误:
 raise ValueError('Index contains duplicate entries, '
ValueError: Index contains duplicate entries, cannot reshape
# coding=utf-8
import pandas as pd
import numpy as np
"""
DataFrame 中的pivot方法可以将“长格式”旋转为“宽格式”
"""
# 因为没有数据所以我们只能自己写点
ldata = pd.DataFrame({'date':['03-31','03-31','03-31','06-30','06-30','06-30'],
                      'item':['real','infl','unemp','real','infl','unemp'],
                      'value':['2710.','000.','5.8','2778.','2.34','5.1']
                      })
print ldata
# 将date作为行索引的名字,item为列索引的名字,将value填充进去
pivoted = ldata.pivot('date','item','value')
print pivoted
'''
item   infl   real unemp
date
03-31  000.  2710.   5.8
06-30  2.34  2778.   5.1
[2 rows x 3 columns]
'''
# 将需要重塑的列扩为两列
ldata['value2'] = np.random.randn(len(ldata))
print ldata
# 忽略pivot的最后一个参数,会得到一个带有层次化的列
pivoted = ldata.pivot('date','item')
print pivoted
'''
      value                 value2
item   infl   real unemp      infl      real     unemp
date
03-31  000.  2710.   5.8  1.059406  0.437246  0.106987
06-30  2.34  2778.   5.1 -1.087665 -0.811100 -0.579266
[2 rows x 6 columns]
'''
print pivoted['value'][:5]
'''
item   infl   real unemp
date
03-31  000.  2710.   5.8
06-30  2.34  2778.   5.1
[2 rows x 3 columns]
'''
# 这个操作是完整的pivot 操作
unstacked = ldata.set_index(['date','item']).unstack('item')
print unstacked
'''
      value                 value2
item   infl   real unemp      infl      real     unemp
date
03-31  000.  2710.   5.8 -1.018416 -1.476397  1.579151
06-30  2.34  2778.   5.1  0.863437  1.606538 -1.147549
[2 rows x 6 columns]
'''# coding=utf-8
import pandas as pd
import numpy as np
data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
print data
'''
    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4
[7 rows x 2 columns]
'''
"""
DataFrame 的duplicated 方法返回一个布尔型Series,表示各行是否是重复行
"""
print data.duplicated()
'''
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
'''
"""
DataFrame 的drop_duplicated 方法用于移除了重复行的DataFrame
"""
print data.drop_duplicates()
'''
    k1  k2
0  one   1
2  one   2
3  two   3
5  two   4
[4 rows x 2 columns]
'''
"""
具体根据 某一列 来判断是否有重复值
"""
data['v1'] = range(7)
print data.drop_duplicates(['k1'])
'''
    k1  k2  v1
0  one   1   0
3  two   3   3
[2 rows x 3 columns]
'''
"""
默认保留的是第1个值,如果想保留最后一个值则传入 take_last=True
"""
print data.drop_duplicates(['k1','k2'], take_last=True)
'''
    k1  k2  v1
1  one   1   1
2  one   2   2
4  two   3   4
6  two   4   6
[4 rows x 3 columns]
'''# coding=utf-8
import pandas as pd
import numpy as np
# 一张食物和重量的表格
data = pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef',
                             'Bacon','pastrami','honey ham','nova lox'],
                     'ounces':[4,3,12,6,7.5,8,3,5,6]})
# 食物和动物的表格映射
meat_to_ainmal = {
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'pig',
    'nova lox':'salmon'
}
"""
Series 的map 方法接受一个函数或含有映射关系的字典型对象
"""
# 这里我们先将表1 中的所有食物转换为小写,再做一个map
data['animal'] = data['food'].map(str.lower).map(meat_to_ainmal)
print data
'''
          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     pig
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     NaN
8     nova lox     6.0  salmon
[9 rows x 3 columns]
'''
"""
我们可以传入一个能够完成全部这些工作的函数
"""
# coding=utf-8
import pandas as pd
import numpy as np
data = pd.Series([1., -999., 2.,  -999., -1000., 3.])
"""
我们要使用pandas 将-999 这样的数据替换成 NA 值
"""
data.replace(-999,np.nan)
"""
如果希望一次性替换多个值,可以传入一个由待替换值组成的列表以及一个替换值
"""
data.replace([-999,-1000],np.nan)
"""
如果希望对不同的值进行不同的替换,则传入一个由替换关系组成的列表即可
"""
data.replace([-999,-1000],[np.nan,0])
"""
传入的参数也可以是字典
"""
data.replace({-999:np.nan, -1000:0})# coding=utf-8
import pandas as pd
import numpy as np
"""
轴标签可以通过函数或映射进行转换,从而得到一个新对象,轴还可以被就地修改
"""
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                    index=['Ohio','Colorado','New York',],
                    columns=['one','two','three','four'])
"""
轴标签也有一个map 方法
"""
print data.index.map(str.upper)
"""
直接赋值给index
"""
data.index = data.index.map(str.upper)
print data
"""
创建数据集的转换版本,而不是修改原始数据
"""
data.rename(index=str.title, columns=str.upper)
"""
rename 可以结合字典型对象实现对部分轴标签的更新
"""
data.rename(index={'OHIO':'INDIANA'},
            columns={'three','peekaboo'})
"""
rename 也可以就地对DataFrame 进行修改,inplace=True
"""
_=data.rename(index={'OHIO':'INDIANA'},inplace=True)
标签:
原文地址:http://blog.csdn.net/peerslee/article/details/51498713