标签:
问题导读:
1.pandas的数据结构介绍
2.利用pandas对数据的基本操作
3.汇总和计算描述统计方法
4.处理缺失数据
5.层次化索引
解决方案:
In [6]: obj = pd.Series([1,2,3,4]) In [7]: obj Out[7]: 0 1 1 2 2 3 3 4 dtype: int64
In [8]: obj.values Out[8]: array([1, 2, 3, 4]) In [9]: obj.index Out[9]: Int64Index([0, 1, 2, 3], dtype='int64')
In [10]: obj2 = pd.Series([2,4,1,6],index=['a','b','c','d']) In [11]: obj2 Out[11]: a 2 b 4 c 1 d 6 dtype: int64 In [12]: obj2.c Out[12]: 1
In [18]: obj2[obj2 > 0] Out[18]: a 2 b 4 c 1 d 6 dtype: int64 In [19]: obj2 * 2 Out[19]: a 4 b 8 c 2 d 12 dtype: int64 In [20]: np.exp(obj2) Out[20]: a 7.389056 b 54.598150 c 2.718282 d 403.428793 dtype: float64
In [21]: sdata = {'Ohio':2000, 'Texas':3000, 'Utah':3425, 'Oregon':3908} In [22]: obj3 = pd.Series(sdata) In [23]: obj3 Out[23]: Ohio 2000 Oregon 3908 Texas 3000 Utah 3425 dtype: int64 In [24]: obj3.values Out[24]: array([2000, 3908, 3000, 3425]) In [25]: obj3.index Out[25]: Index([u'Ohio', u'Oregon', u'Texas', u'Utah'], dtype='object')
n [26]: states = ['california','Ohio','Oregon','Texas'] In [27]: obj4 = pd.Series(sdata,index = states) In [28]: obj4 Out[28]: california NaN Ohio 2000 Oregon 3908 Texas 3000 dtype: float64
In [34]: obj4.isnull() Out[34]: california True Ohio False Oregon False Texas False dtype: bool In [35]: obj4.notnull() Out[35]: california False Ohio True Oregon True Texas True dtype: bool
In [36]: obj3 Out[36]: Ohio 2000 Oregon 3908 Texas 3000 Utah 3425 dtype: int64 In [37]: obj4 Out[37]: california NaN Ohio 2000 Oregon 3908 Texas 3000 dtype: float64 In [38]: obj3 + obj4 Out[38]: Ohio 4000 Oregon 7816 Texas 6000 Utah NaN california NaN dtype: float64
In [40]: obj4.name = 'population' In [41]: obj4.index.name = 'state' In [42]: obj4 Out[42]: state california NaN Ohio 2000 Oregon 3908 Texas 3000 Name: population, dtype: float64 In [43]: obj Out[43]: 0 1 1 2 2 3 3 4 dtype: int64 In [45]: obj.index = ['a','b','c','d'] In [46]: obj Out[46]: a 1 b 2 c 3 d 4 dtype: int64
In [16]: data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year':[2000, 2001, 2002, 2003, 2004], 'pop':[1.5, 1.7, 3.6, 2.4, 2.9]} In [17]: data Out[17]: {'pop': [1.5, 1.7, 3.6, 2.4, 2.9], 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2003, 2004]} In [18]: frame01 = pd.DataFrame(data,columns = ['year','state','pop'],index = ['1','2','3','4','5']) In [19]: frame01 Out[19]: year state pop 1 2000 Ohio 1.5 2 2001 Ohio 1.7 3 2002 Ohio 3.6 4 2003 Nevada 2.4 5 2004 Nevada 2.9
In [31]: frame01['year'] Out[31]: 1 2000 2 2001 3 2002 4 2003 5 2004 Name: year, dtype: int64 In [32]: frame01['pop'] Out[32]: 1 1.5 2 1.7 3 3.6 4 2.4 5 2.9 Name: pop, dtype: float64 In [33]: frame01.state Out[33]: 1 Ohio 2 Ohio 3 Ohio 4 Nevada 5 Nevada Name: state, dtype: object
In [5]: frame01.ix[1] Out[5]: year 2001 state Ohio pop 1.7 Name: 2, dtype: object In [6]: frame01.ix['1'] Out[6]: year 2000 state Ohio pop 1.5 Name: 1, dtype: object
In [35]: frame02 Out[35]: state year pop one Ohio 2000 1.5 two Ohio 2001 1.7 three Ohio 2002 3.6 four Nevada 2003 2.4 five Nevada 2004 2.9 In [36]: frame02.state Out[36]: one Ohio two Ohio three Ohio four Nevada five Nevada Name: state, dtype: object In [37]: frame02.year Out[37]: one 2000 two 2001 three 2002 four 2003 five 2004 Name: year, dtype: int64
In [22]: frame Out[22]: state year pop one Ohio 1 1 two Ohio 1 1 three Ohio 1 1 four Nevada 1 1 five Nevada 1 1 In [23]: frame.year = np.arange(2011,2016) In [24]: frame.pop = np.random.randn(5) In [25]: frame Out[25]: state year pop one Ohio 2011 0.968678 two Ohio 2012 0.280024 three Ohio 2013 -0.396301 four Nevada 2014 -0.132369 five Nevada 2015 -0.181226 In [40]: frame Out[40]: state year pop debt one Ohio 1 1 NaN two Ohio 1 1 NaN three Ohio 1 1 NaN four Nevada 1 1 NaN five Nevada 1 1 NaN In [41]: val = pd.Series([-1.2,-1.5,-1.7], index = ['two', 'four', 'five']) In [42]: frame['dedt'] = val In [43]: frame Out[43]: state year pop debt dedt one Ohio 1 1 NaN NaN two Ohio 1 1 NaN -1.2 three Ohio 1 1 NaN NaN four Nevada 1 1 NaN -1.5 five Nevada 1 1 NaN -1.7
In [44]: frame['newline'] = frame.state != 'Ohio' In [45]: frame Out[45]: state year pop debt dedt newline one Ohio 1 1 NaN NaN False two Ohio 1 1 NaN -1.2 False three Ohio 1 1 NaN NaN False four Nevada 1 1 NaN -1.5 True five Nevada 1 1 NaN -1.7 True In [46]: del frame['newline'] In [47]: frame Out[47]: state year pop debt dedt one Ohio 1 1 NaN NaN two Ohio 1 1 NaN -1.2 three Ohio 1 1 NaN NaN four Nevada 1 1 NaN -1.5 five Nevada 1 1 NaN -1.7
In [69]: frame.columns.name = 'state' In [70]: frame.index.name = 'myname' In [71]: frame.columns.name = 'state' In [72]: frame Out[72]: state state year pop debt dedt myname one Ohio 1 1 NaN NaN two Ohio 1 1 NaN -1.2 three Ohio 1 1 NaN NaN four Nevada 1 1 NaN -1.5 five Nevada 1 1 NaN -1.7
In [73]: frame.values Out[73]: array([['Ohio', '1', '1', nan, nan], ['Ohio', '1', '1', nan, -1.2], ['Ohio', '1', '1', nan, nan], ['Nevada', '1', '1', nan, -1.5], ['Nevada', '1', '1', nan, -1.7]], dtype=object)
n [4]: obj = pd.Series(range(3),index=['a','b','c']) In [5]: obj Out[5]: a 0 b 1 c 2 dtype: int64 In [6]: index = obj.index In [7]: index Out[7]: Index([u'a', u'b', u'c'], dtype='object') In [8]: index[1:] Out[8]: Index([u'b', u'c'], dtype='object') In [9]: index[
n [4]: obj = pd.Series(range(3),index=['a','b','c']) In [5]: obj Out[5]: a 0 b 1 c 2 dtype: int64 In [6]: index = obj.index In [7]: index Out[7]: Index([u'a', u'b', u'c'], dtype='object') In [8]: index[1:] Out[8]: Index([u'b', u'c'], dtype='object')
In [42]: index Out[42]: Index([u'a', u'b', u'c'], dtype='object') In [43]: obj2 = pd.Series(range(3),index = index) In [44]: obj2.index obj2.index In [44]: obj2.index is index Out[44]: True In [45]: index Out[45]: Index([u'a', u'b', u'c'], dtype='object') In [46]: obj2 = pd.Series(range(3),index = index) In [47]: obj2.index is index Out[47]: True
In [48]: index2=index In [49]: index2 Out[49]: Index([u'a', u'b', u'c'], dtype='object') In [50]: obj2.index is index2 Out[50]: True In [51]: index3 = ['a','b','c'] In [52]: index3 is index Out[52]: False In [53]: obj2.index is index3 Out[53]: False
In [28]: frame Out[28]: Nevada Ohio 2000 2.2 1.5 2001 2.1 1.7 2002 2.1 3.6 In [29]: 'Ohio' in frame.columns Out[29]: True In [30]: 2002 in frame.index Out[30]: True In [31]: 1999 in frame.index Out[31]: False
In [38]: obj = pd.Series([4.5,7.2,-5.3,3.8], index = ['d','b','a','c']) In [39]: obj Out[39]: d 4.5 b 7.2 a -5.3 c 3.8 dtype: float64 In [40]: obj2 = obj.reindex(['a','b','c','d','e']) In [41]: obj2 Out[41]: a -5.3 b 7.2 c 3.8 d 4.5 e NaN dtype: float64
In [42]: obj2 = obj.reindex(['a','b','c','d','e'],fill_value = 0) In [43]: obj2 Out[43]: a -5.3 b 7.2 c 3.8 d 4.5 e 0.0 dtype: float64
In [44]: obj3 = pd.Series(['blue','purple','yellow'],index=[0,2,4]) In [45]: obj3.reindex(range(6),method='ffill') Out[45]: 0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object
In [47]: frame = pd.DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'], ....: columns=['Ohio','Texas','California']) In [48]: frame Out[48]: Ohio Texas California a 0 1 2 c 3 4 5 d 6 7 8 In [49]: frame2 = frame.reindex(['a','b','c','d']) In [50]: frame2 Out[50]: Ohio Texas California a 0 1 2 b NaN NaN NaN c 3 4 5 d 6 7 8 In [51]: states = ['Texas','Utah','California'] In [52]: frame.reindex(columns=states) Out[52]: Texas Utah California a 1 NaN 2 c 4 NaN 5 d 7 NaN 8
In [54]: frame.reindex(index=['a','b','c','d'],method = 'ffill', ....: columns = states) Out[54]: Texas Utah California a 1 NaN 2 b 1 NaN 2 c 4 NaN 5 d 7 NaN 8
In [56]: frame.ix[['a','c'],'Texas'] Out[56]: a 1 c 4 Name: Texas, dtype: int64
In [58]: obj = pd.Series(np.arange(5.),index=['a','b','c','d','e']) In [59]: new_obj = obj.drop('c') In [60]: obj Out[60]: a 0 b 1 c 2 d 3 e 4 dtype: float64 In [61]: new_obj Out[61]: a 0 b 1 d 3 e 4 dtype: float64
In [68]: data = pd.DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'], columns=['one','two','three','four']) In [69]: data Out[69]: one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15 In [70]: data.drop(['Colorado','Ohio']) Out[70]: one two three four Utah 8 9 10 11 New York 12 13 14 15 In [71]: data.drop('two',axis=1) Out[71]: one three four Ohio 0 2 3 Colorado 4 6 7 Utah 8 10 11 New York 12 14 15 In [72]: data.drop(['one','four'],axis=1) Out[72]: two three Ohio 1 2 Colorado 5 6 Utah 9 10 New York 13 14
In [3]: obj = pd.Series(np.arange(4.),index=['a','b','c','d']) In [4]: obj Out[4]: a 0 b 1 c 2 d 3 dtype: float64 In [5]: obj['b'] Out[5]: 1.0 In [6]: obj[1] Out[6]: 1.0 In [7]: obj[2:4] Out[7]: c 2 d 3 dtype: float64 In [8]: obj[['b','d']] Out[8]: b 1 d 3 dtype: float64 In [9]: obj[['1','3']] Out[9]: 1 NaN 3 NaN dtype: float64 In [10]: obj[[1,3]] Out[10]: b 1 d 3 dtype: float64 In [11]: obj[obj<2] Out[11]: a 0 b 1 dtype: float64
In [18]: obj[0:3] Out[18]: a 0 b 1 c 2 dtype: float64 In [19]: obj['a':'d'] Out[19]: a 0 b 1 c 2 d 3 dtype: float64
In [21]: obj['a':'c'] = 1 In [22]: obj Out[22]: a 1 b 1 c 1 d 3 dtype: float64
In [24]: data = pd.DataFrame(np.arange(16).reshape((4,4)),index = ['Ohio','Colorado','Utah','New York'], ....: columns=['one','two','three','four']) In [25]: data Out[25]: one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15 [4 rows x 4 columns] In [26]: data[:2] Out[26]: one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 [2 rows x 4 columns] In [27]: data[data['three'] > 5] Out[27]: one two three four Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15 [3 rows x 4 columns]
In [38]: data < 5 Out[38]: one two three four Ohio True True True True Colorado True False False False Utah False False False False New York False False False False [4 rows x 4 columns] In [39]: data[data < 5] = 0 In [40]: data Out[40]: one two three four Ohio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 New York 12 13 14 15 [4 rows x 4 columns]
In [44]: data.ix['Colorado',['tow','three']] Out[44]: tow NaN three 6 Name: Colorado, dtype: float64 In [45]: data.ix['Colorado',['two','three']] Out[45]: two 5 three 6 Name: Colorado, dtype: int64 In [46]: data.ix[['Colorado','Utah'],[3,1,0]] Out[46]: four two one Colorado 7 5 0 Utah 11 9 8 [2 rows x 3 columns] In [47]: data.ix[2] Out[47]: one 8 two 9 three 10 four 11 Name: Utah, dtype: int64 In [48]: data.ix[:'Utah','two'] Out[48]: Ohio 0 Colorado 5 Utah 9 Name: two, dtype: int64 In [49]: data.ix[data.three > 5,:3] Out[49]: one two three Colorado 0 5 6 Utah 8 9 10 New York 12 13 14 [3 rows x 3 columns]
In [51]: s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a','c','d','e']) In [52]: s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a','c','e','f','g']) In [53]: s1 Out[53]: a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64 In [54]: s2 Out[54]: a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64 In [55]: s1 + s2 Out[55]: a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64
In [61]: df1 = pd.DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'), index = ['Ohio','Texas','Colorado']) In [62]: df2 = pd.DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'), index = ['Utah','Ohio','Texas','Oregon']) In [63]: df1 Out[63]: b c d Ohio 0 1 2 Texas 3 4 5 Colorado 6 7 8 [3 rows x 3 columns] In [64]: df2 Out[64]: b d e Utah 0 1 2 Ohio 3 4 5 Texas 6 7 8 Oregon 9 10 11 [4 rows x 3 columns] In [65]: df1 + df2 Out[65]: b c d e Colorado NaN NaN NaN NaN Ohio 3 NaN 6 NaN Oregon NaN NaN NaN NaN Texas 9 NaN 12 NaN Utah NaN NaN NaN NaN [5 rows x 4 columns]
In [66]: df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd')) In [67]: df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde')) In [68]: df1 + df2 Out[68]: a b c d e 0 0 2 4 6 NaN 1 9 11 13 15 NaN 2 18 20 22 24 NaN 3 NaN NaN NaN NaN NaN [4 rows x 5 columns] In [69]: df1.add(df2,fill_value=0) Out[69]: a b c d e 0 0 2 4 6 4 1 9 11 13 15 9 2 18 20 22 24 14 3 15 16 17 18 19 [4 rows x 5 columns]
In [72]: df1.reindex(columns=df2.columns,fill_value=0) Out[72]: a b c d e 0 0 1 2 3 0 1 4 5 6 7 0 2 8 9 10 11 0 [3 rows x 5 columns]
In [73]: arr = np.arange(12.).reshape((3,4)) In [74]: arr Out[74]: array([[ 0., 1., 2., 3.], [ 4., 5., 6., 7.], [ 8., 9., 10., 11.]]) In [75]: arr[0] Out[75]: array([ 0., 1., 2., 3.]) In [76]: array([0.,1.,2.,3.]) Out[76]: array([ 0., 1., 2., 3.]) In [77]: arr - arr[0] Out[77]: array([[ 0., 0., 0., 0.], [ 4., 4., 4., 4.], [ 8., 8., 8., 8.]])
In [78]: frame = pd.DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon']) In [79]: series = frame.ix[0] In [80]: frame Out[80]: b d e Utah 0 1 2 Ohio 3 4 5 Texas 6 7 8 Oregon 9 10 11 [4 rows x 3 columns] In [81]: series Out[81]: b 0 d 1 e 2 Name: Utah, dtype: float64 In [82]: frame - series Out[82]: b d e Utah 0 0 0 Ohio 3 3 3 Texas 6 6 6 Oregon 9 9 9 [4 rows x 3 columns]
In [89]: series2 = pd.Series(range(3),index=['b','e','f']) In [90]: frame + series2 Out[90]: b d e f Utah 0 NaN 3 NaN Ohio 3 NaN 6 NaN Texas 6 NaN 9 NaN Oregon 9 NaN 12 NaN [4 rows x 4 columns]
In [98]: series3 = frame['d'] In [99]: frame Out[99]: b d e Utah 0 1 2 Ohio 3 4 5 Texas 6 7 8 Oregon 9 10 11 [4 rows x 3 columns] In [100]: series3 Out[100]: Utah 1 Ohio 4 Texas 7 Oregon 10 Name: d, dtype: float64 In [101]: frame.sub(series3,axis=0) Out[101]: b d e Utah -1 0 1 Ohio -1 0 1 Texas -1 0 1 Oregon -1 0 1 [4 rows x 3 columns]
<span style="font-family:Microsoft Yahei, Helvetica Neue, Helvetica, Arial, sans-serif;color:#333333;"><span style="line-height: 24px;">In [102]: frame = pd.DataFrame(np.random.randn(4,3),columns=list('bde'), .....: index=['Utah','Ohio','Texas','Oregon']) In [103]: frame Out[103]: b d e Utah 0.752307 -1.701850 0.231538 Ohio -1.156549 -0.116461 -0.224967 Texas -1.919040 0.287937 0.580048 Oregon -1.817225 1.545293 -0.606694 [4 rows x 3 columns] In [104]: np.abs(frame) Out[104]: b d e Utah 0.752307 1.701850 0.231538 Ohio 1.156549 0.116461 0.224967 Texas 1.919040 0.287937 0.580048 Oregon 1.817225 1.545293 0.606694 [4 rows x 3 columns]</span></span>
lambda只是一个表达式,函数体比def简单很多。
lambda的主体是一个表达式,而不是一个代码块。仅仅能在lambda表达式中封装有限的逻辑进去。
lambda表达式是起到一个函数速写的作用。允许在代码内嵌入一个函数的定义。
In [106]: f = lambda x:x.max() - x.min() In [107]: frame.apply(f) Out[107]: b 2.671347 d 3.247143 e 1.186742 dtype: float64 In [108]: frame.apply(f,axis=1) Out[108]: Utah 2.454157 Ohio 1.040089 Texas 2.499088 Oregon 3.362518 dtype: float64
In [120]: def f(x): return pd.Series([x.min(),x.max()],index=['min','max']) .....: In [121]: frame.apply(f) Out[121]: b d e min -1.919040 -1.701850 -0.606694 max 0.752307 1.545293 0.580048 [2 rows x 3 columns]
In [123]: format = lambda x:'%.2f' % x In [124]: frame.applymap(format) Out[124]: b d e Utah 0.75 -1.70 0.23 Ohio -1.16 -0.12 -0.22 Texas -1.92 0.29 0.58 Oregon -1.82 1.55 -0.61 [4 rows x 3 columns]
In [125]: frame['e'].map(format) Out[125]: Utah 0.23 Ohio -0.22 Texas 0.58 Oregon -0.61 Name: e, dtype: object
In [127]: obj = pd.Series(range(4),index = ['b','a','d','c']) In [128]: obj.sort_index() Out[128]: a 1 b 0 c 3 d 2 dtype: int64
In [132]: frame Out[132]: d a b c three 0 1 2 3 one 4 5 6 7 [2 rows x 4 columns] In [133]: frame.sort_index() Out[133]: d a b c one 4 5 6 7 three 0 1 2 3 [2 rows x 4 columns] In [134]: frame.sort_index(axis=1) Out[134]: a b c d three 1 2 3 0 one 5 6 7 4 [2 rows x 4 columns]
In [135]: frame.sort_index(axis=1,ascending=False) Out[135]: d c b a three 0 3 2 1 one 4 7 6 5 [2 rows x 4 columns]
In [137]: obj = pd.Series([4,7,-3,2]) In [138]: obj.order() Out[138]: 2 -3 3 2 0 4 1 7 dtype: int64
In [139]: obj = pd.Series([4,np.nan,7,np.nan,-3,2]) In [140]: obj.order() Out[140]: 4 -3 5 2 0 4 2 7 1 NaN 3 NaN dtype: float64
In [141]: frame = pd.DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]}) In [142]: frame Out[142]: a b 0 0 4 1 1 7 2 0 -3 3 1 2 [4 rows x 2 columns] In [143]: frame.sort_index(by='b') Out[143]: a b 2 0 -3 3 1 2 0 0 4 1 1 7 [4 rows x 2 columns] In [144]: frame.sort_index(by=['a','b']) Out[144]: a b 2 0 -3 0 0 4 3 1 2 1 1 7 [4 rows x 2 columns]
In [153]: obj = pd.Series([7,-5,7,4,2,0,4]) In [154]: obj.rank() Out[154]: 0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64
In [155]: obj.rank(method='first') Out[155]: 0 6 1 1 2 7 3 4 4 3 5 2 6 5 dtype: float64 In [156]: obj.rank(ascending=False,method='max') Out[156]: 0 2 1 7 2 2 3 4 4 5 5 6 6 4 dtype: float64
In [157]: obj = pd.Series(range(5),index = ['a','a','b','b','c']) In [158]: obj Out[158]: a 0 a 1 b 2 b 3 c 4 dtype: int64 In [159]: obj.index.is_unique Out[159]: False
In [161]: obj['a'] Out[161]: a 0 a 1 dtype: int64 In [162]: obj['b'] Out[162]: b 2 b 3 dtype: int64
In [163]: df = pd.DataFrame(np.random.randn(4,3),index=['a','a','b','b']) In [164]: df Out[164]: 0 1 2 a 0.788165 -0.046659 0.781164 a -1.329601 0.245623 0.074344 b 0.126223 0.141285 -2.280768 b 1.605712 -0.450426 -0.083235 [4 rows x 3 columns] In [165]: df.ix['b'] Out[165]: 0 1 2 b 0.126223 0.141285 -2.280768 b 1.605712 -0.450426 -0.083235 [2 rows x 3 columns]
In [54]: df = pd.DataFrame([[1.4,np.nan],[7.1,-4.5], [np.nan,np.nan],[0.75,-1.3]],index = ['a','b','c','d'], ....: columns=['one','two']) In [55]: df Out[55]: one two a 1.40 NaN b 7.10 -4.5 c NaN NaN d 0.75 -1.3 [4 rows x 2 columns] In [56]: df.sum() Out[56]: one 9.25 two -5.80 dtype: float64 In [57]: df.sum(1) Out[57]: a 1.40 b 2.60 c NaN d -0.55 dtype: float64 In [58]: df.mean(axis=1,skipna=False) Out[58]: a NaN b 1.300 c NaN d -0.275 dtype: float64
In [64]: df.idxmax() Out[64]: one b two d dtype: object In [65]: df.cumsum() Out[65]: one two a 1.40 NaN b 8.50 -4.5 c NaN NaN d 9.25 -5.8 [4 rows x 2 columns]
In [67]: df.describe() Out[67]: one two count 3.000000 2.000000 mean 3.083333 -2.900000 std 3.493685 2.262742 min 0.750000 -4.500000 25% 1.075000 -3.700000 50% 1.400000 -2.900000 75% 4.250000 -2.100000 max 7.100000 -1.300000
In [69]: obj = pd.Series(['a','a','b','c']*4) In [70]: obj Out[70]: 0 a 1 a 2 b 3 c 4 a 5 a 6 b 7 c 8 a 9 a 10 b 11 c 12 a 13 a 14 b 15 c dtype: object In [71]: obj.describe() Out[71]: count 16 unique 3 top a freq 8 dtype: object
#!/usr/bin/env python # coding=utf-8 import pandas as pd import pandas.io.data as web all_data = {} for ticker in ['AAPL','IBM','MSFT','GOOG']: all_data[ticker] = web.get_data_yahoo(ticker,'1/1/2000','1/1/2010') price = pd.DataFrame({tic:data['Adj Close'] for tic,data in all_data.iteritems()}) volume = pd.DataFrame({tic:data['Volume'] for tic, data in all_data.iteritems()})
In [33]: returns = price.pct_change() In [34]: returns.tail() Out[34]: AAPL GOOG IBM MSFT Date 2009-12-24 0.034339 0.011117 0.004385 0.002587 2009-12-28 0.012294 0.007098 0.013326 0.005484 2009-12-29 -0.011861 -0.005571 -0.003477 0.007058 2009-12-30 0.012147 0.005376 0.005461 -0.013699 2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504 [5 rows x 4 columns] In [35]: returns.MSFT.corr(returns.IBM) Out[35]: 0.49597969625135241 In [36]: returns.MSFT.cov(returns.IBM) Out[36]: 0.00021595764700046635
In [38]: returns.corr() Out[38]: AAPL GOOG IBM MSFT AAPL 1.000000 0.470676 0.410011 0.424305 GOOG 0.470676 1.000000 0.390689 0.443587 IBM 0.410011 0.390689 1.000000 0.495980 MSFT 0.424305 0.443587 0.495980 1.000000 [4 rows x 4 columns] In [39]: returns.cov() Out[39]: AAPL GOOG IBM MSFT AAPL 0.001027 0.000303 0.000252 0.000309 GOOG 0.000303 0.000580 0.000142 0.000205 IBM 0.000252 0.000142 0.000367 0.000216 MSFT 0.000309 0.000205 0.000216 0.000516 [4 rows x 4 columns]
In [40]: returns.corrwith(returns.IBM) Out[40]: AAPL 0.410011 GOOG 0.390689 IBM 1.000000 MSFT 0.495980 dtype: float64
In [43]: returns.corrwith(volume) Out[43]: AAPL -0.057549 GOOG 0.062647 IBM -0.007892 MSFT -0.014245 dtype: float64
In [44]: obj = pd.Series(['c','a','d','a','a','b','b','c','c']) In [47]: uniques = obj.unique() In [48]: uniques Out[48]: array(['c', 'a', 'd', 'b'], dtype=object)
In [50]: uniques.sort() In [51]: uniques Out[51]: array(['a', 'b', 'c', 'd'], dtype=object) In [52]: obj.value_counts() Out[52]: c 3 a 3 b 2 d 1 dtype: int64
In [53]: pd.value_counts(obj.values,sort=False) Out[53]: a 3 c 3 b 2 d 1 dtype: int64
In [55]: mask = obj.isin(['a','c']) In [56]: mask Out[56]: 0 True 1 True 2 False 3 True 4 True 5 False 6 False 7 True 8 True dtype: bool In [57]: obj[mask] Out[57]: 0 c 1 a 3 a 4 a 7 c 8 c dtype: object
In [58]: data = pd.DataFrame({'Qu1':[1,3,4,3,4], ....: 'Qu2':[2,3,1,2,3], ....: 'Qu3':[1,5,2,4,4]}) In [59]: data Out[59]: Qu1 Qu2 Qu3 0 1 2 1 1 3 3 5 2 4 1 2 3 3 2 4 4 4 3 4 [5 rows x 3 columns] In [60]: result = data.apply(pd.value_counts).fillna(0) In [61]: result Out[61]: Qu1 Qu2 Qu3 1 1 1 1 2 0 2 1 3 2 2 0 4 2 0 2 5 0 0 1 [5 rows x 3 columns]
In [66]: string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado']) In [67]: string_data Out[67]: 0 aardvark 1 artichoke 2 NaN 3 avocado dtype: object In [69]: string_data.isnull() Out[69]: 0 False 1 False 2 True 3 False dtype: bool In [70]: string_data[0] = None In [71]: string_data.isnull() Out[71]: 0 True 1 False 2 True 3 False dtype: bool
In [72]: from numpy import nan as NA In [73]: data = pd.Series([1,NA,3.5,NA,7]) In [74]: data.dropna() Out[74]: 0 1.0 2 3.5 4 7.0 dtype: float64 In [75]: data[data.notnull()] Out[75]: 0 1.0 2 3.5 4 7.0 dtype: float64
In [76]: data = pd.DataFrame([[1.,6.5,3.],[1.,NA,NA], ....: [NA,NA,NA],[NA,6.5,3.]]) In [77]: cleaned = data.dropna() In [78]: data Out[78]: 0 1 2 0 1 6.5 3 1 1 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3 [4 rows x 3 columns] In [79]: cleaned Out[79]: 0 1 2 0 1 6.5 3 [1 rows x 3 columns] In [80]: data.dropna(how='all') Out[80]: 0 1 2 0 1 6.5 3 1 1 NaN NaN 3 NaN 6.5 3 [3 rows x 3 columns] In [81]: data.dropna(axis=1,how='all') Out[81]: 0 1 2 0 1 6.5 3 1 1 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3 [4 rows x 3 columns]
In [90]: df.ix[:4,1] = NA;df.ix[:3,2] = NA In [91]: df Out[91]: 0 1 2 0 0.739899 NaN NaN 1 0.124309 NaN NaN 2 0.960898 NaN NaN 3 -0.059859 NaN NaN 4 -1.917175 NaN -1.122671 5 -0.073825 1.384257 -1.266332 6 0.957087 0.185031 -1.241783 [7 rows x 3 columns] In [92]: df.dropna(thresh=4) Out[92]: Empty DataFrame Columns: [0, 1, 2] Index: [] [0 rows x 3 columns] In [93]: df.dropna(thresh=3) Out[93]: 0 1 2 5 -0.073825 1.384257 -1.266332 6 0.957087 0.185031 -1.241783 [2 rows x 3 columns]
In [97]: df.fillna(0) Out[97]: 0 1 2 0 0.739899 0.000000 0.000000 1 0.124309 0.000000 0.000000 2 0.960898 0.000000 0.000000 3 -0.059859 0.000000 0.000000 4 -1.917175 0.000000 -1.122671 5 -0.073825 1.384257 -1.266332 6 0.957087 0.185031 -1.241783 [7 rows x 3 columns] In [99]: df.fillna({1:0.5,2:-1}) Out[99]: 0 1 2 0 0.739899 0.500000 -1.000000 1 0.124309 0.500000 -1.000000 2 0.960898 0.500000 -1.000000 3 -0.059859 0.500000 -1.000000 4 -1.917175 0.500000 -1.122671 5 -0.073825 1.384257 -1.266332 6 0.957087 0.185031 -1.241783 [7 rows x 3 columns]
In [100]: _ = df.fillna(0,inplace=True) In [101]: df Out[101]: 0 1 2 0 0.739899 0.000000 0.000000 1 0.124309 0.000000 0.000000 2 0.960898 0.000000 0.000000 3 -0.059859 0.000000 0.000000 4 -1.917175 0.000000 -1.122671 5 -0.073825 1.384257 -1.266332 6 0.957087 0.185031 -1.241783 [7 rows x 3 columns]
In [105]: data = pd.Series(np.random.randn(10), .....: index = [['a','a','a','b','b','b','c','c','d','d'], .....: [1,2,3,1,2,3,1,2,2,3]]) In [106]: data Out[106]: a 1 -0.237206 2 -0.992311 3 1.685961 b 1 0.987261 2 -1.166006 3 -0.962065 c 1 -1.071484 2 0.393728 d 2 0.793652 3 -0.223266 dtype: float64 In [107]: data.index Out[107]: MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]]) In [108]: data['b'] Out[108]: 1 0.987261 2 -1.166006 3 -0.962065 dtype: float64
In [109]: data[:,2] Out[109]: a -0.992311 b -1.166006 c 0.393728 d 0.793652 dtype: float64
In [110]: data.unstack() Out[110]: 1 2 3 a -0.237206 -0.992311 1.685961 b 0.987261 -1.166006 -0.962065 c -1.071484 0.393728 NaN d NaN 0.793652 -0.223266 [4 rows x 3 columns] In [111]: data.unstack().stack() Out[111]: a 1 -0.237206 2 -0.992311 3 1.685961 b 1 0.987261 2 -1.166006 3 -0.962065 c 1 -1.071484 2 0.393728 d 2 0.793652 3 -0.223266 dtype: float64
In [113]: frame = pd.DataFrame(np.arange(12).reshape((4,3)), index = [['a','a','b','b'],[1,2,1,2]], columns = [['Ohio',"Ohio",'Colorado'], ['Green','Red','Green']]) In [114]: frame Out[114]: Ohio Colorado Green Red Green a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11 [4 rows x 3 columns] In [115]: frame.index.names = ['key1','key2'] In [116]: frame.columns.names = ['state','color'] In [117]: frame Out[117]: state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11 [4 rows x 3 columns]
In [125]: col = pd.MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Green']],names=['state','color']) In [126]: frame2 = pd.DataFrame(np.arange(12).reshape((4,3)), index = [['a','a','b','b'],[1,2,1,2]], columns = col ) In [127]: frame2 Out[127]: state Ohio Colorado color Green Red Green a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11 [4 rows x 3 columns]
In [129]: frame.swaplevel('key1','key2') Out[129]: state Ohio Colorado color Green Red Green key2 key1 1 a 0 1 2 2 a 3 4 5 1 b 6 7 8 2 b 9 10 11 [4 rows x 3 columns]
In [130]: frame.sortlevel(1) Out[130]: state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 b 1 6 7 8 a 2 3 4 5 b 2 9 10 11 [4 rows x 3 columns] In [131]: frame Out[131]: state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11 [4 rows x 3 columns] In [132]: frame.sortlevel(0) Out[132]: state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11 [4 rows x 3 columns]
In [134]: frame.swaplevel(0,1) Out[134]: state Ohio Colorado color Green Red Green key2 key1 1 a 0 1 2 2 a 3 4 5 1 b 6 7 8 2 b 9 10 11 [4 rows x 3 columns] In [135]: frame Out[135]: state Ohio Colorado color Green Red Green key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11 [4 rows x 3 columns] In [136]: frame.swaplevel(0,1).sortlevel(0) Out[136]: state Ohio Colorado color Green Red Green key2 key1 1 a 0 1 2 b 6 7 8 2 a 3 4 5 b 9 10 11 [4 rows x 3 columns]
In [137]: frame.sum(level='key2') Out[137]: state Ohio Colorado color Green Red Green key2 1 6 8 10 2 12 14 16 [2 rows x 3 columns] In [138]: frame.sum(level='color',axis=1) Out[138]: color Green Red key1 key2 a 1 2 1 2 8 4 b 1 14 7 2 20 10 [4 rows x 2 columns]
In [139]: frame = pd.DataFrame({'a':range(7),'b':range(7,0,-1), .....: 'c':['one','one','one','two','two','two','two'], .....: 'd':[0,1,2,0,1,2,3]}) In [140]: frame Out[140]: a b c d 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 3 3 4 two 0 4 4 3 two 1 5 5 2 two 2 6 6 1 two 3 [7 rows x 4 columns] In [141]: frame2 = frame.set_index(['c','d']) In [142]: frame2 Out[142]: a b c d one 0 0 7 1 1 6 2 2 5 two 0 3 4 1 4 3 2 5 2 3 6 1 [7 rows x 2 columns]
In [143]: frame.set_index(['c','d'],drop=False) Out[143]: a b c d c d one 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 two 0 3 4 two 0 1 4 3 two 1 2 5 2 two 2 3 6 1 two 3 [7 rows x 4 columns] In [144]: frame2.reset_index() Out[144]: c d a b 0 one 0 0 7 1 one 1 1 6 2 one 2 2 5 3 two 0 3 4 4 two 1 4 3 5 two 2 5 2 6 two 3 6 1 [7 rows x 4 columns]
标签:
原文地址:http://blog.csdn.net/peerslee/article/details/51345369