Python练手，pandas

时间：2018-01-19 11:45:23 阅读：205 评论：0 收藏：0 [点我收藏+]

''' http://pandas.pydata.org/pandas-docs/stable/10min.html numpy的主要数据结构是ndarry pandas的主要数据结构是Series、DataFrame ''' import pandas as pd import numpy as np import matplotlib.pyplot as plt df1 = pd.DataFrame(np.array(range(101,125)).reshape(6,4), index=range(6), columns=list('ABCD')) print(df1) # A B C D # 0 101 102 103 104 # 1 105 106 107 108 # 2 109 110 111 112 # 3 113 114 115 116 # 4 117 118 119 120 # 5 121 122 123 124 df2 = pd.DataFrame({'custID':['C0001','C0002','C0004','C0004','C0004','C0003'], 'accountID':pd.Series(['6214C000101', '6214C000201', '6214C000401', '6214C000403', '6214C000402', '6214C000301'],index=range(6),dtype='str'), 'tradeDate':pd.Series(['2018-01-18 14:00:00', '2018-01-18 14:00:00', '2018-01-18 14:00:01', '2018-01-18 14:00:03', '2018-01-18 14:00:02', '2018-01-18 14:00:00'],index=range(6),dtype='str'), 'tradeAmt':pd.Series([100.0, 100.0, 101.0, 103.0, 102.0, 100.0],index=range(6),dtype='float'), 'tradeDesc':'xxxxxx', 'mark':pd.Categorical(["row1","row2","row3","row4","row5","row6"])}, index=range(6)) #注意：表DateFrame与列Series的索引保持一致。DateFrame的真实index默认是从0开始的，这里设置的其实是index的标签，如果自定义了DateFrame的index（标签），假如某列是Series，那么Series的index也必须保持一致，否则会错位。 print(df2) # accountID custID mark tradeAmt tradeDate tradeDesc # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx # 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx print(df2.dtypes) # accountID object # custID object # mark category # tradeAmt float64 # tradeDate object # tradeDesc object # dtype: object print(df2.index) # RangeIndex(start=0, stop=6, step=1) print(df2.columns) # Index(['accountID', 'custID', 'mark', 'tradeAmt', 'tradeDate', 'tradeDesc'], dtype='object') print(df2.values) # [['6214C000101' 'C0001' 'row1' 100.0 '2018-01-18 14:00:00' 'xxxxxx'] # ['6214C000201' 'C0002' 'row2' 100.0 '2018-01-18 14:00:00' 'xxxxxx'] # ['6214C000401' 'C0004' 'row3' 101.0 '2018-01-18 14:00:01' 'xxxxxx'] # ['6214C000403' 'C0004' 'row4' 103.0 '2018-01-18 14:00:03' 'xxxxxx'] # ['6214C000402' 'C0004' 'row5' 102.0 '2018-01-18 14:00:02' 'xxxxxx'] # ['6214C000301' 'C0003' 'row6' 100.0 '2018-01-18 14:00:00' 'xxxxxx']] print(df2.head(2)) # accountID custID mark tradeAmt tradeDate tradeDesc # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx print(df2.tail(2)) # accountID custID mark tradeAmt tradeDate tradeDesc # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx # 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx print(df2.describe()) #统计，但仅限数值的列，非数值的列不会输出统计 # tradeAmt # count 6.000000 # mean 101.000000 # std 1.264911 # min 100.000000 # 25% 100.000000 # 50% 100.500000 # 75% 101.750000 # max 103.000000 print(df2.T) # 0 1 2 # accountID 6214C000101 6214C000201 6214C000401 # custID C0001 C0002 C0004 # mark row1 row2 row3 # tradeAmt 100 100 101 # tradeDate 2018-01-18 14:00:00 2018-01-18 14:00:00 2018-01-18 14:00:01 # tradeDesc xxxxxx xxxxxx xxxxxx # # 3 4 5 # accountID 6214C000403 6214C000402 6214C000301 # custID C0004 C0004 C0003 # mark row4 row5 row6 # tradeAmt 103 102 100 # tradeDate 2018-01-18 14:00:03 2018-01-18 14:00:02 2018-01-18 14:00:00 # tradeDesc xxxxxx xxxxxx xxxxxx print('------------------------------------------------------------------------------------') print(df2.sort_values(by='tradeDate',ascending=False)) #排序按指定列的值降序 # accountID custID mark tradeAmt tradeDate tradeDesc # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx print(df2.sort_values(by=['custID','tradeDate'],ascending=[True,False])) #联合排序 # accountID custID mark tradeAmt tradeDate tradeDesc # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx print(df2.sort_index(axis=0,ascending=False)) #索引排序按照行的索引 # accountID custID mark tradeAmt tradeDate tradeDesc # 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx print(df2.sort_index(axis=1,ascending=True)) #索引排序按照列的索引（默认是按照列名生成的行索引） # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx # 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx print('------------------------------------------------------------------------------------') ''' iloc按索引查找，loc按标签查找 iat按索引查找，iat按标签查找 ''' print(df2['custID']) # 0 C0001 # 1 C0002 # 2 C0004 # 3 C0004 # 4 C0004 # 5 C0003 # Name: custID, dtype: object print(df2[0:4]) #切片按行索引 # accountID custID mark tradeAmt tradeDate tradeDesc # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx print(df2[1:4]) #切片按行索引 # accountID custID mark tradeAmt tradeDate tradeDesc # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx print(df2.loc[1,'accountID']) #按行列标签查找，不是按行列索引查找 # 6214C000201 print(df2.iloc[3]) #第4行 # accountID 6214C000403 # custID C0004 # mark row4 # tradeAmt 103 # tradeDate 2018-01-18 14:00:03 # tradeDesc xxxxxx # Name: 3, dtype: object print(df2.iloc[3,4]) #第4行第5列 # 2018-01-18 14:00:03 print(df2.iloc[3:4]) #第4至5行（不含第5行） # accountID custID mark tradeAmt tradeDate tradeDesc # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx print(df2.iloc[3:5,1:3]) #第4、5行，第2、3列（列索引如果没有自定义，是按列名排序自动生成的） # custID mark # 3 C0004 row4 # 4 C0004 row5 print(df2.iloc[[3,4],[1,2]]) #第4、5行，第2、3列 # custID mark # 3 C0004 row4 # 4 C0004 row5 print(df2.iloc[3:5,:]) #第4、5行，所有列 # accountID custID mark tradeAmt tradeDate tradeDesc # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx print(df2.iloc[:,1:3]) #所有行，第2、3列 # custID mark # 0 C0001 row1 # 1 C0002 row2 # 2 C0004 row3 # 3 C0004 row4 # 4 C0004 row5 # 5 C0003 row6 print(df2[df2.tradeAmt > 101.0]) #筛选 # accountID custID mark tradeAmt tradeDate tradeDesc # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx print('------------------------------------------------------------------------------------') df3 = df2.copy() df3["custID"] = ["NEW","NEW","NEW","NEW","NEW","NEW"] # 更新整列 df3.loc[:,'tradeAmt'] = range(len(df3)) #更新按行列标签查找 df3.at[range(7)[1],'accountID'] = '===========' # 更新按行列标签查找 df3.iat[0,0] = '+++++++++++' # 更新按行列索引查找 # df3[df3.tradeDate == '2018-01-18 14:00:03'] = -df3 #找出符合条件的行，然后取反，如果所有字段都是数值的话是可以的 print(df3) # accountID custID mark tradeAmt tradeDate tradeDesc # 0 +++++++++++ NEW row1 0 2018-01-18 14:00:00 xxxxxx # 1 =========== NEW row2 1 2018-01-18 14:00:00 xxxxxx # 2 6214C000401 NEW row3 2 2018-01-18 14:00:01 xxxxxx # 3 6214C000403 NEW row4 3 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 NEW row5 4 2018-01-18 14:00:02 xxxxxx # 5 6214C000301 NEW row6 5 2018-01-18 14:00:00 xxxxxx print('------------------------------------------------------------------------------------') df4 = df2.reindex(index=range(4), columns=['custID','accountID','tradeAmt']) #重新组合抽取 df4.loc[0:1,'tradeAmt'] = 200 #如果该列存在，则更新 df4.loc[0:1,'newColumn'] = 1 #如果该列不存在，则新增列 print(df4) # custID accountID tradeAmt newColumn # 0 C0001 6214C000101 200.0 1.0 # 1 C0002 6214C000201 200.0 1.0 # 2 C0004 6214C000401 101.0 NaN # 3 C0004 6214C000403 103.0 NaN print(df4.dropna(how='any')) #过滤所有包含空值的行 # custID accountID tradeAmt newColumn # 0 C0001 6214C000101 200.0 1.0 # 1 C0002 6214C000201 200.0 1.0 print(df4.fillna(value=999)) #填充空值 # custID accountID tradeAmt newColumn # 0 C0001 6214C000101 200.0 1.0 # 1 C0002 6214C000201 200.0 1.0 # 2 C0004 6214C000401 101.0 999.0 # 3 C0004 6214C000403 103.0 999.0 print(pd.isnull(df4)) #判断空值 # custID accountID tradeAmt newColumn # 0 False False False False # 1 False False False False # 2 False False False True # 3 False False False True print('------------------------------------------------------------------------------------') print(df2) # accountID custID mark tradeAmt tradeDate tradeDesc # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx # 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx print(df2.mean()) # tradeAmt 101.0 # dtype: float64 s = pd.Series([1,3,5,np.nan,6,8], index=range(6)).shift(2) # 向后移动几行，前面置空 print(s) # 0 NaN # 1 1.0 # 2 3.0 # 3 5.0 # 4 NaN # 5 6.0 # dtype: float64 print(df2.shift(2)) # accountID custID mark tradeAmt tradeDate tradeDesc # 0 NaN NaN NaN NaN NaN NaN # 1 NaN NaN NaN NaN NaN NaN # 2 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 3 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 4 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 5 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx print('------------------------------------------------------------------------------------') print(df2.apply(lambda x: max(x))) #列函数 lambda或者function都可以 # accountID 6214C000403 # custID C0004 # mark row6 # tradeAmt 103 # tradeDate 2018-01-18 14:00:03 # tradeDesc xxxxxx # dtype: object print('------------------------------------------------------------------------------------') print(df2["custID"].value_counts()) #类似 group by count # C0004 3 # C0001 1 # C0002 1 # C0003 1 # Name: custID, dtype: int64 print('------------------------------------------------------------------------------------') print(df2["mark"].str.upper()) #大小写转换 # 0 ROW1 # 1 ROW2 # 2 ROW3 # 3 ROW4 # 4 ROW5 # 5 ROW6 # Name: mark, dtype: object print('------------------------------------------------------------------------------------') df5 = pd.DataFrame(np.random.randn(9,3)) print(df5) # 0 1 2 # 0 1.303158 -0.125934 -0.205285 # 1 0.760388 -1.004298 1.143800 # 2 2.063722 0.229955 0.020368 # 3 -2.024974 0.307957 -0.579090 # 4 -1.571883 0.260561 -0.884209 # 5 2.465572 -1.001873 1.243028 # 6 0.025388 -0.372608 1.431214 # 7 -0.079416 -0.401075 -0.973337 # 8 -1.088755 -1.947188 -1.100827 pieces = [df5[:2],df5[5:6],df5[7:]] #头、中间、尾，切几块拼起来 print(pieces) # [ 0 1 2 # 0 1.303158 -0.125934 -0.205285 # 1 0.760388 -1.004298 1.143800, 0 1 2 # 5 2.465572 -1.001873 1.243028, 0 1 2 #index重复打印了几次 # 7 -0.079416 -0.401075 -0.973337 # 8 -1.088755 -1.947188 -1.100827] print(pd.concat(pieces)) #包含 # 0 1 2 # 0 1.303158 -0.125934 -0.205285 # 1 0.760388 -1.004298 1.143800 # 5 2.465572 -1.001873 1.243028 # 7 -0.079416 -0.401075 -0.973337 # 8 -1.088755 -1.947188 -1.100827 print('------------------------------------------------------------------------------------') df_left = pd.DataFrame({'key':['001','002','007'],'val':['999','1','2']}) df_right = pd.DataFrame({'key':['001','002','009'],'key2':['001','002','009'],'val':['999','3','4']}) print(df_left) # key val # 0 001 999 # 1 002 1 # 2 007 2 print(df_right) # key key2 val # 0 001 001 999 # 1 002 002 3 # 2 009 009 4 print( pd.merge(df_left, df_right,how='inner', on='key') ) #内关联 # key val_x key2 val_y # 0 001 999 001 999 # 1 002 1 002 3 print( pd.merge(df_left, df_right, how='inner', left_on='key',right_on='key2') ) #内关联不同字段 # key_x val_x key_y key2 val_y # 0 001 999 001 001 999 # 1 002 1 002 002 3 print( pd.merge(df_left, df_right,how='inner', on=['key','val']) ) #内关联多字段 # key val key2 # 0 001 999 001 print( pd.merge(df_left, df_right, how='left', on='key') ) #左外关联 # key val_x key2 val_y # 0 001 999 001 999 # 1 002 1 002 3 # 2 007 2 NaN NaN print( pd.merge(df_left, df_right, how='right', on='key') ) #右外关联 # key val_x key2 val_y # 0 001 999 001 999 # 1 002 1 002 3 # 2 009 NaN 009 4 print('------------------------------------------------------------------------------------') print(df2.append(df2[:3],ignore_index=True)) #对原表做行切片，再追加到原表，追加的时候忽略切片的索引标签，索引自动重新编排标签 # accountID custID mark tradeAmt tradeDate tradeDesc # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx # 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx # 6 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx （这行是追加的） # 7 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx （这行是追加的） # 8 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx （这行是追加的） print(df2.append(df2[:3],ignore_index=False)) #追加之后，保留切片的索引标签，发现了吗，索引标签是允许重复的 # accountID custID mark tradeAmt tradeDate tradeDesc # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx # 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx # 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx # 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx # 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx （这行是追加的） # 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx （这行是追加的） # 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx （这行是追加的） print('------------------------------------------------------------------------------------') tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', #zip()函数，将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) # 多索引标签MultiIndex df6 = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) print(df6) # A B # first second # bar one -0.101234 -0.956210 # two -0.480354 1.308950 # baz one 0.943706 0.976480 # two -0.788852 -1.556547 # foo one 0.997527 -0.337391 # two -0.191448 -0.083129 # qux one -0.919527 -0.414051 # two -0.579727 1.595290 stacked = df6.stack() # 把“行列表结构”变成“堆栈结构”（姑且这样称呼它），把列标签追加到行标签之后 print(stacked) # first second # bar one A -0.101234 # B -0.956210 # two A -0.480354 # B 1.308950 # baz one A 0.943706 # B 0.976480 # two A -0.788852 # B -1.556547 # foo one A 0.997527 # B -0.337391 # two A -0.191448 # B -0.083129 # qux one A -0.919527 # B -0.414051 # two A -0.579727 # B 1.595290 print(stacked["bar"]["one"]["A"]) # “堆栈结构”的好处是，你可以这样访问数据，可以想象“堆栈结构”其实就是多层数组 # dtype: float64 # -0.101233870095 unstacked = stacked.unstack() # 还原回去，把“堆栈结构”变成“行列表结构”，把行标签变成列 print(unstacked) # A B # first second # bar one -0.101234 -0.956210 # two -0.480354 1.308950 # baz one 0.943706 0.976480 # two -0.788852 -1.556547 # foo one 0.997527 -0.337391 # two -0.191448 -0.083129 # qux one -0.919527 -0.414051 # two -0.579727 1.595290 unstacked_unstacked_0 = unstacked.unstack(0) #还能继续吧行标签变成列标签 print(unstacked_unstacked_0) # A B # first bar baz foo qux bar baz foo qux # second # one -0.101234 0.943706 0.997527 -0.919527 -0.95621 0.976480 -0.337391 -0.414051 # two -0.480354 -0.788852 -0.191448 -0.579727 1.30895 -1.556547 -0.083129 1.595290 unstacked_unstacked_1 = unstacked.unstack(1) #还能继续吧行标签变成列标签把第2个标签变成列标签 print(unstacked_unstacked_1) # A B # second one two one two # first # bar -0.101234 -0.480354 -0.956210 1.308950 # baz 0.943706 -0.788852 0.976480 -1.556547 # foo 0.997527 -0.191448 -0.337391 -0.083129 # qux -0.919527 -0.579727 -0.414051 1.595290 print('------------------------------------------------------------------------------------') df7 = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)}) print(df7) # A B C D E # 0 one A foo -0.516297 -0.860641 # 1 one B foo -1.560483 -1.647366 # 2 two C foo 1.124756 0.329971 # 3 three A bar -0.312954 0.040263 # 4 one B bar -1.355079 0.358829 # 5 one C bar 0.749617 0.978513 # 6 two A foo -2.173830 0.434789 # 7 three B foo -1.070213 0.641253 # 8 one C foo -0.515032 0.127273 # 9 one A bar -1.408970 0.025128 # 10 two B bar -0.390044 0.060392 # 11 three C bar 0.067667 0.676595 print( pd.pivot_table(df7, values='D', index=['A', 'B'], columns=['C']) ) #透视表 # C bar foo # A B # one A -1.408970 -0.516297 # B -1.355079 -1.560483 # C 0.749617 -0.515032 # three A -0.312954 NaN # B NaN -1.070213 # C 0.067667 NaN # two A NaN -2.173830 # B -0.390044 NaN # C NaN 1.124756 print('------------------------------------------------------------------------------------') rng = pd.date_range('1/1/2012', periods=10, freq='min') #看结果，是个时间索引DatetimeIndex print(rng) # DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:01:00', # '2012-01-01 00:02:00', '2012-01-01 00:03:00', # '2012-01-01 00:04:00', '2012-01-01 00:05:00', # '2012-01-01 00:06:00', '2012-01-01 00:07:00', # '2012-01-01 00:08:00', '2012-01-01 00:09:00'], # dtype='datetime64[ns]', freq='T') ts = pd.Series(range(10), index=rng) # 时间序列数据 print(ts) # 2012-01-01 00:00:00 0 # 2012-01-01 00:01:00 1 # 2012-01-01 00:02:00 2 # 2012-01-01 00:03:00 3 # 2012-01-01 00:04:00 4 # 2012-01-01 00:05:00 5 # 2012-01-01 00:06:00 6 # 2012-01-01 00:07:00 7 # 2012-01-01 00:08:00 8 # 2012-01-01 00:09:00 9 # Freq: T, dtype: int32 print( ts.resample('5Min').sum() ) #resample()是对时间序列数据进行重新采样的便捷方法 # 2012-01-01 00:00:00 10 # 2012-01-01 00:05:00 35 # Freq: 5T, dtype: int32 ts_utc = ts.tz_localize('UTC') #改变时区标准 UTC世界时 GMT格里尼治时 print( ts_utc ) # 2012-01-01 00:00:00+00:00 0 # 2012-01-01 00:01:00+00:00 1 # 2012-01-01 00:02:00+00:00 2 # 2012-01-01 00:03:00+00:00 3 # 2012-01-01 00:04:00+00:00 4 # 2012-01-01 00:05:00+00:00 5 # 2012-01-01 00:06:00+00:00 6 # 2012-01-01 00:07:00+00:00 7 # 2012-01-01 00:08:00+00:00 8 # 2012-01-01 00:09:00+00:00 9 # Freq: T, dtype: int32 print( ts_utc.tz_convert('US/Eastern') ) #时区转换 # 2011-12-31 19:00:00-05:00 0 # 2011-12-31 19:01:00-05:00 1 # 2011-12-31 19:02:00-05:00 2 # 2011-12-31 19:03:00-05:00 3 # 2011-12-31 19:04:00-05:00 4 # 2011-12-31 19:05:00-05:00 5 # 2011-12-31 19:06:00-05:00 6 # 2011-12-31 19:07:00-05:00 7 # 2011-12-31 19:08:00-05:00 8 # 2011-12-31 19:09:00-05:00 9 # Freq: T, dtype: int32 print( ts.to_period() ) #时间序列显示格式，只显示到你定义的单位 # 2012-01-01 00:00 0 # 2012-01-01 00:01 1 # 2012-01-01 00:02 2 # 2012-01-01 00:03 3 # 2012-01-01 00:04 4 # 2012-01-01 00:05 5 # 2012-01-01 00:06 6 # 2012-01-01 00:07 7 # 2012-01-01 00:08 8 # 2012-01-01 00:09 9 # Freq: T, dtype: int32 print( ts.to_period().to_timestamp() ) #时间序列显示格式，标准时间格式 # 2012-01-01 00:00:00 0 # 2012-01-01 00:01:00 1 # 2012-01-01 00:02:00 2 # 2012-01-01 00:03:00 3 # 2012-01-01 00:04:00 4 # 2012-01-01 00:05:00 5 # 2012-01-01 00:06:00 6 # 2012-01-01 00:07:00 7 # 2012-01-01 00:08:00 8 # 2012-01-01 00:09:00 9 # Freq: T, dtype: int32 print('------------------------------------------------------------------------------------') df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) df["grade"] = df["raw_grade"].astype("category") #创建新的列，支持category类型数据（category是一种类别标签） print( df["grade"] ) # 0 a # 1 b # 2 b # 3 a # 4 a # 5 e # Name: grade, dtype: category df["grade"].cat.categories = ["very good", "good", "very bad"] df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) #重新定义类别，覆盖原来的类别 print( df["grade"] ) # 0 very good # 1 good # 2 good # 3 very good # 4 very good # 5 very bad # Name: grade, dtype: category # Categories (5, object): [very bad, bad, medium, good, very good] print( df.groupby("grade").size() ) #按类别统计 # grade # very bad 1 # bad 0 # medium 0 # good 2 # very good 3 # dtype: int64 print('------------------------------------------------------------------------------------') ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) #1000日的时间序列+随机数 ts = ts.cumsum() #累加统计 print(ts) ts.plot() #有的环境到这步就显式了 plt.show() #有的要导入matplotlib.pyplot模块，这样开启图像显示 #图像是一条曲线，X轴：1000日，y轴：每日的累加统计结果 df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,columns=['A', 'B', 'C', 'D']) #时间序列的索引标签，4列的表 df = df.cumsum() #每列的累加统计 df.plot() plt.show() #图像是4条曲线，X轴：1000日，y轴：每日的累加统计结果

Python练手，pandas

标签：大小 tar 模块类型 pivot int tab ica cat

原文地址：http://blog.51cto.com/hadoooo/2062678

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行