标签:scope   Fix   条件   out   list   json   超过   array   end   
9.2 Pandas-数据结构
- 一维数据:序列(Series)
- 二维数据:数据框(DataFrame)
- 三维数据:面板(MultiIndex/Panel(后面版本可能放弃))
从数据结构角度,一般实现“增删改查”操作,官方接口提供了如下操作:
9.2.1 Series
接口文档
pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
1. 创建
# 通过列表
import numpy as np
import pandas as pd
s1 = pd.Series([1,3,‘555‘,np.nan,‘6.66‘,8.8],index=list(‘abcdef‘),name=‘value‘)
s1
a       1
b       3
c     555
d     NaN
e    6.66
f     8.8
Name: value, dtype: object
# 通过字典
import numpy as np
import pandas as pd
d = {‘b‘: 1, ‘a‘: 0, ‘c‘: 2}
s2 = pd.Series(d,name=‘value‘)
s2
b    1
a    0
c    2
Name: value, dtype: int64
2. 查找
获取元素
s1.get(‘c‘)
‘555‘
s1[[‘a‘,‘c‘,‘d‘]]
a      1
c    555
d    NaN
Name: value, dtype: object
s1[[1,2,4]]
b       3
c     555
e    6.66
Name: value, dtype: object
索引、列名、值
# 索引
s1.index
Index([‘a‘, ‘b‘, ‘c‘, ‘d‘, ‘e‘, ‘f‘], dtype=‘object‘)
# 列名
s1.name
‘value‘
# 值
type(s1.values) #返回ndarray类型
type(s1.items())#返回tuples类型
zip
条件查询
# 查找空值数据
s1[s1.isna()]
d    NaN
Name: value, dtype: object
# 条件查找
d = {‘b‘: 1, ‘a‘: 0, ‘c‘: 2}
s2 = pd.Series(d,name=‘value‘)
s2[s2.values>0]
b    1
c    2
Name: value, dtype: int64
切片
# 切片
s1[‘b‘:‘e‘]
b       3
c     555
d     NaN
e    6.66
Name: value, dtype: object
# 切片-前5行
s1.head()
a       1
b       3
c     555
d     NaN
e    6.66
Name: value, dtype: object
# 切片-后3行
s1.tail(3)
d     NaN
e    6.66
f     8.8
Name: value, dtype: object
3. 修改
排序
# 索引排序
s2.sort_index()
a    0
b    1
c    2
Name: value, dtype: int64
# 值排序,要求类型相同
s2.sort_values()
a    0
b    1
c    2
Name: value, dtype: int64
运算
# 算术运算
s1*2
a           2
b           6
c      555555
d         NaN
e    6.666.66
f        17.6
Name: value, dtype: object
# 统计运算
s2.sum()
3
类型转换、输出
# 类型转换
s1 = s1.astype("float64")
# 导出到csv
s1.to_csv(".\data\\666.csv")
# 导出到json
s1.to_json(".\data\\666.json")
7.2.2 数据框(dataframe)
接口链接
DataFrame([data, index, columns, dtype, copy])
1. 创建
# 通过Series
import numpy as np
import pandas as pd
d = {‘col1‘: pd.Series([1., 2., 3.], index=[‘a‘, ‘b‘, ‘c‘]),     ‘col2‘: pd.Series([1., 2., 3., 4.], index=list(‘abcd‘))}
df1 = pd.DataFrame(d)
df1
  
    
      |  | col1 | col2 | 
  
  
    
      | a | 1.0 | 1.0 | 
    
      | b | 2.0 | 2.0 | 
    
      | c | 3.0 | 3.0 | 
    
      | d | NaN | 4.0 | 
  
 
# 通过列表
import numpy as np
import pandas as pd
dates = pd.date_range(‘20200801‘,periods=5)
df2 = pd.DataFrame(np.random.randn(5,3),index=dates,columns=list(‘ABC‘))
df2
  
    
      |  | A | B | C | 
  
  
    
      | 2020-08-01 | 0.781995 | 1.349165 | -2.274934 | 
    
      | 2020-08-02 | -1.671108 | 1.352948 | -0.700606 | 
    
      | 2020-08-03 | -0.693292 | 1.014148 | 0.599076 | 
    
      | 2020-08-04 | 0.264373 | 0.620617 | -0.235280 | 
    
      | 2020-08-05 | 0.109606 | 0.452433 | 0.615102 | 
  
 
# 通过字典
import numpy as np
import pandas as pd
df3 = pd.DataFrame({‘A‘:1.,
                   ‘B‘:pd.Timestamp(‘20200827‘),
                   ‘C‘:pd.Series(1,index=list(range(4)),dtype=‘float64‘),
                   ‘D‘:np.array([3]*4,dtype=‘int64‘),
                   ‘E‘:pd.Categorical([‘test‘,‘train‘,‘test‘,‘train‘]),
                   ‘F‘:‘foo‘})
df3
  
    
      |  | A | B | C | D | E | F | 
  
  
    
      | 0 | 1.0 | 2020-08-27 | 1.0 | 3 | test | foo | 
    
      | 1 | 1.0 | 2020-08-27 | 1.0 | 3 | train | foo | 
    
      | 2 | 1.0 | 2020-08-27 | 1.0 | 3 | test | foo | 
    
      | 3 | 1.0 | 2020-08-27 | 1.0 | 3 | train | foo | 
  
 
2. 属性
# index
df3.index
Int64Index([0, 1, 2, 3], dtype=‘int64‘)
# columns
df3.columns
Index([‘A‘, ‘B‘, ‘C‘, ‘D‘, ‘E‘, ‘F‘], dtype=‘object‘)
# type
df3.dtypes
A           float64
B    datetime64[ns]
C           float64
D             int64
E          category
F            object
dtype: object
# values
df3.values
array([[1.0, Timestamp(‘2020-08-27 00:00:00‘), 1.0, 3, ‘test‘, ‘foo‘],
       [1.0, Timestamp(‘2020-08-27 00:00:00‘), 1.0, 3, ‘train‘, ‘foo‘],
       [1.0, Timestamp(‘2020-08-27 00:00:00‘), 1.0, 3, ‘test‘, ‘foo‘],
       [1.0, Timestamp(‘2020-08-27 00:00:00‘), 1.0, 3, ‘train‘, ‘foo‘]],
      dtype=object)
# 转置
df1.T
  
    
      |  | a | b | c | d | 
  
  
    
      | col1 | 1.0 | 2.0 | 3.0 | NaN | 
    
      | col2 | 1.0 | 2.0 | 3.0 | 4.0 | 
  
 
# 统计量
df1.describe()
  
    
      |  | col1 | col2 | 
  
  
    
      | count | 3.0 | 4.000000 | 
    
      | mean | 2.0 | 2.500000 | 
    
      | std | 1.0 | 1.290994 | 
    
      | min | 1.0 | 1.000000 | 
    
      | 25% | 1.5 | 1.750000 | 
    
      | 50% | 2.0 | 2.500000 | 
    
      | 75% | 2.5 | 3.250000 | 
    
      | max | 3.0 | 4.000000 | 
  
 
3. 查询与赋值
获取单元
# 直接索引(先列后行)--不推荐
df1[‘col2‘][‘b‘]
2.0
# select by label
# 标签索引(先行后列)
df1.loc[‘b‘,‘col2‘]
2.0
# select by position
# 位置索引
df1.iloc[1,1]
2.0
获取行
df1.loc[‘c‘:]
  
    
      |  | col1 | col2 | 
  
  
    
      | c | 3.0 | 3.0 | 
    
      | d | NaN | 4.0 | 
  
 
df1.iloc[2:]
  
    
      |  | col1 | col2 | 
  
  
    
      | c | 3.0 | 3.0 | 
    
      | d | NaN | 4.0 | 
  
 
获取列
df1[‘col2‘]
a    1.0
b    2.0
c    3.0
d    4.0
Name: col2, dtype: float64
df1.col2
a    1.0
b    2.0
c    3.0
d    4.0
Name: col2, dtype: float64
df1.loc[:,‘col2‘]
a    1.0
b    2.0
c    3.0
d    4.0
Name: col2, dtype: float64
df1.iloc[:,1]
a    1.0
b    2.0
c    3.0
d    4.0
Name: col2, dtype: float64
条件查询
# 条件查询--不推荐
df2[df2[‘B‘].values>0]
  
    
      |  | A | B | C | 
  
  
    
      | 2020-08-01 | 0.781995 | 1.349165 | -2.274934 | 
    
      | 2020-08-02 | -1.671108 | 1.352948 | -0.700606 | 
    
      | 2020-08-03 | -0.693292 | 1.014148 | 0.599076 | 
    
      | 2020-08-04 | 0.264373 | 0.620617 | -0.235280 | 
    
      | 2020-08-05 | 0.109606 | 0.452433 | 0.615102 | 
  
 
# 条件查询--推荐
df2.query("B > 0")
  
    
      |  | A | B | C | 
  
  
    
      | 2020-08-01 | 0.781995 | 1.349165 | -2.274934 | 
    
      | 2020-08-02 | -1.671108 | 1.352948 | -0.700606 | 
    
      | 2020-08-03 | -0.693292 | 1.014148 | 0.599076 | 
    
      | 2020-08-04 | 0.264373 | 0.620617 | -0.235280 | 
    
      | 2020-08-05 | 0.109606 | 0.452433 | 0.615102 | 
  
 
# 条件查询
df3[‘E‘].isin([‘test‘])
0     True
1    False
2     True
3    False
Name: E, dtype: bool
赋值
# 单元赋值
df1.loc[‘d‘,‘col1‘]=666
df1
  
    
      |  | col1 | col2 | 
  
  
    
      | a | 1.0 | 1.0 | 
    
      | b | 2.0 | 2.0 | 
    
      | c | 3.0 | 3.0 | 
    
      | d | 666.0 | 4.0 | 
  
 
# 列赋值
df1.col1=2
df1
  
    
      |  | col1 | col2 | 
  
  
    
      | a | 2 | 1.0 | 
    
      | b | 2 | 2.0 | 
    
      | c | 2 | 3.0 | 
    
      | d | 2 | 4.0 | 
  
 
# 行赋值
df1.loc[‘d‘]=888
df1
  
    
      |  | col1 | col2 | 
  
  
    
      | a | 2 | 1.0 | 
    
      | b | 2 | 2.0 | 
    
      | c | 2 | 3.0 | 
    
      | d | 888 | 888.0 | 
  
 
df1.loc[‘c‘:,‘col2‘]=444
df1
  
    
      |  | col1 | col2 | 
  
  
    
      | a | 2 | 1.0 | 
    
      | b | 2 | 2.0 | 
    
      | c | 2 | 444.0 | 
    
      | d | 888 | 444.0 | 
  
 
4. 操作
排序
# 值排序
df2=df2.sort_values(‘B‘,ascending=False) #降序
df2
  
    
      |  | A | B | C | 
  
  
    
      | 2020-08-02 | -1.671108 | 1.352948 | -0.700606 | 
    
      | 2020-08-01 | 0.781995 | 1.349165 | -2.274934 | 
    
      | 2020-08-03 | -0.693292 | 1.014148 | 0.599076 | 
    
      | 2020-08-04 | 0.264373 | 0.620617 | -0.235280 | 
    
      | 2020-08-05 | 0.109606 | 0.452433 | 0.615102 | 
  
 
# 索引排序
df2=df2.sort_index()
df2
  
    
      |  | A | B | C | 
  
  
    
      | 2020-08-01 | 0.781995 | 1.349165 | -2.274934 | 
    
      | 2020-08-02 | -1.671108 | 1.352948 | -0.700606 | 
    
      | 2020-08-03 | -0.693292 | 1.014148 | 0.599076 | 
    
      | 2020-08-04 | 0.264373 | 0.620617 | -0.235280 | 
    
      | 2020-08-05 | 0.109606 | 0.452433 | 0.615102 | 
  
 
7.3 Pandas-数据处理
7.3.1 缺失值处理
- 查询缺失值 df.isnull().any()
- 移除缺失值 df.dropna(axis=0,how=‘all‘) # how={‘any‘,‘all‘}
- 替换缺失值 df.fillna(inplace=True)
- 替换标记缺失值(非NaN) df.repalce(to_repalce=,value=)
# 通过字典
import numpy as np
import pandas as pd
df3 = pd.DataFrame({‘A‘:1.,
                   ‘B‘:pd.Timestamp(‘20200827‘),
                   ‘C‘:pd.Series(1,index=list(range(4)),dtype=‘float64‘),
                   ‘D‘:np.array(range(1,5),dtype=‘int64‘),
                   ‘E‘:pd.Categorical([‘test‘,‘train‘,np.nan,‘?‘]),
                   ‘F‘:[np.nan,np.nan,np.nan,np.nan]})
df3
  
    
      |  | A | B | C | D | E | F | 
  
  
    
      | 0 | 1.0 | 2020-08-27 | 1.0 | 1 | test | NaN | 
    
      | 1 | 1.0 | 2020-08-27 | 1.0 | 2 | train | NaN | 
    
      | 2 | 1.0 | 2020-08-27 | 1.0 | 3 | NaN | NaN | 
    
      | 3 | 1.0 | 2020-08-27 | 1.0 | 4 | ? | NaN | 
  
 
# 某列中含有NaN,则返回True
df3.isnull().any()
A    False
B    False
C    False
D    False
E     True
F     True
dtype: bool
# 某列中全部数据为NaN,则返回True
df3.isnull().all()
A    False
B    False
C    False
D    False
E    False
F     True
dtype: bool
# numpy查询整个dataframe
np.any(pd.isnull(df3))
True
data1=df3.dropna(axis=1,how=‘all‘) #删除整列为NaN的数据
data1
  
    
      |  | A | B | C | D | E | 
  
  
    
      | 0 | 1.0 | 2020-08-27 | 1.0 | 1 | test | 
    
      | 1 | 1.0 | 2020-08-27 | 1.0 | 2 | train | 
    
      | 2 | 1.0 | 2020-08-27 | 1.0 | 3 | NaN | 
    
      | 3 | 1.0 | 2020-08-27 | 1.0 | 4 | ? | 
  
 
df3.F.fillna(df3.D.mean(),inplace=True) # inplace表示在原有表修改
df3
  
    
      |  | A | B | C | D | E | F | 
  
  
    
      | 0 | 1.0 | 2020-08-27 | 1.0 | 1 | test | 2.5 | 
    
      | 1 | 1.0 | 2020-08-27 | 1.0 | 2 | train | 2.5 | 
    
      | 2 | 1.0 | 2020-08-27 | 1.0 | 3 | NaN | 2.5 | 
    
      | 3 | 1.0 | 2020-08-27 | 1.0 | 4 | ? | 2.5 | 
  
 
# 替换标记缺失值(非NaN)
data2=df3.replace(to_replace=‘?‘,value=df3.D.mean())
data2
  
    
      |  | A | B | C | D | E | F | 
  
  
    
      | 0 | 1.0 | 2020-08-27 | 1.0 | 1 | test | 2.5 | 
    
      | 1 | 1.0 | 2020-08-27 | 1.0 | 2 | train | 2.5 | 
    
      | 2 | 1.0 | 2020-08-27 | 1.0 | 3 | NaN | 2.5 | 
    
      | 3 | 1.0 | 2020-08-27 | 1.0 | 4 | 2.5 | 2.5 | 
  
 
7.3.2 离散化
- 分组
- sr = pd.qcut(data,bins) #自动分组
- sr = pd.cut(data,[区间]) #手动分组
 
- 将分组好的结果换成one-hot编码
- pd.get_dummies(sr,prefix=前缀标记)
 
# 1 创建数据
data = pd.Series([165,174,160,180,159,163,192,184],index=list(range(1,9)))
# 2 手动分组
bins = [150,165,180,195]
sr = pd.cut(data,bins)
print(sr.value_counts())
# 3 noe-hots编码
pd.get_dummies(sr,prefix=‘身高_‘)
(150, 165]    4
(180, 195]    2
(165, 180]    2
dtype: int64
  
    
      |  | 身高__(150, 165] | 身高__(165, 180] | 身高__(180, 195] | 
  
  
    
      | 1 | 1 | 0 | 0 | 
    
      | 2 | 0 | 1 | 0 | 
    
      | 3 | 1 | 0 | 0 | 
    
      | 4 | 0 | 1 | 0 | 
    
      | 5 | 1 | 0 | 0 | 
    
      | 6 | 1 | 0 | 0 | 
    
      | 7 | 0 | 0 | 1 | 
    
      | 8 | 0 | 0 | 1 | 
  
 
7.3.3 数据合并
1. 拼接concat
# 创建数据
df1 = pd.DataFrame(np.ones((3,4))*0,columns=list(‘abcd‘))
df2 = pd.DataFrame(np.ones((3,4))*1,columns=list(‘bcde‘))
df3 = pd.DataFrame(np.ones((3,4))*2,columns=list(‘abcd‘))
print(df1)
print(df2)
print(df3)
# 拼接
res1 = pd.concat([df1,df2,df3],axis=0,ignore_index=True) # 纵向合并(axis=0),索引重新排序,缺失数据补NaN
res1
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
     b    c    d    e
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
  
    
      |  | a | b | c | d | e | 
  
  
    
      | 0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 
    
      | 1 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 
    
      | 2 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 
    
      | 3 | NaN | 1.0 | 1.0 | 1.0 | 1.0 | 
    
      | 4 | NaN | 1.0 | 1.0 | 1.0 | 1.0 | 
    
      | 5 | NaN | 1.0 | 1.0 | 1.0 | 1.0 | 
    
      | 6 | 2.0 | 2.0 | 2.0 | 2.0 | NaN | 
    
      | 7 | 2.0 | 2.0 | 2.0 | 2.0 | NaN | 
    
      | 8 | 2.0 | 2.0 | 2.0 | 2.0 | NaN | 
  
 
‘‘‘
join参数:默认outer
    outer:缺失数据补NaN
    inner:删除缺失数据列
‘‘‘
res2=pd.concat([df1,df2],join=‘inner‘,ignore_index=True)
res2
  
    
      |  | b | c | d | 
  
  
    
      | 0 | 0.0 | 0.0 | 0.0 | 
    
      | 1 | 0.0 | 0.0 | 0.0 | 
    
      | 2 | 0.0 | 0.0 | 0.0 | 
    
      | 3 | 1.0 | 1.0 | 1.0 | 
    
      | 4 | 1.0 | 1.0 | 1.0 | 
    
      | 5 | 1.0 | 1.0 | 1.0 | 
  
 
2.添加append
# 添加dataframe
res3 = df1.append(df2,ignore_index=True)
res3
  
    
      |  | a | b | c | d | e | 
  
  
    
      | 0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 
    
      | 1 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 
    
      | 2 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 
    
      | 3 | NaN | 1.0 | 1.0 | 1.0 | 1.0 | 
    
      | 4 | NaN | 1.0 | 1.0 | 1.0 | 1.0 | 
    
      | 5 | NaN | 1.0 | 1.0 | 1.0 | 1.0 | 
  
 
# 添加行
sr = pd.Series([1,2,3,4],index=list(‘abcd‘))
res4 = df1.append(sr,ignore_index=True)
res4
  
    
      |  | a | b | c | d | 
  
  
    
      | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 
    
      | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 
    
      | 2 | 0.0 | 0.0 | 0.0 | 0.0 | 
    
      | 3 | 1.0 | 2.0 | 3.0 | 4.0 | 
  
 
3. 融合Merge
#依据一组key合并
left = pd.DataFrame({‘key‘: [‘K0‘, ‘K1‘, ‘K2‘, ‘K3‘],
                             ‘A‘: [‘A0‘, ‘A1‘, ‘A2‘, ‘A3‘],
                             ‘B‘: [‘B0‘, ‘B1‘, ‘B2‘, ‘B3‘]})
right = pd.DataFrame({‘key‘: [‘K0‘, ‘K1‘, ‘K2‘, ‘K3‘],
                              ‘C‘: [‘C0‘, ‘C1‘, ‘C2‘, ‘C3‘],
                              ‘D‘: [‘D0‘, ‘D1‘, ‘D2‘, ‘D3‘]})
res = pd.merge(left, right, on=‘key‘)
print(left)
print(right)
res
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3
  
    
      |  | key | A | B | C | D | 
  
  
    
      | 0 | K0 | A0 | B0 | C0 | D0 | 
    
      | 1 | K1 | A1 | B1 | C1 | D1 | 
    
      | 2 | K2 | A2 | B2 | C2 | D2 | 
    
      | 3 | K3 | A3 | B3 | C3 | D3 | 
  
 
# 依据两组key合并
left = pd.DataFrame({‘key1‘: [‘K0‘, ‘K0‘, ‘K1‘, ‘K2‘],
                      ‘key2‘: [‘K0‘, ‘K1‘, ‘K0‘, ‘K1‘],
                      ‘A‘: [‘A0‘, ‘A1‘, ‘A2‘, ‘A3‘],
                      ‘B‘: [‘B0‘, ‘B1‘, ‘B2‘, ‘B3‘]})
right = pd.DataFrame({‘key1‘: [‘K0‘, ‘K1‘, ‘K1‘, ‘K2‘],
                       ‘key2‘: [‘K0‘, ‘K0‘, ‘K0‘, ‘K0‘],
                       ‘C‘: [‘C0‘, ‘C1‘, ‘C2‘, ‘C3‘],
                       ‘D‘: [‘D0‘, ‘D1‘, ‘D2‘, ‘D3‘]})
print(‘left表:‘)
print(left)
print(‘\nright表:‘)
print(right)
#依据key1与key2 columns进行合并,并打印出四种结果[‘left‘, ‘right‘, ‘outer‘, ‘inner‘]
res = pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘inner‘) #只合并两张表key都有的数据
print(‘\ninner方式:‘)
print(res)
res = res = pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘outer‘)#没有的数据补NaN
print(‘\nouter方式:‘)
print(res)
res = pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘left‘) #以左表为基准,右表匹配
print(‘\nleft方式:‘)
print(res)
res = pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘right‘) #以右表为基准,左表匹配
print(‘\nright方式:‘)
print(res)
left表:
  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
right表:
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3
inner方式:
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
outer方式:
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3
left方式:
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN
right方式:
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3
#依据index合并
left = pd.DataFrame({‘A‘: [‘A0‘, ‘A1‘, ‘A2‘],
                     ‘B‘: [‘B0‘, ‘B1‘, ‘B2‘]},
                     index=[‘K0‘, ‘K1‘, ‘K2‘])
right = pd.DataFrame({‘C‘: [‘C0‘, ‘C2‘, ‘C3‘],
                      ‘D‘: [‘D0‘, ‘D2‘, ‘D3‘]},
                     index=[‘K0‘, ‘K2‘, ‘K3‘])
print(left)
print(right)
# outer方式
res = pd.merge(left, right, left_index=True, right_index=True, how=‘outer‘)
print(res)
# inner方式
res = pd.merge(left, right, left_index=True, right_index=True, how=‘inner‘)
res
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3
  
    
      |  | A | B | C | D | 
  
  
    
      | K0 | A0 | B0 | C0 | D0 | 
    
      | K2 | A2 | B2 | C2 | D2 | 
  
 
7.3.4 透视表(pivot table)
df = pd.DataFrame({‘A‘: [‘one‘, ‘one‘, ‘two‘, ‘three‘] * 3,
                   ‘B‘: [‘A‘, ‘B‘, ‘C‘] * 4,
                   ‘C‘: [‘foo‘, ‘foo‘, ‘foo‘, ‘bar‘, ‘bar‘, ‘bar‘] * 2,
                   ‘D‘: np.random.randn(12),
                   ‘E‘: np.random.randn(12)})
print(df)
pivot=pd.pivot_table(df,values=‘D‘,index=[‘A‘,‘B‘],columns=‘C‘)
print(pivot.columns)
pivot
        A  B    C         D         E
0     one  A  foo  2.847002 -0.341067
1     one  B  foo -0.764842  1.078190
2     two  C  foo  0.002059  0.414781
3   three  A  bar -0.174984  0.084828
4     one  B  bar -2.018801 -1.122346
5     one  C  bar  1.576535  0.551934
6     two  A  foo -0.427333 -0.990089
7   three  B  foo -0.907410 -0.541668
8     one  C  foo -0.988257  2.493991
9     one  A  bar -0.560151 -1.124036
10    two  B  bar  1.333048 -0.620632
11  three  C  bar  0.735043 -0.102446
Index([‘bar‘, ‘foo‘], dtype=‘object‘, name=‘C‘)
  
    
      |  | C | bar | foo | 
    
      | A | B |  |  | 
  
  
    
      | one | A | -0.560151 | 2.847002 | 
    
      | B | -2.018801 | -0.764842 | 
    
      | C | 1.576535 | -0.988257 | 
    
      | three | A | -0.174984 | NaN | 
    
      | B | NaN | -0.907410 | 
    
      | C | 0.735043 | NaN | 
    
      | two | A | NaN | -0.427333 | 
    
      | B | 1.333048 | NaN | 
    
      | C | NaN | 0.002059 | 
  
 
bill = pd.read_csv(‘./data/bill.csv‘,encoding=‘gb2312‘)
bill.时间 = pd.to_datetime(bill.时间).dt.normalize() #去除时间保留日期
# 按‘分类’分组
pivot = pd.pivot_table(bill,index=‘分类‘,columns=‘时间‘,values=‘支出‘,aggfunc=‘sum‘).reset_index()
# 查询单天支出超过1笔的日期
pivot.loc[:,pivot.count(axis=0)>1]
  
    
      | 时间 | 分类 | 2020-03-13 00:00:00 | 2020-06-17 00:00:00 | 2020-07-12 00:00:00 | 2020-08-20 00:00:00 | 
  
  
    
      | 0 | App购买 | NaN | 74.0 | NaN | NaN | 
    
      | 1 | 交通 | NaN | NaN | NaN | NaN | 
    
      | 2 | 其它 | NaN | NaN | NaN | NaN | 
    
      | 3 | 医疗 | NaN | NaN | NaN | 280.0 | 
    
      | 4 | 发红包 | NaN | NaN | NaN | NaN | 
    
      | 5 | 学习 | NaN | NaN | NaN | 3900.0 | 
    
      | 6 | 就诊 | NaN | NaN | NaN | NaN | 
    
      | 7 | 旅行 | NaN | NaN | 399.0 | NaN | 
    
      | 8 | 电器 | NaN | NaN | NaN | NaN | 
    
      | 9 | 电子产品 | NaN | NaN | NaN | NaN | 
    
      | 10 | 租金 | NaN | NaN | NaN | 1000.0 | 
    
      | 11 | 衣服 | 158.0 | 79.0 | NaN | NaN | 
    
      | 12 | 话费网费 | 400.0 | NaN | 114.0 | NaN | 
    
      | 13 | 请客送礼 | NaN | NaN | NaN | NaN | 
  
 
Python数据处理-v1.0
标签:scope   Fix   条件   out   list   json   超过   array   end   
原文地址:https://www.cnblogs.com/liuwenzhen/p/13589395.html