标签:scope Fix 条件 out list json 超过 array end
9.2 Pandas-数据结构
- 一维数据:序列(Series)
- 二维数据:数据框(DataFrame)
- 三维数据:面板(MultiIndex/Panel(后面版本可能放弃))
从数据结构角度,一般实现“增删改查”操作,官方接口提供了如下操作:
9.2.1 Series
接口文档
pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
1. 创建
# 通过列表
import numpy as np
import pandas as pd
s1 = pd.Series([1,3,‘555‘,np.nan,‘6.66‘,8.8],index=list(‘abcdef‘),name=‘value‘)
s1
a 1
b 3
c 555
d NaN
e 6.66
f 8.8
Name: value, dtype: object
# 通过字典
import numpy as np
import pandas as pd
d = {‘b‘: 1, ‘a‘: 0, ‘c‘: 2}
s2 = pd.Series(d,name=‘value‘)
s2
b 1
a 0
c 2
Name: value, dtype: int64
2. 查找
获取元素
s1.get(‘c‘)
‘555‘
s1[[‘a‘,‘c‘,‘d‘]]
a 1
c 555
d NaN
Name: value, dtype: object
s1[[1,2,4]]
b 3
c 555
e 6.66
Name: value, dtype: object
索引、列名、值
# 索引
s1.index
Index([‘a‘, ‘b‘, ‘c‘, ‘d‘, ‘e‘, ‘f‘], dtype=‘object‘)
# 列名
s1.name
‘value‘
# 值
type(s1.values) #返回ndarray类型
type(s1.items())#返回tuples类型
zip
条件查询
# 查找空值数据
s1[s1.isna()]
d NaN
Name: value, dtype: object
# 条件查找
d = {‘b‘: 1, ‘a‘: 0, ‘c‘: 2}
s2 = pd.Series(d,name=‘value‘)
s2[s2.values>0]
b 1
c 2
Name: value, dtype: int64
切片
# 切片
s1[‘b‘:‘e‘]
b 3
c 555
d NaN
e 6.66
Name: value, dtype: object
# 切片-前5行
s1.head()
a 1
b 3
c 555
d NaN
e 6.66
Name: value, dtype: object
# 切片-后3行
s1.tail(3)
d NaN
e 6.66
f 8.8
Name: value, dtype: object
3. 修改
排序
# 索引排序
s2.sort_index()
a 0
b 1
c 2
Name: value, dtype: int64
# 值排序,要求类型相同
s2.sort_values()
a 0
b 1
c 2
Name: value, dtype: int64
运算
# 算术运算
s1*2
a 2
b 6
c 555555
d NaN
e 6.666.66
f 17.6
Name: value, dtype: object
# 统计运算
s2.sum()
3
类型转换、输出
# 类型转换
s1 = s1.astype("float64")
# 导出到csv
s1.to_csv(".\data\\666.csv")
# 导出到json
s1.to_json(".\data\\666.json")
7.2.2 数据框(dataframe)
接口链接
DataFrame([data, index, columns, dtype, copy])
1. 创建
# 通过Series
import numpy as np
import pandas as pd
d = {‘col1‘: pd.Series([1., 2., 3.], index=[‘a‘, ‘b‘, ‘c‘]), ‘col2‘: pd.Series([1., 2., 3., 4.], index=list(‘abcd‘))}
df1 = pd.DataFrame(d)
df1
|
col1 |
col2 |
a |
1.0 |
1.0 |
b |
2.0 |
2.0 |
c |
3.0 |
3.0 |
d |
NaN |
4.0 |
# 通过列表
import numpy as np
import pandas as pd
dates = pd.date_range(‘20200801‘,periods=5)
df2 = pd.DataFrame(np.random.randn(5,3),index=dates,columns=list(‘ABC‘))
df2
|
A |
B |
C |
2020-08-01 |
0.781995 |
1.349165 |
-2.274934 |
2020-08-02 |
-1.671108 |
1.352948 |
-0.700606 |
2020-08-03 |
-0.693292 |
1.014148 |
0.599076 |
2020-08-04 |
0.264373 |
0.620617 |
-0.235280 |
2020-08-05 |
0.109606 |
0.452433 |
0.615102 |
# 通过字典
import numpy as np
import pandas as pd
df3 = pd.DataFrame({‘A‘:1.,
‘B‘:pd.Timestamp(‘20200827‘),
‘C‘:pd.Series(1,index=list(range(4)),dtype=‘float64‘),
‘D‘:np.array([3]*4,dtype=‘int64‘),
‘E‘:pd.Categorical([‘test‘,‘train‘,‘test‘,‘train‘]),
‘F‘:‘foo‘})
df3
|
A |
B |
C |
D |
E |
F |
0 |
1.0 |
2020-08-27 |
1.0 |
3 |
test |
foo |
1 |
1.0 |
2020-08-27 |
1.0 |
3 |
train |
foo |
2 |
1.0 |
2020-08-27 |
1.0 |
3 |
test |
foo |
3 |
1.0 |
2020-08-27 |
1.0 |
3 |
train |
foo |
2. 属性
# index
df3.index
Int64Index([0, 1, 2, 3], dtype=‘int64‘)
# columns
df3.columns
Index([‘A‘, ‘B‘, ‘C‘, ‘D‘, ‘E‘, ‘F‘], dtype=‘object‘)
# type
df3.dtypes
A float64
B datetime64[ns]
C float64
D int64
E category
F object
dtype: object
# values
df3.values
array([[1.0, Timestamp(‘2020-08-27 00:00:00‘), 1.0, 3, ‘test‘, ‘foo‘],
[1.0, Timestamp(‘2020-08-27 00:00:00‘), 1.0, 3, ‘train‘, ‘foo‘],
[1.0, Timestamp(‘2020-08-27 00:00:00‘), 1.0, 3, ‘test‘, ‘foo‘],
[1.0, Timestamp(‘2020-08-27 00:00:00‘), 1.0, 3, ‘train‘, ‘foo‘]],
dtype=object)
# 转置
df1.T
|
a |
b |
c |
d |
col1 |
1.0 |
2.0 |
3.0 |
NaN |
col2 |
1.0 |
2.0 |
3.0 |
4.0 |
# 统计量
df1.describe()
|
col1 |
col2 |
count |
3.0 |
4.000000 |
mean |
2.0 |
2.500000 |
std |
1.0 |
1.290994 |
min |
1.0 |
1.000000 |
25% |
1.5 |
1.750000 |
50% |
2.0 |
2.500000 |
75% |
2.5 |
3.250000 |
max |
3.0 |
4.000000 |
3. 查询与赋值
获取单元
# 直接索引(先列后行)--不推荐
df1[‘col2‘][‘b‘]
2.0
# select by label
# 标签索引(先行后列)
df1.loc[‘b‘,‘col2‘]
2.0
# select by position
# 位置索引
df1.iloc[1,1]
2.0
获取行
df1.loc[‘c‘:]
|
col1 |
col2 |
c |
3.0 |
3.0 |
d |
NaN |
4.0 |
df1.iloc[2:]
|
col1 |
col2 |
c |
3.0 |
3.0 |
d |
NaN |
4.0 |
获取列
df1[‘col2‘]
a 1.0
b 2.0
c 3.0
d 4.0
Name: col2, dtype: float64
df1.col2
a 1.0
b 2.0
c 3.0
d 4.0
Name: col2, dtype: float64
df1.loc[:,‘col2‘]
a 1.0
b 2.0
c 3.0
d 4.0
Name: col2, dtype: float64
df1.iloc[:,1]
a 1.0
b 2.0
c 3.0
d 4.0
Name: col2, dtype: float64
条件查询
# 条件查询--不推荐
df2[df2[‘B‘].values>0]
|
A |
B |
C |
2020-08-01 |
0.781995 |
1.349165 |
-2.274934 |
2020-08-02 |
-1.671108 |
1.352948 |
-0.700606 |
2020-08-03 |
-0.693292 |
1.014148 |
0.599076 |
2020-08-04 |
0.264373 |
0.620617 |
-0.235280 |
2020-08-05 |
0.109606 |
0.452433 |
0.615102 |
# 条件查询--推荐
df2.query("B > 0")
|
A |
B |
C |
2020-08-01 |
0.781995 |
1.349165 |
-2.274934 |
2020-08-02 |
-1.671108 |
1.352948 |
-0.700606 |
2020-08-03 |
-0.693292 |
1.014148 |
0.599076 |
2020-08-04 |
0.264373 |
0.620617 |
-0.235280 |
2020-08-05 |
0.109606 |
0.452433 |
0.615102 |
# 条件查询
df3[‘E‘].isin([‘test‘])
0 True
1 False
2 True
3 False
Name: E, dtype: bool
赋值
# 单元赋值
df1.loc[‘d‘,‘col1‘]=666
df1
|
col1 |
col2 |
a |
1.0 |
1.0 |
b |
2.0 |
2.0 |
c |
3.0 |
3.0 |
d |
666.0 |
4.0 |
# 列赋值
df1.col1=2
df1
|
col1 |
col2 |
a |
2 |
1.0 |
b |
2 |
2.0 |
c |
2 |
3.0 |
d |
2 |
4.0 |
# 行赋值
df1.loc[‘d‘]=888
df1
|
col1 |
col2 |
a |
2 |
1.0 |
b |
2 |
2.0 |
c |
2 |
3.0 |
d |
888 |
888.0 |
df1.loc[‘c‘:,‘col2‘]=444
df1
|
col1 |
col2 |
a |
2 |
1.0 |
b |
2 |
2.0 |
c |
2 |
444.0 |
d |
888 |
444.0 |
4. 操作
排序
# 值排序
df2=df2.sort_values(‘B‘,ascending=False) #降序
df2
|
A |
B |
C |
2020-08-02 |
-1.671108 |
1.352948 |
-0.700606 |
2020-08-01 |
0.781995 |
1.349165 |
-2.274934 |
2020-08-03 |
-0.693292 |
1.014148 |
0.599076 |
2020-08-04 |
0.264373 |
0.620617 |
-0.235280 |
2020-08-05 |
0.109606 |
0.452433 |
0.615102 |
# 索引排序
df2=df2.sort_index()
df2
|
A |
B |
C |
2020-08-01 |
0.781995 |
1.349165 |
-2.274934 |
2020-08-02 |
-1.671108 |
1.352948 |
-0.700606 |
2020-08-03 |
-0.693292 |
1.014148 |
0.599076 |
2020-08-04 |
0.264373 |
0.620617 |
-0.235280 |
2020-08-05 |
0.109606 |
0.452433 |
0.615102 |
7.3 Pandas-数据处理
7.3.1 缺失值处理
- 查询缺失值 df.isnull().any()
- 移除缺失值 df.dropna(axis=0,how=‘all‘) # how={‘any‘,‘all‘}
- 替换缺失值 df.fillna(inplace=True)
- 替换标记缺失值(非NaN) df.repalce(to_repalce=,value=)
# 通过字典
import numpy as np
import pandas as pd
df3 = pd.DataFrame({‘A‘:1.,
‘B‘:pd.Timestamp(‘20200827‘),
‘C‘:pd.Series(1,index=list(range(4)),dtype=‘float64‘),
‘D‘:np.array(range(1,5),dtype=‘int64‘),
‘E‘:pd.Categorical([‘test‘,‘train‘,np.nan,‘?‘]),
‘F‘:[np.nan,np.nan,np.nan,np.nan]})
df3
|
A |
B |
C |
D |
E |
F |
0 |
1.0 |
2020-08-27 |
1.0 |
1 |
test |
NaN |
1 |
1.0 |
2020-08-27 |
1.0 |
2 |
train |
NaN |
2 |
1.0 |
2020-08-27 |
1.0 |
3 |
NaN |
NaN |
3 |
1.0 |
2020-08-27 |
1.0 |
4 |
? |
NaN |
# 某列中含有NaN,则返回True
df3.isnull().any()
A False
B False
C False
D False
E True
F True
dtype: bool
# 某列中全部数据为NaN,则返回True
df3.isnull().all()
A False
B False
C False
D False
E False
F True
dtype: bool
# numpy查询整个dataframe
np.any(pd.isnull(df3))
True
data1=df3.dropna(axis=1,how=‘all‘) #删除整列为NaN的数据
data1
|
A |
B |
C |
D |
E |
0 |
1.0 |
2020-08-27 |
1.0 |
1 |
test |
1 |
1.0 |
2020-08-27 |
1.0 |
2 |
train |
2 |
1.0 |
2020-08-27 |
1.0 |
3 |
NaN |
3 |
1.0 |
2020-08-27 |
1.0 |
4 |
? |
df3.F.fillna(df3.D.mean(),inplace=True) # inplace表示在原有表修改
df3
|
A |
B |
C |
D |
E |
F |
0 |
1.0 |
2020-08-27 |
1.0 |
1 |
test |
2.5 |
1 |
1.0 |
2020-08-27 |
1.0 |
2 |
train |
2.5 |
2 |
1.0 |
2020-08-27 |
1.0 |
3 |
NaN |
2.5 |
3 |
1.0 |
2020-08-27 |
1.0 |
4 |
? |
2.5 |
# 替换标记缺失值(非NaN)
data2=df3.replace(to_replace=‘?‘,value=df3.D.mean())
data2
|
A |
B |
C |
D |
E |
F |
0 |
1.0 |
2020-08-27 |
1.0 |
1 |
test |
2.5 |
1 |
1.0 |
2020-08-27 |
1.0 |
2 |
train |
2.5 |
2 |
1.0 |
2020-08-27 |
1.0 |
3 |
NaN |
2.5 |
3 |
1.0 |
2020-08-27 |
1.0 |
4 |
2.5 |
2.5 |
7.3.2 离散化
- 分组
- sr = pd.qcut(data,bins) #自动分组
- sr = pd.cut(data,[区间]) #手动分组
- 将分组好的结果换成one-hot编码
- pd.get_dummies(sr,prefix=前缀标记)
# 1 创建数据
data = pd.Series([165,174,160,180,159,163,192,184],index=list(range(1,9)))
# 2 手动分组
bins = [150,165,180,195]
sr = pd.cut(data,bins)
print(sr.value_counts())
# 3 noe-hots编码
pd.get_dummies(sr,prefix=‘身高_‘)
(150, 165] 4
(180, 195] 2
(165, 180] 2
dtype: int64
|
身高__(150, 165] |
身高__(165, 180] |
身高__(180, 195] |
1 |
1 |
0 |
0 |
2 |
0 |
1 |
0 |
3 |
1 |
0 |
0 |
4 |
0 |
1 |
0 |
5 |
1 |
0 |
0 |
6 |
1 |
0 |
0 |
7 |
0 |
0 |
1 |
8 |
0 |
0 |
1 |
7.3.3 数据合并
1. 拼接concat
# 创建数据
df1 = pd.DataFrame(np.ones((3,4))*0,columns=list(‘abcd‘))
df2 = pd.DataFrame(np.ones((3,4))*1,columns=list(‘bcde‘))
df3 = pd.DataFrame(np.ones((3,4))*2,columns=list(‘abcd‘))
print(df1)
print(df2)
print(df3)
# 拼接
res1 = pd.concat([df1,df2,df3],axis=0,ignore_index=True) # 纵向合并(axis=0),索引重新排序,缺失数据补NaN
res1
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
b c d e
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
|
a |
b |
c |
d |
e |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
3 |
NaN |
1.0 |
1.0 |
1.0 |
1.0 |
4 |
NaN |
1.0 |
1.0 |
1.0 |
1.0 |
5 |
NaN |
1.0 |
1.0 |
1.0 |
1.0 |
6 |
2.0 |
2.0 |
2.0 |
2.0 |
NaN |
7 |
2.0 |
2.0 |
2.0 |
2.0 |
NaN |
8 |
2.0 |
2.0 |
2.0 |
2.0 |
NaN |
‘‘‘
join参数:默认outer
outer:缺失数据补NaN
inner:删除缺失数据列
‘‘‘
res2=pd.concat([df1,df2],join=‘inner‘,ignore_index=True)
res2
|
b |
c |
d |
0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
3 |
1.0 |
1.0 |
1.0 |
4 |
1.0 |
1.0 |
1.0 |
5 |
1.0 |
1.0 |
1.0 |
2.添加append
# 添加dataframe
res3 = df1.append(df2,ignore_index=True)
res3
|
a |
b |
c |
d |
e |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
NaN |
3 |
NaN |
1.0 |
1.0 |
1.0 |
1.0 |
4 |
NaN |
1.0 |
1.0 |
1.0 |
1.0 |
5 |
NaN |
1.0 |
1.0 |
1.0 |
1.0 |
# 添加行
sr = pd.Series([1,2,3,4],index=list(‘abcd‘))
res4 = df1.append(sr,ignore_index=True)
res4
|
a |
b |
c |
d |
0 |
0.0 |
0.0 |
0.0 |
0.0 |
1 |
0.0 |
0.0 |
0.0 |
0.0 |
2 |
0.0 |
0.0 |
0.0 |
0.0 |
3 |
1.0 |
2.0 |
3.0 |
4.0 |
3. 融合Merge
#依据一组key合并
left = pd.DataFrame({‘key‘: [‘K0‘, ‘K1‘, ‘K2‘, ‘K3‘],
‘A‘: [‘A0‘, ‘A1‘, ‘A2‘, ‘A3‘],
‘B‘: [‘B0‘, ‘B1‘, ‘B2‘, ‘B3‘]})
right = pd.DataFrame({‘key‘: [‘K0‘, ‘K1‘, ‘K2‘, ‘K3‘],
‘C‘: [‘C0‘, ‘C1‘, ‘C2‘, ‘C3‘],
‘D‘: [‘D0‘, ‘D1‘, ‘D2‘, ‘D3‘]})
res = pd.merge(left, right, on=‘key‘)
print(left)
print(right)
res
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
|
key |
A |
B |
C |
D |
0 |
K0 |
A0 |
B0 |
C0 |
D0 |
1 |
K1 |
A1 |
B1 |
C1 |
D1 |
2 |
K2 |
A2 |
B2 |
C2 |
D2 |
3 |
K3 |
A3 |
B3 |
C3 |
D3 |
# 依据两组key合并
left = pd.DataFrame({‘key1‘: [‘K0‘, ‘K0‘, ‘K1‘, ‘K2‘],
‘key2‘: [‘K0‘, ‘K1‘, ‘K0‘, ‘K1‘],
‘A‘: [‘A0‘, ‘A1‘, ‘A2‘, ‘A3‘],
‘B‘: [‘B0‘, ‘B1‘, ‘B2‘, ‘B3‘]})
right = pd.DataFrame({‘key1‘: [‘K0‘, ‘K1‘, ‘K1‘, ‘K2‘],
‘key2‘: [‘K0‘, ‘K0‘, ‘K0‘, ‘K0‘],
‘C‘: [‘C0‘, ‘C1‘, ‘C2‘, ‘C3‘],
‘D‘: [‘D0‘, ‘D1‘, ‘D2‘, ‘D3‘]})
print(‘left表:‘)
print(left)
print(‘\nright表:‘)
print(right)
#依据key1与key2 columns进行合并,并打印出四种结果[‘left‘, ‘right‘, ‘outer‘, ‘inner‘]
res = pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘inner‘) #只合并两张表key都有的数据
print(‘\ninner方式:‘)
print(res)
res = res = pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘outer‘)#没有的数据补NaN
print(‘\nouter方式:‘)
print(res)
res = pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘left‘) #以左表为基准,右表匹配
print(‘\nleft方式:‘)
print(res)
res = pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘right‘) #以右表为基准,左表匹配
print(‘\nright方式:‘)
print(res)
left表:
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
right表:
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
inner方式:
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
outer方式:
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3
left方式:
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
right方式:
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
#依据index合并
left = pd.DataFrame({‘A‘: [‘A0‘, ‘A1‘, ‘A2‘],
‘B‘: [‘B0‘, ‘B1‘, ‘B2‘]},
index=[‘K0‘, ‘K1‘, ‘K2‘])
right = pd.DataFrame({‘C‘: [‘C0‘, ‘C2‘, ‘C3‘],
‘D‘: [‘D0‘, ‘D2‘, ‘D3‘]},
index=[‘K0‘, ‘K2‘, ‘K3‘])
print(left)
print(right)
# outer方式
res = pd.merge(left, right, left_index=True, right_index=True, how=‘outer‘)
print(res)
# inner方式
res = pd.merge(left, right, left_index=True, right_index=True, how=‘inner‘)
res
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
|
A |
B |
C |
D |
K0 |
A0 |
B0 |
C0 |
D0 |
K2 |
A2 |
B2 |
C2 |
D2 |
7.3.4 透视表(pivot table)
df = pd.DataFrame({‘A‘: [‘one‘, ‘one‘, ‘two‘, ‘three‘] * 3,
‘B‘: [‘A‘, ‘B‘, ‘C‘] * 4,
‘C‘: [‘foo‘, ‘foo‘, ‘foo‘, ‘bar‘, ‘bar‘, ‘bar‘] * 2,
‘D‘: np.random.randn(12),
‘E‘: np.random.randn(12)})
print(df)
pivot=pd.pivot_table(df,values=‘D‘,index=[‘A‘,‘B‘],columns=‘C‘)
print(pivot.columns)
pivot
A B C D E
0 one A foo 2.847002 -0.341067
1 one B foo -0.764842 1.078190
2 two C foo 0.002059 0.414781
3 three A bar -0.174984 0.084828
4 one B bar -2.018801 -1.122346
5 one C bar 1.576535 0.551934
6 two A foo -0.427333 -0.990089
7 three B foo -0.907410 -0.541668
8 one C foo -0.988257 2.493991
9 one A bar -0.560151 -1.124036
10 two B bar 1.333048 -0.620632
11 three C bar 0.735043 -0.102446
Index([‘bar‘, ‘foo‘], dtype=‘object‘, name=‘C‘)
|
C |
bar |
foo |
A |
B |
|
|
one |
A |
-0.560151 |
2.847002 |
B |
-2.018801 |
-0.764842 |
C |
1.576535 |
-0.988257 |
three |
A |
-0.174984 |
NaN |
B |
NaN |
-0.907410 |
C |
0.735043 |
NaN |
two |
A |
NaN |
-0.427333 |
B |
1.333048 |
NaN |
C |
NaN |
0.002059 |
bill = pd.read_csv(‘./data/bill.csv‘,encoding=‘gb2312‘)
bill.时间 = pd.to_datetime(bill.时间).dt.normalize() #去除时间保留日期
# 按‘分类’分组
pivot = pd.pivot_table(bill,index=‘分类‘,columns=‘时间‘,values=‘支出‘,aggfunc=‘sum‘).reset_index()
# 查询单天支出超过1笔的日期
pivot.loc[:,pivot.count(axis=0)>1]
时间 |
分类 |
2020-03-13 00:00:00 |
2020-06-17 00:00:00 |
2020-07-12 00:00:00 |
2020-08-20 00:00:00 |
0 |
App购买 |
NaN |
74.0 |
NaN |
NaN |
1 |
交通 |
NaN |
NaN |
NaN |
NaN |
2 |
其它 |
NaN |
NaN |
NaN |
NaN |
3 |
医疗 |
NaN |
NaN |
NaN |
280.0 |
4 |
发红包 |
NaN |
NaN |
NaN |
NaN |
5 |
学习 |
NaN |
NaN |
NaN |
3900.0 |
6 |
就诊 |
NaN |
NaN |
NaN |
NaN |
7 |
旅行 |
NaN |
NaN |
399.0 |
NaN |
8 |
电器 |
NaN |
NaN |
NaN |
NaN |
9 |
电子产品 |
NaN |
NaN |
NaN |
NaN |
10 |
租金 |
NaN |
NaN |
NaN |
1000.0 |
11 |
衣服 |
158.0 |
79.0 |
NaN |
NaN |
12 |
话费网费 |
400.0 |
NaN |
114.0 |
NaN |
13 |
请客送礼 |
NaN |
NaN |
NaN |
NaN |
Python数据处理-v1.0
标签:scope Fix 条件 out list json 超过 array end
原文地址:https://www.cnblogs.com/liuwenzhen/p/13589395.html