pandas索引
In [1]:
# from http://pandas.pydata.org/pandas-docs/stable/indexing.html
# pandas 0.22.0
In [2]:
import pandas as pd
import numpy as np
In [3]:
"""
尽量利用pandas提供的专用索引方式,而不是python通用的切片方式。
三种主要的索引方式:indexers
.loc label based 基于标签, 可以是标签名,可以是布尔值,可以是一元函数
.iloc integer psition based 基于整数位置(from 0 to length-1 of the axis),和python切片类似
[]
"""
"""
Object Type Selection Return Value Type
------------------------------------------------------------
Series series[label] scalar value
DataFrame frame[colname] Series corresponding to colname
Panel panel[itemname] DataFrame corresponding to the itemname
"""
Out[3]:
Out[3]:
In [5]:
# Here we construct a simple time series data set to use for illustrating the indexing functionality
# 构造时间序列,举例说明索引的功能
dates = pd.date_range(‘1/1/2000‘, periods=8)
dates
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=[‘A‘, ‘B‘, ‘C‘, ‘D‘])
df
Out[5]:
Out[5]:
In [8]:
panel = pd.Panel({‘one‘ : df, ‘two‘ : df - df.mean()}) # 多维表格
panel
panel[‘one‘]
panel[‘two‘]
Out[8]:
Out[8]:
Out[8]:
In [9]:
# Thus, as per above, we have the most basic indexing using []:
# 最基本的索引方法,使用[]
s = df[‘A‘]
s
dates[5] # 注意切片从0计数
s[dates[5]]
Out[9]:
Out[9]:
Out[9]:
In [12]:
# 在[]中传入列名的列表,如[ ‘A‘, "B" ]
columns_l = [‘A‘, ‘B‘]
df[columns_l]
df[[ ‘A‘, "B" ]] # 相当于上面,注意两重[]
df
Out[12]:
Out[12]:
Out[12]:
In [17]:
df[[‘A‘, ‘B‘]] = df[[‘B‘, ‘A‘]] # 交换两列的值
df
df.loc[:,[‘B‘, ‘A‘]]
# 下式不能交换两列的值
# This will not modify df because the column alignment is before value assignment. ? 不理解?
# 列赋值在值赋值之前?
df.loc[:,[‘B‘, ‘A‘]] = df[[‘A‘, ‘B‘]]
df
# df.loc[:,[‘B‘, ‘A‘]] = df[:, [‘A‘, ‘B‘]] # 错误?
# df
# 正确的方式:
df.loc[:,[‘B‘, ‘A‘]] = df[[‘A‘, ‘B‘]].values # 取列的值
df
Out[17]:
Out[17]:
Out[17]:
Out[17]:
In [18]:
# Attribute Access?
# 属性访问,属性存取
# You may access an index on a Series, column on a DataFrame,
# and an item on a Panel directly as an attribute:
sa = pd.Series([1,2,3],index=list(‘abc‘))
sa
dfa = df.copy()
dfa
Out[18]:
Out[18]:
In [20]:
# 对item column等赋值
# 就像属性一样的存取,但需要注意:
# 1. 名称应符合python命名规则,由字母、数字和下划线组成
# 2. 不能和已有的方法名称重名,例如 min
# 3. 不能与pandas内部“关键字”重名,例如 index axis items labels
# 以上情况,可以使用 [ "" ] 引用
sa.a = 5
sa
dfa.index
dfa.A = list(range(len(dfa.index)))
dfa
Out[20]:
Out[20]:
Out[20]:
In [21]:
x = pd.DataFrame({‘x‘: [1, 2, 3], ‘y‘: [3, 4, 5]}) # 字典key值为列名
x
Out[21]:
In [24]:
dict(x=9, y=99)
x.iloc[1]
x.iloc[1] = dict(x=9, y=99)
x
Out[24]:
Out[24]:
Out[24]:
In [29]:
df = pd.DataFrame({‘one‘: [1., 2., 3.]})
df
df.two = [4, 5, 6] # 错误,不能增加一列,利用属性的方式不能对没有的列赋值
df.two # 但是增加了一项属性,而且可以取得这项属性
df
df[‘two‘] = [4, 5, 6] # 可以增加一列
df
Out[29]:
Out[29]:
Out[29]:
Out[29]:
In [31]:
# Slicing ranges
# 切片范围
# iloc 方法是最稳健和兼容的
# 下面介绍 方括号 [] 操作符 作为切片
# 对series操作
# 取值
s
s[:5]
s[::2]
s[::-1]
Out[31]:
Out[31]:
Out[31]:
Out[31]:
In [33]:
# 赋值
s2 = s.copy()
s2
s2[:5] = 0
s2
Out[33]:
Out[33]:
In [35]:
# 对DataFrame操作
# [] 操作 选择的是行
df
df[:3]
df[::-1]
Out[35]:
Out[35]:
Out[35]:
In [49]:
# Selection By Label
# 通过标签选择
# 这种方式可能依靠上下文关系,有时候会调用链式赋值,这应该避免。参考:
# Returning a view versus a copy 返回视图 对比 拷贝
# When setting values in a pandas object,
# care must be taken to avoid what is called chained indexing.
# Here is an example.
[list(‘abcd‘),list(‘efgh‘),list(‘ijkl‘),list(‘mnop‘)]
[[‘one‘,‘two‘],[‘first‘,‘second‘]]
dfmi = pd.DataFrame([list(‘abcd‘),list(‘efgh‘),list(‘ijkl‘),list(‘mnop‘)], # 定义四列数值
columns=pd.MultiIndex.from_product(
[[‘one‘,‘two‘],[‘first‘,‘second‘]])) # 定义多重索引,第一个list是第一层,以下类推
dfmi
Out[49]:
Out[49]:
Out[49]:
In [50]:
# Compare these two access methods:
dfmi[‘one‘][‘second‘] # chained
dfmi.loc[:,(‘one‘,‘second‘)]
Out[50]:
Out[50]:
In [40]:
# 两种方式分析
# 第一种方式:链式 使用两个 []
dfmi[‘one‘] # 第一个[] 先生成了一个DataFrame
dfmi[‘one‘][‘second‘] # pandas 把两个[] 作为分开的事件,他们执行分开的两步调用 __getitem__
Out[40]:
In [53]:
# 第二种方式:loc
# 通过嵌套的元组切片
df_s = (slice(None), (‘one‘, ‘second‘))
dfmi.loc[df_s] # 只调用了一次 __getitem__
Out[53]:
In [62]:
# Selection By Label
# 通过标签选择
# loc要注意索引的数据类型,必须与索引的数据类型一致才可以,
# 例如 datetimeIndex 中,使用loc[2:3] ,即整数型的slice 将会出现TypeError
dfl = pd.DataFrame(np.random.randn(5,4), columns=list(‘ABCD‘),
index=pd.date_range(‘20130101‘,periods=5))
dfl
# dfl.loc[2:3] # 错误的loc
dfl.loc[‘20130102‘:‘20130104‘] # 使用可转换为datetime的字符串
dfl.loc[‘20130202‘:‘20130204‘] # 不报错,返回为空DataFrame
# dfl.loc[‘20130202‘] # 报错,错误信息是index无此值
dfl.loc[‘20130104‘:‘20130114‘] # 只返回存在的数据
Out[62]:
Out[62]:
Out[62]:
Out[62]:
In [64]:
# loc可以使用整数,但此时的整数不代表位置,而是label
# loc是基本的取值方法
# 给loc的输入,即在[]中的值,可以是:
# 1. 单独的label e.g. 5 or ‘a
# 2. labels的list [‘a‘, ‘b‘, ‘c‘]
# 3. slice对象 ‘a‘:‘f‘ !! 注意:与python的切片不同,pandas的切片包括开始和结尾,而python不包括结尾
# 4. 布尔值
# 5. 调用函数 [lambda df: df.A > 0, :]
# Series
s1 = pd.Series(np.random.randn(6),index=list(‘abcdef‘))
s1
s1.loc[‘c‘:]
s1.loc[‘b‘]
s1.loc[‘c‘:] = 0
s1
Out[64]:
Out[64]:
Out[64]:
Out[64]:
In [65]:
# DataFrame
df1 = pd.DataFrame(np.random.randn(6,4), # 6X4 阵列
index=list(‘abcdef‘), # 索引
columns=list(‘ABCD‘)) # 行号
df1
Out[65]:
In [72]:
df1.loc[[‘a‘, ‘b‘, ‘d‘], :] # 先是行label选择,再是列label选择
df1.loc[‘d‘:, ‘A‘:‘C‘]
# 取得 a cross section 截面,用单个的label,返回Series
# 以下三式等同
df1.loc[‘a‘]
df1.loc[‘a‘,:]
df1.xs(‘a‘)
type(df1.loc[‘a‘])
Out[72]:
Out[72]:
Out[72]:
Out[72]:
Out[72]:
Out[72]:
In [73]:
# 通过布尔值数组取值
df1.loc[‘a‘] > 0
df1.loc[:, df1.loc[‘a‘] > 0]
Out[73]:
Out[73]:
In [74]:
# 获取一个值
# this is also equivalent to ``df1.at[‘a‘,‘A‘]``
df1.loc[‘a‘, ‘A‘]
df1.at[‘a‘,‘A‘]
Out[74]:
Out[74]:
In [75]:
# Slicing with labels?
# 用labels切片
s = pd.Series(list(‘abcde‘), index=[0,3,2,5,4])
s
s.loc[3:5] # 包含 5, 注意不是 3 4 5 ,而是从 标签3 到 标签5
Out[75]:
Out[75]:
In [80]:
# s.loc[3:6] # 2个错误,ValueError: index must be monotonic increasing or decreasing KeyError: 6
# 如果排序的话,可以超出范围
s.sort_index() # 不改变原值
s
s.sort_index().loc[1:6] # 可以超出范围
s.sort_index().loc[6:8] # 即使一个都没有
# s.sort_index().loc[8] # 但不能是单值,必须是切片
Out[80]:
Out[80]:
Out[80]:
Out[80]:
In [82]:
# Selection By Position
# 通过位置选择,仅通过基于索引的整数,与python和numpy类似,从0开始,且不包括最后一个
# iloc的输入:
# 1. 整数
# 2. 整数的list
# 3. 整数的切片
# 4. 布尔值数组
# 5. 调用函数
# Series
s1 = pd.Series(np.random.randn(5), index=list(range(0,10,2)))
s1
s1.iloc[:3]
s1.iloc[3]
s1.iloc[:3] = 0
s1
Out[82]:
Out[82]:
Out[82]:
Out[82]:
In [83]:
# DataFrame
df1 = pd.DataFrame(np.random.randn(6,4),
index=list(range(0,12,2)),
columns=list(range(0,8,2)))
df1
Out[83]:
In [87]:
df1.iloc[:3]
df1.iloc[1:5, 2:4]
df1.iloc[[1, 3, 5], [1, 3]]
df1.iloc[1:3, :]
df1.iloc[:, 1:3]
Out[87]:
Out[87]:
Out[87]:
Out[87]:
Out[87]:
In [89]:
# this is also equivalent to ``df1.iat[1,1]``
# 取单个的值
df1.iloc[1, 1]
df1.iat[1, 1]
Out[89]:
Out[89]:
In [92]:
# For getting a cross section using an integer position (equiv to df.xs(1))
# 取截面,得到Series
df1.iloc[1]
df1.iloc[:,1]
Out[92]:
Out[92]:
In [93]:
# Out of range slice indexes are handled gracefully just as in Python/Numpy.
# 超过索引的切片处理,与python和numpy一样
# 注意:不能是单独索引,或列表中,有超过界限的值,只可以是slice,即带冒号的切片才不会提示错误
x = list(‘abcdef‘)
x[4:10]
x[8:10]
s = pd.Series(x)
s.iloc[4:10]
s.iloc[8:10] # 超过界限bound返回空
Out[93]:
Out[93]:
Out[93]:
Out[93]:
In [94]:
dfl = pd.DataFrame(np.random.randn(5,2), columns=list(‘AB‘))
dfl
dfl.iloc[:, 2:3]
dfl.iloc[:, 1:3]
dfl.iloc[4:6]
Out[94]:
Out[94]:
Out[94]:
Out[94]:
In [99]:
# Selection By Callable
# 通过调用函数进行选择
df1 = pd.DataFrame(np.random.randn(6,4),
index=list("abcdef"),
columns=list("ABCD"))
df1
df1.A
Out[99]:
Out[99]:
In [100]:
df1.loc[lambda df: df.A > 0, :]
df1.loc[:, lambda df: [‘A‘, ‘B‘]]
df1.iloc[:, lambda df: [0, 1]]
df1[lambda df: df.columns[0]]
Out[100]:
Out[100]:
Out[100]:
Out[100]:
In [101]:
df1.A
df1.A.loc[lambda s: s > 0]
df1.A.loc[df1.A > 0]
Out[101]:
Out[101]:
Out[101]:
In [ ]:
# 使用这些方法或索引,可以使用链式的选择方法,而不用中间的临时变量。链式方法,不是链式[]
bb = pd.read_csv(‘data/baseball.csv‘, index_col=‘id‘)
bb.groupby([‘year‘, ‘team‘]).sum().loc[lambda df: df.r > 100]
In [102]:
# IX Indexer is Deprecated
# 不推荐使用ix
# in favor of the more strict .iloc and .loc indexers.
# 使用.iloc和.loc代替.ix
dfd = pd.DataFrame({‘A‘: [1, 2, 3],‘B‘: [4, 5, 6]},index=list(‘abc‘))
dfd
Out[102]:
In [103]:
dfd.ix[[0, 2], ‘A‘]
Out[103]:
In [105]:
dfd.index[[0, 2]] # 取得索引的名称
dfd.loc[dfd.index[[0, 2]], ‘A‘]
dfd.columns.get_loc(‘A‘) # 取得列的索引值
dfd.iloc[[0, 2], dfd.columns.get_loc(‘A‘)]
Out[105]:
Out[105]:
Out[105]:
Out[105]:
In [106]:
# Indexing with list with missing labels is Deprecated
# 不推荐用有缺失标签的list进行索引
# using .loc or [] with a list with one or more missing labels, is deprecated, in favor of .reindex.
# 推荐使用.reindex
s = pd.Series([1, 2, 3])
s
Out[106]:
In [107]:
s.loc[[1, 2]] # list中的keys都存在,则没有变化
Out[107]:
In [108]:
s.loc[[1, 2, 3]] # 当有缺失时,赋值为NaN
Out[108]:
In [109]:
# Reindexing
s.reindex([1, 2, 3])
Out[109]:
In [110]:
# 如果仅选择有效的keys
labels = [1, 2, 3]
s.index.intersection(labels) # index和labes的交集
s.loc[s.index.intersection(labels)]
Out[110]:
Out[110]:
In [112]:
# reindex 索引中不能有重复的项
s = pd.Series(np.arange(4), index=[‘a‘, ‘a‘, ‘b‘, ‘c‘])
s
labels = [‘c‘, ‘d‘]
# s.reindex(labels) # 不能reindex
Out[112]:
In [113]:
# 可以先把交集切出来,再进行reindex
# 但是交集不能有重复的index
s.index.intersection(labels)
s.loc[s.index.intersection(labels)]
s.loc[s.index.intersection(labels)].reindex(labels)
Out[113]:
Out[113]:
Out[113]:
In [114]:
# Selecting Random Samples
# 随机取样选择
# 默认行取样
s = pd.Series([0,1,2,3,4,5])
s
Out[114]:
In [129]:
s.sample() # 默认取1行
s.sample(n=3) # 取3行
s.sample(frac=0.5) # 小数,行数百分数
s.sample(frac=0.8) # 小数
s
Out[129]:
Out[129]:
Out[129]:
Out[129]:
Out[129]:
In [132]:
s.sample(n=6, replace=False) # 默认
s.sample(n=6, replace=True) # replace,不改变本身
s
Out[132]:
Out[132]:
Out[132]:
In [166]:
# 默认每行都有同样的概率被抽样到,也可以指定每行的概率比重
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights)
example_weights2 = [0.5, 0, 0, 0, 0, 0] # 权重将会自动的归一化
s.sample(n=1, weights=example_weights2)
Out[166]:
Out[166]:
In [175]:
# 可以指定DataFrame的某列作为权重
df2 = pd.DataFrame({‘col1‘:[9,8,7,6], ‘weight_column‘:[0.5, 0.4, 0.1, 0]})
df2.sample(n = 3, weights = ‘weight_column‘)
Out[175]:
In [196]:
# 对列进行抽样
df3 = pd.DataFrame({‘col1‘:[1,2,3], ‘col2‘:[2,3,4], ‘col3‘:[3,4,5]})
df3
df3.sample(n=2, axis=1) # 指定axis=1,对列抽样,抽取的是列的组合
df3.sample(n=2, axis=0)
Out[196]:
Out[196]:
Out[196]:
In [220]:
# 可以指定random seed 或者 numpy 的 randomState 对象,作为sample 随机数生成器的种子
# 一旦seed确定,随机数不变
df3.sample(n=2, random_state=2)
df3.sample(n=2, random_state=2)
df3.sample(n=2, random_state=200)
df3.sample(n=2, random_state=200)
Out[220]:
Out[220]:
Out[220]:
Out[220]:
In [222]:
# Setting With Enlargement
# loc或[] 操作 可以通过赋值不存在的键扩大Series或Dataframe
# Series
se = pd.Series([1,2,3])
se
se[5] = 5 # append
se
Out[222]:
Out[222]:
In [223]:
# DataFrame
dfi = pd.DataFrame(np.arange(6).reshape(3,2),columns=[‘A‘,‘B‘])
dfi
Out[223]:
In [224]:
dfi.loc[:,‘C‘] = dfi.loc[:,‘A‘] # enlarge 增加列
dfi
Out[224]:
In [225]:
dfi.loc[3] = 5 # append 增加行
dfi
Out[225]:
In [231]:
# Fast scalar value getting and setting
# 快速取得或设置标量的值
# [] 可以进行很多操作,所以它为了知道你要进行那种操作,会有一点计算开销
# 最快的标量访问方式是使用 at 和 iat 方法,他们可以在所有的数据结构上使用
# at 类似于 loc 基于 label
# iat 类似于 iloc 基于 整数index
s = pd.Series([0,1,2,3,4,5])
s
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=[‘A‘, ‘B‘, ‘C‘, ‘D‘])
df
Out[231]:
Out[231]:
In [233]:
# get value
s.iat[5]
df.at[dates[5], ‘A‘]
df.iat[3, 0]
Out[233]:
Out[233]:
Out[233]:
In [234]:
# set value
df.at[dates[5], ‘E‘] = 7 # 没有列就增加一列,没有值的默认赋值为nan
df
Out[234]:
In [235]:
df.iat[3, 0] = 7
df
Out[235]:
In [236]:
df.at[dates[-1]+1, 0] = 7 # 行和列都扩展了
df
Out[236]:
In [ ]:
# Boolean indexing
# 使用布尔向量过滤数据
# 操作符包括:| for or, & for and, and ~ for not ,必须用括号进行分组
In [237]:
s = pd.Series(range(-3, 4)) # 不用把range先list,直接可以series
s
Out[237]:
In [238]:
s[s >= 0] # 直接用series,不用取其值
s[~(s < 0)]
s[(s < -1) | (s > 0.5)]
Out[238]:
Out[238]:
Out[238]:
In [239]:
df[df[‘A‘] > 0]
Out[239]:
In [240]:
# List comprehensions and map method of Series can also be used to produce more complex criteria:
# 列表生成式和map方法 也可以用来生成 更复杂的条件判断
df2 = pd.DataFrame({‘a‘ : [‘one‘, ‘one‘, ‘two‘, ‘three‘, ‘two‘, ‘one‘, ‘six‘],
‘b‘ : [‘x‘, ‘y‘, ‘y‘, ‘x‘, ‘y‘, ‘x‘, ‘x‘],
‘c‘ : np.random.randn(7)})
df2
Out[240]:
In [241]:
# 判别式
criterion = df2[‘a‘].map(lambda x: x.startswith(‘t‘)) # 选择 two 和 three
criterion
Out[241]:
In [242]:
df2[criterion] # 根据 a 列的判别式 选择数据表的一部分,包含其他列
Out[242]:
In [243]:
# 等价的方式,但是速度慢一些
df2[[x.startswith(‘t‘) for x in df2[‘a‘]]] # 不适用map,而是用列表生成式
Out[243]:
In [244]:
# 复合判断
df2[criterion & (df2[‘b‘] == ‘x‘)] # a 列 和 b 列 均符合某类要求
Out[244]:
In [245]:
# 布尔向量选择 可以与索引选择一并使用
df2.loc[criterion & (df2[‘b‘] == ‘x‘),‘b‘:‘c‘] # 只选择 b 和 c 两列,不选择c列
Out[245]:
In [ ]:
# Indexing with isin
# 用isin索引
In [246]:
# series
# 对列数据进行判断
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype=‘int64‘)
s
Out[246]:
In [248]:
s.isin([2, 4, 6]) # 列数据中是否存在列表中的值,返回布尔值
s[s.isin([2, 4, 6])] # 可以利用返回的布尔值进行选择
Out[248]:
Out[248]:
In [249]:
# 也可以对index obj 进行筛选
s[s.index.isin([2, 4, 6])]
s.reindex([2, 4, 6]) # reindex不同,列表中没有的值返回了nan,且原来的int64返回了float64数据类型
Out[249]:
Out[249]:
In [250]:
# 对应多重索引,可以单独选择索引级别
s_mi = pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[0, 1], [‘a‘, ‘b‘, ‘c‘]]))
s_mi
Out[250]:
In [252]:
s_mi.iloc[s_mi.index.isin([(1, ‘a‘), (2, ‘b‘), (0, ‘c‘)])]
s_mi.iloc[s_mi.index.isin([‘a‘, ‘c‘, ‘e‘], level=1)] # 指定索引级别,在第二级索引中选择
Out[252]:
Out[252]:
In [253]:
# DataFrame
df = pd.DataFrame({‘vals‘: [1, 2, 3, 4], ‘ids‘: [‘a‘, ‘b‘, ‘f‘, ‘n‘],‘ids2‘: [‘a‘, ‘n‘, ‘c‘, ‘n‘]})
df
Out[253]:
In [254]:
values = [‘a‘, ‘b‘, 1, 3]
df.isin(values) # 匹配所有的值
Out[254]:
In [262]:
values = {‘ids‘: [‘a‘, ‘b‘], ‘ids2‘: [‘a‘, ‘c‘], ‘vals‘: [1, 3]}
row_mask = df.isin(values) # 对不用的列,分别匹配某些值
row_mask
# ?row_mask.all # Return whether all elements are True over requested axis
row_mask = row_mask.all(1)
row_mask
df[row_mask] # 选择全是True的行
Out[262]:
Out[262]:
Out[262]:
In [264]:
# The where() Method and Masking
# To guarantee that selection output has the same shape as the original data
# 保证选集输出与原数据有同样的shape形态
# series
s
s[s > 0] # 只返回满足的项
s.where(s > 0) # 全部返回,不满足的项,赋值nan
Out[264]:
Out[264]:
Out[264]:
In [267]:
# DataFrame 使用布尔值选择时,返回值保留原数据结构
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=[‘A‘, ‘B‘, ‘C‘, ‘D‘])
df
Out[267]:
In [268]:
df[df < 0]
df.where(df < 0) # 等价于上式
df.where(df < 0, -df) # where 可以传入另一个参数,用于替换条件为 False 的项,返回copy数据拷贝,不修改原值
df
Out[268]:
Out[268]:
Out[268]:
Out[268]:
In [270]:
# You may wish to set values based on some boolean criteria. This can be done intuitively like so:
# 设置基于布尔值的项值
s2 = s.copy()
s2
s2[s2 < 3] = 0
s2
df2 = df.copy()
df2
df2[df2 < 0] = 0
df2
Out[270]:
Out[270]:
Out[270]:
Out[270]:
In [271]:
# where 默认返回修改后的数据拷贝,原值不变;可以设置inplace参数,直接修改原值,而不是创建拷贝
df_orig = df.copy()
df_orig.where(df > 0, -df, inplace=True)
df_orig
Out[271]:
In [272]:
# 注意 :pandas 和 numpy 的where方法不一样
# 一般的,df1.where(m, df2) 相当于 np.where(m, df1, df2)
df.where(df < 0, -df) == np.where(df < 0, df, -df)
Out[272]:
In [276]:
# alignment 定位,对齐
# where 可以选择局部(部分区域)的布尔条件
# This is analogous to partial setting via .loc (but on the contents rather than the axis labels)
df2 = df.copy()
df2
df2[1:4] # 行选择
df2.where(df2[1:4] > 0, 3) # 对不符合项的值进行赋值!
df2[ df2[1:4] > 0 ] = 3 # 只定位部分区域,对符合项的值进行赋值
df2
Out[276]:
Out[276]:
Out[276]:
Out[276]:
In [279]:
# Where can also accept axis and level parameters to align the input when performing the where.
# where 可以接受 轴参数axis 和 级别参数level
df2 = df.copy()
df2
df2.where(df2>0,df2[‘A‘],axis=‘index‘) # 小于等于0的值,赋值为A列的值
df2
df2.apply(lambda x, y: x.where(x>0,y), y=df[‘A‘]) # 相当于上式,但此式较慢,同样不改变原值,而是生成一个拷贝copy
df2
Out[279]:
Out[279]:
Out[279]:
Out[279]:
Out[279]:
In [280]:
# where 可以接受一个函数调用,此函数只能有一个参数,且返回有效的布尔条件
df3 = pd.DataFrame({‘A‘: [1, 2, 3],‘B‘: [4, 5, 6],‘C‘: [7, 8, 9]})
df3
df3.where(lambda x: x > 4, lambda x: x + 10) # x<=4 的值 赋值为 x+10
Out[280]:
Out[280]:
In [282]:
# mask 遮罩 mask is the inverse boolean operation of where. 反向的布尔值操作
s
s.mask(s >= 0) # 选择<0 的值, 不符合项 置为nan
df
df.mask(df >= 0)
Out[282]:
Out[282]:
Out[282]:
Out[282]:
In [288]:
# The query() Method (Experimental) 对 DataFrame对象 使用表达式进行选择
# 例如 :选择b列中 在a列和c列两值中间的 行,a<b<c
n = 10
df = pd.DataFrame(np.random.rand(n, 3), columns=list(‘abc‘))
df
Out[288]:
In [289]:
# pure python
df[(df.a < df.b) & (df.b < df.c)]
Out[289]:
In [290]:
# query,传入的是str表达式
df.query(‘(a < b) & (b < c)‘) # 比纯py慢?!
Out[290]:
In [302]:
# 对于命名的index索引是低效的,比利用列名称
# 而且如果索引的名称和列名同名,列名优先
df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list(‘bc‘))
# df
df.index.name = "a"
df
Out[302]:
In [303]:
df.query(‘a <= b and b <= c‘)
Out[303]:
In [305]:
# 可以不使用索引的名称,而是直接用index,这样同时可以避免与列名重名
df.query(‘index <= b <= c‘)
Out[305]:
In [307]:
# MultiIndex query() Syntax
# 对于多重索引
n = 10
colors = np.random.choice([‘red‘, ‘green‘], size=n)
colors
foods = np.random.choice([‘eggs‘, ‘ham‘], size=n)
foods
Out[307]:
Out[307]:
In [308]:
index = pd.MultiIndex.from_arrays([colors, foods], names=[‘color‘, ‘food‘])
index
df = pd.DataFrame(np.random.randn(n, 2), index=index)
df
Out[308]:
Out[308]:
In [309]:
df.query(‘color == "red"‘)
Out[309]:
In [310]:
# 如果多重索引没有命名 可以使用特殊的名字
df.index.names = [None, None]
df
df.query(‘ilevel_0 == "red"‘) # The convention is ilevel_0, which means “index level 0” for the 0th level of the index.
Out[310]:
Out[310]:
In [ ]:
# query() Use Cases 使用示例
# A use case for query() is when you have a collection of DataFrame objects
# that have a subset of column names (or index levels/names) in common. ## 有公共列名称子集的df
# You can pass the same query to both frames
# without having to specify which frame you’re interested in querying. ## 传入同样的query
In [311]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list(‘abc‘))
df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns)
df
df2
Out[311]:
Out[311]:
In [315]:
expr = ‘0.0 <= a <= c <= 0.5‘
mp = map(lambda frame: frame.query(expr), [df, df2]) # 同一个表达式,作用在有同样列名的多个df上
In [316]:
for i in mp:
print(i)
In [317]:
# query() Python versus pandas Syntax Comparison
# Full numpy-like syntax
df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list(‘abc‘))
df
Out[317]:
In [318]:
df.query(‘(a < b) & (b < c)‘)
df[(df.a < df.b) & (df.b < df.c)]
Out[318]:
Out[318]:
In [319]:
# 几种其他的不同的写法
df.query(‘a < b & b < c‘) # 去掉括号
df.query(‘a < b and b < c‘) # 使用英文and
df.query(‘a < b < c‘) # 连写,优雅的表达
Out[319]:
Out[319]:
Out[319]:
In [320]:
# The in and not in operators
# get all rows where columns "a" and "b" have overlapping values
# 得到a列和b列有重叠值的行,a列中的值在 in b列的值中
df = pd.DataFrame({‘a‘: list(‘aabbccddeeff‘), ‘b‘: list(‘aaaabbbbcccc‘),
‘c‘: np.random.randint(5, size=12),
‘d‘: np.random.randint(9, size=12)})
df
Out[320]:
In [323]:
df.query(‘a in b‘) # 第一次运行113ms 第二次明显加快 25ms
Out[323]:
In [324]:
df[df.a.isin(df.b)] # How you‘d do it in pure Python 仍然比query快 35ms
Out[324]:
In [325]:
df[~df.a.isin(df.b)]
Out[325]:
In [326]:
df.query(‘a not in b‘)
Out[326]:
In [328]:
# rows where cols a and b have overlapping values and col c‘s values are less than col d‘s
df.query(‘a in b and c < d‘)
df[df.a.isin(df.b) & (df.c < df.d)]
Out[328]:
Out[328]:
In [ ]:
# 注意:?????
# Note that in and not in are evaluated in Python, since numexpr has no equivalent of this operation.
# However, only the in/not in expression itself is evaluated in vanilla Python.
# For example, in the expression
df.query(‘a in b + c + d‘)
# (b + c + d) is evaluated by numexpr and then the in operation is evaluated in plain Python. In general, any operations that can be evaluated using numexpr will be.
In [329]:
# Special use of the == operator with list objects
# 特别的用法,== 用于list
# Comparing a list of values to a column using ==/!= works similarly to in/not in
# 用==/!=比较列表的值与列的值,类似于in/not in
df
Out[329]:
In [330]:
df.query(‘b == ["a", "b", "c"]‘) # b列中有列表中值的
Out[330]:
In [332]:
df.query(‘c == [1, 2]‘)
df.query(‘[1, 2] in c‘)
df[df.c.isin([1, 2])]
Out[332]:
Out[332]:
Out[332]:
In [333]:
# Boolean Operators 布尔操作符
# ~ 或 not
df = pd.DataFrame(np.random.rand(n, 3), columns=list(‘abc‘))
df
df[‘bools‘] = np.random.rand(len(df)) > 0.5
df
Out[333]:
Out[333]:
In [334]:
df.query(‘~bools‘)
df.query(‘not bools‘)
Out[334]:
Out[334]:
In [335]:
# 任意组合布尔表达式
shorter = df.query(‘a < b < c and (not bools) or bools > 2‘) # short query syntax
longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] # equivalent in pure Python
shorter
longer
Out[335]:
Out[335]:
In [ ]:
# Performance of query()
# DataFrame.query() using numexpr is slightly faster than Python for large frames
# Note : You will only see the performance benefits of using the numexpr engine with DataFrame.query()
# if your frame has more than approximately 200,000 rows
# query() 效能在大数据时高于数字表达式
In [ ]:
# Duplicate Data
# 识别和剔除重复数据,两个方法:duplicated and drop_duplicates,他们的参数都是列
# duplicated 返回布尔向量,标示重复的行
# drop_duplicates 删除重复列
# 默认保留第一个找到的值,但可以通过keep参数指定保留的值
# keep=‘first‘ (default): mark / drop duplicates except for the first occurrence. 只保留第一个
# keep=‘last‘: mark / drop duplicates except for the last occurrence. 只保留最后一个重复值
# keep=False: mark / drop all duplicates. 剔除所有重复值
In [336]:
df2 = pd.DataFrame({‘a‘: [‘one‘, ‘one‘, ‘two‘, ‘two‘, ‘two‘, ‘three‘, ‘four‘],
‘b‘: [‘x‘, ‘y‘, ‘x‘, ‘y‘, ‘x‘, ‘x‘, ‘x‘],
‘c‘: np.random.randn(7)})
df2
Out[336]:
In [338]:
df2.duplicated(‘a‘)
df2.duplicated("a", keep="last")
df2.duplicated(‘a‘, keep=False)
Out[338]:
Out[338]:
Out[338]:
In [339]:
df2.drop_duplicates(‘a‘)
df2.drop_duplicates(‘a‘, keep="last")
df2.drop_duplicates(‘a‘, keep=False)
Out[339]:
Out[339]:
Out[339]:
In [340]:
# 传入一个列表作为参数
df2.duplicated([‘a‘, ‘b‘]) # a b 两列看做一个整体,标示重复值
df2.drop_duplicates([‘a‘, ‘b‘])
Out[340]:
Out[340]:
In [342]:
# index索引去重 index.duplicated
df3 = pd.DataFrame({‘a‘: np.arange(6),‘b‘: np.random.randn(6)},index=[‘a‘, ‘a‘, ‘b‘, ‘c‘, ‘b‘, ‘a‘])
df3
Out[342]:
In [343]:
df3.index.duplicated()
df3[~df3.index.duplicated()]
df3[~df3.index.duplicated(keep=‘last‘)]
df3[~df3.index.duplicated(keep=False)]
Out[343]:
Out[343]:
Out[343]:
Out[343]:
In [344]:
# Dictionary-like get() method get方法
# Each of Series, DataFrame, and Panel have a get method which can return a default value.
s = pd.Series([1,2,3], index=[‘a‘,‘b‘,‘c‘])
s.get("a") # 相当于s["a"]
s.get("x", default=-1) # 可以对不存在的index赋值
Out[344]:
Out[344]:
In [345]:
# The lookup() Method
# 按一定的顺序取得行/列的值
dflookup = pd.DataFrame(np.random.rand(20,4), columns = [‘A‘,‘B‘,‘C‘,‘D‘])
dflookup
Out[345]:
In [347]:
list(range(0,10,2))
dflookup.lookup(list(range(0,10,2)), [‘B‘,‘C‘,‘A‘,‘B‘,‘D‘])
Out[347]:
Out[347]:
In [348]:
# index objets 索引对象
# 索引可以通过list或其他序列对象直接创建
index = pd.Index([‘e‘, ‘d‘, ‘a‘, ‘b‘])
index
# 可以命名
index = pd.Index([‘e‘, ‘d‘, ‘a‘, ‘b‘], name=‘something‘)
index.name
index
Out[348]:
Out[348]:
Out[348]:
In [349]:
index = pd.Index(list(range(5)), name=‘rows‘)
index
columns = pd.Index([‘A‘, ‘B‘, ‘C‘], name=‘cols‘)
columns
Out[349]:
Out[349]:
In [350]:
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns) # 使用索引给列命名,列名是一个索引对象
df
Out[350]:
In [351]:
# Setting metadata 设置元数据
# 索引“多半”是“不可变”,但是可以设置和改变其元数据,比如索引的名称,或者,多重索引的级别和标签
# 可以使用 rename, set_names, set_levels, and set_labels 设置这些属性。
# 默认返回一个拷贝(不修改原值),也可以就地修改in place
ind = pd.Index([1, 2, 3])
ind
Out[351]:
In [359]:
ind.name = "ind"
ind
ind.rename("apple")
ind
ind.name = "apple"
ind
ind.set_names(["bob"], inplace=True)
ind
Out[359]:
Out[359]:
Out[359]:
Out[359]:
Out[359]:
In [361]:
# set_names, set_levels, and set_labels also take an optional level` argument
# df.query(‘ilevel_0 == "red"‘) 在query中使用 ilevel_0 ……
index = pd.MultiIndex.from_product([range(3), [‘one‘, ‘two‘]], names=[‘first‘, ‘second‘])
index
Out[361]:
In [362]:
index.levels[1]
index.levels[0]
index.set_levels(["a", "b"], level=1)
Out[362]:
Out[362]:
Out[362]:
In [366]:
# Set operations on Index objects 索引对象中的集合操作
# Note: The resulting index from a set operation will be sorted in ascending order.
# 注意: 索引的集合操作返回的是升序排序后的结果
a = pd.Index([‘c‘, ‘b‘, ‘a‘])
b = pd.Index([‘c‘, ‘e‘, ‘d‘])
a | b # 并集
a & b # 交集
# a - b # 不支持
a.difference(b) # 差集,非对称
a.symmetric_difference(b) # 对称差集,相当于idx1.difference(idx2).union(idx2.difference(idx1))
a ^ b # 对称差集
Out[366]:
Out[366]:
Out[366]:
Out[366]:
Out[366]:
In [367]:
# Missing values 缺失值
# 注意:索引能够容纳缺失值,但是应该避免这样的情况。因为可能出现不可预料的结果,例如有些操作默认排除缺失值。
idx1 = pd.Index([1, np.nan, 3, 4])
idx1
Out[367]:
In [368]:
idx1.fillna(2) # 缺失值赋值为2,nan的index赋值为2
Out[368]:
In [372]:
idx2 = pd.DatetimeIndex([pd.Timestamp(‘2011-01-01‘), pd.NaT, pd.Timestamp(‘2011-01-03‘)]) # pd的缺失值是NaT,与np不同
idx2
Out[372]:
In [373]:
idx2.fillna(pd.Timestamp(‘2011-01-02‘))
Out[373]:
In [398]:
# Set / Reset Index 设置/重设索引
# DataFrame
data = pd.DataFrame({"a":["bar", "bar", "foo", "foo"],
"b":["one","two","one","two"],
"c":["z","y","x", "w"],
"d":range(1,5)}) # range 不用list
data
Out[398]:
In [377]:
indexed1 = data.set_index("c") # 把df的某列设置为index索引
indexed1
Out[377]:
In [378]:
indexed2 = data.set_index([‘a‘, ‘b‘]) # 把df的多列设置为多重索引
indexed2
Out[378]:
In [379]:
frame = data.set_index(‘c‘, drop=False) # drop参数,可以设置索引时不删除列,默认为删除列
frame
frame = frame.set_index([‘a‘, ‘b‘], append=True) # append参数,在原有索引的基础上,增加索引,变成复合索引
frame
Out[379]:
Out[379]:
In [380]:
data.set_index(‘c‘, drop=False) # 把某列设为索引,但不删除
data
data.set_index([‘a‘, ‘b‘], inplace=True) # 就地修改,而不是返回拷贝
data
Out[380]:
Out[380]:
Out[380]:
In [381]:
# Reset the index 重置索引
# Note: The reset_index method used to be called delevel which is now deprecated. 不推荐使用delevel
data
data.index.name # 无返回值?
data.index.names # 索引列的名称
data.reset_index() # 重置索引,原索引变为普通列,原索引名称变为列名
Out[381]:
Out[381]:
Out[381]:
In [382]:
frame
frame.reset_index(level=1) # 对于多重索引,可以指定重置哪一级索引,而不是全部重置
Out[382]:
Out[382]:
In [385]:
# reset_index方法可以使用drop参数,若其为true,则仅是把索引剔除,而不转换为df数据列
frame
frame.reset_index(level=2, drop=True)
Out[385]:
Out[385]:
In [399]:
# Adding an ad hoc index
# If you create an index yourself, you can just assign it to the index field
# 创建index对象,并赋值给df
index = pd.MultiIndex.from_product([range(2), [‘one‘, ‘two‘]], names=[‘first‘, ‘second‘])
index
data
data.index = index # 注意:index的长度要与df向适应
data
Out[399]:
Out[399]:
Out[399]:
In [ ]:
# Returning a view versus a copy 返回视图 对比 返回拷贝
# 避免链式操作,即多个连接的[],具体见前面的说明
# Why does assignment fail when using chained indexing? 为什么链式索引赋值失败
# python解释器,对以下赋值的区别
# 第一种:非链式赋值
dfmi.loc[:,(‘one‘,‘second‘)] = value
# becomes, python解释为:
dfmi.loc.__setitem__((slice(None), (‘one‘, ‘second‘)), value) # 直接set
# Of course, dfmi.loc.__getitem__(idx) may be a view or a copy of dfmi.
# 第二种:链式赋值
dfmi[‘one‘][‘second‘] = value # pd会抛出异常SettingWithCopy,链式操作具有不确定性!
# becomes, python解释为:
dfmi.__getitem__(‘one‘).__setitem__(‘second‘, value) # 先get,再set,前面的get返回的是一个copy
In [ ]:
# SettingWithCopy有时候会在没有明显的链式操作的情况下出现,例如:
def do_something(df):
foo = df[[‘bar‘, ‘baz‘]] # Is foo a view? A copy? Nobody knows!
# ... many lines here ...
foo[‘quux‘] = value # We don‘t know whether this will modify df or not! # “隐式”的链式操作
return foo
In [400]:
# Evaluation order matters 赋值命令事情
# 链式赋值操作引起的SettingWithCopyWarning,可以通过设置option mode取消或抑制。
dfb = pd.DataFrame({‘a‘ : [‘one‘, ‘one‘, ‘two‘,‘three‘, ‘two‘, ‘one‘, ‘six‘],‘c‘ : np.arange(7)})
dfb
Out[400]:
In [401]:
# This will show the SettingWithCopyWarning
# but the frame values will be set
dfb[‘c‘][dfb.a.str.startswith(‘o‘)] = 42
dfb
Out[401]:
In [ ]:
# This however is operating on a copy and will not work.
# 把option的提示级别由warn改为raise ????
pd.set_option(‘mode.chained_assignment‘,‘warn‘) # This however is operating on a copy and will not work. # ??
In [402]:
# A chained assignment can also crop up in setting in a mixed dtype frame. 出现在
# Note : These setting rules apply to all of .loc/.iloc
# This is the correct access method 正确的存取方法
dfc = pd.DataFrame({‘A‘:[‘aaa‘,‘bbb‘,‘ccc‘],‘B‘:[1,2,3]})
dfc
dfc.loc[0,‘A‘] = 11
dfc
Out[402]:
Out[402]:
In [403]:
dfc = dfc.copy()
dfc
dfc[‘A‘][0] = 111 # This can work at times, but is not guaranteed, and so should be avoided 有时可以工作,但不保证,应避免使用
dfc
Out[403]:
Out[403]:
In [ ]:
pd.set_option(‘mode.chained_assignment‘, ‘raise‘) # 链式赋值将不会执行
In [ ]:
# http://pandas.pydata.org/pandas-docs/stable/indexing.html 全文完
# 2018-02-19