# 运算、操作
# Series
# DataFrame
# 输出列名
print(df.columns)
# 查看行数
print(len(df))
# 按列条件筛选行
cod = df["label"] == "无关"
df = df[cod]
# 迭代行
for idx,row in df.iterrows():
print(f"Row {index}: Name={row['Name']}, Age={row['Age']}, Occupation={row['Occupation']}")
# nan值判断
df["mark"].isna()
df["mark"].notna()
# 逐行操作
df['col3'] = df.apply(lambda row: combine_cols(row['col1'], row['col2']), axis=1)
# 删除列
del df['col2']
# 不包含索引列
df.to_csv('output.csv', index=False)
# 运算、操作
两个Series或者两个DataFrame进行算术运算时,返回的是两个对象的并集(索引),缺失值或者无法计算的位置将以NaN进行替代。
In [61]: s1 Out[61]: a 4.2 c 2.6 d 5.4 e -1.9 dtype: float64 In [62]: s2 Out[62]: a -2.3 c 1.2 e 5.6 f 7.2 g 3.4 dtype: float64 In [63]: s1+s2 Out[63]: a 1.9 c 3.8 d NaN e 3.7 f NaN g NaN dtype: float64在运算过程中,可以使用填充值-如果某一方有值,另一方没有的话,将没有的那方的值填充为指定的参数值。
In [71]: df1.add(df2, fill_value=0) Out[71]: b c d e five 6.0 NaN 7.0 8.0 one 0.0 1.0 2.0 NaN six 9.0 NaN 10.0 11.0 three 9.0 7.0 12.0 5.0 two 3.0 4.0 6.0 2.0 In [74]: df1.reindex(columns=df2.columns, fill_value=0) # 也可以这么干 Out[74]: b d e one 0 2 0 two 3 5 0 # 类似的还有 # add:加法 # sub:减法 # div:除法 # floordiv:整除 # mul:乘法 # pow:幂次方使用drop方法可以删除Series的一个元素或者DataFrame的一行(默认)或一列(axis=1),指定inplace则是原地修改
df.drop(2) # 删除第二行 df.drop('two',axis = 1) # 指定删除列,而不是默认的行 df.drop(2,inplace=True) #修改原数据可以使用replace将pandas对象中的指定值替换为别的值
df.replace(4, NA) # 将4替换为缺失值 df.replace([3,4], NA) # 将3和4都替换为缺失值 df.replace([3,4], [NA,0]) # 3和4分别替换为缺失值和0 df.replace({3:NA,4:0}) # 参数的字典形式一些Numpy的通用函数对Pandas对象也有效
In [91]: df = pd.DataFrame(np.random.randn(4,3), columns=list('bde'),index = ['one','two','three','four']) In [92]: df Out[92]: b d e one -0.522310 0.636599 0.992393 two 0.572624 -0.451550 -1.935332 three 0.021926 0.056706 -0.267661 four -2.718122 -0.740140 -1.565448 In [93]: np.abs(df) Out[93]: b d e one 0.522310 0.636599 0.992393 two 0.572624 0.451550 1.935332 three 0.021926 0.056706 0.267661 four 2.718122 0.740140 1.565448可以自定义处理函数,然后使用apply方法对行列进行应用
apply对默认对DataFrame的列(或者说DataFrame中核心思想就是对列进行操作)进行操作,设置axis=columns可以变成对行应用。
In [94]: f = lambda x: x.max() - x.min() In [95]: df.apply(f) Out[95]: b 3.290745 d 1.376740 e 2.927725 dtype: float64DataFrame的applymap以及Series的map。它们逐一对每个元素进行操作,而不是整行整列的操作。
In [99]: f3 = lambda x: '%.2f' % x In [100]: df.applymap(f3) Out[100]: b d e one -0.52 0.64 0.99 two 0.57 -0.45 -1.94 three 0.02 0.06 -0.27 four -2.72 -0.74 -1.57
使用sort_index方法根据索引排序
- DataFrame中默认行行索引排序,设置axis=1指定按列排序。
- 默认升序,设置ascending=False指定降序排序
In [104]: s Out[104]: d 0 a 1 b 2 c 3 dtype: int64 In [105]: s.sort_index() # 根据索引的字母序列排序 Out[105]: a 1 b 2 c 3 d 0 dtype: int64使用sort_values方法根据元素值进行排序
Nan值会自动排序到最后
DataFrame需要指定by参数说明根据哪一()/几列进行排序
df2.sort_values(by='b') df2.sort_values(by=['a','b'])
In [113]: s= pd.Series([4, 7,-3,2]) In [114]: s.sort_values() Out[114]: 2 -3 3 2 0 4 1 7 dtype: int64 In [120]: df2 Out[120]: b a 0 4 0 1 7 1 2 -3 0 3 2 1 In [121]: df2.sort_values(by='b') # 根据某一列里的元素值进行排序 Out[121]: b a 2 -3 0 3 2 1 0 4 0 1 7 1 In [122]: df2.sort_values(by=['a','b']) # 根据某些列进行排序 Out[122]: b a 2 -3 0 0 4 0 3 2 1 1 7 1排名
- 以升序排名为例
- 所有数中最小的数排为1.0
- 按数大小依此类推,2.0、3.0、4.0给安排位次
- 如果有重复的数,则重复的排名相加除以重复的个数,得出一个排名
- 重复的数后面的排名,接着排
In [123]: s = pd.Series([7,-5,7,4,2,0,4]) In [124]: s.rank() Out[124]: 0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64 # 排序规则 # -5最小,给排名1.0 # 0其次,排2.0 # 2再次,排3.0 # 有2个4,于是4.0+5.0等于9,再除个数2,最终排名4.5。4.0和5.0两个排名并未使用。 # 又有2个7,于是6.0+7.0等于13,再除2,最后排名6.5可以按行或列进行排名
df.rank(axis='columns')
离散化和分箱
使用cut方法根据分箱区间将数据进行分组
设置right=False变成左闭右开,默认左开右闭
可以提供分箱的区间定义或者直接要求分隔成整数个等分区间
- 即分箱区间要么指定,要么均匀大小
In [93]: bins = [18,25,35,60,100] In [94]: cats = pd.cut(ages,bins) In [95]: cats Out[95]: [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]] Length: 12 Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]] In [96]: cats.codes Out[96]: array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8) In [97]: cats.categories Out[97]: IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]] closed='right', dtype='interval[int64]') In [98]: cats.describe Out[98]: <bound method Categorical.describe of [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]] Length: 12 Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]> In [99]: pd.value_counts(cats) # 各个箱子的数量 Out[99]: (18, 25] 5 (35, 60] 3 (25, 35] 3 (60, 100] 1 dtype: int64 # 均匀大小区间 In [101]: d =np.random.rand(20) In [102]: d Out[102]: array([0.83732945, 0.0850416 , 0.66540597, 0.90479238, 0.99222014, 0.39409122, 0.91896172, 0.87163655, 0.31374598, 0.27726111, 0.7716572 , 0.79131961, 0.42805445, 0.29934685, 0.19077374, 0.79701771, 0.93789892, 0.93536338, 0.32299602, 0.305671 ]) In [103]: pd.cut(d, 4, precision=2) # 精度限制在两位 Out[103]: [(0.77, 0.99], (0.084, 0.31], (0.54, 0.77], (0.77, 0.99], (0.77, 0.99], ..., (0.77, 0.99], (0.77, 0.99], (0.77, 0.99], (0.31, 0.54], (0.084, 0.31]] Length: 20 Categories (4, interval[float64]): [(0.084, 0.31] < (0.31, 0.54] < (0.54, 0.77] < (0.77, 0.99]]使用qcut方法指定每个箱子中的元素个数
- 可以指定0-1之间的分位数
In [104]: data = np.random.randn(1000) In [105]: cats = pd.qcut(data,4) In [106]: cats Out[106]: [(0.644, 2.83], (-0.0344, 0.644], (-0.0344, 0.644], (-0.734, -0.0344], (-0.734, -0.0344], ..., (-3.327, -0.734], (-0.734, -0.0344], (0.644, 2.83], (-0.734, -0.0344], (-0.0344, 0.644]] Length: 1000 Categories (4, interval[float64]): [(-3.327, -0.734] < (-0.734, -0.0344] < (-0.0344, 0.644] < (0.644, 2.83]] In [108]: pd.value_counts(cats) # 各箱子中的元素个数相同 Out[108]: (0.644, 2.83] 250 (-0.0344, 0.644] 250 (-0.734, -0.0344] 250 (-3.327, -0.734] 250 dtype: int64
随机和抽样
利用permutation生成一个随机序列
In [134]: order = np.random.permutation(5) # 5个数 In [135]: order Out[135]: array([3, 4, 1, 2, 0])利用take实现抽取操作-传入序列
In [136]: df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4))) In [138]: df.take(order) # Out[138]: 0 1 2 3 3 12 13 14 15 4 16 17 18 19 1 4 5 6 7 2 8 9 10 11 0 0 1 2 3利用sample方法实现抽样操作
- 默认不重复抽样,设置replace=True可变为重复抽样
In [140]: df.sample(n=3) Out[140]: 0 1 2 3 0 0 1 2 3 4 16 17 18 19 1 4 5 6 7 # 数据不够时会报错- 可以设置axis=1变为对列抽样
merge方法将两个pandas对象连接在一起,类似SQL的连接操作。
- 默认情况下,它执行的是内连接(交集),也就是两个对象的交集。通过参数how,还可以指定外连接(并集)、左连接(左边对象全部保留)和右连接(右边对象全部保留)。
- 参数on指定在哪个键上连接,参数
left_on和right_on分别指定左右对象的连接键。
In [23]: df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], ...: 'data1': range(7)}) ...: In [24]: df2 = pd.DataFrame({'key': ['a', 'b', 'd'], ...: 'data2': range(3)}) ...: In [25]: df1 Out[25]: key data1 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 a 5 6 b 6 In [26]: df2 Out[26]: key data2 0 a 0 1 b 1 2 d 2 In [27]: pd.merge(df1,df2) # 默认内链接,并智能地查找连接的键 Out[27]: key data1 data2 0 b 0 1 1 b 1 1 2 b 6 1 3 a 2 0 4 a 4 0 5 a 5 0 In [28]: pd.merge(df1,df2,on='key') # 最好是显式地指定连接的键 Out[28]: key data1 data2 0 b 0 1 1 b 1 1 2 b 6 1 3 a 2 0 4 a 4 0 5 a 5 0 In [30]: pd.merge(df1, df2, how='outer') # 外连接 Out[30]: key data1 data2 0 b 0.0 1.0 1 b 1.0 1.0 2 b 6.0 1.0 3 a 2.0 0.0 4 a 4.0 0.0 5 a 5.0 0.0 6 c 3.0 NaN 7 d NaN 2.0 In [31]: pd.merge(df1, df2, how='left') # 左连接 Out[31]: key data1 data2 0 b 0 1.0 1 b 1 1.0 2 a 2 0.0 3 c 3 NaN 4 a 4 0.0 5 a 5 0.0 6 b 6 1.0 In [32]: pd.merge(df1, df2, how='right') #右连接 Out[32]: key data1 data2 0 b 0.0 1 1 b 1.0 1 2 b 6.0 1 3 a 2.0 0 4 a 4.0 0 5 a 5.0 0 6 d NaN 2 In [33]: df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], ...: 'data1': range(7)}) ...: In [34]: df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], ...: 'data2': range(3)}) ...: In [35]: pd.merge(df3, df4, left_on='lkey', right_on='rkey') # 指定两边的键 Out[35]: lkey data1 rkey data2 0 b 0 b 1 1 b 1 b 1 2 b 6 b 1 3 a 2 a 0 4 a 4 a 0 5 a 5 a 0- 可以同时指定多个键进行链接
In [36]: left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'], ...: 'key2': ['one', 'two', 'one'], ...: 'lval': [1, 2, 3]}) In [37]: right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'], ...: 'key2': ['one', 'one', 'one', 'two'], ...: 'rval': [4, 5, 6, 7]}) In [38]: pd.merge(left, right, on=['key1', 'key2'], how='outer') Out[38]: key1 key2 lval rval 0 foo one 1.0 4.0 1 foo one 1.0 5.0 2 foo two 2.0 NaN 3 bar one 3.0 6.0 4 bar two NaN 7.0concat方法可以实现对象在轴向的的粘合或者堆叠。
In [55]: s1 = pd.Series([0, 1], index=['a', 'b']) In [56]: s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e']) In [57]: s3 = pd.Series([5, 6], index=['f', 'g']) In [58]: pd.concat([s1, s2, s3]) # 要以列表的方式提供参数 Out[58]: a 0 b 1 c 2 d 3 e 4 f 5 g 6 dtype: int64 In [59]: pd.concat([s1, s2, s3], axis=1) # 横向堆叠,但出现警告信息 C:\ProgramData\Anaconda3\Scripts\ipython:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. ...... Out[59]: 0 1 2 a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0 In [60]: pd.concat([s1, s2, s3], axis=1,sort=True) # 按人家的要求做 Out[60]: 0 1 2 a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0- DataFrame中默认按行向下进行合并,设置axis可以按列合并
In [66]: df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], ...: columns=['one', 'two']) ...: In [67]: df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], ...: columns=['three', 'four']) ...: In [68]: df1 Out[68]: one two a 0 1 b 2 3 c 4 5 In [69]: df2 Out[69]: three four a 5 6 c 7 8 In [71]: pd.concat([df1, df2], sort=True) Out[71]: four one three two a NaN 0.0 NaN 1.0 b NaN 2.0 NaN 3.0 c NaN 4.0 NaN 5.0 a 6.0 NaN 5.0 NaN c 8.0 NaN 7.0 NaN In [72]: pd.concat([df1, df2], axis=1, sort=True) Out[72]: one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0-
combine_first 根据传入的对象来修补调用对象的缺失值。
In [81]: df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan], ...: 'b': [np.nan, 2., np.nan, 6.], ...: 'c': range(2, 18, 4)}) ...: In [82]: df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.], ...: 'b': [np.nan, 3., 4., 6., 8.]}) ...: In [83]: df1 Out[83]: a b c 0 1.0 NaN 2 1 NaN 2.0 6 2 5.0 NaN 10 3 NaN 6.0 14 In [84]: df2 Out[84]: a b 0 5.0 NaN 1 4.0 3.0 2 NaN 4.0 3 3.0 6.0 4 7.0 8.0 In [85]: df1.combine_first(df2) Out[85]: a b c 0 1.0 NaN 2.0 1 4.0 2.0 6.0 2 5.0 4.0 10.0 3 3.0 6.0 14.0 4 7.0 8.0 NaNnp.where(pd.isnull(a), b, a)- 首先去
pd.isnull(a)种判断元素,如果是True,从b里拿数据,否则从a里拿,得到最终结果。
- 首先去