# 运算、操作

# Series

‍

# DataFrame

# 输出列名
print(df.columns)
# 查看行数
print(len(df))
# 按列条件筛选行
cod = df["label"] == "无关"
df = df[cod]
# 迭代行
for idx,row in df.iterrows():
     print(f"Row {index}: Name={row['Name']}, Age={row['Age']}, Occupation={row['Occupation']}")
# nan值判断
df["mark"].isna()
df["mark"].notna()
# 逐行操作
df['col3'] = df.apply(lambda row: combine_cols(row['col1'], row['col2']), axis=1)
# 删除列
del df['col2']
# 不包含索引列
df.to_csv('output.csv', index=False)

‍

# 运算、操作

两个Series或者两个DataFrame进行算术运算时，返回的是两个对象的并集（索引），缺失值或者无法计算的位置将以NaN进行替代。

In [61]: s1
Out[61]:
a    4.2
c    2.6
d    5.4
e   -1.9
dtype: float64

In [62]: s2
Out[62]:
a   -2.3
c    1.2
e    5.6
f    7.2
g    3.4
dtype: float64

In [63]: s1+s2
Out[63]:
a    1.9
c    3.8
d    NaN
e    3.7
f    NaN
g    NaN
dtype: float64

在运算过程中，可以使用填充值-如果某一方有值，另一方没有的话，将没有的那方的值填充为指定的参数值。

In [71]: df1.add(df2, fill_value=0)
Out[71]:
         b    c     d     e
five   6.0  NaN   7.0   8.0
one    0.0  1.0   2.0   NaN
six    9.0  NaN  10.0  11.0
three  9.0  7.0  12.0   5.0
two    3.0  4.0   6.0   2.0

In [74]: df1.reindex(columns=df2.columns, fill_value=0) # 也可以这么干
Out[74]:
       b  d  e
one    0  2  0
two    3  5  0

# 类似的还有
# add：加法
# sub：减法
# div：除法
# floordiv：整除
# mul：乘法
# pow：幂次方

使用drop方法可以删除Series的一个元素或者DataFrame的一行(默认)或一列(axis=1)，指定inplace则是原地修改

df.drop(2) # 删除第二行
df.drop('two',axis = 1)  # 指定删除列，而不是默认的行
df.drop(2,inplace=True)  #修改原数据

可以使用replace将pandas对象中的指定值替换为别的值

df.replace(4, NA)  # 将4替换为缺失值
df.replace([3,4], NA) # 将3和4都替换为缺失值
df.replace([3,4], [NA,0]) # 3和4分别替换为缺失值和0
df.replace({3:NA,4:0})  # 参数的字典形式

一些Numpy的通用函数对Pandas对象也有效

In [91]: df = pd.DataFrame(np.random.randn(4,3), columns=list('bde'),index = ['one','two','three','four'])

In [92]: df
Out[92]:
              b         d         e
one   -0.522310  0.636599  0.992393
two    0.572624 -0.451550 -1.935332
three  0.021926  0.056706 -0.267661
four  -2.718122 -0.740140 -1.565448

In [93]: np.abs(df)
Out[93]:
              b         d         e
one    0.522310  0.636599  0.992393
two    0.572624  0.451550  1.935332
three  0.021926  0.056706  0.267661
four   2.718122  0.740140  1.565448

可以自定义处理函数，然后使用apply方法对行列进行应用

apply对默认对DataFrame的列（或者说DataFrame中核心思想就是对列进行操作）进行操作，设置axis=columns可以变成对行应用。

In [94]: f = lambda x: x.max() - x.min()

In [95]: df.apply(f)
Out[95]:
b    3.290745
d    1.376740
e    2.927725
dtype: float64

DataFrame的applymap以及Series的map。它们逐一对每个元素进行操作，而不是整行整列的操作。

In [99]: f3 = lambda x: '%.2f' % x

In [100]: df.applymap(f3)
Out[100]:
           b      d      e
one    -0.52   0.64   0.99
two     0.57  -0.45  -1.94
three   0.02   0.06  -0.27
four   -2.72  -0.74  -1.57

使用sort_index方法根据索引排序

DataFrame中默认行行索引排序，设置axis=1指定按列排序。
默认升序，设置ascending=False指定降序排序

In [104]: s
Out[104]:
d    0
a    1
b    2
c    3
dtype: int64

In [105]: s.sort_index() # 根据索引的字母序列排序
Out[105]:
a    1
b    2
c    3
d    0
dtype: int64

使用sort_values方法根据元素值进行排序

Nan值会自动排序到最后
DataFrame需要指定by参数说明根据哪一（）/几列进行排序
```
df2.sort_values(by='b')
df2.sort_values(by=['a','b'])
```

In [113]: s= pd.Series([4, 7,-3,2])

In [114]: s.sort_values()
Out[114]:
2   -3
3    2
0    4
1    7
dtype: int64

In [120]: df2
Out[120]:
   b  a
0  4  0
1  7  1
2 -3  0
3  2  1

In [121]: df2.sort_values(by='b')  # 根据某一列里的元素值进行排序
Out[121]:
   b  a
2 -3  0
3  2  1
0  4  0
1  7  1

In [122]: df2.sort_values(by=['a','b']) # 根据某些列进行排序
Out[122]:
   b  a
2 -3  0
0  4  0
3  2  1
1  7  1

排名

1. 以升序排名为例
2. 所有数中最小的数排为1.0
3. 按数大小依此类推，2.0、3.0、4.0给安排位次
4. 如果有重复的数，则重复的排名相加除以重复的个数，得出一个排名
5. 重复的数后面的排名，接着排

In [123]: s = pd.Series([7,-5,7,4,2,0,4])

In [124]: s.rank()
Out[124]:
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

# 排序规则
# -5最小，给排名1.0
# 0其次，排2.0
# 2再次，排3.0
# 有2个4，于是4.0+5.0等于9，再除个数2，最终排名4.5。4.0和5.0两个排名并未使用。
# 又有2个7，于是6.0+7.0等于13，再除2，最后排名6.5

可以按行或列进行排名
```
df.rank(axis='columns')
```
‍

离散化和分箱

使用cut方法根据分箱区间将数据进行分组

设置right=False变成左闭右开，默认左开右闭
可以提供分箱的区间定义或者直接要求分隔成整数个等分区间
- 即分箱区间要么指定，要么均匀大小

In [93]: bins = [18,25,35,60,100]
In [94]: cats = pd.cut(ages,bins)
In [95]: cats
Out[95]:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
In [96]: cats.codes
Out[96]: array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [97]: cats.categories
Out[97]:
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [98]: cats.describe
Out[98]:
<bound method Categorical.describe of [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]>

In [99]: pd.value_counts(cats)  # 各个箱子的数量
Out[99]: 
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

# 均匀大小区间
In [101]: d =np.random.rand(20)

In [102]: d
Out[102]:
array([0.83732945, 0.0850416 , 0.66540597, 0.90479238, 0.99222014,
       0.39409122, 0.91896172, 0.87163655, 0.31374598, 0.27726111,
       0.7716572 , 0.79131961, 0.42805445, 0.29934685, 0.19077374,
       0.79701771, 0.93789892, 0.93536338, 0.32299602, 0.305671  ])

In [103]: pd.cut(d, 4, precision=2) # 精度限制在两位
Out[103]:
[(0.77, 0.99], (0.084, 0.31], (0.54, 0.77], (0.77, 0.99], (0.77, 0.99], ..., (0.77, 0.99], (0.77, 0.99], (0.77, 0.99], (0.31, 0.54], (0.084, 0.31]]
Length: 20
Categories (4, interval[float64]): [(0.084, 0.31] < (0.31, 0.54] < (0.54, 0.77] < (0.77, 0.99]]

使用qcut方法指定每个箱子中的元素个数

可以指定0-1之间的分位数

In [104]: data = np.random.randn(1000)

In [105]: cats = pd.qcut(data,4)

In [106]: cats
Out[106]:
[(0.644, 2.83], (-0.0344, 0.644], (-0.0344, 0.644], (-0.734, -0.0344], (-0.734, -0.0344], ..., (-3.327, -0.734], (-0.734, -0.0344], (0.644, 2.83], (-0.734, -0.0344], (-0.0344, 0.644]]
Length: 1000
Categories (4, interval[float64]): [(-3.327, -0.734] < (-0.734, -0.0344] < (-0.0344, 0.644] < (0.644, 2.83]]

In [108]: pd.value_counts(cats) # 各箱子中的元素个数相同
Out[108]:
(0.644, 2.83]        250
(-0.0344, 0.644]     250
(-0.734, -0.0344]    250
(-3.327, -0.734]     250
dtype: int64

随机和抽样

利用permutation生成一个随机序列

In [134]: order = np.random.permutation(5) # 5个数

In [135]: order
Out[135]: array([3, 4, 1, 2, 0])

利用take实现抽取操作-传入序列

In [136]: df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

In [138]: df.take(order)  #
Out[138]:
    0   1   2   3
3  12  13  14  15
4  16  17  18  19
1   4   5   6   7
2   8   9  10  11
0   0   1   2   3

利用sample方法实现抽样操作

默认不重复抽样，设置replace=True可变为重复抽样

In [140]: df.sample(n=3)
Out[140]:
    0   1   2   3
0   0   1   2   3
4  16  17  18  19
1   4   5   6   7
# 数据不够时会报错

可以设置axis=1变为对列抽样

merge方法将两个pandas对象连接在一起，类似SQL的连接操作。

默认情况下，它执行的是内连接（交集），也就是两个对象的交集。通过参数how，还可以指定外连接（并集）、左连接（左边对象全部保留）和右连接（右边对象全部保留）。
参数on指定在哪个键上连接，参数left_on和right_on分别指定左右对象的连接键。

In [23]: df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
    ...:                     'data1': range(7)})
    ...:

In [24]: df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
    ...:                     'data2': range(3)})
    ...:

In [25]: df1
Out[25]:
  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6

In [26]: df2
Out[26]:
  key  data2
0   a      0
1   b      1
2   d      2

In [27]: pd.merge(df1,df2) # 默认内链接，并智能地查找连接的键
Out[27]:
  key  data1  data2
0   b      0      1
1   b      1      1
2   b      6      1
3   a      2      0
4   a      4      0
5   a      5      0

In [28]: pd.merge(df1,df2,on='key') # 最好是显式地指定连接的键
Out[28]:
  key  data1  data2
0   b      0      1
1   b      1      1
2   b      6      1
3   a      2      0
4   a      4      0
5   a      5      0

In [30]: pd.merge(df1, df2, how='outer') # 外连接
Out[30]:
  key  data1  data2
0   b    0.0    1.0
1   b    1.0    1.0
2   b    6.0    1.0
3   a    2.0    0.0
4   a    4.0    0.0
5   a    5.0    0.0
6   c    3.0    NaN
7   d    NaN    2.0

In [31]: pd.merge(df1, df2, how='left')  # 左连接
Out[31]:
  key  data1  data2
0   b      0    1.0
1   b      1    1.0
2   a      2    0.0
3   c      3    NaN
4   a      4    0.0
5   a      5    0.0
6   b      6    1.0

In [32]: pd.merge(df1, df2, how='right') #右连接
Out[32]:
  key  data1  data2
0   b    0.0      1
1   b    1.0      1
2   b    6.0      1
3   a    2.0      0
4   a    4.0      0
5   a    5.0      0
6   d    NaN      2

In [33]: df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
    ...:                     'data1': range(7)})
    ...:

In [34]: df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
    ...:                     'data2': range(3)})
    ...:

In [35]: pd.merge(df3, df4, left_on='lkey', right_on='rkey') # 指定两边的键
Out[35]:
  lkey  data1 rkey  data2
0    b      0    b      1
1    b      1    b      1
2    b      6    b      1
3    a      2    a      0
4    a      4    a      0
5    a      5    a      0

可以同时指定多个键进行链接

In [36]: left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
    ...:                      'key2': ['one', 'two', 'one'],
    ...:                      'lval': [1, 2, 3]})

In [37]: right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
    ...:                       'key2': ['one', 'one', 'one', 'two'],
    ...:                       'rval': [4, 5, 6, 7]})

In [38]: pd.merge(left, right, on=['key1', 'key2'], how='outer')
Out[38]:
  key1 key2  lval  rval
0  foo  one   1.0   4.0
1  foo  one   1.0   5.0
2  foo  two   2.0   NaN
3  bar  one   3.0   6.0
4  bar  two   NaN   7.0

concat方法可以实现对象在轴向的的粘合或者堆叠。

In [55]: s1 = pd.Series([0, 1], index=['a', 'b'])

In [56]: s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])

In [57]: s3 = pd.Series([5, 6], index=['f', 'g'])

In [58]: pd.concat([s1, s2, s3]) # 要以列表的方式提供参数
Out[58]:
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [59]: pd.concat([s1, s2, s3], axis=1) # 横向堆叠，但出现警告信息
C:\ProgramData\Anaconda3\Scripts\ipython:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
......

Out[59]:
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0

In [60]: pd.concat([s1, s2, s3], axis=1,sort=True) # 按人家的要求做
Out[60]:
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0

DataFrame中默认按行向下进行合并，设置axis可以按列合并

In [66]: df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
    ...:                    columns=['one', 'two'])
    ...:

In [67]: df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
    ...:                    columns=['three', 'four'])
    ...:

In [68]: df1
Out[68]:
   one  two
a    0    1
b    2    3
c    4    5

In [69]: df2
Out[69]:
   three  four
a      5     6
c      7     8


In [71]: pd.concat([df1, df2], sort=True)

Out[71]:
   four  one  three  two
a   NaN  0.0    NaN  1.0
b   NaN  2.0    NaN  3.0
c   NaN  4.0    NaN  5.0
a   6.0  NaN    5.0  NaN
c   8.0  NaN    7.0  NaN

In [72]: pd.concat([df1, df2], axis=1, sort=True)
Out[72]:
   one  two  three  four
a    0    1    5.0   6.0
b    2    3    NaN   NaN
c    4    5    7.0   8.0

combine_first 根据传入的对象来修补调用对象的缺失值。

In [81]: df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
    ...:                     'b': [np.nan, 2., np.nan, 6.],
    ...:                     'c': range(2, 18, 4)})
    ...:

In [82]: df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
    ...:                     'b': [np.nan, 3., 4., 6., 8.]})
    ...:

In [83]: df1
Out[83]:
     a    b   c
0  1.0  NaN   2
1  NaN  2.0   6
2  5.0  NaN  10
3  NaN  6.0  14

In [84]: df2
Out[84]:
     a    b
0  5.0  NaN
1  4.0  3.0
2  NaN  4.0
3  3.0  6.0
4  7.0  8.0

In [85]: df1.combine_first(df2)
Out[85]:
     a    b     c
0  1.0  NaN   2.0
1  4.0  2.0   6.0
2  5.0  4.0  10.0
3  3.0  6.0  14.0
4  7.0  8.0   NaN

np.where(pd.isnull(a), b, a)
- 首先去pd.isnull(a)种判断元素，如果是True，从b里拿数据，否则从a里拿，得到最终结果。
‍

← 核心概念命名捕获组 →