# 运算、操作

# Series

# DataFrame

# 输出列名
print(df.columns)
# 查看行数
print(len(df))
# 按列条件筛选行
cod = df["label"] == "无关"
df = df[cod]
# 迭代行
for idx,row in df.iterrows():
     print(f"Row {index}: Name={row['Name']}, Age={row['Age']}, Occupation={row['Occupation']}")
# nan值判断
df["mark"].isna()
df["mark"].notna()
# 逐行操作
df['col3'] = df.apply(lambda row: combine_cols(row['col1'], row['col2']), axis=1)
# 删除列
del df['col2']
# 不包含索引列
df.to_csv('output.csv', index=False)

# 运算、操作

  • 两个Series或者两个DataFrame进行算术运算时,返回的是两个对象的并集(索引),缺失值或者无法计算的位置将以NaN进行替代。

    In [61]: s1
    Out[61]:
    a    4.2
    c    2.6
    d    5.4
    e   -1.9
    dtype: float64
    
    In [62]: s2
    Out[62]:
    a   -2.3
    c    1.2
    e    5.6
    f    7.2
    g    3.4
    dtype: float64
    
    In [63]: s1+s2
    Out[63]:
    a    1.9
    c    3.8
    d    NaN
    e    3.7
    f    NaN
    g    NaN
    dtype: float64
    
  • 在运算过程中,可以使用填充值-如果某一方有值,另一方没有的话,将没有的那方的值填充为指定的参数值。

    In [71]: df1.add(df2, fill_value=0)
    Out[71]:
             b    c     d     e
    five   6.0  NaN   7.0   8.0
    one    0.0  1.0   2.0   NaN
    six    9.0  NaN  10.0  11.0
    three  9.0  7.0  12.0   5.0
    two    3.0  4.0   6.0   2.0
    
    In [74]: df1.reindex(columns=df2.columns, fill_value=0) # 也可以这么干
    Out[74]:
           b  d  e
    one    0  2  0
    two    3  5  0
    
    # 类似的还有
    # add:加法
    # sub:减法
    # div:除法
    # floordiv:整除
    # mul:乘法
    # pow:幂次方
    
    
  • 使用drop方法可以删除Series的一个元素或者DataFrame的一行(默认)或一列(axis=1),指定inplace则是原地修改

    df.drop(2) # 删除第二行
    df.drop('two',axis = 1)  # 指定删除列,而不是默认的行
    df.drop(2,inplace=True)  #修改原数据
    
  • 可以使用replace将pandas对象中的指定值替换为别的值

    df.replace(4, NA)  # 将4替换为缺失值
    df.replace([3,4], NA) # 将3和4都替换为缺失值
    df.replace([3,4], [NA,0]) # 3和4分别替换为缺失值和0
    df.replace({3:NA,4:0})  # 参数的字典形式
    
  • 一些Numpy的通用函数对Pandas对象也有效

    In [91]: df = pd.DataFrame(np.random.randn(4,3), columns=list('bde'),index = ['one','two','three','four'])
    
    In [92]: df
    Out[92]:
                  b         d         e
    one   -0.522310  0.636599  0.992393
    two    0.572624 -0.451550 -1.935332
    three  0.021926  0.056706 -0.267661
    four  -2.718122 -0.740140 -1.565448
    
    In [93]: np.abs(df)
    Out[93]:
                  b         d         e
    one    0.522310  0.636599  0.992393
    two    0.572624  0.451550  1.935332
    three  0.021926  0.056706  0.267661
    four   2.718122  0.740140  1.565448
    
    • 可以自定义处理函数,然后使用apply方法对行列进行应用

      • apply对默认对DataFrame的列(或者说DataFrame中核心思想就是对列进行操作)进行操作,设置axis=columns可以变成对行应用。

        In [94]: f = lambda x: x.max() - x.min()
        
        In [95]: df.apply(f)
        Out[95]:
        b    3.290745
        d    1.376740
        e    2.927725
        dtype: float64
        
      • DataFrame的applymap以及Series的map。它们逐一对每个元素进行操作,而不是整行整列的操作。

        In [99]: f3 = lambda x: '%.2f' % x
        
        In [100]: df.applymap(f3)
        Out[100]:
                   b      d      e
        one    -0.52   0.64   0.99
        two     0.57  -0.45  -1.94
        three   0.02   0.06  -0.27
        four   -2.72  -0.74  -1.57
        
  • 使用sort_index方法根据索引排序

    • DataFrame中默认行行索引排序,设置axis=1指定按列排序。
    • 默认升序,设置ascending=False指定降序排序
    In [104]: s
    Out[104]:
    d    0
    a    1
    b    2
    c    3
    dtype: int64
    
    In [105]: s.sort_index() # 根据索引的字母序列排序
    Out[105]:
    a    1
    b    2
    c    3
    d    0
    dtype: int64
    
  • 使用sort_values方法根据元素值进行排序

    • Nan值会自动排序到最后

    • DataFrame需要指定by参数说明根据哪一()/几列进行排序

      df2.sort_values(by='b')
      df2.sort_values(by=['a','b'])
      
    In [113]: s= pd.Series([4, 7,-3,2])
    
    In [114]: s.sort_values()
    Out[114]:
    2   -3
    3    2
    0    4
    1    7
    dtype: int64
    
    In [120]: df2
    Out[120]:
       b  a
    0  4  0
    1  7  1
    2 -3  0
    3  2  1
    
    In [121]: df2.sort_values(by='b')  # 根据某一列里的元素值进行排序
    Out[121]:
       b  a
    2 -3  0
    3  2  1
    0  4  0
    1  7  1
    
    In [122]: df2.sort_values(by=['a','b']) # 根据某些列进行排序
    Out[122]:
       b  a
    2 -3  0
    0  4  0
    3  2  1
    1  7  1
    
  • 排名

      1. 以升序排名为例
      2. 所有数中最小的数排为1.0
      3. 按数大小依此类推,2.0、3.0、4.0给安排位次
      4. 如果有重复的数,则重复的排名相加除以重复的个数,得出一个排名
      5. 重复的数后面的排名,接着排
    • In [123]: s = pd.Series([7,-5,7,4,2,0,4])
      
      In [124]: s.rank()
      Out[124]:
      0    6.5
      1    1.0
      2    6.5
      3    4.5
      4    3.0
      5    2.0
      6    4.5
      dtype: float64
      
      # 排序规则
      # -5最小,给排名1.0
      # 0其次,排2.0
      # 2再次,排3.0
      # 有2个4,于是4.0+5.0等于9,再除个数2,最终排名4.5。4.0和5.0两个排名并未使用。
      # 又有2个7,于是6.0+7.0等于13,再除2,最后排名6.5
      
    • 可以按行或列进行排名

      df.rank(axis='columns')
      
  • 离散化和分箱

    • 使用cut方法根据分箱区间将数据进行分组

      • 设置right=False变成左闭右开,默认左开右闭

      • 可以提供分箱的区间定义或者直接要求分隔成整数个等分区间

        • 即分箱区间要么指定,要么均匀大小
      In [93]: bins = [18,25,35,60,100]
      In [94]: cats = pd.cut(ages,bins)
      In [95]: cats
      Out[95]:
      [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
      Length: 12
      Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
      In [96]: cats.codes
      Out[96]: array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
      
      In [97]: cats.categories
      Out[97]:
      IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
                    closed='right',
                    dtype='interval[int64]')
      
      In [98]: cats.describe
      Out[98]:
      <bound method Categorical.describe of [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
      Length: 12
      Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]>
      
      In [99]: pd.value_counts(cats)  # 各个箱子的数量
      Out[99]: 
      (18, 25]     5
      (35, 60]     3
      (25, 35]     3
      (60, 100]    1
      dtype: int64
      
      # 均匀大小区间
      In [101]: d =np.random.rand(20)
      
      In [102]: d
      Out[102]:
      array([0.83732945, 0.0850416 , 0.66540597, 0.90479238, 0.99222014,
             0.39409122, 0.91896172, 0.87163655, 0.31374598, 0.27726111,
             0.7716572 , 0.79131961, 0.42805445, 0.29934685, 0.19077374,
             0.79701771, 0.93789892, 0.93536338, 0.32299602, 0.305671  ])
      
      In [103]: pd.cut(d, 4, precision=2) # 精度限制在两位
      Out[103]:
      [(0.77, 0.99], (0.084, 0.31], (0.54, 0.77], (0.77, 0.99], (0.77, 0.99], ..., (0.77, 0.99], (0.77, 0.99], (0.77, 0.99], (0.31, 0.54], (0.084, 0.31]]
      Length: 20
      Categories (4, interval[float64]): [(0.084, 0.31] < (0.31, 0.54] < (0.54, 0.77] < (0.77, 0.99]]
      
    • 使用qcut方法指定每个箱子中的元素个数

      • 可以指定0-1之间的分位数
      In [104]: data = np.random.randn(1000)
      
      In [105]: cats = pd.qcut(data,4)
      
      In [106]: cats
      Out[106]:
      [(0.644, 2.83], (-0.0344, 0.644], (-0.0344, 0.644], (-0.734, -0.0344], (-0.734, -0.0344], ..., (-3.327, -0.734], (-0.734, -0.0344], (0.644, 2.83], (-0.734, -0.0344], (-0.0344, 0.644]]
      Length: 1000
      Categories (4, interval[float64]): [(-3.327, -0.734] < (-0.734, -0.0344] < (-0.0344, 0.644] < (0.644, 2.83]]
      
      In [108]: pd.value_counts(cats) # 各箱子中的元素个数相同
      Out[108]:
      (0.644, 2.83]        250
      (-0.0344, 0.644]     250
      (-0.734, -0.0344]    250
      (-3.327, -0.734]     250
      dtype: int64
      
  • 随机和抽样

    • 利用permutation生成一个随机序列

      In [134]: order = np.random.permutation(5) # 5个数
      
      In [135]: order
      Out[135]: array([3, 4, 1, 2, 0])
      
    • 利用take实现抽取操作-传入序列

      In [136]: df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
      
      In [138]: df.take(order)  #
      Out[138]:
          0   1   2   3
      3  12  13  14  15
      4  16  17  18  19
      1   4   5   6   7
      2   8   9  10  11
      0   0   1   2   3
      
    • 利用sample方法实现抽样操作

      • 默认不重复抽样,设置replace=True可变为重复抽样
      In [140]: df.sample(n=3)
      Out[140]:
          0   1   2   3
      0   0   1   2   3
      4  16  17  18  19
      1   4   5   6   7
      # 数据不够时会报错
      
      • 可以设置axis=1变为对列抽样
  • merge方法将两个pandas对象连接在一起,类似SQL的连接操作。

    • 默认情况下,它执行的是内连接(交集),也就是两个对象的交集。通过参数how,还可以指定外连接(并集)、左连接(左边对象全部保留)和右连接(右边对象全部保留)。
    • 参数on指定在哪个键上连接,参数left_onright_on分别指定左右对象的连接键。
    In [23]: df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
        ...:                     'data1': range(7)})
        ...:
    
    In [24]: df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
        ...:                     'data2': range(3)})
        ...:
    
    In [25]: df1
    Out[25]:
      key  data1
    0   b      0
    1   b      1
    2   a      2
    3   c      3
    4   a      4
    5   a      5
    6   b      6
    
    In [26]: df2
    Out[26]:
      key  data2
    0   a      0
    1   b      1
    2   d      2
    
    In [27]: pd.merge(df1,df2) # 默认内链接,并智能地查找连接的键
    Out[27]:
      key  data1  data2
    0   b      0      1
    1   b      1      1
    2   b      6      1
    3   a      2      0
    4   a      4      0
    5   a      5      0
    
    In [28]: pd.merge(df1,df2,on='key') # 最好是显式地指定连接的键
    Out[28]:
      key  data1  data2
    0   b      0      1
    1   b      1      1
    2   b      6      1
    3   a      2      0
    4   a      4      0
    5   a      5      0
    
    In [30]: pd.merge(df1, df2, how='outer') # 外连接
    Out[30]:
      key  data1  data2
    0   b    0.0    1.0
    1   b    1.0    1.0
    2   b    6.0    1.0
    3   a    2.0    0.0
    4   a    4.0    0.0
    5   a    5.0    0.0
    6   c    3.0    NaN
    7   d    NaN    2.0
    
    In [31]: pd.merge(df1, df2, how='left')  # 左连接
    Out[31]:
      key  data1  data2
    0   b      0    1.0
    1   b      1    1.0
    2   a      2    0.0
    3   c      3    NaN
    4   a      4    0.0
    5   a      5    0.0
    6   b      6    1.0
    
    In [32]: pd.merge(df1, df2, how='right') #右连接
    Out[32]:
      key  data1  data2
    0   b    0.0      1
    1   b    1.0      1
    2   b    6.0      1
    3   a    2.0      0
    4   a    4.0      0
    5   a    5.0      0
    6   d    NaN      2
    
    In [33]: df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
        ...:                     'data1': range(7)})
        ...:
    
    In [34]: df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
        ...:                     'data2': range(3)})
        ...:
    
    In [35]: pd.merge(df3, df4, left_on='lkey', right_on='rkey') # 指定两边的键
    Out[35]:
      lkey  data1 rkey  data2
    0    b      0    b      1
    1    b      1    b      1
    2    b      6    b      1
    3    a      2    a      0
    4    a      4    a      0
    5    a      5    a      0
    
    • 可以同时指定多个键进行链接
    In [36]: left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
        ...:                      'key2': ['one', 'two', 'one'],
        ...:                      'lval': [1, 2, 3]})
    
    In [37]: right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
        ...:                       'key2': ['one', 'one', 'one', 'two'],
        ...:                       'rval': [4, 5, 6, 7]})
    
    In [38]: pd.merge(left, right, on=['key1', 'key2'], how='outer')
    Out[38]:
      key1 key2  lval  rval
    0  foo  one   1.0   4.0
    1  foo  one   1.0   5.0
    2  foo  two   2.0   NaN
    3  bar  one   3.0   6.0
    4  bar  two   NaN   7.0
    
  • concat方法可以实现对象在轴向的的粘合或者堆叠。

    In [55]: s1 = pd.Series([0, 1], index=['a', 'b'])
    
    In [56]: s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
    
    In [57]: s3 = pd.Series([5, 6], index=['f', 'g'])
    
    In [58]: pd.concat([s1, s2, s3]) # 要以列表的方式提供参数
    Out[58]:
    a    0
    b    1
    c    2
    d    3
    e    4
    f    5
    g    6
    dtype: int64
    
    In [59]: pd.concat([s1, s2, s3], axis=1) # 横向堆叠,但出现警告信息
    C:\ProgramData\Anaconda3\Scripts\ipython:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
    of pandas will change to not sort by default.
    ......
    
    Out[59]:
         0    1    2
    a  0.0  NaN  NaN
    b  1.0  NaN  NaN
    c  NaN  2.0  NaN
    d  NaN  3.0  NaN
    e  NaN  4.0  NaN
    f  NaN  NaN  5.0
    g  NaN  NaN  6.0
    
    In [60]: pd.concat([s1, s2, s3], axis=1,sort=True) # 按人家的要求做
    Out[60]:
         0    1    2
    a  0.0  NaN  NaN
    b  1.0  NaN  NaN
    c  NaN  2.0  NaN
    d  NaN  3.0  NaN
    e  NaN  4.0  NaN
    f  NaN  NaN  5.0
    g  NaN  NaN  6.0
    
    • DataFrame中默认按行向下进行合并,设置axis可以按列合并
    In [66]: df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
        ...:                    columns=['one', 'two'])
        ...:
    
    In [67]: df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
        ...:                    columns=['three', 'four'])
        ...:
    
    In [68]: df1
    Out[68]:
       one  two
    a    0    1
    b    2    3
    c    4    5
    
    In [69]: df2
    Out[69]:
       three  four
    a      5     6
    c      7     8
    
    
    In [71]: pd.concat([df1, df2], sort=True)
    
    Out[71]:
       four  one  three  two
    a   NaN  0.0    NaN  1.0
    b   NaN  2.0    NaN  3.0
    c   NaN  4.0    NaN  5.0
    a   6.0  NaN    5.0  NaN
    c   8.0  NaN    7.0  NaN
    
    In [72]: pd.concat([df1, df2], axis=1, sort=True)
    Out[72]:
       one  two  three  four
    a    0    1    5.0   6.0
    b    2    3    NaN   NaN
    c    4    5    7.0   8.0
    
  • combine_first 根据传入的对象来修补调用对象的缺失值。

    In [81]: df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
        ...:                     'b': [np.nan, 2., np.nan, 6.],
        ...:                     'c': range(2, 18, 4)})
        ...:
    
    In [82]: df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
        ...:                     'b': [np.nan, 3., 4., 6., 8.]})
        ...:
    
    In [83]: df1
    Out[83]:
         a    b   c
    0  1.0  NaN   2
    1  NaN  2.0   6
    2  5.0  NaN  10
    3  NaN  6.0  14
    
    In [84]: df2
    Out[84]:
         a    b
    0  5.0  NaN
    1  4.0  3.0
    2  NaN  4.0
    3  3.0  6.0
    4  7.0  8.0
    
    In [85]: df1.combine_first(df2)
    Out[85]:
         a    b     c
    0  1.0  NaN   2.0
    1  4.0  2.0   6.0
    2  5.0  4.0  10.0
    3  3.0  6.0  14.0
    4  7.0  8.0   NaN
    
  • np.where(pd.isnull(a), b, a)

    • 首先去pd.isnull(a)种判断元素,如果是True,从b里拿数据,否则从a里拿,得到最终结果。