跳至主要內容

Pandas

程序员李某某大约 5 分钟

Pandas

概述

Pandas 库基于 NumPy 构建,为 Python 提供了易于使用的数据结构和数据分析工具。

Series

Series:一维数组,类似 Python 的列表。

# index 可选,默认为索引
data = pd.Series([1, 2, 3, 4, 5],index = ['a', 'b', 'c', 'd', 'e'])
# 获取
print(data['a'])
# set
data['a'] = 10
# 删除
data.drop(['a','c'])
## 替换
data.replace(to_replace = [1,2],value = [0,0])
## 添加
data.append(pd.Series([6,7,8,9,10]),ignore_index = True) #  忽略索引,往后追加
## 改索引
data.rename(index={'a': 'A'}, inplace=True)

DataFrame

DataFrame:二维数组,带列的数据结构,每一列是一个Series。包含三个组件

  • index:索引
  • columns:列名
  • values:数据 --- numpy二维
data = {'Country': ['Belgium', 'India', 'Brazil'],
    'Capital': ['Brussels', 'New Delhi', 'Brasilia'],
    'Population': [11190846, 1303171035, 207847528]}
# columns:列名,默认展示全部
df = pd.DataFrame(data, columns=['Country', 'Capital'])
## 获取
# 按索引
print(df.iloc[[0], [1]], '-------')  # 某一个
print(df.iloc[:, [1]], '-------')  # 一列
print(df.iloc[[1]], '-------')  # 一行
# 按标签
print(df.loc[[0], 'Country'])  # 某一个
print(df.loc[:, 'Country'])  # 列
print(df.loc[[0],])  # 行
df['Country'] # 列
df.Country # 列
df.Population[0] # 某一个

# 按条件
df[df['Population'] > 1000]

## set,将获取到的赋值
df.Population[0] = 1
df.loc[0, 'Country'] = 'China'
df.loc[:, 'Country'] = ['A', 'B', 'C']

## 删除
df.drop('Country', axis='columns', inplace=True) # 删除列
df.drop(0, axis='index', inplace=True) # 删除行

## 添加,axis=1可添加列
new_df = pd.concat([df, pd.DataFrame([{'Country': 'China', 'Population': 1000}])])

## 设置索引
df = pd.DataFrame(data, index=data['Country'])
# 或者
df.set_index('Country', inplace=True)
# 获取时更方便
df['Population']['India']
# 同样适用于上述按索引方式获取,只是换成了用设置的索引

基础用法

基本信息

# 形状
df.shape
# index
df.index # RangeIndex(start=0, stop=3, step=1)
# 列
df.columns
# 数据类型
df.dtypes
# 描述性统计
df.info
df.info()
df.count() # 统计非空值

全局设置

pd.set_option('display.max_rows', 10) # 设置显示行数
pd.set_option('display.max_columns', 10) # 设置显示列数
pd.set_option('display.width', 1000) # 设置显示宽度
pd.set_option('display.max_colwidth', 100) # 设置列宽,字符串长度
pd.set_option('display.precision', 2) # 设置精度

运算

df.sum() # 字符串拼接,数值求和
df.cumsum() # 累计求和
df.Population.max()/df.Population.min() # 最大值除以最小值,仅限数值列
df.describe() # count, mean, std, min, 25%, 50%, 75%, max
df.mean() # 平均数
df.median() # 中位数
df.cov() # 协方差
df.corr() # 相关性系数

# 矩阵加法
s = pd.Series([111, 1303, 2078, 626], index=['a', 'b', 'c', 'd'])
s1 = pd.Series([-111, -1303, -2078], index=['a', 'b', 'c'])
print(s + s1) ## 对应上的数值求和,字符串拼接,没有对应的为 NaN
# 带填充加法
s  = pd.Series([1, 2, 3, 4, 5],index=['a', 'b', 'c', 'd', 'e'])
s1 = pd.Series([1, 2, 3, 4],index=['a', 'b', 'd', 'e'])
s.add(s1, fill_value=0) # NaN 填充为 0
#  减法
s.sub(s1, fill_value=1)
#  乘法
s.mul(s1, fill_value=1)
#  除法
s.div(s1, fill_value=1)
#  取模
s.mod(s1, fill_value=1)
#  取整
s.floordiv(s1, fill_value=1)

排序

data = pd.DataFrame({
    'group': ['a', 'b', 'c'],
    'data': [2, 3, 4]
})
# ascending是否升序,与by中的元素对应,inplace 是否修改原数据
data.sort_values(by=['group', 'data'], ascending=[False, True], inplace=True)

分组

df = pd.DataFrame({
    'key': ['A', 'B', 'C', 'A', 'B', 'C'],
    'data': [1, 2, 3, 4, 5, 6],
})
## 分组
group_by_key = df.groupby('key')['data']
## 求和 -- 用pd的方法
df_sum = group_by_key.sum()
## 用aggregate方法和np函数
group_by_key.aggregate(np.sum)

去重

data = pd.DataFrame({
    'k1': ['a'] * 3 + ['b'] * 4,
    'k2': [1, 1, 2, 4, 3, 1, 5]
})
data.drop_duplicates(inplace=True)

联表

left = pd.DataFrame({
    'key1': ['a', 'b', 'c', 'd', 'e'],
    'key2': ['a', 'b', 'c', 'd', 'z'],
    'A': ['A0', 'A1', 'A2', 'A3', 'A4'],
    'B': ['B0', 'B1', 'B2', 'B3', 'B4'],
})
right = pd.DataFrame({
    'key1': ['a', 'b', 'c', 'd', 'e'],
    'key2': ['a', 'b', 'c', 'd', 'w'],
    'C': ['C0', 'C1', 'C2', 'C3', 'C4'],
    'D': ['D0', 'D1', 'D2', 'D3', 'D4'],
})

pd.merge(left, right, on='key1')
pd.merge(left, right, on=['key1', 'key2']) # 默认是内联 how='inner'
pd.merge(left, right, how='left') # 左外联
pd.merge(left, right, how='right') # 右外联
pd.merge(left, right, how='outer') # 外联, indicator=True可显示每行连接情况

映射

data = pd.DataFrame({
    'food': ['A1', 'B1', 'A2', 'B2', 'C1', 'C2', 'C3'],
    'price': [1, 2, 3, 4, 5, 6, 7]
})

## 用函数分组
def food_map(series):
    if str.startswith(series['food'], 'A'):
        return 'A'
    elif str.startswith(series['food'], 'B'):
        return 'B'
    elif str.startswith(series['food'], 'C'):
        return 'C'


data['food_func'] = data.apply(food_map, axis="columns")
## lambda
lambdaFunc = lambda x: 'A' if str.startswith(x, 'A') else (
        'B' if str.startswith(x, 'B') else 'C')

data['food_lambda'] = data['food'].map(lambdaFunc)
data['food_lambda'] = data['food'].apply(lambdaFunc)

## 用map分组
food_group = {
    'A1': 'A',
    'A2': 'A',
    'B1': 'B',
    'B2': 'B',
    'C1': 'C',
    'C2': 'C',
}
data['food_group'] = data['food'].map(food_group)

分配新的列

data = pd.DataFrame({
    'data1': np.random.randn(5),
    'data2': np.random.randn(5),
})
data_new = data.assign(data_sum=data['data1'] + data['data2'])

# 删除列
data_new.drop('data_sum', axis='columns', inplace=True)

替换

data = pd.Series([1, 2, 3, 4, 5])
# data.replace(5, np.nan, inplace=True)
data.replace(5,2, inplace=True)

统计


## value_counts统计
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
data.value_counts()
data.value_counts(normalize=True) # 频数比
data.value_counts(sort=False) # 不排序,默认降序
data.value_counts(bins=3) # 分组
data.value_counts(dropna=False) # 统计缺失值,默认不统计NaN
data.value_counts(ascending=True) # 升序


## 用cut区间统计
ages = [15, 18, 21, 53, 47, 23, 75]
bins = [10, 40, 80]
group_names = ['小孩', '大人']
bins_res = pd.cut(ages, bins=bins, labels=group_names)
print(bins_res.value_counts())

## 找缺失值
data = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)])
is_null = data.isnull()
is_null.any()  # 看列有没有
is_null.any(axis=1)  # 看行有没有
data.fillna(5, inplace=True) # 填充缺失值

I/O

## excel
pd.read_excel('data.xlsx')
pd.to_excel('data/data.xlsx',sheet_name='Sheet1')
# 多 sheet
xlsx = pd.ExcelFile('data.xls')
df = pd.read_excel(xlsx, 'Sheet1')
## csv
# 读取前5行
data = pd.read_csv('data.csv',header=None,nrows=5)
data.to_csv('data.csv')
## json
data = pd.read_json('data.json')
data.to_json('data.json')
## sql
from sqlalchemy import create_engine

engine = create_engine('sqlite:///:memory:')
pd.read_sql_table('my_table', engine)
pd.read_sql_query("SELECT * FROM my_table;", engine)
# read_sql () 是 read_sq[l_table () and read_sql_query() 的包装
pd.read_sql("SELECT * FROM my_table;", engine)

pd.to_sql('myDf', engine)

高级用法

数据透视表

数据重塑 pivot

data = pd.DataFrame({
    'Month': ["January", "January", "January", "January",
              "February", "February", "February", "February",
              "March", "March", "March", "March"],
    'Category': ["Transportation", "Grocery", "Household", "Entertainment",
                 "Transportation", "Grocery", "Household", "Entertainment",
                 "Transportation", "Grocery", "Household", "Entertainment"],
    'Amount': [74., 235., 175., 100., 115, 240., 225., 125., 90., 260., 200., 120.]})

data_pivot = data.pivot(index='Category',columns='Month',values='Amount')

数据透视表 pivot_table

创建一个名为 ./data/basketball.csv 的文件

对手,胜负,主客场,命中,投篮数,投篮命中率,3分命中率,篮板,助攻,得分
勇士,,,10,23,0.435,0.444,6,11,27
国王,,,8,21,0.381,0.286,3,9,27
小牛,,,10,19,0.526,0.462,3,7,29
灰熊,,,8,20,0.4,0.25,5,8,22
76人,,,10,20,0.5,0.25,3,13,27
黄蜂,,,8,18,0.444,0.4,10,11,27
灰熊,,,6,19,0.316,0.222,4,8,20
76人,,,8,21,0.381,0.429,4,7,29
尼克斯,,,9,23,0.391,0.353,5,9,31
老鹰,,,8,15,0.533,0.545,3,11,29
爵士,,,19,25,0.76,0.875,2,13,56
骑士,,,8,21,0.381,0.429,11,13,35
灰熊,,,11,25,0.44,0.429,4,8,38
步行者,,,9,21,0.429,0.25,5,15,26
猛龙,,,8,25,0.32,0.273,6,11,38
太阳,,,12,22,0.545,0.545,2,7,48
灰熊,,,9,20,0.45,0.5,5,7,29
掘金,,,6,16,0.375,0.143,8,9,21
尼克斯,,,12,27,0.444,0.385,2,10,37
篮网,,,13,20,0.65,0.615,10,8,37
步行者,,,8,22,0.364,0.333,8,10,29
湖人,,,13,22,0.591,0.444,4,9,36
爵士,,,8,19,0.421,0.333,5,3,29
开拓者,,,16,29,0.552,0.571,8,3,48
鹈鹕,,,8,16,0.5,0.4,1,17,26

用法


上次编辑于:
贡献者: 李元昊