Pandas数据分析常用函数的使用

来源：jb51　　时间：2023/1/18 8:42:03　　对本文有异议

一、数据导入导出

pandas提供了一些用于将表格型数据读取为DataFrame对象函数，如read_csv,read_table。输入pd.read后，按Tab键，系统将把以read开头的函数和模块都列出来，根据需要读取的文件类型选取。

#包的安装导入
import pandas as pd
 
#查询帮助文档
pd.read_csv?
 
#数据载入（仅罗列一部分常用参数）
df = pd.read_csv(
? ? ?filePath, #路径?
? ? ?sep=',', ?#分隔符
? ? ?encoding='UTF-8', #用于unicode的文本编码格式，如GBK,UTF-8
? ? ?engine='python',
? ? ?header = None, #第一行不作为列名
? ? ?names= [['col1','col2']], #字段名设置
? ? ?index_col=None,?
? ? ?skiprows=None, #跳过行None
? ? ?error_bad_lines=False #错误行忽略 ? ?
)
# 数据导出
df.to_csv(filePath,
? ? ? ? ? ?sep = ',',
? ? ? ? ? ?index = False)

二、数据加工处理

1）重复值处理

# Pandas提供了duplicated、Index.duplicated、drop_duplicates函数来标记及删除重复记录
 
#找出重复行位置
dIndex = df.duplicated()
#根据某些列找出重复位置
dIndex = df.duplicated('id')
dIndex = df.duplicated(['id', 'key'])
#根据返回值提取重复数据
df[dIndex]
#删除重复行
newdf = df.drop_duplicated()
#去掉重复数据
newdf = df.drop_duplicated(keep = False)
#根据'key'字段去重，并保留重复key字段第一个
##subset:指定的标签或标签序列，仅删除这些列重复值，默认情况为所有列
##keep:确定要保留的重复值:first（保留第一次出现的重复值，默认）last(保留最后一次出现的重复值)False(删除所有重复值)
newdf = df.drop_duplicated(subset = ['key'],keep = 'first')

2）缺失值处理

# 输出某列是否有为空值
print(df.isnull().any(axis = 0))
# 获取空值所在的行
df[df.isnull().any(axis = 1)]
# 空值填充
df.fillna('未知')
# 删除空值
newDF = dropna(axis="columns",how="all",inplace=False) #how可选有any和all,any表示只要有空值出现就删除，all表示全部为空值才删除，inplace表示是否替换掉原本数据

3）空格处理

newName = df['name'].str.lstrip()
newName = df['name'].str.rstrip()
newName = df['name'].str.strip()

4）字段拆分

newDF = df['name'].str.split(' ', 1, True)

5）筛选数据

#单条件
df[df.comments>10000]
#多条件
df[df.comments.between(1000, 10000)]
#过滤空值所在行
df[pandas.isnull(df.title)]
#根据关键字过滤
df[df.title.str.contains('台电', na=False)]
#~为取反
df[~df.title.str.contains('台电', na=False)]
#组合逻辑条件
df[(df.comments>=1000) & (df.comments<=10000)]

6）随机抽样

#设置随机种子
numpy.random.seed(seed=2)
#按照个数抽样
data.sample(n=10)
#按照百分比抽样
data.sample(frac=0.02)
#是否可放回抽样，
#replace=True，可放回, 
#replace=False，不可放回
data.sample(n=10, replace=True)

7）数据匹配

items = pandas.read_csv(
    'D:\\PDA\\4.12\\data1.csv', 
    sep='|', 
    names=['id', 'comments', 'title']
)
prices = pandas.read_csv(
    'D:\\PDA\\4.12\\data2.csv', 
    sep='|', 
    names=['id', 'oldPrice', 'nowPrice']
)
#默认只是保留连接上的部分
itemPrices = pd.merge(
    items, 
    prices, 
    left_on='id', 
    right_on='id',
    how = 'left'
)
#how：连接方式，有inner、left、right、outer，默认为inner；

8）数据合并

data = pd.concat([data1, data2, data3])

9）时间处理

data['时间'] = pandas.to_datetime(
    data.注册时间, 
    format='%Y/%m/%d'
)
data['格式化时间'] = data.时间.dt.strftime('%Y-%m-%d')
data['时间.年'] = data['时间'].dt.year
data['时间.月'] = data['时间'].dt.month
data['时间.周'] = data['时间'].dt.weekday
data['时间.日'] = data['时间'].dt.day
data['时间.时'] = data['时间'].dt.hour
data['时间.分'] = data['时间'].dt.minute
data['时间.秒'] = data['时间'].dt.second

10）数据标准化

data['scale'] = round(
    (
        data.score-data.score.min()
    )/(
        data.score.max()-data.score.min()
    )
    , 2
)

11）修改列名和索引

#将id列设为索引
df = df.set_index('id')

12）排序

#选定列排序
df.sort_values(by=['age', 'gender'], ascending=[False, True], inplace=True, ignore_index=True)

三、列表格式设置

pd.set_option('display.max_rows',xxx) # 最大行数
pd.set_option('display.min_rows',xxx) # 最小显示行数
pd.set_option('display.max_columns',xxx) # 最大显示列数
pd.set_option ('display.max_colwidth',xxx) #最大列字符数
pd.set_option( 'display.precision',2) # 浮点型精度
pd.set_option('display.float_format','{:,}'.format) #逗号分隔数字
pd.set_option('display.float_format', ?'{:,.2f}'.format) #设置浮点精度
pd.set_option('display.float_format', '{:.2f}%'.format) #百分号格式化
pd.set_option('plotting.backend', 'altair') # 更改后端绘图方式
pd.set_option('display.max_info_columns', 200) # info输出最大列数
pd.set_option('display.max_info_rows', 5) # info计数null时的阈值
pd.describe_option() #展示所有设置和描述
pd.reset_option('all') #重置所有设置选项