原文:40 Useful Pandas Snippets Pandas snippets that come in handy in data analysis work - 2022.04.20
相关:
import pandas as pd
#读取数据
pd.read_csv("data.csv")
1. 过滤列数据
Filter columns.
只需要数据集的几列,如:
pd.read_csv("data.csv", usecols=["date", "price"])
2. 读取时解析日期
Parse dates on read.
pd.read_csv("data.csv", parse_dates=["date"])
3. 指定数据类型
Specify Data Types.
读取数据时,设置数据类型分类,以节省 内存.
pd.read_csv("data.csv", dtype={"house_type": "category"})
4. 设置索引
Set index.
设置索引,尤其对于时间序列比较有用.
pd.read_csv("data.csv", index_col="date")
5. 读取的行数
No. of rows to read.
只读取数据集中的部分数据.
#100 行数据
pd.read_csv("data.csv", nrows=100)
6. 跳过行
Skip rows.
跳过某些行的数据,
#跳过第1行和第5行
pd.read_csv("data.csv", skiprows=[1, 5])
#跳过前100行
pd.read_csv("data.csv", skiprows=100)
#跳过 90% 的行
pd.read_csv("data.csv", skiprows=lambda x: x > 0 and np.random.rand() > 0.1)
7. 指定 NA 值
Specify NA Values.
如果数据的值有NA,即,如是 ?
等,将其设置为读取,以便以后不用再转换.
pd.read_csv("data.csv", na_values=["?"])
8. 设定布尔值
Setting boolean values.
如果某一列数据的格式是 Yes
和 No
的形式,
pd.read_csv("data.csv", true_values=["yes"], false_values=["no"])
9. 读取多个文件
Read from multiple files.
如果数据被存储在多个文件里,
import glob
import os
files = glob.glob("file_*.csv")
result = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
10. 复制粘贴数据到DataFrames
Copy and Paste into Data Frames.
从剪切板复制粘贴数据,
df = pd.read_clipboard()
11. 从PDF读取表格数据
Read tables from PDF files.
# pip install tabula-py
from tabula import read_pdf
# Read pdf into list of DataFrame
df = read_pdf('test.pdf', pages='all')
12. 探索性数据分析
EDA,Exploratory Data Analysis,探索性数据分析
EDA cheat.
想要可视化数据,但不想写plot代码,可以采用 pandas-profiling,仅需要一行代码,
# pip install pandas-profiling
import pandas_profiling
df = pd.read_csv("data.csv")
profile = df.profile_report(title="Pandas Profiling Report")
profile.to_file(output_file="output.html")
13. 通过 dtype 过滤列
dtype,Data Types. (list of dtypes for pandas).
Filter columns by dtype.
# selecting
df.select_dtypes(include="number")
df.select_dtypes(include=["category", "datetime"])
# exluding
df.select_dtypes(exclude="object")
14. Infer dtype
df.infer_objects().dtypes
15. Downcasting
pd.to_numeric(df.numeric_col, downcast="integer") # smallest signed int dtype
pd.to_numeric(df.numeric_col, downcast="float") # smallest float dtype
16. 手工转换
Manual conversion.
如果数据中有 NaN 值,error="coerce"
能够避免报错.
同时,可以采用 .fillna
将 NA 值填充为合理的值.
# apply to whole data frame
df = df.apply(pd.to_numeric, errors="coerce")
# apply to specific columns
pd.to_numeric(df.numeric_column, errors="coerce")
# filling NA values with zero
pd.to_numeric(df.numeric_column, errors="coerce").fillna(0)
17. 一次转换全部
Convert all at once.
df = df.astype(
{
"date": "datetime64[ns]",
"price": "int",
"is_weekend": "bool",
"status": "category",
}
)
18. 列重命名
Renaming columns.
df = df.rename({"PRICE": "price", "Date (mm/dd/yyyy)": "date"}, axis=1)
19. 增加前缀和后缀
Add suffix and prefix.
df.add_prefix("pre_")
df.add_suffix("_suf")
20. 创建新列
Create new columns.
# create new column of Fahrenheit values from Celcius
df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
21. 指定位置插入列
Insert columns at specific positions.
random_col = np.random.randint(10, size=len(df))
df.insert(3, 'random_col', random_col) # inserts at third column
22. if-then-else
df["logic"] = np.where(df["price"] > 5, "high", "low")
23. Dropping columns
df.drop('col1', axis=1, inplace=True)
df = df.drop(['col1','col2'], axis=1)
s = df.pop('col')
del df['col']
df.drop(df.columns[0], inplace=True)
24. 字符串列名操作
Columns names
# on column names
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
25. 字符串Contains
df['name'].str.contains("John")
df['phone_num'].str.contains('...-...-....', regex=True) # regex
df['email'].str.contains('gmail')
26. 字符串findall
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
df['email'].str.findall(pattern, flags=re.IGNORECASE)
27. 缺失值检查
Missing values Checking
def missing_vals(df):
"""prints out columns with perc of missing values"""
missing = [
(df.columns[idx], perc)
for idx, perc in enumerate(df.isna().mean() * 100)
if perc > 0
]
if len(missing) == 0:
return "no missing values"
# sort desc by perc
missing.sort(key=lambda x: x[1], reverse=True)
print(f"There are a total of {len(missing)} variables with missing values\n")
for tup in missing:
print(str.ljust(f"{tup[0]:<20} => {round(tup[1], 3)}%", 1))
#
missing_vals(df)
如:
There are a total of 16 variables with missing values
PoolQC => 100.0%
Alley => 94.0%
MiscFeature => 91.0%
Fence => 77.0%
FireplaceQu => 54.0%
LotFrontage => 14.0%
GarageType => 6.0%
GarageYrBlt => 6.0%
GarageFinish => 6.0%
GarageQual => 6.0%
GarageCond => 6.0%
BsmtQual => 3.0%
BsmtCond => 3.0%
BsmtExposure => 3.0%
BsmtFinType1 => 3.0%
BsmtFinType2 => 3.0%
28. 缺失值处理
Dealing with missing values.
# drop
df.dropna(axis=0)
df.dropna(axis=1)
# impute
df.fillna(0)
df.fillna(method="ffill")
df.fillna(method='bfill')
# replace
df.replace( -999, np.nan)
df.replace("?", np.nan)
# interpolate
ts.interpolate() # time series
df.interpolate() # fill all consecutive values forward
df.interpolate(limit=1) # fill one consecutive value forward
df.interpolate(limit=1, limit_direction="backward")
df.interpolate(limit_direction="both")
29. 日期操作之时间
# from today
date.today() + datetime.timedelta(hours=30)
date.today() + datetime.timedelta(days=30)
date.today() + datetime.timedelta(weeks=30)
# ago
date.today() - datetime.timedelta(days=365)
30. 日期操作之两个时间点之间过滤
Filter between two dates.
df[(df["Date"] > "2015-01-01") & (df["Date"] < "2017-01-01")]
31. 根据day/month/year 过滤
# filter by single day
df[df["Date"].dt.strftime("%Y-%m-%d") == "2017-03-01"]
# filter by single month
df[df["Date"].dt.strftime("%m") == "12"]
# filter by single year
df[df["Date"].dt.strftime("%Y") == "2017"]
32. 数据格式化
format_dict = {
"Date": "{:%d/%m/%y}",
"Open": "${:.2f}",
"Close": "${:.2f}",
"Volume": "{:,}",
}
#
df.style.format(format_dict)
33. 颜色填充
(
df.style.format(format_dict)
.hide_index()
.highlight_min(["Open"], color="red")
.highlight_max(["Open"], color="green")
.background_gradient(subset="Close", cmap="Greens")
.bar('Volume', color='lightblue', align='zero')
.set_caption('Tesla Stock Prices in 2017')
)
34. 获取列的最大最小值 id
df['col'].idxmin()
df['col'].idxmax()
35. dataframe 函数处理
df.applymap(lambda x: np.log(x))
36. 随机打乱数据
df.sample(frac=1, random_state=7).reset_index(drop=True)
37. 百分比变化
Percent change,百分比变化。对时间序列有用.
如: price of BTC over 3 days [30000, 33000, 31000] -> [NaN, 0.1, -0.06]
df['col_name'].pct_change()
38. 检查 dataframe 的内存使用量
df.memory_usage().sum() / (1024**2) #converting to MB
39. 将list值分解为多行
Explode list values to multiple rows.
df.explode("col_name").reset_index(drop=True)
40. 将较少的数据归类为 Others
Convert smaller categories to “Others”
subclass = df.MSSubClass
subclass.value_counts()