功能说明
输入:GBD 官网导出的多个 CSV 文件
输出:统一格式的合并数据
处理内容:
- 多文件合并(按行拼接)
- 字段重命名(统一命名规范)
- 数据筛选(按年份、地区、年龄)
- 缺失值处理
代码实现
1. 读取与合并多个 CSV
R 代码:多文件合并
library(data.table)
library(dplyr) # 方法1:合并同一类型的多个CSV文件
file_list <- list.files("examples/raw_data/", pattern = "*.csv", full.names = TRUE) # 逐个读取并合并
dt_list <- lapply(file_list, fread)
combined_data <- rbindlist(dt_list, use.names = TRUE, fill = TRUE) # 方法2:按文件夹合并(不同地区的数据)
# 假设有 China/, Global/, Region/ 三个子文件夹
china_files <- list.files("examples/raw_data/China/", pattern = "*.csv", full.names = TRUE)
global_files <- list.files("examples/raw_data/Global/", pattern = "*.csv", full.names = TRUE) china_dt <- rbindlist(lapply(china_files, fread), id = "source")
global_dt <- rbindlist(lapply(global_files, fread), id = "source") # 合并所有
all_data <- rbindlist(list(china_dt, global_dt), use.names = TRUE) # 保存
fwrite(all_data, "examples/output_merged.csv") 2. 字段重命名与筛选
R 代码:字段标准化
# 读取合并后的数据
df <- fread("examples/output_merged.csv") # 重命名字段(GBD导出字段名标准化)
# 原始字段可能:location_name, year, val, upper, lower
# 统一为:location, year, value, upper, lower df <- df %>% rename( location = location_name, value = val, measure = measure_name, age = age_name, metric = metric_name, sex = sex_name ) %>% select(location, year, age, sex, measure, metric, value, upper, lower) # 按条件筛选
# 示例:筛选中国、2010-2021年、年龄标化数据
df_filtered <- df %>% filter( location == "China", year >= 2010 & year <= 2021, age == "Age-standardized" ) # 保存筛选后数据
fwrite(df_filtered, "examples/output_filtered.csv") 3. 完整数据清洗流程
R 代码:完整清洗函数
# 数据清洗完整函数
clean_gbd_data <- function(input_dir, output_file, countries = NULL, years = NULL, ages = NULL, metrics = NULL) { library(data.table) library(dplyr) # 1. 读取所有CSV files <- list.files(input_dir, pattern = "*.csv", full.names = TRUE) dt <- rbindlist(lapply(files, fread), use.names = TRUE) # 2. 字段重命名 setnames(dt, c("location_name", "measure_name", "age_name", "metric_name", "sex_name", "val"), c("location", "measure", "age", "metric", "sex", "value")) # 3. 筛选条件 if (!is.null(countries)) { dt <- dt[location %in% countries] } if (!is.null(years)) { dt <- dt[year %in% years] } if (!is.null(ages)) { dt <- dt[age %in% ages] } if (!is.null(metrics)) { dt <- dt[metric %in% metrics] } # 4. 去除缺失值 dt <- dt[!is.na(value)] # 5. 排序 setorder(dt, location, year, age, sex, measure, metric) # 6. 保存 fwrite(dt, output_file) return(dt)
} # 使用示例
clean_gbd_data( input_dir = "examples/raw_data/", output_file = "examples/output_clean.csv", countries = c("China", "United States", "Japan"), years = 1990:2021, ages = c("Age-standardized", "All ages"), metrics = c("Number", "Rate")
) 常见问题
Q1: 合并后出现重复行?
→ 使用 unique 或 distinct 去重。
Q2: 字段名不一致?
→ 用 setnames 统一重命名,或用 janitor::clean_names 自动清理。
Q3: 内存不足?
→ 使用 data.table 的 fread 分块读取,或 chunked 包。