🧭 新手导读:为什么先看暴露数据?
暴露端格式错误是MR最常见失败原因。把暴露数据标准化好,后面流程会顺很多。
背景介绍
暴露数据决定了工具变量质量。OR、beta、SE、等位基因方向一旦混乱,会直接导致后续效应方向和显著性错误。
备注(统一三段)
适用人群:手头已有暴露GWAS原始文件,需整理成TwoSampleMR可读格式的初学者。
常见错误:OR未转beta或等位基因列映射错误,会导致后续harmonise大幅丢SNP。
论文写法:方法部分建议明确写出暴露数据来源、阈值和LD参数(kb/r²/p)。
案例总览
- 🧪 案例 1:二分类暴露(OR + P)转换为 TwoSampleMR 可用格式
- 📈 案例 2:连续性状(已有 beta/SE)标准化读取与筛选
- 🔁 案例 3:多个暴露文件批量导入 + LD Clumping
案例 1:OR 格式暴露数据读取(模板同款)
library(data.table)
library(dplyr)
library(TwoSampleMR)
library(plinkbinr)
library(ieugwasr)
library(tibble)
i <- "GCST90133000"
message(paste0("0. 当前分析的是:", i))
# 路径示例1(Windows网络盘):path/to/your/data
path_prefix <- "path/to/your/data"
exposure_file <- paste0(path_prefix, i, "_exposure_dat.tsv") # 已标准化后的缓存文件
raw_file <- paste0(path_prefix, i, "_buildGRCh37.tsv.gz") # 原始GWAS文件
# 文件检查逻辑:有缓存就直接读取;没有缓存才重新格式化
if (!file.exists(exposure_file)) {
if (!file.exists(raw_file)) stop(paste("原始文件不存在:", raw_file))
message("1. 读取原始暴露数据")
exposure_dat <- fread(file = raw_file, header = TRUE, nThread = 12)
message("2. 列名映射到TwoSampleMR标准字段")
formatted_exposure_dat <- data.table(
SNP = exposure_dat$variant_id, # snp_col
BETA = exposure_dat$beta, # beta_col
SE = exposure_dat$standard_error, # se_col
phenotype = i, # phenotype_col
effect_allele_col = exposure_dat$effect_allele, # effect_allele_col
other_allele_col = exposure_dat$other_allele, # other_allele_col
P = exposure_dat$p_value, # pval_col
pos_col = exposure_dat$base_pair_location, # pos_col
chr_col = exposure_dat$chromosome, # chr_col
eaf_col = exposure_dat$effect_allele_frequency # eaf_col
)
temp_exposure_file <- tempfile(fileext = ".csv")
fwrite(formatted_exposure_dat, temp_exposure_file, sep = ",", nThread = 12)
message("3. 完整参数映射读取为 exposure_dat")
exposure_dat <- read_exposure_data(
filename = temp_exposure_file,
sep = ",",
snp_col = "SNP",
beta_col = "BETA",
se_col = "SE",
phenotype_col = "phenotype",
effect_allele_col = "effect_allele_col",
other_allele_col = "other_allele_col",
pval_col = "P",
chr_col = "chr_col",
pos_col = "pos_col",
eaf_col = "eaf_col",
clump = FALSE
)
exposure_dat$id.exposure <- i
# 先按P值筛选,再进行LD clumping
filtered_exposure_dat <- exposure_dat %>% dplyr::filter(pval.exposure < 5e-8)
b <- ld_clump(
dplyr::tibble(
rsid = filtered_exposure_dat$SNP,
pval = filtered_exposure_dat$pval.exposure,
id = filtered_exposure_dat$id.exposure
),
clump_kb = 10000,
clump_r2 = 0.001,
clump_p = 1e-5,
plink_bin = "path/to/your/data",
bfile = "path/to/reference/EUR"
)
exposure_dat <- exposure_dat[which(exposure_dat$SNP %in% b$rsid), ]
exposure_dat$id.exposure <- i
fwrite(exposure_dat, exposure_file, sep = "\t", row.names = FALSE, nThread = 12)
unlink(temp_exposure_file)
}
message("4. 导入 exposure_dat_mediation 数据")
exposure_dat <- fread(file = exposure_file, header = TRUE, nThread = 12)
exposure_dat$id.exposure <- i
exposure_dat_mediation <- exposure_dat
rm(exposure_dat)
if (exists("formatted_exposure_dat")) rm(formatted_exposure_dat)
gc()案例 2:连续性状(已提供 beta/SE)
library(data.table)
library(dplyr)
library(TwoSampleMR)
library(plinkbinr)
library(ieugwasr)
library(tibble)
i <- "GCST90475984"
message(paste0("0. 当前分析的是:", i))
# 路径示例2(孟德尔随机化项目目录)
path_prefix <- "path/to/your/data"
exposure_file <- paste0(path_prefix, i, "_exposure_dat.tsv")
raw_file <- paste0(path_prefix, i, ".tsv.gz")
if (!file.exists(exposure_file)) {
if (!file.exists(raw_file)) stop(paste("原始文件不存在:", raw_file))
exposure_dat <- fread(file = raw_file, header = TRUE, nThread = 12)
# OR转beta,并按P值估算SE(适用于仅提供OR和P值的数据)
setDT(exposure_dat)
exposure_dat[, `:=`(
beta = log(odds_ratio),
SE = abs(log(odds_ratio)) / abs(qnorm(p_value / 2))
)]
formatted_exposure_dat <- data.table(
SNP = exposure_dat$rsid,
BETA = exposure_dat$beta,
SE = exposure_dat$SE,
phenotype = i,
effect_allele_col = exposure_dat$effect_allele,
other_allele_col = exposure_dat$other_allele,
P = exposure_dat$p_value,
pos_col = exposure_dat$base_pair_location,
chr_col = exposure_dat$chromosome,
eaf_col = exposure_dat$case_af
)
temp_exposure_file <- tempfile(fileext = ".csv")
fwrite(formatted_exposure_dat, temp_exposure_file, sep = ",", nThread = 12)
exposure_dat <- read_exposure_data(
filename = temp_exposure_file,
sep = ",",
snp_col = "SNP",
beta_col = "BETA",
se_col = "SE",
phenotype_col = "phenotype",
effect_allele_col = "effect_allele_col",
other_allele_col = "other_allele_col",
pval_col = "P",
chr_col = "chr_col",
pos_col = "pos_col",
eaf_col = "eaf_col",
clump = FALSE
)
exposure_dat$id.exposure <- i
filtered_exposure_dat <- exposure_dat %>% dplyr::filter(pval.exposure < 5e-8)
b <- ld_clump(
dplyr::tibble(
rsid = filtered_exposure_dat$SNP,
pval = filtered_exposure_dat$pval.exposure,
id = filtered_exposure_dat$id.exposure
),
clump_kb = 10000,
clump_r2 = 0.001,
clump_p = 1e-5,
plink_bin = "path/to/your/data",
bfile = "path/to/reference/EUR"
)
exposure_dat <- exposure_dat[which(exposure_dat$SNP %in% b$rsid), ]
exposure_dat$id.exposure <- i
fwrite(exposure_dat, exposure_file, sep = "\t", row.names = FALSE, nThread = 12)
unlink(temp_exposure_file)
}
exposure_dat <- fread(file = exposure_file, header = TRUE, nThread = 12)
exposure_dat$id.exposure <- i
exposure_dat_mediation <- exposure_dat
rm(exposure_dat)
if (exists("formatted_exposure_dat")) rm(formatted_exposure_dat)
gc()案例 3:多个暴露文件批量读取
library(data.table)
library(dplyr)
library(ieugwasr)
library(tibble)
path_prefix <- "./input/"
exposure_files <- list.files(path_prefix, pattern = "\\.tsv.gz$", full.names = TRUE)
all_exposure_dat <- list()
for (file_path in exposure_files) {
i <- tools::file_path_sans_ext(tools::file_path_sans_ext(basename(file_path)))
# 读取GWAS数据
exposure_dat <- fread(file = file_path, header = TRUE, nThread = 12)
# 格式化(支持OR转beta)
exposure_dat[, `:=`(beta = log(odds_ratio),
SE = abs(log(odds_ratio)) / abs(qnorm(p_value / 2)))]
# 格式化为TwoSampleMR格式
formatted_exposure_dat <- data.table(
SNP = exposure_dat$rsid,
BETA = exposure_dat$beta,
SE = exposure_dat$SE,
phenotype = i,
effect_allele_col = exposure_dat$effect_allele,
other_allele_col = exposure_dat$other_allele,
P = exposure_dat$p_value,
pos_col = exposure_dat$base_pair_location,
chr_col = exposure_dat$chromosome,
eaf_col = exposure_dat$case_af
)
temp_file <- tempfile(fileext = ".csv")
fwrite(formatted_exposure_dat, temp_file)
exposure_dat <- read_exposure_data(filename = temp_file, sep = ",")
exposure_dat <- exposure_dat %>% dplyr::filter(pval.exposure < 5e-8)
b <- ld_clump(tibble(rsid = exposure_dat$SNP, pval = exposure_dat$pval.exposure, id = exposure_dat$id.exposure),
clump_kb = 10000, clump_r2 = 0.001, clump_p = 1e-5)
exposure_dat <- exposure_dat[SNP %in% b$rsid]
all_exposure_dat[[i]] <- exposure_dat
}
exposure_union <- dplyr::bind_rows(all_exposure_dat)