暴露数据读取（3个案例）- 医研AI工具库

🧭 新手导读：为什么先看暴露数据？

暴露端格式错误是MR最常见失败原因。把暴露数据标准化好，后面流程会顺很多。

背景介绍

暴露数据决定了工具变量质量。OR、beta、SE、等位基因方向一旦混乱，会直接导致后续效应方向和显著性错误。

备注（统一三段）

适用人群：手头已有暴露GWAS原始文件，需整理成TwoSampleMR可读格式的初学者。

常见错误：OR未转beta或等位基因列映射错误，会导致后续harmonise大幅丢SNP。

论文写法：方法部分建议明确写出暴露数据来源、阈值和LD参数（kb/r²/p）。

案例总览

🧪 案例 1：二分类暴露（OR + P）转换为 TwoSampleMR 可用格式
📈 案例 2：连续性状（已有 beta/SE）标准化读取与筛选
🔁 案例 3：多个暴露文件批量导入 + LD Clumping

案例 1：OR 格式暴露数据读取（模板同款）

library(data.table)
library(dplyr)
library(TwoSampleMR)
library(plinkbinr)
library(ieugwasr)
library(tibble)

i <- "GCST90133000"
message(paste0("0. 当前分析的是：", i))

# 路径示例1（Windows网络盘）：path/to/your/data
path_prefix <- "path/to/your/data"
exposure_file <- paste0(path_prefix, i, "_exposure_dat.tsv")   # 已标准化后的缓存文件
raw_file <- paste0(path_prefix, i, "_buildGRCh37.tsv.gz")     # 原始GWAS文件

# 文件检查逻辑：有缓存就直接读取；没有缓存才重新格式化
if (!file.exists(exposure_file)) {
 if (!file.exists(raw_file)) stop(paste("原始文件不存在：", raw_file))

 message("1. 读取原始暴露数据")
 exposure_dat <- fread(file = raw_file, header = TRUE, nThread = 12)

 message("2. 列名映射到TwoSampleMR标准字段")
 formatted_exposure_dat <- data.table(
  SNP = exposure_dat$variant_id,          # snp_col
  BETA = exposure_dat$beta,             # beta_col
  SE = exposure_dat$standard_error,         # se_col
  phenotype = i,                  # phenotype_col
  effect_allele_col = exposure_dat$effect_allele,  # effect_allele_col
  other_allele_col = exposure_dat$other_allele,   # other_allele_col
  P = exposure_dat$p_value,             # pval_col
  pos_col = exposure_dat$base_pair_location,    # pos_col
  chr_col = exposure_dat$chromosome,        # chr_col
  eaf_col = exposure_dat$effect_allele_frequency  # eaf_col
 )

 temp_exposure_file <- tempfile(fileext = ".csv")
 fwrite(formatted_exposure_dat, temp_exposure_file, sep = ",", nThread = 12)

 message("3. 完整参数映射读取为 exposure_dat")
 exposure_dat <- read_exposure_data(
  filename = temp_exposure_file,
  sep = ",",
  snp_col = "SNP",
  beta_col = "BETA",
  se_col = "SE",
  phenotype_col = "phenotype",
  effect_allele_col = "effect_allele_col",
  other_allele_col = "other_allele_col",
  pval_col = "P",
  chr_col = "chr_col",
  pos_col = "pos_col",
  eaf_col = "eaf_col",
  clump = FALSE
 )
 exposure_dat$id.exposure <- i

 # 先按P值筛选，再进行LD clumping
 filtered_exposure_dat <- exposure_dat %>% dplyr::filter(pval.exposure < 5e-8)
 b <- ld_clump(
  dplyr::tibble(
   rsid = filtered_exposure_dat$SNP,
   pval = filtered_exposure_dat$pval.exposure,
   id = filtered_exposure_dat$id.exposure
  ),
  clump_kb = 10000,
  clump_r2 = 0.001,
  clump_p = 1e-5,
  plink_bin = "path/to/your/data",
  bfile = "path/to/reference/EUR"
 )

 exposure_dat <- exposure_dat[which(exposure_dat$SNP %in% b$rsid), ]
 exposure_dat$id.exposure <- i

 fwrite(exposure_dat, exposure_file, sep = "\t", row.names = FALSE, nThread = 12)
 unlink(temp_exposure_file)
}

message("4. 导入 exposure_dat_mediation 数据")
exposure_dat <- fread(file = exposure_file, header = TRUE, nThread = 12)
exposure_dat$id.exposure <- i
exposure_dat_mediation <- exposure_dat
rm(exposure_dat)
if (exists("formatted_exposure_dat")) rm(formatted_exposure_dat)
gc()

案例 2：连续性状（已提供 beta/SE）

library(data.table)
library(dplyr)
library(TwoSampleMR)
library(plinkbinr)
library(ieugwasr)
library(tibble)

i <- "GCST90475984"
message(paste0("0. 当前分析的是：", i))

# 路径示例2（孟德尔随机化项目目录）
path_prefix <- "path/to/your/data"
exposure_file <- paste0(path_prefix, i, "_exposure_dat.tsv")
raw_file <- paste0(path_prefix, i, ".tsv.gz")

if (!file.exists(exposure_file)) {
 if (!file.exists(raw_file)) stop(paste("原始文件不存在：", raw_file))
 exposure_dat <- fread(file = raw_file, header = TRUE, nThread = 12)

 # OR转beta，并按P值估算SE（适用于仅提供OR和P值的数据）
 setDT(exposure_dat)
 exposure_dat[, `:=`(
  beta = log(odds_ratio),
  SE = abs(log(odds_ratio)) / abs(qnorm(p_value / 2))
 )]

 formatted_exposure_dat <- data.table(
  SNP = exposure_dat$rsid,
  BETA = exposure_dat$beta,
  SE = exposure_dat$SE,
  phenotype = i,
  effect_allele_col = exposure_dat$effect_allele,
  other_allele_col = exposure_dat$other_allele,
  P = exposure_dat$p_value,
  pos_col = exposure_dat$base_pair_location,
  chr_col = exposure_dat$chromosome,
  eaf_col = exposure_dat$case_af
 )

 temp_exposure_file <- tempfile(fileext = ".csv")
 fwrite(formatted_exposure_dat, temp_exposure_file, sep = ",", nThread = 12)

 exposure_dat <- read_exposure_data(
  filename = temp_exposure_file,
  sep = ",",
  snp_col = "SNP",
  beta_col = "BETA",
  se_col = "SE",
  phenotype_col = "phenotype",
  effect_allele_col = "effect_allele_col",
  other_allele_col = "other_allele_col",
  pval_col = "P",
  chr_col = "chr_col",
  pos_col = "pos_col",
  eaf_col = "eaf_col",
  clump = FALSE
 )
 exposure_dat$id.exposure <- i

 filtered_exposure_dat <- exposure_dat %>% dplyr::filter(pval.exposure < 5e-8)
 b <- ld_clump(
  dplyr::tibble(
   rsid = filtered_exposure_dat$SNP,
   pval = filtered_exposure_dat$pval.exposure,
   id = filtered_exposure_dat$id.exposure
  ),
  clump_kb = 10000,
  clump_r2 = 0.001,
  clump_p = 1e-5,
  plink_bin = "path/to/your/data",
  bfile = "path/to/reference/EUR"
 )

 exposure_dat <- exposure_dat[which(exposure_dat$SNP %in% b$rsid), ]
 exposure_dat$id.exposure <- i
 fwrite(exposure_dat, exposure_file, sep = "\t", row.names = FALSE, nThread = 12)
 unlink(temp_exposure_file)
}

exposure_dat <- fread(file = exposure_file, header = TRUE, nThread = 12)
exposure_dat$id.exposure <- i
exposure_dat_mediation <- exposure_dat
rm(exposure_dat)
if (exists("formatted_exposure_dat")) rm(formatted_exposure_dat)
gc()

案例 3：多个暴露文件批量读取

library(data.table)
library(dplyr)
library(ieugwasr)
library(tibble)

path_prefix <- "./input/"
exposure_files <- list.files(path_prefix, pattern = "\\.tsv.gz$", full.names = TRUE)
all_exposure_dat <- list()

for (file_path in exposure_files) {
 i <- tools::file_path_sans_ext(tools::file_path_sans_ext(basename(file_path)))

 # 读取GWAS数据
 exposure_dat <- fread(file = file_path, header = TRUE, nThread = 12)

 # 格式化（支持OR转beta）
 exposure_dat[, `:=`(beta = log(odds_ratio),
           SE  = abs(log(odds_ratio)) / abs(qnorm(p_value / 2)))]

 # 格式化为TwoSampleMR格式
 formatted_exposure_dat <- data.table(
  SNP = exposure_dat$rsid,
  BETA = exposure_dat$beta,
  SE = exposure_dat$SE,
  phenotype = i,
  effect_allele_col = exposure_dat$effect_allele,
  other_allele_col = exposure_dat$other_allele,
  P = exposure_dat$p_value,
  pos_col = exposure_dat$base_pair_location,
  chr_col = exposure_dat$chromosome,
  eaf_col = exposure_dat$case_af
 )

 temp_file <- tempfile(fileext = ".csv")
 fwrite(formatted_exposure_dat, temp_file)

 exposure_dat <- read_exposure_data(filename = temp_file, sep = ",")
 exposure_dat <- exposure_dat %>% dplyr::filter(pval.exposure < 5e-8)

 b <- ld_clump(tibble(rsid = exposure_dat$SNP, pval = exposure_dat$pval.exposure, id = exposure_dat$id.exposure),
        clump_kb = 10000, clump_r2 = 0.001, clump_p = 1e-5)
 exposure_dat <- exposure_dat[SNP %in% b$rsid]

 all_exposure_dat[[i]] <- exposure_dat
}

exposure_union <- dplyr::bind_rows(all_exposure_dat)