🧾 新手导读:结局读取的核心目标
目标是把结局GWAS整理成可被 read_outcome_data() 正确识别的标准格式,并与暴露SNP成功匹配。
背景介绍
真实数据里结局文件经常没有标准rsID,或列名不统一。读取前做好映射与字段对齐,是避免“匹配不上SNP”的关键。
备注(统一三段)
适用人群:需要把chr:pos或非标准结局GWAS文件转成标准输入的人群。
常见错误:只做字段重命名不验证rsID映射率,常造成可用SNP数量异常偏低。
论文写法:在论文中建议单列“Outcome preprocessing”说明映射流程与保留比例。
模式总览
- 🧭 模式 1:原始文件是 chr:pos:ref:alt,先拆分并映射为 rsID
- ✅ 模式 2:原始文件已是标准 GWAS 列名,直接 read_outcome_data()
- 🔗 两种模式最终都输出 TwoSampleMR 可直接 harmonise 的 outcome_dat
模式 1:chr:pos 转换 + 读取
适用于结局文件没有 rsID,只有 variant=chr:pos:ref:alt 的情况。
library(data.table)
library(dplyr)
library(TwoSampleMR)
i <- "GCST013196"
path_prefix <- "path/to/your/data"
snp150_file <- "path/to/your/data"
# 输出与中间文件
outcome_file <- paste0(path_prefix, i, "_outcome_dat.tsv") # 最终标准化结果
raw_file <- paste0(path_prefix, i, ".tsv") # 原始结局文件
file_transformed <- paste0(path_prefix, i, "_transformed.tsv") # chr:pos映射后的中间文件
# 1) 首次运行时加载 snp150_hg19 映射表(chromosome:start -> rsID)
if (!exists("df_snp150_hg19")) {
df_snp150_hg19 <- fread(snp150_file, data.table = FALSE, nThread = 12)
}
# 2) 如果标准化结果不存在,执行“原始文件 -> 映射 -> read_outcome_data”
if (!file.exists(outcome_file)) {
if (!file.exists(raw_file)) stop(paste("原始文件不存在:", raw_file))
# 2.1 没有中间文件时,先创建 chromosome:start 并与 snp150_hg19 连接
if (!file.exists(file_transformed)) {
outcome_dat <- fread(file = raw_file, header = TRUE, nThread = 12)
outcome_dat$`chromosome:start` <- paste0(outcome_dat$chromosome, ":", outcome_dat$base_pair_location)
data_rs <- dplyr::left_join(outcome_dat, df_snp150_hg19, by = "chromosome:start")
fwrite(data_rs, file = file_transformed, row.names = FALSE, nThread = 12)
}
# 2.2 读取中间文件并整理为 TwoSampleMR 输入列
outcome_dat <- fread(file = file_transformed, header = TRUE, nThread = 12)
formatted_outcome_dat <- data.table(
SNP = outcome_dat$name,
BETA = outcome_dat$beta,
SE = outcome_dat$standard_error,
phenotype = i,
effect_allele_col = outcome_dat$effect_allele,
other_allele_col = outcome_dat$other_allele,
P = outcome_dat$p_value,
samplesize = outcome_dat$n,
pos_col = outcome_dat$base_pair_location,
chr_col = outcome_dat$chromosome,
eaf_col = outcome_dat$effect_allele_frequency
)
temp_outcome_file <- tempfile(fileext = ".csv")
fwrite(formatted_outcome_dat, temp_outcome_file, sep = ",", nThread = 12)
# 2.3 完整参数映射读取 outcome_dat
outcome_dat <- read_outcome_data(
filename = temp_outcome_file,
sep = ",",
snp_col = "SNP",
beta_col = "BETA",
se_col = "SE",
phenotype_col = "phenotype",
effect_allele_col = "effect_allele_col",
other_allele_col = "other_allele_col",
pval_col = "P",
chr_col = "chr_col",
pos_col = "pos_col",
eaf_col = "eaf_col",
samplesize_col = "samplesize"
)
outcome_dat$id.outcome <- i
fwrite(outcome_dat, file = outcome_file, sep = "\t", nThread = 12)
unlink(temp_outcome_file)
}
# 3) 读取最终结果供后续 harmonise_data 使用
outcome_dat <- fread(file = outcome_file, nThread = 12)
outcome_dat$id.outcome <- i
outcome_dat_mediation <- outcome_dat
rm(outcome_dat)
if (exists("formatted_outcome_dat")) rm(formatted_outcome_dat)
gc()模式 2:标准列名直接读取
适用于结局文件本身已有 rsID + beta/se/eaf/pval 等标准列。
library(TwoSampleMR)
library(data.table)
# 假设 outcome_standard.tsv.gz 包含:
# rsid, beta, se, effect_allele, other_allele, eaf, p, n, chr, pos
outcome_dat <- read_outcome_data(
snps = exposure_dat$SNP,
filename = "./input/outcome_standard.tsv.gz",
sep = "\t",
snp_col = "rsid",
beta_col = "beta",
se_col = "se",
effect_allele_col = "effect_allele",
other_allele_col = "other_allele",
eaf_col = "eaf",
pval_col = "p",
samplesize_col = "n",
chr_col = "chr",
pos_col = "pos"
)
fwrite(outcome_dat, "./output/outcome_dat_standard_mode.tsv", sep = "\t")实操提示
- 📌 如果结局是 OR,请先转为 beta = log(OR) 再读取。
- 📌 读取后优先检查 nrow(outcome_dat) 是否明显偏少(常见于等位基因方向不一致)。
- 📌 下一步直接进入 harmonise_data(exposure_dat, outcome_dat)。