scRNA-seq 拷贝数变异专题

1、简介 inferCNV是 TrinityCTAT 的一个组件，被用于在单细胞数据中鉴定体细胞大规模的染色体拷贝数变异，例如染色体部分区域的拷贝数增加与缺失。在软件执行过程中，infercnv会与参考的正常细胞之间进行肿瘤基因组中基因密度的评估，最终输出热图展示各染色体的基因相对密度，从而反映待测细胞类型的拷贝数变异情况。

InferCNV 提供了多个残差表达过滤器，以探索最小化噪音并进一步揭示支持拷贝数变异(copy number alteration, CNA)的信号。此外，InferCNV还包括预测CNA区域的方法，并根据异质性模式定义细胞聚类。

2、安装 if ( ! requireNamespace ( "BiocManager" , quietly = TRUE )) install.packages ( "BiocManager" ) if ( ! requireNamespace ( "infercnv" , quietly = TRUE )) BiocManager :: install ( "infercnv" ) 3、输入文件一共需要准备三个输入数据：raw count矩阵、细胞注释、基因注释，这里我们以 infercnv 内置的文件展示数据结构：# 清空环境变量 rm ( list = ls ) gc # used (Mb) gc trigger (Mb) max used (Mb) # Ncells 836583 44.7 1578222 84.3 1277907 68.3 # Vcells 1551361 11.9 8388608 64.0 2518594 19.3 # 展示系统时间 Sys.time # [1] "2025-08-29 17:53:28 CST" # 设置工作目录 setwd ( "/data/09_scRNA-seq_cnv/" ) # 如果你成功安装了infercnv，那么你的系统中自带以下文件，无需下载：# 获取测matrix的路径：matrix_path <- system.file ( "extdata" , "oligodendroglioma_expression_downsampled.counts.matrix.gz" , package = "infercnv" ) matrix_path # [1] "/usr/local/lib/R/site-library/infercnv/extdata/oligodendroglioma_expression_downsampled.counts.matrix.gz" # 比较诡异的是作者声称这里输入的是raw count(如果你是用Seurat对象生成的矩阵，GetAssayData(Seurat_obj_V5, assay="RNA", slot='counts'))，但实际上作者提供的是normalization后的矩阵(包含小数，如果你是用Seurat对象生成的矩阵，那么可以这么获取 GetAssayData(Seurat_obj_V5, assay="RNA", slot='data')) # 原文对counts的描述：InferCNV is compatible with both smart-seq2 and 10x single cell transcriptome data, and presumably other methods (not tested). The counts matrix can be generated using any conventional single cell transcriptome quantification pipeline, yielding a matrix of genes (rows) vs. cells (columns) containing assigned read counts. my_matrix <- read.table (matrix_path, header = T, row.names = 1 , sep = ' \t ' ) # 读取这个压缩文件 my_matrix[ 1 : 4 , 1 : 4 ] # 可以看到行名是基因，列名是细胞名 # MGH54_P16_F12 MGH54_P12_C10 MGH54_P11_C11 MGH54_P15_D06 # A2M 0 0.000 0.000 0.000 # A4GALT 0 0.000 0.000 0.000 # AAAS 0 37.008 30.935 21.011 # AACS 0 0.000 0.000 0.000 # 获取细胞注释的路径：anno_path <- system.file ( "extdata" , "oligodendroglioma_annotations_downsampled.txt" , package = "infercnv" ) anno_path # [1] "/usr/local/lib/R/site-library/infercnv/extdata/oligodendroglioma_annotations_downsampled.txt" # 其中包含的是细胞的注释信息，即Seurat中的meta.data anno_data <- read.table (anno_path, header = F, sep = ' \t ' ) # 读取txt文件 head (anno_data) # 查看表头，第一列包含的是细胞名/barcode名称，第二列包含的是细胞对应的注释 # V1 V2 # 1 MGH54_P2_C12 Microglia/Macrophage # 2 MGH36_P6_F03 Microglia/Macrophage # 3 MGH53_P4_H08 Microglia/Macrophage # 4 MGH53_P2_E09 Microglia/Macrophage # 5 MGH36_P5_E12 Microglia/Macrophage # 6 MGH54_P2_H07 Microglia/Macrophage print ( c ( '数据集中包含的细胞类型有：' , paste ( unique (anno_data[, 2 ])))) # [1] "数据集中包含的细胞类型有：" "Microglia/Macrophage" # [3] "Oligodendrocytes (non-malignant)" "malignant_MGH36" # [5] "malignant_MGH53" "malignant_97" # [7] "malignant_93" # 获取基因注释的路径：gene_ann_path <- system.file ( "extdata" , "gencode_downsampled.EXAMPLE_ONLY_DONT_REUSE.txt" , package = "infercnv" ) gene_ann <- read.table (gene_ann_path) # 包含的是基因在染色体中的坐标信息，通过这个信息才可以进行对应基因组区域的拷贝数计算 # 四列分别为基因名、染色体名、基因在染色体上的起始位点、基因在染色体上的终止位点 head (gene_ann) # V1 V2 V3 V4 # 1 WASH7P chr1 14363 29806 # 2 LINC00115 chr1 761586 762902 # 3 NOC2L chr1 879584 894689 # 4 MIR200A chr1 1103243 1103332 # 5 SDF4 chr1 1152288 1167411 # 6 UBE2J2 chr1 1189289 1209265 4、infercnv workflow 准备好以上测试文件后，两步法即可完成infercnv的分析流程：library (infercnv) library (Seurat) # 创建递归目录 dir.create ( "/data/09_scRNA-seq_cnv/result_infercnv/test1" , recursive = T) # （1）、构建infercnv对象 # 依次填入上述文件路径即可：infercnv_obj <- CreateInfercnvObject ( raw_counts_matrix= matrix_path, # 矩阵文件路径 annotations_file= anno_path, # 细胞注释文件路径 delim= " \t " , # 读取文件时的分隔符，如果是csv文件，则用',' gene_order_file= gene_ann_path, # 基因注释文件路径 ref_group_names= c ( "Microglia/Macrophage" , "Oligodendrocytes (non-malignant)" )) # 正常参考细胞类型的名称，需要根据大家的生物学知识来判断，设定一定未发生肿瘤变异的细胞类型 # 如果出现以下报错：# Error in `ScaleData`: # ! No layer matching pattern 'data' found. Please run NormalizeData and retry # Backtrace: # 1. infercnv::run(...) # 可以设置：options ( "Seurat.object.assay.version" = "v3" ) # options 函数是 R 语言中用于管理全局选项的函数 # 这行代码的主要功能是将 Seurat 对象的分析版本设置为版本V3，以便在后续的分析中使用该版本的功能和数据处理方式 # 参考https://github.com/broadinstitute/infercnv/issues/664 # 运行infercnv_obj <- infercnv :: run (infercnv_obj, # 创建的infercnv对象 cutoff= 0.1 , # 使用Smart-seq2这类全长数据时填cutoff=1 , 使用10x Genomics这类3'测序策略数据时填cutoff=0.1 out_dir= '/data/09_scRNA-seq_cnv/result_infercnv/test1' , # 输出路径 cluster_by_groups= TRUE , # 决定是否按提供的细胞分组进行聚类。

如果 TRUE，细胞会基于之前分配的组信息进行聚类，这有助于分析不同组间的 CNV 变化。如果 FALSE，则不使用组信息进行聚类。denoise= TRUE , # 决定是否对数据进行去噪处理 HMM= TRUE , # 是否启用隐马尔科夫模型（Hidden Markov Model, HMM）来进一步细化拷贝数变异的推断。HMM 会根据推断出的 CNV 模式，进行更精确的基因组段拷贝数变异的检测，帮助识别潜在的基因组异常区域。

num_threads= 4 , # 并行计算设置，以4线程为例 write_expr_matrix = TRUE ) 经过22个内置step的运行后程序运行完毕，得到如下结构的输出文件夹：system ( 'tree /data/09_scRNA-seq_cnv/result_infercnv/test1' , intern = T) # [1] "\033[01;34m/data/09_scRNA-seq_cnv/result_infercnv/test1\033[00m" # [2] "├── 01_incoming_data.infercnv_obj" # [3] "├── 02_reduced_by_cutoff.infercnv_obj" # [4] "├── 03_normalized_by_depthHMMi6.infercnv_obj" # [5] "├── 04_logtransformedHMMi6.infercnv_obj" # [6] "├── 08_remove_ref_avg_from_obs_logFCHMMi6.infercnv_obj" # [7] "├── 09_apply_max_centered_expr_thresholdHMMi6.infercnv_obj" # [8] "├── 10_smoothed_by_chrHMMi6.infercnv_obj" # [9] "├── 11_recentered_cells_by_chrHMMi6.infercnv_obj" # [10] "├── 12_remove_ref_avg_from_obs_adjustHMMi6.infercnv_obj" # [11] "├── 14_invert_log_transformHMMi6.infercnv_obj" # [12] "├── 15_tumor_subclustersHMMi6.leiden.infercnv_obj" # [13] "├── 17_HMM_predHMMi6.leiden.hmm_mode-subclusters.cell_groupings" # [14] "├── 17_HMM_predHMMi6.leiden.hmm_mode-subclusters.genes_used.dat" # [15] "├── 17_HMM_predHMMi6.leiden.hmm_mode-subclusters.infercnv_obj" # [16] "├── 17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat" # [17] "├── 17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_regions.dat" # [18] "├── 18_HMM_pred.Bayes_NetHMMi6.leiden.hmm_mode-subclusters.mcmc_obj" # [19] "├── 19_HMM_pred.Bayes_NetHMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.infercnv_obj" # [20] "├── 20_HMM_pred.repr_intensitiesHMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.infercnv_obj" # [21] "├── 22_denoiseHMMi6.leiden.NF_NA.SD_1.5.NL_FALSE.infercnv_obj" # [22] "├── \033[01;34mBayesNetOutput.HMMi6.leiden.hmm_mode-subclusters\033[00m" # [23] "│ ├── cellProbs.0.5.pdf" # [24] "│ ├── cnvProbs.0.5.pdf" # [25] "│ ├── CNV_State_Probabilities.dat" # [26] "│ ├── infercnv.NormalProbabilities.PostFiltering.heatmap_thresholds.txt" # [27] "│ ├── infercnv.NormalProbabilities.PostFiltering.observation_groupings.txt" # [28] "│ ├── infercnv.NormalProbabilities.PostFiltering.observations_dendrogram.txt" # [29] "│ ├── infercnv.NormalProbabilities.PostFiltering.png" # [30] "│ ├── infercnv.NormalProbabilities.PreFiltering.heatmap_thresholds.txt" # [31] "│ ├── infercnv.NormalProbabilities.PreFiltering.observation_groupings.txt" # [32] "│ ├── infercnv.NormalProbabilities.PreFiltering.observations_dendrogram.txt" # [33] "│ ├── infercnv.NormalProbabilities.PreFiltering.png" # [34] "│ └── MCMC_inferCNV_obj.rds" # [35] "├── expr.infercnv.17_HMM_predHMMi6.leiden.hmm_mode-subclusters.dat" # [36] "├── expr.infercnv.19_HMM_pred.Bayes_Net.Pnorm_0.5.dat" # [37] "├── expr.infercnv.20_HMM_predHMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.repr_intensities.dat" # [38] "├── expr.infercnv.dat" # [39] "├── expr.infercnv.preliminary.dat" # [40] "├── HMM_CNV_predictions.HMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.pred_cnv_genes.dat" # [41] "├── HMM_CNV_predictions.HMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.pred_cnv_regions.dat" # [42] "├── infercnv.17_HMM_predHMMi6.leiden.hmm_mode-subclusters.heatmap_thresholds.txt" # [43] "├── infercnv.17_HMM_predHMMi6.leiden.hmm_mode-subclusters.observation_groupings.txt" # [44] "├── infercnv.17_HMM_predHMMi6.leiden.hmm_mode-subclusters.observations_dendrogram.txt" # [45] "├── infercnv.17_HMM_predHMMi6.leiden.hmm_mode-subclusters.observations.txt" # [46] "├── infercnv.17_HMM_predHMMi6.leiden.hmm_mode-subclusters.png" # [47] "├── infercnv.17_HMM_predHMMi6.leiden.hmm_mode-subclusters.references.txt" # [48] "├── infercnv.19_HMM_pred.Bayes_Net.Pnorm_0.5.heatmap_thresholds.txt" # [49] "├── infercnv.19_HMM_pred.Bayes_Net.Pnorm_0.5.observation_groupings.txt" # [50] "├── infercnv.19_HMM_pred.Bayes_Net.Pnorm_0.5.observations_dendrogram.txt" # [51] "├── infercnv.19_HMM_pred.Bayes_Net.Pnorm_0.5.observations.txt" # [52] "├── infercnv.19_HMM_pred.Bayes_Net.Pnorm_0.5.png" # [53] "├── infercnv.19_HMM_pred.Bayes_Net.Pnorm_0.5.references.txt" # [54] "├── infercnv.20_HMM_predHMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.repr_intensities.heatmap_thresholds.txt" # [55] "├── infercnv.20_HMM_predHMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.repr_intensities.observation_groupings.txt" # [56] "├── infercnv.20_HMM_predHMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.repr_intensities.observations_dendrogram.txt" # [57] "├── infercnv.20_HMM_predHMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.repr_intensities.observations.txt" # [58] "├── infercnv.20_HMM_predHMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.repr_intensities.png" # [59] "├── infercnv.20_HMM_predHMMi6.leiden.hmm_mode-subclusters.Pnorm_0.5.repr_intensities.references.txt" # [60] "├── infercnv.heatmap_thresholds.txt" # [61] "├── infercnv.observation_groupings.txt" # [62] "├── infercnv.observations_dendrogram.txt" # [63] "├── infercnv.observations.txt" # [64] "├── infercnv.png" # [65] "├── infercnv.preliminary.heatmap_thresholds.txt" # [66] "├── infercnv.preliminary.observation_groupings.txt" # [67] "├── infercnv.preliminary.observations_dendrogram.txt" # [68] "├── infercnv.preliminary.observations.txt" # [69] "├── infercnv.preliminary.png" # [70] "├── infercnv.preliminary.references.txt" # [71] "├── infercnv.references.txt" # [72] "├── infercnv_subclusters.heatmap_thresholds.txt" # [73] "├── infercnv_subclusters.observation_groupings.txt" # [74] "├── infercnv_subclusters.observations_dendrogram.txt" # [75] "├── infercnv_subclusters.png" # [76] "├── preliminary.infercnv_obj" # [77] "└── run.final.infercnv_obj" # [78] "" # [79] "1 directory, 75 files" 文件比较多，它们收录了分析过程中保存的信息：01_incoming_data.infercnv_obj：原始创建的infercnv对象，包含了三个输入文件的信息。

02_reduced_by_cutoff.infercnv_obj : 特定阈值过滤后的数据。03_normalized_by_depthHMMi6.infercnv_obj : 基于HMM深度归一化后的表达矩阵。04_logtransformedHMMi6.infercnv_obj : 对基于HMM深度归一化后的表达矩阵进行对数转化的结果。

08_remove_ref_avg_from_obs_logFCHMMi6.infercnv_obj : 从观测数据中去除参考细胞平均值后的表达数据，可以突出观测细胞的异常表达情况。

09_apply_max_centered_expr_thresholdHMMi6.infercnv_obj 中心化表达阈值后的数据，减少离群值的干扰 10_smoothed_by_chrHMMi6.infercnv_obj : 按染色体坐标平滑化后的数据，减少局部波动对整体结果的影响。

11_recentered_cells_by_chrHMMi6.infercnv_obj : 以染色体为单位重新中心化的细胞表达数据，减少染色体的影响，提升结果稳定性。12_remove_ref_avg_from_obs_adjustHMMi6.infercnv_obj : 移除参考平均值并做调整后的数据，可以提升细胞异常表达的观测效果。

14_invert_log_transformHMMi6.infercnv_obj : 对数变换的反向操作，将之前的对数转换数据还原到原来的表达值范围。15_no_subclusteringHMMi6.infercnv_obj : 未进行亚群聚类的对象。17-22_HMM_pred* 系列文件: 基于HMM（隐藏马尔科夫模型）方法对拷贝数变异（CNV）的预测结果。

包括细胞分组、基因使用、CNV预测的基因和区域、以及贝叶斯网络的MCMC（马尔科夫链蒙特卡罗）推断结果。这些文件是预测结果的核心，包含细胞的拷贝数状态，帮助识别潜在的基因组异常。BayesNetOutput.HMMi6.hmm_mode-samples 目录：贝叶斯网络方法的输出，包括细胞和CNV状态的概率、热图、分组、树状图和细胞的拷贝数状态等。用于展示拷贝数变异的概率估计，帮助可视化细胞群体中的变异分布。

expr.infercnv.* 系列文件：不同分析阶段下的表达数据，用于对拷贝数变异进行进一步分析，以及生成热图和分组信息。HMM_CNV_predictions.* 系列文件：HMM预测的CNV区域和基因信息，用于预测哪些基因或基因区域发生了拷贝数变异。

heatmap_thresholds , observation_groupings , observations_dendrogram：包含生成热图的阈值、细胞分组信息和层次聚类结果。MCMC_inferCNV_obj.rds : 包含MCMC推断结果的rds对象。preliminary.* 系列文件：初步分析阶段生成的数据文件，包括初步的热图、分组和树状图信息。最终分析前用于检查数据处理的效果和初步的拷贝数变异结果。

run.final.infercnv_obj : 最终的inferCNV对象，包含了所有分析步骤的最终结果。我们看一下几个比较重要的输出文件：infercnv.png : 以热图的形式展示 infercnv score，图中可以观察到，作为参考细胞的”Microglia/Macrophage”与”Oligodendrocytes (non-malignant)“，图中几乎没有蓝色(cnv loss)与红色(cnv gain)。

而受试组细胞类型”malignant_MGH36”、“malignant_MGH53”、“malignant_97”、“malignant_93”则存在大量密集的红色(CNV gain)、蓝色区域(CNV loss)。当然，这里作者已经注释出了这些恶性细胞群作为受试细胞，因此它们的CNV事件均比较明显，实际计算下，大家可以通过infercnv的结果来推断未命名亚群的恶性程度。

infercnv.references.txt : 参考细胞的CNV分值矩阵，行名是基因名，列名是细胞名：# 读取数据：ref_score <- read.table ( '/data/09_scRNA-seq_cnv/result_infercnv/test1/infercnv.references.txt' ) ref_score[ 1 : 4 , 1 : 4 ] # 可以看出大部分都在1附近 # MGH53_P2_G04 MGH54_P16_F12 MGH53_P1_D07 MGH53_P11_F08 # WASH7P 1.003744 1.003744 1.264298 1.003744 # LINC00115 1.003744 1.003744 1.261337 1.003744 # NOC2L 1.003744 1.003744 1.258332 1.003744 # SDF4 1.003744 1.003744 1.255518 1.003744 infercnv.observations.txt : 受试细胞的CNV分值矩阵，行名是基因名，列名是细胞名：# 读取数据：obs_score <- read.table ( '/data/09_scRNA-seq_cnv/result_infercnv/test1/infercnv.observations.txt' ) obs_score[ 1 : 4 , 1 : 4 ] # 可以看出相较于ref_score，obs_score与"1"偏离程度更大 # X93_P8_G11 X93_P8_C11 X93_P8_A12 X93_P8_B06 # WASH7P 1.003744 1.003744 0.7897687 0.8224919 # LINC00115 1.003744 1.003744 0.7923169 0.8239459 # NOC2L 1.003744 1.003744 0.7960422 0.8228446 # SDF4 1.003744 1.003744 0.8010698 0.8231211 5、infercnv实战 5.1 数据概况这里的演示数据来源于转移性肺腺癌的单细胞转录组谱，共包含从44例正常组织或早期到转移期癌症患者的208,506个细胞。

该数据集的原文发表在《Nature communication》中：https://pmc.ncbi.nlm.nih.gov/articles/PMC7210975/。数据可以从GEO数据库中直接下载 GSE131907，当然我也已经给大家下载到本地。

# 读取原始矩阵 mycount <- readRDS ( '/data/09_scRNA-seq_cnv/data/GSE131907_Lung_Cancer_raw_UMI_matrix.rds' ) # 读取注释信息 mymetada <- read.table ( '/data/09_scRNA-seq_cnv/data/GSE131907_Lung_Cancer_cell_annotation.txt' , header = T, row.names = 1 , sep = ' \t ' ) # 创建Seurat对象 scRNA <- CreateSeuratObject ( counts = mycount) # 添加注释信息到Seurat对象中 scRNA <- AddMetaData (scRNA, metadata = mymetada) # 创建文件夹 dir.create ( "/data/09_scRNA-seq_cnv/result_infercnv/test2" , recursive = T) # 保存rds文件 saveRDS (scRNA, "/data/09_scRNA-seq_cnv/result_infercnv/test2/GSE131907_Lung_Cancer_anno.rds" ) library (Seurat) # 看看细胞数：scRNA <- readRDS ( "/data/09_scRNA-seq_cnv/result_infercnv/test2/GSE131907_Lung_Cancer_anno.rds" ) ncol (scRNA) # [1] 208506 # 看看有哪些注释信息：names (scRNA @ meta.data) # [1] "orig.ident" "nCount_RNA" "nFeature_RNA" # [4] "Barcode" "Sample" "Sample_Origin" # [7] "Cell_type" "Cell_type.refined" "Cell_subtype" # 一共有这些样本：unique (scRNA $ Sample_Origin) # [1] "nLN" "mBrain" "nLung" "tLung" "PE" "mLN" "tL/B" # 一共有这些亚群：unique (scRNA $ Cell_type) # [1] "B lymphocytes" "Epithelial cells" "T lymphocytes" # [4] "Myeloid cells" "NK cells" "Undetermined" # [7] "MAST cells" "Fibroblasts" "Endothelial cells" # [10] "Oligodendrocytes" # 取出Epithelial cells用于infercnv分析：Idents (scRNA) <- 'Cell_type' Epi <- subset (scRNA, idents = 'Epithelial cells' ) # 查看亚群分布：table (Epi $ Cell_subtype) # # AT1 AT2 Ciliated Club Malignant cells # 530 2020 654 439 24784 # tS1 tS2 tS3 Undetermined # 3270 3018 64 60 # 取出特定亚群参与分析 Idents (Epi) <- 'Cell_subtype' Epi <- subset (Epi, idents = c ( 'AT1' , 'tS1' , 'Malignant cells' )) # transcriptional states 1(tS1)、tS2、tS3、alveolar types (AT1)、AT2是normal Epithelial的亚群 # 细胞数量还是太多，为了加快演示速度，每个细胞亚群抽500 Epi <- subset (Epi, downsample= 500 ) # 查看当前留存的细胞数：table ( Idents (Epi)) # # Malignant cells AT1 tS1 # 500 接下来就是标准的单细胞分析流程，这部分内容看不懂的同学可以参考scRNA-Seq学习手册Seurat V4修订版 # 标准化 Epi <- NormalizeData (Epi, normalization.method = "LogNormalize" , scale.factor = 10000 ) # 寻找高变基因 Epi <- FindVariableFeatures (Epi, selection.method = "vst" , nfeatures = 2000 ) # 归一化数据 Epi <- ScaleData (Epi, features = VariableFeatures (Epi)) # 计算线粒体含量 Epi[[ "percent.mt" ]] <- PercentageFeatureSet (Epi, pattern = "^MT-" ) Epi <- ScaleData (Epi, vars.to.regress = "percent.mt" ) Epi <- RunPCA (Epi, features = VariableFeatures ( object = Epi)) ElbowPlot (Epi) # 寻找邻近值 Epi <- FindNeighbors (Epi, dims = 1 : 20 ) Epi <- RunUMAP (Epi, dims = 1 : 20 ) Epi <- RunTSNE (Epi, dims = 1 : 20 ) # 展示细胞类型的降维图 DimPlot (Epi, reduction = "tsne" , label = TRUE ) 这里为了演示起来方便，我直接展示的是注释后的细胞标签，以显著的展示出正常细胞与癌症细胞的拷贝数变异事件差异。

实际的分析过程中，你手里拿到的可能是亚群分析的结果，需要通过infercnv的分析才能够完成亚群的注释，如果这部分内容大家理解不了，可以参考下面两个教程：如何分析细胞亚群 Seurat中如何让细胞听你指挥 5.2 infercnv流程这里我们再走一遍infercnv的基础流程 # 加载包 library (infercnv) library (tidyverse) library (ComplexHeatmap) library (circlize) library (RColorBrewer) library (infercnv) # 整理数据 # 获取count矩阵 exprMatrix <- as.matrix ( GetAssayData (Epi, slot= 'counts' )) # 取出分组注释信息 cellAnnota <- subset (Epi @ meta.data, select= 'Cell_subtype' ) # 把分组注释信息写到本地 write.table (cellAnnota, file = "/data/09_scRNA-seq_cnv/result_infercnv/test2/groupFiles.txt" , sep = ' \t ' , col.names = F) # 制作基因与染色体位置的文件 # 加载注释基因信息的包 if ( ! require (AnnoProbe)) install.packages ( 'AnnoProbe' ) # 获取基因注释信息，这里物种是人，如果是小鼠，把human换成mouse geneInfor = annoGene ( rownames (Epi), "SYMBOL" , 'human' ) # 包含以下信息 colnames (geneInfor) # [1] "SYMBOL" "biotypes" "ENSEMBL" "chr" "start" "end" # 选取基因的 geneInfor = geneInfor[ with (geneInfor, order (chr, start)), c ( 1 , 4 : 6 )] # 去重 geneInfor = geneInfor[ ! duplicated (geneInfor[, 1 ]),] # 查看一共有多少基因 length ( unique (geneInfor[, 1 ])) # [1] 20363 # 查看最终的注释信息 head (geneInfor) # SYMBOL chr start end # 23 AP006222.2 chr1 266855 268655 # 51 FAM87B chr1 817371 819837 # 53 LINC00115 chr1 826206 827522 # 55 FAM41C chr1 868071 876903 # 62 SAMD11 chr1 923928 944581 # 63 NOC2L chr1 944203 959309 # 注释信息需要写到本地供infercnv读取 write.table (geneInfor, '/data/09_scRNA-seq_cnv/result_infercnv/test2/geneLocate.txt' , row.names= F, col.names= F, sep= ' \t ' ) # 创建infercnv对象：infercnv_obj <- CreateInfercnvObject ( raw_counts_matrix= exprMatrix, # 矩阵文件路径 annotations_file= "/data/09_scRNA-seq_cnv/result_infercnv/test2/groupFiles.txt" , # 细胞注释文件路径 delim= " \t " , # 读取文件时的分隔符，如果是csv文件，则用',' gene_order_file= "/data/09_scRNA-seq_cnv/result_infercnv/test2/geneLocate.txt" , # 基因注释文件路径 ref_group_names= "AT1" ) # 正常参考细胞类型的名称，需要根据大家的生物学知识来判断，设定一定未发生肿瘤变异的细胞类型 # 运行infercnv：infercnv_obj <- infercnv :: run (infercnv_obj, # 创建的infercnv对象 cutoff= 0.1 , # 使用Smart-seq2这类全长数据时填cutoff=1, 使用10x Genomics这类3'测序策略数据时填cutoff=0.1 out_dir= '/data/09_scRNA-seq_cnv/result_infercnv/test2' , # 结果输出目录 hclust_method= "ward.D2" , # 指定层次聚类方法，ward.D2最常用的方法，试图最小化类间的方差（推荐用于 scRNA-seq） plot_steps= F, # 如果为真，则保存 infercnv 对象并在中间步骤绘制数据 denoise= TRUE , # 决定是否对数据进行去噪处理 HMM= T, Markov Model, HMM）来进一步细化拷贝数变异的推断。

HMM 会根据推断出的 CNV 模式，进行更精确的基因组段拷贝数变异的检测，帮助识别潜在的基因组异常区域。num_threads= 4 , # 并行计算设置，以4线程为例 cluster_by_groups= TRUE # 决定是否按提供的细胞分组进行聚类。如果 TRUE，细胞会基于之前分配的组信息进行聚类，这有助于分析不同组间的 CNV 变化。如果 FALSE，则不使用组信息进行聚类。

) # 如果出现Error in seq_len(max(obs_annotations_groups)) : argument must be coercible to non-negative integer # 可以参考https://github.com/broadinstitute/infercnv/issues/526进行调整,设置cluster_by_groups=TRUE 可以看到作为参考的 AT1 (上方热图)发生的拷贝数变异事件比较少。

而 Malignant cells 出现了大量且密集的拷贝数变异。tS1 似乎也存在着少量的拷贝数变异，所以单凭热图还不行，还得继续分析。

5.3 热图优化 # 继续可视化 infercnv_obj = readRDS ( "/data/09_scRNA-seq_cnv/result_infercnv/test2/run.final.infercnv_obj" ) # 读取运行好的infercnv对象 expr <- infercnv_obj @ expr.data # 取出infercnv评分数据 # 获取参考/待测细胞的索引 normal_loc <- infercnv_obj @ reference_grouped_cell_indices # 正常参考细胞的索引 normal_loc <- normal_loc $ AT1 test_loc <- infercnv_obj @ observation_grouped_cell_indices # 待测细胞的索引 test_loc <- c (test_loc $ tS1,test_loc $ ` Malignant cells ` ) test_loc[ 1 : 4 ] # [1] 3 6 8 10 # 把之前写到本地的分组信息读回来 anno.df = read.table ( '/data/09_scRNA-seq_cnv/result_infercnv/test2/groupFiles.txt' , header = F, sep = ' \t ' ) colnames (anno.df)[ 1 ] <- 'CB' # 读取并整理此前保存的基因坐标信息：geneFile <- read.table ( "/data/09_scRNA-seq_cnv/result_infercnv/test2/geneLocate.txt" , header = F, sep = " \t " , stringsAsFactors = F) rownames (geneFile) = geneFile $ V1 # 将基因名设置为行名 co_gene <- intersect ( rownames (expr),geneFile $ V1) # 表达矩阵和基因注释文件都存在的基因 sub_geneFile <- geneFile[co_gene,] # 取出基因注释文件的交集 sub_geneFile $ num_chr <- gsub ( 'chr' , '' ,sub_geneFile $ V2) %>% as.numeric # 以数值型进行排序：sub_geneFile <- sub_geneFile[ order (sub_geneFile $ num_chr),] expr = expr[co_gene,] # 取出表达矩阵的交集 set.seed ( 2024 ) kmeans.result <- kmeans ( t (expr), 7 ) # 完成kmeans聚类 kmeans_df <- data.frame ( kmeans_class= kmeans.result $ cluster) # 取出聚类的cluster信息并转化为数据框 kmeans_df $ CB = rownames (kmeans_df) # 设置行名为新的一列，即cell barcode kmeans_df = kmeans_df %>% inner_join (anno.df, by= "CB" ) # 按照barcode合并分组注释文件与聚类结果 kmeans_df_s = arrange (kmeans_df,kmeans_class) # 按照聚类结果进行排序 rownames (kmeans_df_s) = kmeans_df_s $ CB # 设置行名为cell barcode kmeans_df_s $ CB = NULL # 后面用不到cellbarcode了，将这行删除 kmeans_df_s $ kmeans_class = as.factor (kmeans_df_s $ kmeans_class) # 聚类结果是数字，需要转换为因子，否则后面热图注释会报错 colnames (kmeans_df_s)[ 2 ] <- 'Sub_type' # 改变第二列的列名 head (kmeans_df_s) # kmeans_class Sub_type # AAACGGGAGGCGCTCT_NS_12 1 Malignant cells # AATCGGTAGCGTAGTG_NS_12 1 Malignant cells # ACATACGTCGTGGGAA_NS_12 1 Malignant cells # ACCTTTAAGATCACGG_NS_12 1 Malignant cells # ACGAGGAGTAGGGACT_NS_12 1 Malignant cells # ACGAGGATCTGCCCTA_NS_12 1 Malignant cells # 绘制热图 # 热图这部分代码看不懂的同学请先行学习：相关链接见专题页 # 顶部注释 top_anno <- HeatmapAnnotation ( foo = anno_block ( gp = gpar ( fill = 1 : 22 , col = "NA" ), labels = paste0 ( 'Chr' , 1 : 22 ), labels_gp = gpar ( cex = 0.8 , rot = 90 )), annotation_height = unit ( 0.3 , "cm" )) # 调整注释的高度 color_v = RColorBrewer :: brewer.pal ( 8 , "Dark2" )[ 1 : 7 ] names (color_v) = as.character ( 1 : 7 ) # 提取两个子集 malignant_tS1_data <- expr[, rownames (kmeans_df_s[kmeans_df_s $ Sub_type %in% c ( "Malignant cells" , "tS1" ), ])] AT1_data <- expr[, rownames (kmeans_df_s[kmeans_df_s $ Sub_type == "AT1" , ]) ] # 配色方案：my_fn <- fivenum (expr) # 返回最低值、下四分位数、中位数，上四分位数、最大值 col_fun <- colorRamp2 ( c (my_fn[ 1 ], my_fn[ 3 ], my_fn[ 5 ]), c ( " , " , " )) # 确定最低值、中位数、最高值的颜色分配方案 # 为 Malignant cells 和 tS1 生成注释：malignant_tS1_anno <- rowAnnotation ( df = kmeans_df_s[kmeans_df_s $ Sub_type %in% c ( "Malignant cells" , "tS1" ), ], col = list ( Sub_type = c ( "tS1" = " , 'Malignant cells' = ' ), kmeans_class = color_v)) # 为 AT1 生成注释：AT1_anno <- rowAnnotation ( df = kmeans_df_s[kmeans_df_s $ Sub_type == "AT1" , ], col = list ( Sub_type = c ( "AT1" = " ), kmeans_class = color_v)) # 热图1：Malignant cells + tS1的cnvscore热图 ht1 <- Heatmap ( t (malignant_tS1_data)[,sub_geneFile $ V1], col = col_fun, cluster_rows = FALSE , cluster_columns = FALSE , show_column_names = FALSE , show_row_names = FALSE , top_annotation = top_anno, left_annotation = malignant_tS1_anno, column_split = factor (sub_geneFile $ num_chr), border = TRUE , # 添加边框 column_title = NULL ) # `use_raster` is automatically set to TRUE for a matrix with more than # 2000 columns You can control `use_raster` argument by explicitly # setting TRUE/FALSE to it. # # Set `ht_opt$message = FALSE` to turn off this message. # 热图2：AT1的cnv score热图 ht2 <- Heatmap ( t (AT1_data)[,sub_geneFile $ V1], col = col_fun, cluster_rows = FALSE , cluster_columns = FALSE , show_column_names = FALSE , show_row_names = FALSE , left_annotation = AT1_anno, column_split = factor (sub_geneFile $ num_chr), border = TRUE , # 添加边框 show_column_dend = FALSE # 关闭column_split标签 ) # `use_raster` is automatically set to TRUE for a matrix with more than # 2000 columns You can control `use_raster` argument by explicitly # setting TRUE/FALSE to it. # # Set `ht_opt$message = FALSE` to turn off this message. # 把两张热图拼接起来 draw (ht1 %v% ht2, heatmap_legend_side = "right" ) 这样还是可以明显看出Malignant cells与tS1在聚类中分离的比较开，并且发生了更多的拷贝数变异事件。

5.4 小提琴图 5.4.1 从细胞类群的角度看 # 转化cnvscore数据 # fivenum (expr) # 原始的cov_score是以1为中心分布的数据 # [1] 0.7623885 1.0011799 1.0011799 1.0011799 1.5786998 CNV_score = expr -1 # 转换为以0为中心分布的数据 CNV_score = CNV_score ^ 2 # 平方，防止累加的过程中cnv lose和cnv gain相互抵消，造成没有cnv事件发生的假象 CNV_score = as.data.frame ( colMeans (CNV_score)) # 以细胞为单位，获得cnv score colnames (CNV_score) = "CNV_score" # 更改列名 CNV_score $ CB = rownames (CNV_score) # 加入cell barcode列 kmeans_df_s $ CB = rownames (kmeans_df_s) # 把kmeans_df_s中的cell barcode也加上 CNV_score = CNV_score %>% inner_join (kmeans_df_s, by= "CB" ) # 合并cnvscore和聚类结果 library (ggplot2) library (ggsignif) ggplot (CNV_score, aes (Sub_type,CNV_score, fill= Sub_type)) + geom_violin ( alpha= 0.4 ) + stat_boxplot ( geom= "errorbar" , position = position_dodge ( width = 0.1 ), width= 0.1 ) + # 加上误差棒 geom_boxplot ( alpha= 0.5 , outlier.size= 0 , size= 0.3 , width= 0.3 ) + # 加上箱线图 scale_fill_manual ( values = ggsci :: pal_aaas ( 3 )) + # 填充颜色 theme_bw + # 主题为空白 geom_signif ( comparisons = list ( c ( "AT1" , "tS1" ), c ( "tS1" , "Malignant cells" ), c ( "AT1" , "Malignant cells" )), step_increase = 0.1 , map_signif_level = F) 可以看到结果非常完美，作为正常细胞的 AT1 与 tS1 与 Malignant cells之间的统计结果呈现出极显著的关系，而AT1与tS1之间则没有显著性差异(0.086) 5.4.2 从聚类的角度来看 ggplot (CNV_score, aes (kmeans_class,CNV_score, fill= kmeans_class)) + geom_violin ( alpha= 0.4 ) + stat_boxplot ( geom= "errorbar" , position = position_dodge ( width = 0.1 ), width= 0.1 ) + # 加上误差棒 geom_boxplot ( alpha= 0.5 , outlier.size= 0 , size= 0.3 , width= 0.3 ) + # 加上箱线图 scale_fill_manual ( values = ggsci :: pal_nejm ( 8 )) + # 填充颜色 theme_bw # 主题为空白可以看出1、3、5、6、7明显cnv score要高于其它几个亚群，我们可以通过cluster在Sub_type中的亚群来辅助判断它们的身份：cell.prop <- as.data.frame ( prop.table ( table (CNV_score $ kmeans_class, CNV_score $ Sub_type))) colnames (cell.prop) <- c ( "kmeans_class" , "Sub_type" , "proportion" ) ggplot (cell.prop, aes (kmeans_class,proportion, fill= Sub_type)) + geom_bar ( stat= "identity" , position= "fill" ) + ggtitle ( "" ) + theme_bw + theme ( axis.ticks.length= unit ( 0.5 , 'cm' )) + guides ( fill= guide_legend ( title= NULL )) + theme ( axis.text.x.bottom = element_text ( angle = 45 , hjust = 1 , vjust = 1 )) + scale_fill_manual ( values = ggsci :: pal_nejm ( 8 )) # 填充颜色果然不出所料，1、3、6、7中均存在很高比例的Malignant cells，而5含有一定比例的Malignant cell，非常的迷惑，假设这是一个未经注释的数据集，我一定会这么命名：CNV_score $ Type <- recode (CNV_score $ kmeans_class, '1' = 'Malignant cells' , '2' = 'Normal Epi' , '3' = 'Malignant cells' , '4' = 'Normal Epi' , '5' = 'Malignant-transition cells' , '6' = 'Malignant cells' , '7' = 'Malignant cells' ) unique (CNV_score $ Type) # [1] Normal Epi Malignant-transition cells # [3] Malignant cells # Levels: Malignant cells Normal Epi Malignant-transition cells # 看一下重命名过的cnv score怎么分布：ggplot (CNV_score, aes (Type,CNV_score, fill= Type)) + geom_violin ( alpha= 0.4 ) + stat_boxplot ( geom= "errorbar" , position = position_dodge ( width = 0.1 ), width= 0.1 ) + # 加上误差棒 geom_boxplot ( alpha= 0.5 , outlier.size= 0 , size= 0.3 , width= 0.3 ) + # 加上箱线图 scale_fill_manual ( values = ggsci :: pal_nejm ( 8 )) + # 填充颜色 theme_bw + # 主题为空白 geom_signif ( comparisons = list ( c ( 'Malignant cells' , 'Normal Epi' ), c ( 'Normal Epi' , 'Malignant-transition cells' ), c ( 'Malignant cells' , 'Malignant-transition cells' )), step_increase = 0.1 , map_signif_level = F) 猜的没错，正常组和另外两组cnv score间均有显著性差异，我们可以将这个注释导回原来的上皮细胞Seurat对象中：rownames (CNV_score) <- CNV_score $ CB # 设置Cell barcode行名，AddMeta会按照行名匹配 Epi <- AddMetaData (Epi, metadata = CNV_score) # 添加CNV score注释信息 Idents (Epi) <- 'Type' # 改变Seurat对象默认标签 DimPlot (Epi, reduction = "tsne" , cols = ggsci :: pal_nejm ( 8 )) # 查看不同type的降维情况 # 下面的工作就是生物学方面的意义啦，比如做个差异分析：DEG <- FindMarkers (Epi, ident.1 = 'Malignant cells' , ident.2 = 'Normal Epi' ) head (DEG) # p_val avg_log2FC pct.1 pct.2 p_val_adj # IGFBP3 1.443336e-146 6.379162 0.604 0.021 4.277182e-142 # GAPDH 1.958204e-116 1.945720 0.994 0.830 5.802941e-112 # LY6D 1.324926e-111 5.484267 0.457 0.011 3.926287e-107 # S100P 3.648096e-110 3.440433 0.569 0.047 1.081077e-105 # FAM83A 2.220502e-106 4.188541 0.569 0.055 6.580235e-102 # STEAP1 4.566941e-102 4.089125 0.466 0.023 1.353367e-97 DEG $ Gene <- row.names (DEG) # 画个火山图看一下：ggplot (DEG, aes (avg_log2FC , - log10 (p_val))) + # 横向水平参考线：geom_hline ( yintercept = - log10 ( 0.05 ), linetype = "dashed" , color = " ) + # 纵向垂直参考线：geom_vline ( xintercept = c ( - 1.2 , 1.2 ), linetype = "dashed" , color = " ) + # 散点图: geom_point ( aes ( size= - log10 (p_val), color= - log10 (p_val))) + # 指定颜色渐变模式：scale_color_gradientn ( values = seq ( 0 , 1 , 0.2 ), colors = c ( " , " , ' , " , " )) + # 指定散点大小渐变模式：scale_size_continuous ( range = c ( 0.5 , 4 )) + # 主题调整：theme_bw + # 调整主题和图例位置：# theme(panel.grid = element_blank, # legend.position = c(0.01,0.7), # legend.justification = c(0,1) # )+ # 设置部分图例不显示：guides ( col = guide_colourbar ( title = "-Log10_q-value" ), size = "none" ) + # 添加标签：geom_text ( aes ( label= Gene, color = - log10 (p_val)), size = 3 , vjust = 1.5 , hjust= 1 ) + # 修改坐标轴：xlab ( "Log2FC" ) + ylab ( "-Log10(FDR q-value)" ) 常规套路就是富集分析之流的，大家可以参考：富集分析 Sys.time # [1] "2025-08-29 17:57:39 CST" save.image ( "result_infercnv/infercnv.rdata" )