第 3/8 章

四、monocle2

1、简介 视频教程:请点击monocle2轨迹分析视频链接。测试文件:请在《拟时序分析》3.monocle2实操:完整版推送中获取。

2、安装monocle2 # 安装指定版本的R和monocle2 # conda search monocle # conda create -n monocle.2.18.0 # conda activate monocle.2.18.0 # conda install r-base==4.0.5 # conda install # conda install bioconductor-monocle==2.18.0 library (monocle) library (Seurat) if ( ! require (dplyr)) install.packages ( 'dplyr' ) 3、monocle2对象的格式 celldataset的创建、转换 Sys.time # [1] "2025-04-28 12:25:56 CST" # 设置工作目录 setwd ( "/data/06_scRNA-seq_psedutime/" ) # 官方给出的cds对象构建过程 # 根据自己数据实际情况修改表达矩阵、细胞文件、基因名称文件 # HSMM_expr_matrix <- read.table("fpkm_matrix.txt") # HSMM_sample_sheet <- read.delim("cell_sample_sheet.txt") # HSMM_gene_annotation <- read.delim("gene_annotations.txt") # pd <- new("AnnotatedDataFrame", data = HSMM_sample_sheet) # fd <- new("AnnotatedDataFrame", data = HSMM_gene_annotation) # HSMM <- newCellDataSet(as.matrix(HSMM_expr_matrix), # phenoData = pd, featureData = fd) # seurat对象转换为cds对象方法 sce.obj <- readRDS ( 'data/sce.obj.rds' ) class (sce.obj) # [1] "Seurat" # attr(,"package") # [1] "SeuratObject" # 准备细胞*基因表达矩阵 data <- GetAssayData (sce.obj, slot = "counts" ) data[ 1 : 4 , 1 : 4 ] # 4 x 4 sparse Matrix of class "dgCMatrix" # AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1 # AL627309.1 . . . # AP006222.2 . . . # RP11-206L10.2 . . . # RP11-206L10.9 . . . # AAACCGTGCTTCCG-1 # AL627309.1 . # AP006222.2 . # RP11-206L10.2 . # RP11-206L10.9 . # 准备phenoData,描述细胞的各种元属性,相当于[email protected],行-细胞,列-属性 pd <- new ( 'AnnotatedDataFrame' , data = sce.obj @ meta.data) # 准备featureData,基因名,行-基因名 fData <- data.frame ( gene_short_name = row.names (data), row.names = row.names (data)) fd <- new ( 'AnnotatedDataFrame' , data = fData) # 创建monocle2的celldataset(缩写为cds)对象 mycds <- newCellDataSet (data, phenoData = pd, featureData = fd, expressionFamily = negbinomial.size ) 需要根据输入数据做出相应的选择 count的输入,而前者计算速度更快,后者运行的更加准确,适用于截断正态分布(truncated normal distributions)的数据,如FPKM、TPM,monocle处理这类数据会先取log class (mycds) # [1] "CellDataSet" # attr(,"package") # [1] "monocle" cds => seurat.obj as.data.frame (mycds @ phenoData @ data)[ 1 : 4 , 1 : 4 ] # orig.ident nCount_RNA nFeature_RNA percent.mt # AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759 # AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958 # AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363 # AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845 names (mycds @ phenoData @ data) # [1] "orig.ident" "nCount_RNA" "nFeature_RNA" "percent.mt" # [5] "RNA_snn_res.0.5" "seurat_clusters" "celltype" "Size_Factor" class (mycds @ assayData $ exprs) # [1] "dgCMatrix" # attr(,"package") # [1] "Matrix" m2seurat <- CreateSeuratObject ( counts = mycds @ assayData $ exprs, meta.data = mycds @ phenoData @ data) class (m2seurat) # [1] "Seurat" # attr(,"package") # [1] "SeuratObject" 4、初探monocle2 4.1 预处理、质控 mycds <- estimateSizeFactors (mycds) mycds <- estimateDispersions (mycds) # 质控(可选) 去除死细胞、空液滴、碎细胞液滴、双液滴,提高计算准确度、降低运算时间 mycds <- detectGenes (mycds, min_expr = 0.1 ) head (Biobase :: fData (mycds)) # gene_short_name num_cells_expressed # AL627309.1 AL627309.1 9 # AP006222.2 AP006222.2 3 # RP11-206L10.2 RP11-206L10.2 5 # RP11-206L10.9 RP11-206L10.9 3 # LINC00115 18 # NOC2L 254 # head(fData(mycds))如果提示Error: unable to find an inherited method for function ‘fData’ for signature ‘x = "CellDataSet"’,可能是monocle2和monocle3的冲突,请调整命令为Biobase::fData(mycds),参考https://github.com/cole-trapnell-lab/garnett/issues/20 mycds_expressed_genes <- row.names ( subset (Biobase :: fData (mycds), num_cells_expressed >= 10 )) mycds_expressed_genes[ 1 : 5 ] # [1] "LINC00115" "NOC2L" "HES4" "ISG15" "C1orf159" mycds <- mycds[mycds_expressed_genes,] length ( row.names (sce.obj)) # [1] 13714 length (mycds_expressed_genes) # [1] 11095 4.2 细胞降维、分群、鉴定 # 我们说过monocle是可以用来鉴定细胞类型的 # 如果你拥有一些先验知识,你可以通过设定一定的阈值来鉴定细胞 # 比如我们之前命名好的Seurat.obj # 计算差异基因 seu.marker <- FindAllMarkers (sce.obj) # 提取每个亚群top10差异基因 top10gene <- seu.marker %>% group_by (cluster) %>% top_n ( n = 10 , wt = avg_log2FC) # 提取指定亚群差异基因 top10gene %>% filter (cluster == 'B' ) # # A tibble: 10 × 7 # # Groups: cluster [1] # p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene # <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr> # 1 8.79e-62 7.77 0.125 0.001 1.21e-57 B AC079767.4 # 2 2.03e-42 7.75 0.087 0.001 2.78e-38 B TNFRSF13C # 3 4.03e-29 7.93 0.061 0.001 5.52e-25 B KCNG1 # 4 8.47e-28 8.11 0.055 0 1.16e-23 B MACROD2 # 5 3.82e-25 9.12 0.047 0 5.25e-21 B FAM177B # 6 7.88e-25 7.75 0.049 0 1.08e-20 B AL928768.3 # 7 1.15e-23 8.70 0.044 0 1.58e-19 B RP11-16E12.2 # 8 7.94e-12 8.15 0.02 0 1.09e- 7 B MARC2 # 9 7.94e-12 7.97 0.02 0 1.09e- 7 B PKHD1L1 # 10 2.44e-10 7.83 0.017 0 3.34e- 6 B HRK top10gene %>% filter (cluster == 'NK' ) # # A tibble: 10 × 7 # # Groups: cluster [1] # p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene # <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr> # 1 2.63e-33 9.47 0.058 0 3.60e-29 NK LIM2 # 2 1.69e-24 7.01 0.058 0.001 2.32e-20 NK RHOBTB3 # 3 2.46e-19 7.76 0.039 0 3.37e-15 NK YES1 # 4 6.57e-16 7.41 0.032 0 9.01e-12 NK NME8 # 5 1.16e-15 8.48 0.026 0 1.59e-11 NK MIR181A2HG # 6 1.16e-15 8.05 0.026 0 1.59e-11 NK GRIK4 # 7 1.72e-12 7.20 0.026 0 2.36e- 8 NK CCNJL # 8 4.09e-12 8.03 0.019 0 5.61e- 8 NK BOK # 9 4.09e-12 7.97 0.019 0 5.61e- 8 NK BNC2 # 10 4.09e-12 8.12 0.019 0 5.61e- 8 NK KRT86 # 从 mycds 对象里找出基因名称为 "CD79A" 和 "GZMB" 的基因在 fData 数据框中的行名 CD79A_id <- row.names ( subset (Biobase :: fData (mycds), gene_short_name == "CD79A" )) GZMB_id <- row.names ( subset (Biobase :: fData (mycds), gene_short_name == "GZMB" )) # newCellTypeHierarchy 是一个函数,用于创建一个新的细胞类型层次结构对象。

cth 就是这个新创建的对象,后续会在这个基础上添加不同的细胞类型。# classify_func 是一个自定义的分类函数,它接收一个矩阵 x 作为输入。这里的矩阵 x 通常代表细胞的基因表达矩阵,x[CD79A_id,] 表示提取基因 CD79A 在所有细胞中的表达值。

该函数的判断条件是当基因 CD79A 的表达值大于等于 1 时,将细胞分类为 B 细胞 # 当基因 CD79A 的表达值小于 1 且基因 GZMB 的表达值大于 1 时,将细胞分类为 NK 细胞 cth <- newCellTypeHierarchy cth <- addCellType (cth, "B" , classify_func = function (x) { x[CD79A_id,] >= 1 }) cth <- addCellType (cth, "NK" , classify_func = function (x) { x[CD79A_id,] < 1 & x[GZMB_id,] > 1 }) class (cth) # [1] "CellTypeHierarchy" # attr(,"package") # [1] "monocle" mycds <- classifyCells (mycds, cth, 0.1 ) table (Biobase :: pData (mycds) $ CellType) # # B NK Unknown # 417 198 2023 # 如果出现以下报错提示 # `nei` was deprecated in igraph 2.1.0 and is now defunct;

# 定位错误发生位置:rlang::last_trace # 解决方法:修改源码 # trace("cth_classifier_cell",edit = T, where = asNamespace("monocle")) 后 修改nei为.nei # pData(mycds)报错提示Error: unable to find an inherited method for function ‘pData’ for signature ‘x = "CellDataSet"’ # 将命令修改为 Biobase::pData(mycds) pm <- ggplot (Biobase :: pData (mycds), aes ( x = factor ( 1 ), fill = factor (CellType))) + geom_bar ( width = 1 ) + coord_polar ( theta = "y" ) + theme ( axis.title.x = element_blank , axis.title.y = element_blank ) ps <- ggplot (Biobase :: pData (mycds), aes ( x = factor ( 1 ), fill = factor (celltype))) + geom_bar ( width = 1 ) + coord_polar ( theta = "y" ) + theme ( axis.title.x = element_blank , axis.title.y = element_blank ) pm | ps # 当然,monocle也可以用非监督式的方法进行分群:# 找出一些表达量高得高变基因来,这些基因可能是会参与细胞命运得决定得 # 这步其实有点像 Seurat::FindVariableFeatures disp_table <- dispersionTable (mycds) unsup_clustering_genes <- subset (disp_table, mean_expression >= 0.1 ) # 大概两千多个“离散基因” length (unsup_clustering_genes $ gene_id) # [1] 2252 # setOrderingFilter该函数的作用是为 Monocle 的 cell ordering 设置一个基因的子集,作为排序的依据 mycds <- setOrderingFilter (mycds, unsup_clustering_genes $ gene_id) plot_ordering_genes (mycds) # 平均表达值与离散度(可变性较高的基因被标为黑点,而灰色的点会被下游分析弃用) # 这其实就是Seurat里的肘图 monocle :: plot_pc_variance_explained (mycds, return_all = F) # 如果提示plot_pc_variance_explained(mycds, return_all = F):Error in plot_pc_variance_explained(mycds, return_all = F),将命令调整为monocle::plot_pc_variance_explained(mycds, return_all = F) # 降维 mycds <- reduceDimension (mycds, max_components = 2 , num_dim = 6 , reduction_method = 'tSNE' , verbose = T) # monocle分群可以指定分群的数量 cluster.num <- length ( unique (sce.obj $ celltype)) cluster.num # [1] 9 mycds <- clusterCells (mycds, num_clusters = cluster.num) mono.redu <- plot_cell_clusters (mycds, 1 , 2 , color = "celltype" , show_cell_names = T, cell_name_size = 5 ) mono.redu library (patchwork) plot_cell_clusters (mycds, 1 , 2 , color = "celltype" , markers = c ( "CD79A" ), show_cell_names = T, cell_name_size = 5 ) | mono.redu plot_cell_clusters (mycds, 1 , 2 , color = "celltype" , markers = c ( "GZMB" ), show_cell_names = T, cell_name_size = 5 ) | mono.redu # 看一下与Seurat的差别,基本形状还是差不多的,但是Seurat的label是可以展示的 seu.tsne <- DimPlot (sce.obj, label = T, reduction = 'tsne' ) plot_cell_clusters (mycds, 1 , 2 , color = "celltype" , show_cell_names = T, cell_name_size = 10 ) | seu.tsne # 这里相当于Seurat regression,排除一些不想要的因素 names (pd @ data) # [1] "orig.ident" "nCount_RNA" "nFeature_RNA" "percent.mt" # [5] "RNA_snn_res.0.5" "seurat_clusters" "celltype" # 排除单个因素影响 mycds <- reduceDimension (mycds, max_components = 2 , num_dim = 2 , reduction_method = 'tSNE' , residualModelFormulaStr = "~percent.mt" , verbose = T) mycds <- clusterCells (mycds, num_clusters = 9 ) reg.tsne <- plot_cell_clusters (mycds, 1 , 2 , color = "celltype" ) mono.redu | reg.tsne # 两个变量的影响一起排除 mycds <- reduceDimension (mycds, max_components = 2 , num_dim = 2 , reduction_method = 'tSNE' , residualModelFormulaStr = "~nFeature_RNA + percent.mt" , verbose = T) mycds <- clusterCells (mycds, num_clusters = 9 ) reg2.tsne <- plot_cell_clusters (mycds, 1 , 2 , color = "celltype" ) mono.redu | reg.tsne | reg2.tsne plot_cell_clusters (mycds, 1 , 2 , color = "celltype" ) + facet_wrap ( ~ celltype) # 用monocle计算一下marker,看看降维效果如何 diff_test_res <- differentialGeneTest (mycds[mycds_expressed_genes,], fullModelFormulaStr = "~celltype" ) # 这一步就相当于Seurat里的findallmarker,约3.5min,不能多线程,狠麻烦,会算很久 diff_test_res[ 1 : 4 , 1 : 4 ] # status family pval qval # LINC00115 OK negbinomial.size 9.547089e-01 9.716102e-01 # NOC2L OK negbinomial.size 4.047651e-01 5.783723e-01 # HES4 OK negbinomial.size 1.118682e-45 6.430970e-44 # ISG15 OK negbinomial.size 2.504851e-22 6.148523e-21 ordering_genes <- row.names ( subset (diff_test_res, qval < 0.01 )) mycds <- setOrderingFilter (mycds, ordering_genes) plot_ordering_genes (mycds) mycds <- reduceDimension (mycds, max_components = 2 , num_dim = 3 , norm_method = 'log' , reduction_method = 'tSNE' , # residualModelFormulaStr = "~Media + num_genes_expressed", verbose = T) mycds <- clusterCells (mycds, num_clusters = 9 ) mono.gene.tsne <- plot_cell_clusters (mycds, 1 , 2 , color = "celltype" ) mono.gene.tsne | mono.redu | seu.tsne seu.var.gene <- Seurat :: VariableFeatures (sce.obj) length (seu.var.gene) # [1] 2000 mycds <- setOrderingFilter (mycds, seu.var.gene) mycds <- reduceDimension (mycds, max_components = 2 , num_dim = 3 , norm_method = 'log' , reduction_method = 'tSNE' , # residualModelFormulaStr = "~Media + num_genes_expressed", verbose = T) mycds <- clusterCells (mycds, num_clusters = 9 ) seu.var.plot <- plot_ordering_genes (mycds) mono.gene.tsne | seu.var.plot 5、 monocle2实战 有选择先验(半监督式)/高变基因(非监督式)两种机器学习供大家选择 5.1 半监督式拟时序 # 正式开始拟时序 # 第一步:,# 这一步主要作用是去噪、提取真正对拟时序有用的基因矩阵,这会影响拟时序的形状 # 半监督式:寻找自己感兴趣的,这里我们先用各celltype之间的marker gene尝试一下 diff_test_res <- differentialGeneTest (mycds[mycds_expressed_genes,], fullModelFormulaStr = "~celltype" ) diff_test_res[ 1 : 4 , 1 : 4 ] # status family pval qval # LINC00115 OK negbinomial.size 9.547089e-01 9.716102e-01 # NOC2L OK negbinomial.size 4.047651e-01 5.783723e-01 # HES4 OK negbinomial.size 1.118682e-45 6.430970e-44 # ISG15 OK negbinomial.size 2.504851e-22 6.148523e-21 ordering_genes <- row.names ( subset (diff_test_res, qval < 0.01 )) ordering_genes[ 1 : 5 ] # [1] "HES4" "ISG15" "TNFRSF18" "TNFRSF4" "SDF4" mycds <- setOrderingFilter (mycds, ordering_genes) plot_ordering_genes (mycds) Graph Embedding降维 mycds <- reduceDimension (mycds, max_components = 2 , method = 'DDRTree' ) mycds <- orderCells (mycds) # 如果报错出现的提示:# Error in if (class(projection) != "matrix") projection <- as.matrix(projection):the condition has length > 1 # 解决方法:修改源码 # trace("project2MST",edit = T, where = asNamespace("monocle")) 删除 “ if (class(projection) != "matrix") ”这部分的代码 plot_cell_trajectory (mycds, color_by = "celltype" ) | plot_cell_trajectory (mycds, color_by = "State" ) | plot_cell_trajectory (mycds, color_by = "Pseudotime" ) # 注意:拟时间可能会与实际的生物学方向相反,如果你具有一定的先验知识,那么你可以反转这一进程 ori.pse <- plot_cell_trajectory (mycds, color_by = "Pseudotime" ) mycds <- orderCells (mycds, reverse = T) revserse.pse <- plot_cell_trajectory (mycds, color_by = "Pseudotime" ) ori.pse | revserse.pse # 如果你觉得混在一起看不清楚 plot_cell_trajectory (mycds, color_by = "State" ) + facet_wrap ( ~ State, nrow = 1 ) plot_cell_trajectory (mycds, color_by = "State" ) + facet_wrap ( ~ celltype, nrow = 1 ) blast_genes <- diff_test_res $ gene_short_name[ 1 : 10 ] blast_genes # [1] "LINC00115" "NOC2L" "HES4" "ISG15" "C1orf159" "TNFRSF18" # [7] "TNFRSF4" "SDF4" "B3GALT6" "UBE2J2" plot_genes_in_pseudotime (mycds[ c ( "HES4" , "ISG15" , "SDF4" ),], color_by = "State" ) mycds_expressed_genes <- row.names ( subset (Biobase :: fData (mycds), num_cells_expressed >= 10 )) mycds_filtered <- mycds[mycds_expressed_genes,] top10gene # # A tibble: 90 × 7 # # Groups: cluster [9] # p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene # <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr> # 1 4.03e-12 4.79 0.034 0.003 0.0000000552 Naive CD4 T NOG # 2 8.87e-10 5.85 0.022 0.001 0.0000122 Naive CD4 T REG4 # 3 2.94e- 8 7.12 0.016 0 0.000403 Naive CD4 T GTSCR1 # 4 1.54e- 7 3.81 0.02 0.002 0.00212 Naive CD4 T C2orf40 # 5 2.83e- 7 4.33 0.016 0.001 0.00388 Naive CD4 T MMP28 # 6 6.11e- 7 3.58 0.019 0.002 0.00837 Naive CD4 T FAM153A # 7 1.87e- 6 3.51 0.016 0.001 0.0256 Naive CD4 T MYB # 8 8.67e- 5 4.42 0.01 0.001 1 Naive CD4 T AXIN2 # 9 8.71e- 5 4.43 0.01 0.001 1 Naive CD4 T ST6GALNAC1 # 10 1.17e- 4 3.86 0.011 0.001 1 Naive CD4 T RP11-119D9.1 # # ℹ 80 more rows cds_subset <- mycds_filtered[ c ( 'LDHB' , 'CCR7' , 'CD3D' ),] plot_genes_in_pseudotime (cds_subset, color_by = "celltype" ) 5.2 非监督式的 # 刚才是半监督式的,现在来个非监督式的 # 重来 # 1、按照cluster基因做拟时序 mycds <- newCellDataSet (data, phenoData = pd, featureData = fd, expressionFamily = negbinomial.size ) mycds <- estimateSizeFactors (mycds) mycds <- estimateDispersions (mycds, cores= 10 , relative_expr = TRUE ) mycds <- detectGenes (mycds, min_expr = 0.1 ) Biobase :: fData (mycds) $ use_for_ordering <- Biobase :: fData (mycds) $ num_cells_expressed > 0.05 * ncol (mycds) expr.genes <- Biobase :: fData (mycds) $ gene_short_name[Biobase :: fData (mycds) $ use_for_ordering] mycds <- setOrderingFilter (mycds, ordering_genes = expr.genes ) mycds <- reduceDimension (mycds, max_components = 2 , method = 'DDRTree' ) plot_ordering_genes (mycds) plot_pc_variance_explained (mycds, return_all = F) mycds <- reduceDimension (mycds, max_components = 2 , norm_method = 'log' , num_dim = 3 , reduction_method = 'tSNE' , verbose = T) mycds <- clusterCells (mycds, rho_threshold = 2 , delta_threshold = 4 , skip_rho_sigma = T, verbose = F) plot_cell_clusters (mycds, color_by = 'as.factor(Cluster)' ) | plot_cell_clusters (mycds, color_by = 'as.factor(celltype)' ) clustering_DEG_genes <- differentialGeneTest (mycds[mycds_expressed_genes,], fullModelFormulaStr = '~Cluster' , cores = 10 ) mycds_ordering_genes <- row.names (clustering_DEG_genes)[ order (clustering_DEG_genes $ qval)][ 1 : 1000 ] mycds <- setOrderingFilter (mycds, ordering_genes = mycds_ordering_genes) mycds <- reduceDimension (mycds, method = 'DDRTree' ) suppressWarnings (mycds <- orderCells (mycds)) plot_cell_trajectory (mycds, color_by = "celltype" ) | plot_cell_trajectory (mycds, color_by = "Pseudotime" ) suppressWarnings (mycds <- orderCells (mycds, reverse = T)) cluster.pse <- plot_cell_trajectory (mycds, color_by = "Pseudotime" ) ori.pse | cluster.pse data <- data <- as ( as.matrix (sce.obj @ assays $ RNA $ counts), 'sparseMatrix' ) mycds <- newCellDataSet (data, phenoData = pd, featureData = fd, expressionFamily = negbinomial.size ) mycds <- estimateSizeFactors (mycds) mycds <- estimateDispersions (mycds, cores= 10 , relative_expr = TRUE ) disp_table <- dispersionTable (mycds) disp.genes <- subset (disp_table, mean_expression >= 0.1 & dispersion_empirical >= 1 * dispersion_fit) $ gene_id mycds <- setOrderingFilter (mycds, disp.genes) plot_ordering_genes (mycds) mycds <- reduceDimension (mycds, max_components = 2 , method = 'DDRTree' ) mycds <- orderCells (mycds) disper.pse <- plot_cell_trajectory (mycds, color_by = "Pseudotime" ) ori.pse | cluster.pse | disper.pse cds_subset <- mycds[ row.names (clustering_DEG_genes)[ order (clustering_DEG_genes $ qval)][ 1 : 1000 ],] diff_test_res <- differentialGeneTest (cds_subset, fullModelFormulaStr = "~sm.ns(Pseudotime)" ) library (dplyr) diff_test_res[, c ( "gene_short_name" , "pval" , "qval" )] %>% head # gene_short_name pval qval # S100A9 1.316208e-213 1.316208e-210 # S100A8 7.284744e-209 3.642372e-206 # NKG7 1.552122e-173 3.104243e-171 # GNLY 4.006125e-191 1.335375e-188 # GZMB 2.736250e-186 6.840626e-184 # PPBP 1.324811e-01 1.503758e-01 sig.gene <- row.names ( subset (diff_test_res, qval < 0.05 )) # plot_genes_in_pseudotime(cds_subset[sig.gene], # color_by = 'celltype',ncol = 5) plot_pseudotime_heatmap (cds_subset[sig.gene[ 1 : 30 ],], num_clusters = 3 , cores = 1 , show_rownames = T) 6、下游探索 6.1 DimPlot和FeaturePlot mono.info <- pData (mycds) head (mono.info) # orig.ident nCount_RNA nFeature_RNA percent.mt RNA_snn_res.0.5 # AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759 1 # AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958 3 # AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363 1 # AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845 2 # AAACCGTGTATGCG-1 pbmc3k 980 521 1.2244898 6 # AAACGCACTGGTAC-1 pbmc3k 2163 781 1.6643551 1 # seurat_clusters celltype Size_Factor Pseudotime State # AAACATACAACCAC-1 1 CD14+ Mono 1.1076350 4.1751808 1 # AAACATTGAGCTAC-1 3 B 2.2450329 5.4834034 3 # AAACATTGATCAGC-1 1 CD14+ Mono 1.4409787 2.5338982 1 # AAACCGTGCTTCCG-1 2 Memory CD4 T 1.2083708 8.6265722 3 # AAACCGTGTATGCG-1 6 NK 0.4487318 0.5655187 1 # AAACGCACTGGTAC-1 1 CD14+ Mono 0.9904153 4.8913800 1 # 可以看出这是一个行名为细胞名,列名为各个变量信息的数据框,可以通过以下方式将这个数据框作为注释列表添加回Seurat colnames (mono.info) # [1] "orig.ident" "nCount_RNA" "nFeature_RNA" "percent.mt" # [5] "RNA_snn_res.0.5" "seurat_clusters" "celltype" "Size_Factor" # [9] "Pseudotime" "State" # 为了避免与Seurat原有的注释重合,我们给这些变量名加一个后缀 new.var <- paste0 ( colnames (mono.info), '_mono.info' ) colnames (mono.info) <- new.var colnames (mono.info) # [1] "orig.ident_mono.info" "nCount_RNA_mono.info" # [3] "nFeature_RNA_mono.info" "percent.mt_mono.info" # [5] "RNA_snn_res.0.5_mono.info" "seurat_clusters_mono.info" # [7] "celltype_mono.info" "Size_Factor_mono.info" # [9] "Pseudotime_mono.info" "State_mono.info" sce.obj <- AddMetaData (sce.obj, metadata = mono.info) # 可以利用Seurat原有的降维图观察细胞类型、拟时序中state、拟时间分布:library (ggsci) DimPlot (sce.obj, group.by = 'celltype' , cols = pal_npg ( 10 )) | DimPlot (sce.obj, group.by = 'State_mono.info' , cols = pal_nejm ( 10 )) | FeaturePlot (sce.obj, features = 'Pseudotime_mono.info' , cols = c ( 'white' , 'darkred' )) 6.2 VlnPlot # VlnPlot可以用于绘制Pseudotime这样的数值型变量 VlnPlot (sce.obj, features = 'Pseudotime_mono.info' , group.by = 'celltype' , cols = pal_npg ( 10 )) | VlnPlot (sce.obj, features = 'Pseudotime_mono.info' , group.by = 'State_mono.info' , cols = pal_nejm ( 10 )) 6.3 ggplot2绘图 cellname <- rownames (mono.info) data4plot <- cbind (sce.obj @ meta.data, as.data.frame (sce.obj @ reductions $ umap @ cell.embeddings)[cellname,]) head (data4plot) # orig.ident nCount_RNA nFeature_RNA percent.mt RNA_snn_res.0.5 # AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759 1 # AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958 3 # AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363 1 # AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845 2 # AAACCGTGTATGCG-1 pbmc3k 980 521 1.2244898 6 # AAACGCACTGGTAC-1 pbmc3k 2163 781 1.6643551 1 # seurat_clusters celltype orig.ident_mono.info # AAACATACAACCAC-1 1 CD14+ Mono pbmc3k # AAACATTGAGCTAC-1 3 B pbmc3k # AAACATTGATCAGC-1 1 CD14+ Mono pbmc3k # AAACCGTGCTTCCG-1 2 Memory CD4 T pbmc3k # AAACCGTGTATGCG-1 6 NK pbmc3k # AAACGCACTGGTAC-1 1 CD14+ Mono pbmc3k # nCount_RNA_mono.info nFeature_RNA_mono.info # AAACATACAACCAC-1 2419 779 # AAACATTGAGCTAC-1 4903 1352 # AAACATTGATCAGC-1 3147 1129 # AAACCGTGCTTCCG-1 2639 960 # AAACCGTGTATGCG-1 980 521 # AAACGCACTGGTAC-1 2163 781 # percent.mt_mono.info RNA_snn_res.0.5_mono.info # AAACATACAACCAC-1 3.0177759 1 # AAACATTGAGCTAC-1 3.7935958 3 # AAACATTGATCAGC-1 0.8897363 1 # AAACCGTGCTTCCG-1 1.7430845 2 # AAACCGTGTATGCG-1 1.2244898 6 # AAACGCACTGGTAC-1 1.6643551 1 # seurat_clusters_mono.info celltype_mono.info # AAACATACAACCAC-1 1 CD14+ Mono # AAACATTGAGCTAC-1 3 B # AAACATTGATCAGC-1 1 CD14+ Mono # AAACCGTGCTTCCG-1 2 Memory CD4 T # AAACCGTGTATGCG-1 6 NK # AAACGCACTGGTAC-1 1 CD14+ Mono # Size_Factor_mono.info Pseudotime_mono.info State_mono.info # AAACATACAACCAC-1 1.1076350 4.1751808 1 # AAACATTGAGCTAC-1 2.2450329 5.4834034 3 # AAACATTGATCAGC-1 1.4409787 2.5338982 1 # AAACCGTGCTTCCG-1 1.2083708 8.6265722 3 # AAACCGTGTATGCG-1 0.4487318 0.5655187 1 # AAACGCACTGGTAC-1 0.9904153 4.8913800 1 # UMAP_1 UMAP_2 # AAACATACAACCAC-1 3.001694 3.751682 # AAACATTGAGCTAC-1 5.673855 -12.254552 # AAACATTGATCAGC-1 5.428529 5.915791 # AAACCGTGCTTCCG-1 -8.592008 -3.628471 # AAACCGTGTATGCG-1 2.763551 -2.193586 # AAACGCACTGGTAC-1 2.877154 5.927556 library ( "ggplot2" ) ggplot (data4plot, aes ( x= UMAP_1, y= UMAP_2)) + geom_point ( aes ( x= UMAP_1, y= UMAP_2, shape= State_mono.info, size= Pseudotime_mono.info, colour = celltype), data = data4plot) + scale_color_manual ( values = pal_nejm ( 10 )) + scale_size_continuous ( range = c ( 0.3 , 1.5 )) + theme_classic + theme ( axis.line = element_line ( size= 0.6 , colour = "black" , arrow = ggplot2 :: arrow ( length = ggplot2 :: unit ( 0.1 , "inches" ), ends = "last" , type = 'open' ))) 6.4 细胞比例探索 这部分内容其实我们以前出过专门的教程,详情见沉浸式统计细胞比例。

在这里也一样,我们可以去统计拟时序的state和celltype之间的关系。

6.4.1 饼图 # 统计细胞类型 unique (mono.info $ celltype_mono.info) # [1] CD14+ Mono B Memory CD4 T NK CD8 T # [6] Naive CD4 T FCGR3A+ Mono DC Platelet # 9 Levels: Naive CD4 T CD14+ Mono Memory CD4 T B CD8 T FCGR3A+ Mono NK ... Platelet mynames <- table (mono.info $ celltype_mono.info) %>% names mynames # [1] "Naive CD4 T" "CD14+ Mono" "Memory CD4 T" "B" "CD8 T" # [6] "FCGR3A+ Mono" "NK" "DC" "Platelet" # 统计细胞数目 myratio <- table (mono.info $ celltype_mono.info) %>% as.numeric myratio # [1] 697 483 480 344 271 162 155 32 14 # 整理细胞标签 pielabel <- paste0 (mynames, " (" , round (myratio / sum (myratio) * 100 , 2 ), "%)" ) pielabel # [1] "Naive CD4 T (26.42%)" "CD14+ Mono (18.31%)" "Memory CD4 T (18.2%)" # [4] "B (13.04%)" "CD8 T (10.27%)" "FCGR3A+ Mono (6.14%)" # [7] "NK (5.88%)" "DC (1.21%)" "Platelet (0.53%)" cols <- pal_npg ( "nrc" )( 10 ) cols # [1] " " " " " " # [7] " " " " pie (myratio, labels= pielabel, radius = 1.0 , clockwise= F, main = "Celltype" , col = cols) mynames <- table (mono.info $ State_mono.info) %>% names myratio <- table (mono.info $ State_mono.info) %>% as.numeric pielabel <- paste0 (mynames, " (" , round (myratio / sum (myratio) * 100 , 2 ), "%)" ) pielabel # [1] "1 (44.28%)" "2 (27.1%)" "3 (28.62%)" pie (myratio, labels= pielabel, radius = 1.0 , clockwise= F, main = "State" , col = cols) 6.4.2 堆积柱形图 cellnum <- table (mono.info $ celltype_mono.info,mono.info $ State_mono.info) cellnum <- cellnum[ ! apply (cellnum, 1 , sum) == 0 ,] # 按列统计细胞比例 cell.prop <- as.data.frame ( prop.table (cellnum, margin= 2 )) colnames (cell.prop) <- c ( "Celltype" , "State" , "Proportion" ) head (cell.prop) # Celltype State Proportion # 1 Naive CD4 T 1 0.25256849 # 2 CD14+ Mono 1 0.36044521 # 3 Memory CD4 T 1 0.00000000 # 4 B 1 0.01626712 # 5 CD8 T 1 0.23116438 # 6 FCGR3A+ Mono 1 0.00000000 p.bar <- ggplot (cell.prop, aes (State,Proportion, fill= Celltype)) + geom_bar ( stat= "identity" , position= "fill" ) + scale_fill_manual ( values= cols) + ggtitle ( "cell proportion" ) + theme_bw + theme ( axis.ticks.length= unit ( 0.5 , 'cm' )) + guides ( fill= guide_legend ( title= NULL )) p.bar 6.4.3 累计密度分布图 densyplot <- ggplot (mono.info, aes ( x= Pseudotime_mono.info, group= celltype_mono.info, fill= celltype_mono.info)) + geom_density ( alpha= 0.3 , adjust= 1.5 ) + scale_fill_manual ( values = c ( pal_npg ( "nrc" )( 10 )) ) + theme_bw densyplot 6.5 基因表达量的探索 6.5.1 在轨迹图上绘制表达量 相信大家一定也很感兴趣如何把Seurat中的基因拿到monocle来绘制。

比如我想通过plot_cell_trajectory函数,像绘制psedotime一样把基因的表达量在拟时序轨迹图中绘制出来,如果直接在color_by中输入基因名是会报错的。

disp.genes[ 1 ] # [1] "CPSF3L" # try(plot_cell_trajectory(mycds, color_by = "CPSF3L")) # 这是因为plot_cell_trajectory函数会在pData(cds)中寻找color_by的变量,所以这里要做的就是把Seurat中基因的表达量加到pData中去,这个操作一行就可以搞定 pData (mycds) $ CPSF3L <- sce.obj @ assays $ RNA @ data[ "CPSF3L" ,][ rownames ( pData (mycds))] try ( plot_cell_trajectory (mycds, color_by = "CPSF3L" )) # 可能你还是觉得上面这张图有点丑,想要改一下颜色,plot_cell_trajectory中无法指定color的值,我阅读了一下源码,发现这个图片本质上还是用ggplot2生成的,那事情就变得简单了 library (circlize) mycolor <- colorRampPalette ( colors = c ( ' , 'white' , ' ))( 100 ) plot_cell_trajectory (mycds, color_by = "CPSF3L" , cell_size = 0.5 ) + scale_colour_gradientn ( colours = mycolor, guide = "colorbar" ) 6.5.2 热图 其实热图的教程我们做过很多:热图绘制天花板:ComplexHeatmap|

一、初探 热图绘制天花板| 二、两万字长文探索注释功能 ComplexHeatmap|

三、用富集分析结果做成词云注释热图 在理解了monocle对象的数据结构和绘制热图的语法后,大家完全可以自行拼凑出下面的图,流程有些长,有点耐心哦:首先整理一下注释信息和表达矩阵 library (ComplexHeatmap) expr4plot <- sce.obj @ assays $ RNA @ scale.data mono.info <- mono.info[ rownames (mono.info) %in% colnames (expr4plot),] mono.info <- mono.info[ order (mono.info $ Pseudotime_mono.info),] expr4plot <- expr4plot[, rownames (mono.info)] dim (mono.info) # [1] 2638 10 dim (expr4plot) # [1] 13714 2638 # 设置顶部的psedotime、celltype、State的注释 top_anno <- HeatmapAnnotation ( Pseudotime= mono.info $ Pseudotime, Celltype= mono.info $ celltype_mono.info, State= mono.info $ State_mono.info) # 可以先看一下注释的效果 draw (top_anno) # 找一些感兴趣的基因集,比如各种细胞的marker,这里以每种细胞的Top20为例,实际过程中大家可以选择更多的基因,便于富集到有意义的通路:# 先展示基因来源 t.sub.marker <- FindAllMarkers (sce.obj, only.pos = TRUE , min.pct = 0.25 , logfc.threshold = 0.25 ) top20gene <- t.sub.marker %>% group_by (cluster) %>% top_n ( n = 20 , wt = avg_log2FC) left_ann <- rowAnnotation ( cellmarker= top20gene $ cluster) draw (left_ann) 来做个富集分析用于词云注释,富集分析的教程可以看这里:手把手教你做单细胞测序数据分析

(七)——基因集富集分析 library (clusterProfiler) library (org.Hs.eg.db) pathway4wc <- data.frame for (runcell in unique (top20gene $ cluster)) { try ({pathway4wc}) gene4er <- filter (top20gene,cluster == runcell) %>% pull (gene) gene.df <- bitr (gene4er, fromType= "SYMBOL" , toType= c ( "ENTREZID" , "ENSEMBL" ), OrgDb = org.Hs.eg.db) gores <- enrichGO (gene.df $ SYMBOL, org.Hs.eg.db, keyType = "SYMBOL" , ont = "BP" , pvalueCutoff = 0.05 , pAdjustMethod = "BH" , qvalueCutoff = 0.2 , minGSSize = 10 , maxGSSize = 500 , readable = FALSE , pool = FALSE ) go.all.res <- as.data.frame (gores @ result) pathway.dt <- top_n (go.all.res, - 10 , p.adjust) pathway.dt <- cbind (pathway.dt, data.frame ( cluster= rep (runcell, nrow (pathway.dt)))) pathway4wc <- rbind (pathway4wc,pathway.dt) } head (pathway4wc) # ID Description GeneRatio BgRatio # GO:0030098 GO:0030098 lymphocyte differentiation 8/19 419/18903 # GO:1903131 GO:1903131 mononuclear cell differentiation 8/19 473/18903 # GO:0030217 GO:0030217 T cell differentiation 7/19 296/18903 # GO:0050863 GO:0050863 regulation of T cell activation 7/19 376/18903 # GO:0022407 GO:0022407 regulation of cell-cell adhesion 7/19 490/18903 # GO:0045061 GO:0045061 thymic T cell selection 3/19 22/18903 # pvalue p.adjust qvalue # GO:0030098 3.327775e-09 2.370731e-06 1.453285e-06 # GO:1903131 8.596498e-09 2.370731e-06 1.453285e-06 # GO:0030217 9.224635e-09 2.370731e-06 1.453285e-06 # GO:0050863 4.778696e-08 9.210937e-06 5.646407e-06 # GO:0022407 2.898002e-07 4.468719e-05 2.739375e-05 # GO:0045061 1.309881e-06 1.450935e-04 8.894393e-05 # geneID Count cluster # GO:0030098 CCR7/CD3D/CD3E/LEF1/TCF7/CD27/NDFIP1/RHOH 8 Naive CD4 T # GO:1903131 CCR7/CD3D/CD3E/LEF1/TCF7/CD27/NDFIP1/RHOH 8 Naive CD4 T # GO:0030217 CCR7/CD3D/CD3E/LEF1/TCF7/CD27/RHOH 7 Naive CD4 T # GO:0050863 CCR7/CD3E/LEF1/TCF7/CD27/NDFIP1/RHOH 7 Naive CD4 T # GO:0022407 CCR7/CD3E/LEF1/CD27/RGCC/NDFIP1/RHOH 7 Naive CD4 T # GO:0045061 CCR7/CD3D/CD3E 3 Naive CD4 T # 构建词云的text # 遍历每个唯一的聚类 mytext <- list for (i in 1 : length ( unique (pathway4wc $ cluster))) { # 选择p.adjust最小的前5个通路 temp.text <- filter (pathway4wc,cluster == unique (pathway4wc $ cluster)[i]) %>% top_n ( - 1 ,p.adjust) %>% pull (Description) # 将当前聚类的通路描述信息添加到列表中 mytext[[i]] <- temp.text } # 为列表元素命名,使用聚类的唯一值 names (mytext) <- unique (pathway4wc $ cluster) # 依据显著性设置词云大小,越显著的通路字体越大 # 创建一个空向量用于存储每个聚类的p.adjust值 myfontsize <- c # 遍历每个唯一的聚类 for (i in 1 : length ( unique (pathway4wc $ cluster))) { # 筛选出当前聚类且p.adjust小于0.05的数据 temfs <- filter (pathway4wc,cluster == unique (pathway4wc $ cluster)[i]) %>% filter (p.adjust < 0.05 ) %>% # 选择p.adjust最小的前5个通路 top_n ( - 1 ,p.adjust) %>% pull (p.adjust) # 将当前聚类的p.adjust值添加到向量中 myfontsize <- c (myfontsize,temfs) } # 加载scales包,用于数据缩放 library (scales) # 对p.adjust取负对数,然后进行缩放,使字体大小范围在0到10之间 myfontsize <- - log2 (myfontsize) %>% scales :: rescale (., to= c ( 0 : 1 )) * 10 # 创建行注释对象 # 使用anno_textbox函数绘制文本框注释 max_line_length <- 30 # 定义换行函数,保留 list 结构 wrap_list_text <- function (lst, width = max_line_length) { lapply (lst, function (vec) { sapply (vec, function (txt) { paste ( strwrap (txt, width = width), collapse = " \n " ) }) }) } # 应用函数 mytext_wrapped <- wrap_list_text (mytext) right_ann <- rowAnnotation ( textbox = anno_textbox (top20gene $ cluster, mytext_wrapped, # 允许文本换行 add_new_line = T, by = "anno_block" , # 设置文本框的样式 gp = gpar ( col = c ( pal_npg ( 10 ), pal_nejm ( 8 ), pal_aaas ( 4 )) , fontsize= myfontsize), # 设置文本框的圆角 round_corners = TRUE , # 设置文本框的最大宽度 max_width= unit ( 3 , 'mm' ))) col_fun <- colorRamp2 ( quantile (expr4plot) %>% as.numeric , c ( ' , ' , 'white' , ' , ' )) finalhmp <- Heatmap (expr4plot[top20gene $ gene,], show_column_names = F, show_row_names = F, col = col_fun, top_annotation = top_anno, cluster_columns = F, row_split = top20gene $ cluster, row_title = NULL , right_annotation = right_ann, border = T, left_annotation = left_ann ) draw (finalhmp) save.image ( "result_m2/m2.Rdata" ) Sys.time # [1] "2025-04-28 13:09:31 CST"

← 上一章 下一章 →