scRNA-seq 数据基础分析（2025修订）｜第十三讲：改造单细胞图形-DotPlot

1. 准备工作 suppressMessages ({ library (Seurat) library (ggplot2) library (dplyr) library (tidyr) library (aplot) library (ggtree) }) # 和第十二讲前期处理类似 # 同样读入第三讲-单样本分析保存的pbmc.rds sce_final <- readRDS ( '../result_seuratV5/pbmc.rds' ) Idents (sce_final) <- "seurat_clusters" DimPlot (sce_final) scRNA.markers <- FindAllMarkers (sce_final, only.pos = TRUE , min.pct = 0.25 , logfc.threshold = 0.25 ) # 选取每个亚群top3基因 marker <- scRNA.markers %>% group_by (cluster) %>% top_n ( n = 3 , wt = avg_log2FC) marker <- marker $ gene 2. DotPlot参数 DotPlot ( object, features, assay = NULL , 默认是active assay cols = c ( "lightgrey" , "blue" ), col.min = - 2.5 , col.max = 2.5 , dot.min = 0 , dot.scale = 6 , idents = NULL , group.by = NULL , split.by = NULL , cluster.idents = FALSE , scale = TRUE , scale.by = "radius" , scale.min = NA , scale.max = NA ) 3. DotPlot及其自带参数美化 # 默认参数效果 DotPlot (sce_final, features = marker) # 修改参数展示 # 一般比较常用的调整参数包括以下：idents / group.by / cols / features # （1）挑选部分细胞类型展示 DotPlot (sce_final, features = marker, idents = c ( "0" , "2" , "4" , "6" )) # （2）按照指定列作为分组信息展示，例如使用seurat_clusters分组 DotPlot (sce_final, features = marker, group.by = "celltype" ) # （3）修改渐变色 DotPlot (sce_final, features = marker, cols = c ( "white" , "red" )) # （4）不同类别的features，分类展示 marker_split <- list ( TCells= c ( "IL32" , "CD3D" , "CD3E" , "MALAT1" , "LDHB" ), Myeloid= c ( "LST1" , "TYROBP" , "FCER1G" , "CST3" , "AIF1" ), BCells= c ( "CD79A" , "MS4A1" , "CD79B" , "LINC00926" , "TCL1A" ), Platelet= c ( "GP9" , "AP001189.4" , "ITGA2B" , "TMEM40" , "LY6G6F" )) DotPlot (sce_final, features = marker_split ) 4、DotPlot联合ggplot2美化 # 在绘图过程中我们也可能遇到几十个基因同时绘制的情况，这时候避免出现字符重叠的情况，我们可以利用Seurat自带修饰主题或者ggplot2这个好帮手 # （1）Seurat自带修饰主题，调整坐标轴字符倾斜角度 DotPlot (sce_final, features = marker) + RotatedAxis # （2）ggplot2的theme函数，调整坐标轴字符倾斜角度 DotPlot (sce_final, features = marker) + theme ( axis.text.x= element_text ( angle= 45 , hjust = 1 )) # （3）ggplot2的coord_flip函数，坐标轴翻转 DotPlot (sce_final, features = marker) + coord_flip 5、ggplot2复现DotPlot函数源码 DotPlot <- function ( object, features, assay = NULL , cols = c ( "lightgrey" , "blue" ), col.min = - 2.5 , col.max = 2.5 , dot.min = 0 , dot.scale = 6 , idents = NULL , group.by = NULL , split.by = NULL , cluster.idents = FALSE , scale = TRUE , scale.by = 'radius' , scale.min = NA , scale.max = NA ) { assay <- assay %||% DefaultAssay ( object = object) DefaultAssay ( object = object) <- assay split.colors <- ! is.null ( x = split.by) && ! any (cols %in% rownames ( x = brewer.pal.info)) scale.func <- switch ( EXPR = scale.by, 'size' = scale_size, 'radius' = scale_radius, stop ( "'scale.by' must be either 'size' or 'radius'" ) ) feature.groups <- NULL if ( is.list (features) | any ( ! is.na ( names (features)))) { feature.groups <- unlist ( x = sapply ( X = 1 : length (features), FUN = function (x) { return ( rep ( x = names ( x = features)[x], each = length (features[[x]]))) } )) if ( any ( is.na ( x = feature.groups))) { warning ( "Some feature groups are unnamed." , call. = FALSE , immediate. = TRUE ) } features <- unlist ( x = features) names ( x = feature.groups) <- features } cells <- unlist ( x = CellsByIdentities ( object = object, cells = colnames (object[[assay]]), idents = idents)) data.features <- FetchData ( object = object, vars = features, cells = cells) data.features $ id <- if ( is.null ( x = group.by)) { Idents ( object = object)[cells, drop = TRUE ] } else { object[[group.by, drop = TRUE ]][cells, drop = TRUE ] } if ( ! is.factor ( x = data.features $ id)) { data.features $ id <- factor ( x = data.features $ id) } id.levels <- levels ( x = data.features $ id) data.features $ id <- as.vector ( x = data.features $ id) if ( ! is.null ( x = split.by)) { splits <- FetchData ( object = object, vars = split.by)[cells, split.by] if (split.colors) { if ( length ( x = unique ( x = splits)) > length ( x = cols)) { stop ( paste0 ( "Need to specify at least " , length ( x = unique ( x = splits)), " colors using the cols parameter" )) } cols <- cols[ 1 : length ( x = unique ( x = splits))] names ( x = cols) <- unique ( x = splits) } data.features $ id <- paste (data.features $ id, splits, sep = '_' ) unique.splits <- unique ( x = splits) id.levels <- paste0 ( rep ( x = id.levels, each = length ( x = unique.splits)), "_" , rep ( x = unique ( x = splits), times = length ( x = id.levels))) } data.plot <- lapply ( X = unique ( x = data.features $ id), FUN = function (ident) { data.use <- data.features[data.features $ id == ident, 1 : ( ncol ( x = data.features) - 1 ), drop = FALSE ] avg.exp <- apply ( X = data.use, MARGIN = 2 , FUN = function (x) { return ( mean ( x = expm1 ( x = x))) } ) pct.exp <- apply ( X = data.use, MARGIN = 2 , FUN = PercentAbove, threshold = 0 ) return ( list ( avg.exp = avg.exp, pct.exp = pct.exp)) } ) names ( x = data.plot) <- unique ( x = data.features $ id) if (cluster.idents) { mat <- do.call ( what = rbind, args = lapply ( X = data.plot, FUN = unlist) ) mat <- scale ( x = mat) id.levels <- id.levels[ hclust ( d = dist ( x = mat)) $ order] } data.plot <- lapply ( X = names ( x = data.plot), FUN = function (x) { data.use <- as.data.frame ( x = data.plot[[x]]) data.use $ features.plot <- rownames ( x = data.use) data.use $ id <- x return (data.use) } ) data.plot <- do.call ( what = 'rbind' , args = data.plot) if ( ! is.null ( x = id.levels)) { data.plot $ id <- factor ( x = data.plot $ id, levels = id.levels) } ngroup <- length ( x = levels ( x = data.plot $ id)) if (ngroup == 1 ) { scale <- FALSE warning ( "Only one identity present, the expression values will be not scaled" , call. = FALSE , immediate. = TRUE ) } else if (ngroup < 5 & scale) { warning ( "Scaling data with a low number of groups may produce misleading results" , call. = FALSE , immediate. = TRUE ) } avg.exp.scaled <- sapply ( X = unique ( x = data.plot $ features.plot), FUN = function (x) { data.use <- data.plot[data.plot $ features.plot == x, 'avg.exp' ] if (scale) { data.use <- scale ( x = log1p (data.use)) data.use <- MinMax ( data = data.use, min = col.min, max = col.max) } else { data.use <- log1p ( x = data.use) } return (data.use) } ) avg.exp.scaled <- as.vector ( x = t ( x = avg.exp.scaled)) if (split.colors) { avg.exp.scaled <- as.numeric ( x = cut ( x = avg.exp.scaled, breaks = 20 )) } data.plot $ avg.exp.scaled <- avg.exp.scaled data.plot $ features.plot <- factor ( x = data.plot $ features.plot, levels = features ) data.plot $ pct.exp[data.plot $ pct.exp < dot.min] <- NA data.plot $ pct.exp <- data.plot $ pct.exp * 100 if (split.colors) { splits.use <- unlist ( x = lapply ( X = data.plot $ id, FUN = function (x) sub ( paste0 ( ".*_(" , paste ( sort ( unique ( x = splits), decreasing = TRUE ), collapse = '|' ), ")$" ), " \\ 1" , x ) ) ) data.plot $ colors <- mapply ( FUN = function (color, value) { return ( colorRampPalette ( colors = c ( 'grey' , color))( 20 )[value]) }, color = cols[splits.use], value = avg.exp.scaled ) } color.by <- ifelse ( test = split.colors, yes = 'colors' , no = 'avg.exp.scaled' ) if ( ! is.na ( x = scale.min)) { data.plot[data.plot $ pct.exp < scale.min, 'pct.exp' ] <- scale.min } if ( ! is.na ( x = scale.max)) { data.plot[data.plot $ pct.exp > scale.max, 'pct.exp' ] <- scale.max } if ( ! is.null ( x = feature.groups)) { data.plot $ feature.groups <- factor ( x = feature.groups[data.plot $ features.plot], levels = unique ( x = feature.groups) ) } plot <- ggplot ( data = data.plot, mapping = aes_string ( x = 'features.plot' , y = 'id' )) + geom_point ( mapping = aes_string ( size = 'pct.exp' , color = color.by)) + scale.func ( range = c ( 0 , dot.scale), limits = c (scale.min, scale.max)) + theme ( axis.title.x = element_blank , axis.title.y = element_blank ) + guides ( size = guide_legend ( title = 'Percent Expressed' )) + labs ( x = 'Features' , y = ifelse ( test = is.null ( x = split.by), yes = 'Identity' , no = 'Split Identity' ) ) + theme_cowplot if ( ! is.null ( x = feature.groups)) { plot <- plot + facet_grid ( facets = ~ feature.groups, scales = "free_x" , space = "free_x" , switch = "y" ) + theme ( panel.spacing = unit ( x = 1 , units = "lines" ), strip.background = element_blank ) } if (split.colors) { plot <- plot + scale_color_identity } else if ( length ( x = cols) == 1 ) { plot <- plot + scale_color_distiller ( palette = cols) } else { plot <- plot + scale_color_gradient ( low = cols[ 1 ], high = cols[ 2 ]) } if ( ! split.colors) { plot <- plot + guides ( color = guide_colorbar ( title = 'Average Expression' )) } return (plot) } 从中可以看到，DotPlot函数主要依赖于下面这段代码绘图，使用到的主要绘图函数是ggplot2 接下来我们尝试用终极大法“ggplot2”还原DotPlot的结果 # DotPlot绘图数据获取 p1 <- DotPlot (sce_final, features = marker) p1 data <- p1 $ data # 添加cluster对应细胞类型名称 data $ celltype <- as.character (new.cluster.ids[data $ id]) # 数据查看 head (data) # avg.exp pct.exp features.plot id avg.exp.scaled celltype # CCR7 2.966887045 43.459916 CCR7 0 2.0782862 Naive CD4 T # LEF1 2.080254074 33.333333 LEF1 0 2.0344535 Naive CD4 T # PRKCQ-AS1 2.092014612 32.770745 PRKCQ-AS1 0 1.8243866 Naive CD4 T # S100A8 0.415138208 8.298172 S100A8 0 -0.5995877 Naive CD4 T # FOLR3 0.005816666 0.140647 FOLR3 0 -0.3653701 Naive CD4 T # S100A12 0.012959228 0.281294 S100A12 0 -0.4006327 Naive CD4 T # 设置展示顺序，将相同细胞类型的cluster放在一起展示 data $ id <- factor (data $ id, levels= c ( "0" , "2" , "4" , "6" , "1" , "5" , "7" , "3" , "8" )) # ggplot2绘图 p2 <- ggplot (data , aes ( x = features.plot, y = id, size = pct.exp, color = avg.exp.scaled)) + geom_point ( shape= 16 ) + = 16实心圆 theme_bw + scale_colour_gradient2 + xlab ( NULL ) + ylab ( NULL ) + theme ( axis.text.x = element_text ( angle = 45 , hjust = 1 )) + theme ( panel.grid.major= element_blank ) + geom_hline ( yintercept= c ( 4.5 , 7.5 , 8.5 ), linewidth= . 5 ) + theme ( panel.border = element_rect ( fill= NA , color= "black" , linewidth= 1.5 , linetype= "solid" )) + 线型为实心线 coord_flip p2 new.cluster.ids <- c ( "TCells" , "Myeloid" , "TCells" , "BCells" , "TCells" , "Myeloid" , "TCells" , "Myeloid" , "Platelet" ) names (new.cluster.ids) <- 0 : 8 anno1 <- data.frame (new.cluster.ids) anno1 <- anno1[ c ( "0" , "2" , "4" , "6" , "1" , "5" , "7" , "3" , "8" ),,drop = F] anno1 $ cluster <- rownames (anno1) anno1 $ cluster <- factor (anno1 $ cluster, levels= c ( "0" , "2" , "4" , "6" , "1" , "5" , "7" , "3" , "8" )) anno1 $ other <- "anno1" anno1 # new.cluster.ids cluster other # 0 TCells 0 anno1 # 2 TCells 2 anno1 # 4 TCells 4 anno1 # 6 TCells 6 anno1 # 1 Myeloid 1 anno1 # 5 Myeloid 5 anno1 # 7 Myeloid 7 anno1 # 3 BCells 3 anno1 # 8 Platelet 8 anno1 p2_anno <- ggplot (anno1, aes ( x= cluster, y= other, fill= new.cluster.ids)) + geom_tile + scale_fill_manual ( values = c ( "TCells" = " , "Myeloid" = " , "BCells" = " , "Platelet" = " )) + scale_y_discrete ( position = "right" ) + theme_bw + theme ( panel.grid.major= element_blank , panel.border = element_blank ) + theme ( axis.text.x = element_blank , axis.text.y = element_blank , axis.ticks.x= element_blank , axis.ticks.y = element_blank ) + labs ( x= "" , y= "" ) p2_anno set.seed ( 7 ) # 合并dotplot和细胞注释条 p2 %>% insert_top (p2_anno, height = . 05 ) expr <- as.data.frame (sce_final @ assays $ RNA $ data[marker,]) hc <- hclust ( dist (expr)) ggtree (hc) ggtree (hc) + geom_text ( aes ( label = node), hjust = - 0.3 , vjust = - 0.3 ) hcc1 <- ggtree (hc) + geom_nodepoint ( color= " , alpha= 1 / 4 , size= 10 ) + geom_tippoint ( color= " , shape= 8 , size= 3 ) hcc1 hcc2 <- ggtree (hc) + geom_point ( aes ( shape= isTip, color= isTip), size= 3 ) + geom_hilight ( node= 31 , fill= "steelblue" , alpha= . 1 ) + geom_hilight ( node= 34 , fill= "darkgreen" , alpha= . 1 ) + geom_hilight ( node= 37 , fill= " , alpha= . 1 ) + geom_hilight ( node= 40 , fill= " , alpha= . 1 ) + geom_hilight ( node= 41 , fill= " , alpha= . 1 ) hcc2 # 合并dotplot、细胞注释条和基因聚类树 p2 %>% insert_top (p2_anno, height = . 05 ) %>% insert_left (hcc1, width = 1 ) p2 %>% insert_top (p2_anno, height = . 05 ) %>% insert_left (hcc2, width = 1 )