Github全套代码文献复现之卵巢和子宫内膜肿瘤（二）|| 作者不进行 UMI count 回归的原因

昨天，我们给大家介绍了新专辑《Github带有全套代码分享的文献复现2025》，受到大家的热烈喜爱，里面学习的文章为：《A multi-omic single-cell landscape of human gynecologic malignancies》，详细介绍可以看上一篇帖子。今天继续来学习他的代码~

简单回顾

文章对应的数据为：https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE173682，作者针对每个样本进行单独的预处理，然后进行merge合并继续后面的分析。

上一篇文章 Github带有全套代码分享的文献复现2025 中我们学习了作者使用MAD方法对低质量细胞进行过滤，今天来看看数据标准化部分作者给出的不进行 UMI count 或者线粒体基因回归的原因，以及inferCNV 分析不走寻常路挑选正常参考 ref 的过程。

文章中是这样描述的，下面来看看代码部分！两个条件二选一

PC1 was not correlated to UMI counts per cell or
evidence of biological variation was found in PC1 based on the number of inferred CNVs and cell type gene signature enrichment

数据标准化

rna 是接上一篇稿子中进行MAD过滤后的 seurat 对象，首先接着进行了默认的标准化，高变基因鉴定，归一化，PCA分析：

# Default Seurat processing
rna "LogNormalize", scale.factor = 10000)
rna "vst", nfeatures = 2000)
rna rna

获取各种细胞的基因集进行打分

接着作者使用了两个数据库的基因集signatures得到 immune/stromal/fibroblast/endothelial 的打分：

ESTIMATE 打分：来源于文件scENDO_scOVAR_2020-main\scRNA-seq Processing Scripts\Individual_Samples\ESTIMATE_signatures.csv，可以得到基质细胞和免疫细胞打分
PanglaoDB数据库：下载自PanglaoDB数据库（https://panglaodb.se/），得到文件 PanglaoDB_markers_27_Mar_2020.tsv.gz，可以得到 fibroblast、endothelial、epithelial、smooth、plasma 这些细胞的打分




    
# Score cells for immune/stromal/fibroblast/endothelial signatures
############################################################################################
# ESTIMATE signatures 
ESTIMATE.signatures "./ESTIMATE_signatures.csv"
immune.stromal head(immune.stromal)
table(immune.stromal$V2)

stromal $V1[1:141]
stromal

immune $V1[142:282]
immune

# PanglaoDB：其他细胞signature
tsv "./PanglaoDB_markers_27_Mar_2020.tsv.gz")  
panglaodb "\t") 
panglaodb "Hs" | species == "Mm Hs") # Human subset 
panglaodb "Connective tissue" |
                             organ == "Epithelium" |
                             organ == "Immune system" |
                             organ == "Reproductive"|
                             organ == "Vasculature" |
                             organ == "Smooth muscle")
panglaodb $official.gene.symbol), panglaodb$cell.type)

fibroblast $Fibroblasts
fibroblast

endothelial endothelial

epithelial epithelial

smooth smooth

plasma plasma

# 生成一个list signature
feats 
# 打分
rna "stromal.","immune.","fibroblast.","endothelial.", "epithelial.","smooth.","plasma."),search = T)
head(rna@meta.data)

AddModuleScore函数得到的各个打分结果如下：

将 PC1 添加到metadata中并计算它与各种细胞的相关性

# Add PC1 to metadata
rna@meta.data$PC1 $pca@cell.embeddings[,1]

count_cor_PC1 $PC1,rna$nCount_RNA,method = "spearman");count_cor_PC1
stromal.cor $PC1,rna$stromal.1,method = "spearman");stromal.cor
immune.cor $PC1,rna$immune.2,method = "spearman");immune.cor
fibroblast.cor $PC1,rna$fibroblast.3,method = "spearman");fibroblast.cor
endothelial.cor $PC1,rna$endothelial.4,method = "spearman");endothelial.cor
epithelial.cor $PC1,rna$epithelial.5,method = "spearman");epithelial.cor
smooth.cor $PC1,rna$smooth.6,method = "spearman");smooth.cor
plasma.cor $PC1,rna$plasma.7,method = "spearman");plasma.cor

接下来，作者对 PC1 与 count值即read depth进行判断，如果这两者相关（即cor值大于0.5），就看PC1是否与生物学变异相关：

# If PC1 is correalted with read depth, check to see if biological variation is corralted to PC1
round(abs(count_cor_PC1),2) > 0.5
# [1] TRUE

test = 0.5 |
    round(abs(immune.cor),2) >= 0.5 |
    round(abs(fibroblast.cor),2) >= 0.5 |
    round(abs(endothelial.cor),2) >= 0.5 |
    round(abs(epithelial.cor),2) >= 0.5 |
    round(abs(smooth.cor),2) >= 0.5 |
    round(abs(plasma.cor),2) >= 0.5
test
# [1] TRUE

两个都为TRUE。说明PC1与上面的因素相关，继续下面判断PC1是否与CNV事件相关。

inferCNV判断：PC1 correlated with CNV events/Malignancy

1、cnv之前的降维聚类

dims我这里直接改成了20，作者用的50，这里问题不大：

rna rna rna Idents(rna) "RNA_snn_res.0.7"
table(Idents(rna))
DimPlot(rna, label = T)

鉴定到17个cluster：

2、inferCNV参考细胞选择

ESTIMATE immune打分鉴定immune细胞cluster，又学到了在没有做细胞注释的情况下怎么选择免疫细胞参考：

# Verify with inferCNV: is PC1 correlated with CNV events/Malignancy?
#########################################################################
# inferCNV: does PC1 also correlated with CNV/malignancy status?
library(infercnv)
library(stringr)
library(Seurat)
counts_matrix = GetAssayData(rna, slot="counts")

# Identify immune clusters
#######################################################
# Find immune cells by relative enrichment of ESTIMATE immune signature
library(psych)
test "immune.2")
test
data test$data$immune.2, test$data$ident, mat = TRUE)
data.immune  0.1)
data.immune

结合小提琴图以及作者卡的0.1打分阈值，cluster3/13/14被鉴定为免疫细胞cluster！

浆细胞cluster：

test "plasma.7")
test
data test$data$plasma.7, test$data$ident, mat = TRUE)
data.plasma  0.1)
data.plasma

cluster13：

将选出的三个亚群注释到metadata信息中：

immune.clusters $group1,levels(Idents(rna)))
plasma.clusters $group1,levels(Idents(rna)))

immune.clusters immune.clusters

for (i in 1:length(immune.clusters)){
  j which(levels(Idents(rna)) == immune.clusters[i])
  levels(Idents(rna))[j] "immune.",immune.clusters[i])
}
rna@meta.data$predoublet.idents head(rna@meta.data)
table(rna$predoublet.idents)
idents $predoublet.idents)
head(idents)
colnames(idents) "V1","V2")
saveRDS(rna,"./rna_predoublet_preinferCNV.rds")

现在的cluster信息如下：

3、制作inferCNV 输入文件

Homo_sapiens.GRCh38.86.txt：在目录scENDO_scOVAR_2020-main\scRNA-seq Processing Scripts\Individual_Samples

# Make inferCNV input files
rownames(idents) colnames(idents) write.table(idents,"./sample_annotation_file_inferCNV.txt",sep = "\t",row.names = FALSE)
idents "./sample_annotation_file_inferCNV.txt",header = F)
head(idents)

library(EnsDb.Hsapiens.v86)
GRCH38.annotations "./Homo_sapiens.GRCh38.86.txt"
gtf convert.symbol = function(Data){
  ensembls $V1
  ensembls "\\.[0-9]*$", "", ensembls)
  geneIDs1 "GENEID", columns = "SYMBOL")
  Data   Data   Data$feature $SYMBOL
  Data.new $SYMBOL,Data$V2,Data$V3,Data$V4)
  Data.new$Data.V2 "chr",Data.new$Data.V2,sep = "")
  Data.new$Data.SYMBOL $Data.SYMBOL)
  return(Data.new)
}
gtf head(gtf)
write.table(gtf,"./Homo_sapiens.GRCh38.86.symbol.txt",sep = "\t",row.names = FALSE,col.names = FALSE)

4、运行inferCNV

# 免疫细胞数
num.immune.clusters = length(immune.clusters)
# create the infercnv object
infercnv_obj = CreateInfercnvObject(raw_counts_matrix=as.matrix(counts_matrix),
                                    annotations_file="./sample_annotation_file_inferCNV.txt",
                                    gene_order_file="./Homo_sapiens.GRCh38.86.symbol.txt",
                                    ref_group_names=paste0("immune.",immune.clusters) )

# perform infercnv operations to reveal cnv signal
infercnv_obj = infercnv::run(infercnv_obj,
                             cutoff=0.1,  # use 1 for smart-seq, 0.1 for 10x-genomics
                             out_dir="./output_dir_CNV_predoublet",  # dir is auto-created for storing outputs
                             cluster_by_groups=T,   # cluster
                             denoise=T,scale_data = T,
                             HMM=T,HMM_type = "i6",
                             analysis_mode = "samples",
                             min_cells_per_gene = 10,
                             BayesMaxPNormal = 0.4, 
                             num_threads = 8
                             )

5、判断PC1是否与CNV事件相关

文件 HMM_CNV_predictions.HMMi6.hmm_mode-samples.Pnorm_0.4.pred_cnv_regions.dat 是使用 inferCNV 工具基于隐马尔可夫模型（HMM）进行拷贝数变异（CNV）预测的结果文件，该文件包含了预测的CNV区域的详细信息，每一行代表一个CNV区域，列出了该区域的染色体位置、起始和结束坐标、状态分配以及对应的细胞分组，通过该文件，可以快速定位和分析样本中不同细胞群体的CNV区域，了解基因组的拷贝数变异情况。
文件 BayesNetOutput.HMMi6.hmm_mode-samples/CNV_State_Probabilities.dat 是使用 inferCNV 工具进行拷贝数变异（CNV）分析时，基于贝叶斯网络（Bayesian Network）计算得到的 CNV 状态概率文件。通过查看每个 CNV 区域属于正常状态（1x）的概率，可以评估该 CNV 预测的可靠性。如果某个 CNV 区域的 P(Normal) 值较高（例如超过 0.5），可能需要谨慎对待该 CNV 的真实性。

# 读取inferCNV预测结果
regions "./output_dir_CNV_predoublet/HMM_CNV_predictions.HMMi6.hmm_mode-samples.Pnorm_0.4.pred_cnv_regions.dat")
head(regions)
# cell_group_name        cnv_name state   chr     start       end
# 1          0.0_s1  chr6-region_12     2  chr6  26055787  33299324
# 2          0.0_s1  chr7-region_15     4  chr7 103344254 135977353
# 3          0.0_s1  chr9-region_19     4  chr9  32552999  92764812
# 4          0.0_s1 chr11-region_22     4 chr11   9138825  35232402
# 5          0.0_s1 chr16-region_31     2 chr16     53010   1964860
# 6          1.1_s1  chr1-region_41     4  chr1  60865259  94927278

probs "./output_dir_CNV_predoublet/BayesNetOutput.HMMi6.hmm_mode-samples/CNV_State_Probabilities.dat")
head(probs)
probs colnames(probs) "Prob.Normal"
probs cnvs cnvs "\\.","-",cnvs)

regions $cnv_name %in% cnvs, ]
regions

cnv.groups "\\..*", "", regions$cell_group_name)
cnv.groups

length(which(rownames(rna@reductions$pca@cell.embeddings) == rownames(rna@meta.data)))
rna$PC1.loading $pca@cell.embeddings[,1]
rna$cell.barcode rna$CNV.Pos $predoublet.idents) %in% cnv.groups,1,0)

cnv.freq $cell_group_name))
cnv.freq$Var1 "\\..*", "", cnv.freq$Var1)

rna$Total_CNVs $predoublet.idents) %in% cnv.freq$Var1,cnv.freq$Freq,0)

boxplot.cnv   geom_boxplot() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
boxplot.cnv
ggsave(filename = "02-scRNA_Res/Predoublet_CNV_PC1_boxplot.png",width = 7, height = 4, plot = boxplot.cnv)

data $data$PC1.loading, boxplot.cnv$data$predoublet.idents, mat = TRUE)
data$CNV $group1 %in% cnv.groups,1,0)

wilcox wilcox

相关，则不进行 nCount_RNA 回归：

wilcox$p.value 
if (wilcox$p.value # 运行这里
  rna   library(stringr)
  levels(Idents(rna)) "immune.") # 这里去掉之前cluster编号上面注释的immune字符
  saveRDS(rna,"./rna_predoublet_PassedPC1Checks.rds")
}else{
  all.genes   rna "nCount_RNA")
  rna   rna   rna   Idents(rna) "RNA_snn_res.0.7"
  
  saveRDS(rna,"./rna_predoublet_FailedCNVTest.rds")
}