Annotating cell types on the basis of single-cell RNA-seq data is a prerequisite for research on disease progress and tumour microenvironments. Here we show that existing annotation methods typically suffer from a lack of curated marker gene lists, improper handling of batch effects and difficulty in leveraging the latent gene–gene interaction information, impairing their generalization and robustness. We developed a pretrained deep neural network-based model, single-cell bidirectional encoder representations from transformers (scBERT), to overcome the challenges. Following BERT’s approach to pretraining and fine-tuning, scBERT attains a general understanding of gene–gene interactions by being pretrained on huge amounts of unlabelled scRNA-seq data; it is then transferred to the cell type annotation task of unseen and user-specific scRNA-seq data for supervised fine-tuning. Extensive and rigorous benchmark studies validated the superior performance of scBERT on cell type annotation, novel cell type discovery, robustness to batch effects and model interpretability.
%0 Journal Article
%1 yang2022scbert
%A Yang, Fan
%A Wang, Wenchuan
%A Wang, Fang
%A Fang, Yuan
%A Tang, Duyu
%A Huang, Junzhou
%A Lu, Hui
%A Yao, Jianhua
%D 2022
%I Cold Spring Harbor Laboratory
%J Nature Machine Intelligence
%K dmir-readinggroup machine-learning single_cell
%P 852-866
%R 10.1038/s42256-022-00534-z
%T scBERT as a Large-scale Pretrained Deep Language Model for Cell Type Annotation of Single-cell RNA-seq Data
%U https://www.nature.com/articles/s42256-022-00534-z
%V 4
%X Annotating cell types on the basis of single-cell RNA-seq data is a prerequisite for research on disease progress and tumour microenvironments. Here we show that existing annotation methods typically suffer from a lack of curated marker gene lists, improper handling of batch effects and difficulty in leveraging the latent gene–gene interaction information, impairing their generalization and robustness. We developed a pretrained deep neural network-based model, single-cell bidirectional encoder representations from transformers (scBERT), to overcome the challenges. Following BERT’s approach to pretraining and fine-tuning, scBERT attains a general understanding of gene–gene interactions by being pretrained on huge amounts of unlabelled scRNA-seq data; it is then transferred to the cell type annotation task of unseen and user-specific scRNA-seq data for supervised fine-tuning. Extensive and rigorous benchmark studies validated the superior performance of scBERT on cell type annotation, novel cell type discovery, robustness to batch effects and model interpretability.
@article{yang2022scbert,
abstract = {Annotating cell types on the basis of single-cell RNA-seq data is a prerequisite for research on disease progress and tumour microenvironments. Here we show that existing annotation methods typically suffer from a lack of curated marker gene lists, improper handling of batch effects and difficulty in leveraging the latent gene–gene interaction information, impairing their generalization and robustness. We developed a pretrained deep neural network-based model, single-cell bidirectional encoder representations from transformers (scBERT), to overcome the challenges. Following BERT’s approach to pretraining and fine-tuning, scBERT attains a general understanding of gene–gene interactions by being pretrained on huge amounts of unlabelled scRNA-seq data; it is then transferred to the cell type annotation task of unseen and user-specific scRNA-seq data for supervised fine-tuning. Extensive and rigorous benchmark studies validated the superior performance of scBERT on cell type annotation, novel cell type discovery, robustness to batch effects and model interpretability.},
added-at = {2023-06-05T09:07:13.000+0200},
author = {Yang, Fan and Wang, Wenchuan and Wang, Fang and Fang, Yuan and Tang, Duyu and Huang, Junzhou and Lu, Hui and Yao, Jianhua},
biburl = {https://www.bibsonomy.org/bibtex/2f42324d95994db3273b64f2ca6951ef6/martinr},
doi = {10.1038/s42256-022-00534-z},
elocation-id = {2021.12.05.471261},
eprint = {https://www.biorxiv.org/content/early/2022/08/01/2021.12.05.471261.full.pdf},
interhash = {e321ca7a0b0510e11881fbdd82834aae},
intrahash = {f42324d95994db3273b64f2ca6951ef6},
issn = {2522-5839},
journal = {Nature Machine Intelligence},
keywords = {dmir-readinggroup machine-learning single_cell},
language = {en},
month = {10},
pages = {852-866},
publisher = {Cold Spring Harbor Laboratory},
timestamp = {2023-06-05T09:15:47.000+0200},
title = {scBERT as a Large-scale Pretrained Deep Language Model for Cell Type Annotation of Single-cell RNA-seq Data},
url = {https://www.nature.com/articles/s42256-022-00534-z},
volume = 4,
year = 2022
}