The identification of rare diseases from clinical notes with Natural Language Processing (NLP) is challenging due to the few cases available for machine learning and the need of data annotation from clinical experts. We propose a method using ontologies and weak supervision. The approach includes two steps: (i) Text-to-UMLS, linking text mentions to concepts in Unified Medical Language System (UMLS), with a named entity linking tool (e.g. SemEHR) and weak supervision based on customised rules and Bidirectional Encoder Representations from Transformers (BERT) based contextual representations, and (ii) UMLS-to-ORDO, matching UMLS concepts to rare diseases in Orphanet Rare Disease Ontology (ORDO). Using MIMIC-III US intensive care discharge summaries as a case study, we show that the Text-to-UMLS process can be greatly improved with weak supervision, without any annotated data from domain experts. Our analysis shows that the overall pipeline processing discharge summaries can surface rare disease cases, which are mostly uncaptured in manual ICD codes of the hospital admissions.
Description
Rare Disease Identification from Clinical Notes with Ontologies and Weak Supervision | IEEE Conference Publication | IEEE Xplore
%0 Conference Paper
%1 9630043
%A Dong, Hang
%A Suárez-Paniagua, Víctor
%A Zhang, Huayu
%A Wang, Minhong
%A Whitfield, Emma
%A Wu, Honghan
%B 2021 43rd Annual International Conference of the IEEE Engineering in Medicine Biology Society (EMBC)
%D 2021
%K myown ontology ontology_matching ordo phenotyping rare_disease text_phenotyping umls weak_supervision
%P 2294-2298
%R 10.1109/EMBC46164.2021.9630043
%T Rare Disease Identification from Clinical Notes with Ontologies and Weak Supervision
%U https://ieeexplore.ieee.org/document/9630043
%X The identification of rare diseases from clinical notes with Natural Language Processing (NLP) is challenging due to the few cases available for machine learning and the need of data annotation from clinical experts. We propose a method using ontologies and weak supervision. The approach includes two steps: (i) Text-to-UMLS, linking text mentions to concepts in Unified Medical Language System (UMLS), with a named entity linking tool (e.g. SemEHR) and weak supervision based on customised rules and Bidirectional Encoder Representations from Transformers (BERT) based contextual representations, and (ii) UMLS-to-ORDO, matching UMLS concepts to rare diseases in Orphanet Rare Disease Ontology (ORDO). Using MIMIC-III US intensive care discharge summaries as a case study, we show that the Text-to-UMLS process can be greatly improved with weak supervision, without any annotated data from domain experts. Our analysis shows that the overall pipeline processing discharge summaries can surface rare disease cases, which are mostly uncaptured in manual ICD codes of the hospital admissions.
@inproceedings{9630043,
abstract = {The identification of rare diseases from clinical notes with Natural Language Processing (NLP) is challenging due to the few cases available for machine learning and the need of data annotation from clinical experts. We propose a method using ontologies and weak supervision. The approach includes two steps: (i) Text-to-UMLS, linking text mentions to concepts in Unified Medical Language System (UMLS), with a named entity linking tool (e.g. SemEHR) and weak supervision based on customised rules and Bidirectional Encoder Representations from Transformers (BERT) based contextual representations, and (ii) UMLS-to-ORDO, matching UMLS concepts to rare diseases in Orphanet Rare Disease Ontology (ORDO). Using MIMIC-III US intensive care discharge summaries as a case study, we show that the Text-to-UMLS process can be greatly improved with weak supervision, without any annotated data from domain experts. Our analysis shows that the overall pipeline processing discharge summaries can surface rare disease cases, which are mostly uncaptured in manual ICD codes of the hospital admissions.},
added-at = {2021-12-12T19:24:57.000+0100},
author = {Dong, Hang and Suárez-Paniagua, Víctor and Zhang, Huayu and Wang, Minhong and Whitfield, Emma and Wu, Honghan},
biburl = {https://www.bibsonomy.org/bibtex/285cd1a378656ec7b869b5e3b3c1a9679/hangdong},
booktitle = {2021 43rd Annual International Conference of the IEEE Engineering in Medicine Biology Society (EMBC)},
description = {Rare Disease Identification from Clinical Notes with Ontologies and Weak Supervision | IEEE Conference Publication | IEEE Xplore},
doi = {10.1109/EMBC46164.2021.9630043},
interhash = {c5e70fb0013e57097a8eccb64994485f},
intrahash = {85cd1a378656ec7b869b5e3b3c1a9679},
issn = {2694-0604},
keywords = {myown ontology ontology_matching ordo phenotyping rare_disease text_phenotyping umls weak_supervision},
month = nov,
pages = {2294-2298},
timestamp = {2021-12-12T19:25:19.000+0100},
title = {Rare Disease Identification from Clinical Notes with Ontologies and Weak Supervision},
url = {https://ieeexplore.ieee.org/document/9630043},
year = 2021
}