The identification of rare diseases from clinical notes with Natural Language
Processing (NLP) is challenging due to the few cases available for machine
learning and the need of data annotation from clinical experts. We propose a
method using ontologies and weak supervision. The approach includes two steps:
(i) Text-to-UMLS, linking text mentions to concepts in Unified Medical Language
System (UMLS), with a named entity linking tool (e.g. SemEHR) and weak
supervision based on customised rules and Bidirectional Encoder Representations
from Transformers (BERT) based contextual representations, and (ii)
UMLS-to-ORDO, matching UMLS concepts to rare diseases in Orphanet Rare Disease
Ontology (ORDO). Using MIMIC-III discharge summaries as a case study, we show
that the Text-to-UMLS process can be greatly improved with weak supervision,
without any annotated data from domain experts. Our analysis shows that the
overall pipeline processing discharge summaries can surface rare disease cases,
which are mostly uncaptured in manual ICD codes of the hospital admissions.
Description
Rare Disease Identification from Clinical Notes with Ontologies and Weak Supervision
%0 Generic
%1 dong2021disease
%A Dong, Hang
%A Suárez-Paniagua, Víctor
%A Zhang, Huayu
%A Wang, Minhong
%A Whitfield, Emma
%A Wu, Honghan
%D 2021
%K bert contextual_embedding contextual_representation entity_enrichment entity_linking myown ontology ontology_mapping ontology_matching ordo rare_disease semehr weak_supervision
%T Rare Disease Identification from Clinical Notes with Ontologies and Weak
Supervision
%U http://arxiv.org/abs/2105.01995
%X The identification of rare diseases from clinical notes with Natural Language
Processing (NLP) is challenging due to the few cases available for machine
learning and the need of data annotation from clinical experts. We propose a
method using ontologies and weak supervision. The approach includes two steps:
(i) Text-to-UMLS, linking text mentions to concepts in Unified Medical Language
System (UMLS), with a named entity linking tool (e.g. SemEHR) and weak
supervision based on customised rules and Bidirectional Encoder Representations
from Transformers (BERT) based contextual representations, and (ii)
UMLS-to-ORDO, matching UMLS concepts to rare diseases in Orphanet Rare Disease
Ontology (ORDO). Using MIMIC-III discharge summaries as a case study, we show
that the Text-to-UMLS process can be greatly improved with weak supervision,
without any annotated data from domain experts. Our analysis shows that the
overall pipeline processing discharge summaries can surface rare disease cases,
which are mostly uncaptured in manual ICD codes of the hospital admissions.
@misc{dong2021disease,
abstract = {The identification of rare diseases from clinical notes with Natural Language
Processing (NLP) is challenging due to the few cases available for machine
learning and the need of data annotation from clinical experts. We propose a
method using ontologies and weak supervision. The approach includes two steps:
(i) Text-to-UMLS, linking text mentions to concepts in Unified Medical Language
System (UMLS), with a named entity linking tool (e.g. SemEHR) and weak
supervision based on customised rules and Bidirectional Encoder Representations
from Transformers (BERT) based contextual representations, and (ii)
UMLS-to-ORDO, matching UMLS concepts to rare diseases in Orphanet Rare Disease
Ontology (ORDO). Using MIMIC-III discharge summaries as a case study, we show
that the Text-to-UMLS process can be greatly improved with weak supervision,
without any annotated data from domain experts. Our analysis shows that the
overall pipeline processing discharge summaries can surface rare disease cases,
which are mostly uncaptured in manual ICD codes of the hospital admissions.},
added-at = {2021-05-06T06:33:36.000+0200},
author = {Dong, Hang and Suárez-Paniagua, Víctor and Zhang, Huayu and Wang, Minhong and Whitfield, Emma and Wu, Honghan},
biburl = {https://www.bibsonomy.org/bibtex/27670e310d61714e8bf365c981744c3fe/hangdong},
description = {Rare Disease Identification from Clinical Notes with Ontologies and Weak Supervision},
interhash = {c5e70fb0013e57097a8eccb64994485f},
intrahash = {7670e310d61714e8bf365c981744c3fe},
keywords = {bert contextual_embedding contextual_representation entity_enrichment entity_linking myown ontology ontology_mapping ontology_matching ordo rare_disease semehr weak_supervision},
note = {cite arxiv:2105.01995},
timestamp = {2021-05-06T06:33:36.000+0200},
title = {Rare Disease Identification from Clinical Notes with Ontologies and Weak
Supervision},
url = {http://arxiv.org/abs/2105.01995},
year = 2021
}