In the era of enormous information production human capabilities have reached their limits. The need for automatic information processing which would not be incommensurate to human sophistication seems to be more than imperative. Information scientists have focused on the development of techniques and processes that would assist human contribution while improve, or at least guarantee, information quality. Automatic indexing techniques may lay on various approaches offering different results in information retrieval. In this paper we introduce an automated methodology for subject analysis, including both the determination of the aboutness of the documents and the translation of the related concepts to the terms of a knowledge organization system. Focusing on a corpus consisting of articles related to the Digital Library Evaluation domain, topic modeling algorithms are utilized for the aboutness of the documents, while the context of the words in topics, as captured by Word Embeddings, are used for the assignment of the extracted topics to the concepts of the EuroVoc thesaurus.
%0 Conference Paper
%1 sfakakis_automated_2019
%A Sfakakis, Michalis
%A Papachristopoulos, Leonidas
%A Zoutsou, Kyriaki
%A Tsakonas, Giannis
%A Papatheodorou, Christos
%B Metadata and Semantic Research
%C Cham
%D 2019
%E Garoufallou, Emmanouel
%E Fallucchi, Francesca
%E William De Luca, Ernesto
%I Springer International Publishing
%K automatisches_indexieren
%P 103--114
%R 10.1007/978-3-030-36599-8_9
%T Automated subject indexing of domain specific collections using word embeddings and general purpose thesauri
%X In the era of enormous information production human capabilities have reached their limits. The need for automatic information processing which would not be incommensurate to human sophistication seems to be more than imperative. Information scientists have focused on the development of techniques and processes that would assist human contribution while improve, or at least guarantee, information quality. Automatic indexing techniques may lay on various approaches offering different results in information retrieval. In this paper we introduce an automated methodology for subject analysis, including both the determination of the aboutness of the documents and the translation of the related concepts to the terms of a knowledge organization system. Focusing on a corpus consisting of articles related to the Digital Library Evaluation domain, topic modeling algorithms are utilized for the aboutness of the documents, while the context of the words in topics, as captured by Word Embeddings, are used for the assignment of the extracted topics to the concepts of the EuroVoc thesaurus.
%@ 978-3-030-36599-8
@inproceedings{sfakakis_automated_2019,
abstract = {In the era of enormous information production human capabilities have reached their limits. The need for automatic information processing which would not be incommensurate to human sophistication seems to be more than imperative. Information scientists have focused on the development of techniques and processes that would assist human contribution while improve, or at least guarantee, information quality. Automatic indexing techniques may lay on various approaches offering different results in information retrieval. In this paper we introduce an automated methodology for subject analysis, including both the determination of the aboutness of the documents and the translation of the related concepts to the terms of a knowledge organization system. Focusing on a corpus consisting of articles related to the Digital Library Evaluation domain, topic modeling algorithms are utilized for the aboutness of the documents, while the context of the words in topics, as captured by Word Embeddings, are used for the assignment of the extracted topics to the concepts of the EuroVoc thesaurus.},
added-at = {2019-12-11T17:56:24.000+0100},
address = {Cham},
author = {Sfakakis, Michalis and Papachristopoulos, Leonidas and Zoutsou, Kyriaki and Tsakonas, Giannis and Papatheodorou, Christos},
biburl = {https://www.bibsonomy.org/bibtex/2993cbf8d93c937287dbf8a534a79fdcd/lepsky},
booktitle = {Metadata and {Semantic} {Research}},
doi = {10.1007/978-3-030-36599-8_9},
editor = {Garoufallou, Emmanouel and Fallucchi, Francesca and William De Luca, Ernesto},
interhash = {0150cf2b1441572816218840314293de},
intrahash = {993cbf8d93c937287dbf8a534a79fdcd},
isbn = {978-3-030-36599-8},
keywords = {automatisches_indexieren},
language = {en},
pages = {103--114},
publisher = {Springer International Publishing},
series = {Communications in {Computer} and {Information} {Science}},
timestamp = {2019-12-11T17:58:34.000+0100},
title = {Automated subject indexing of domain specific collections using word embeddings and general purpose thesauri},
year = 2019
}