Malware is a constant threat and is continuously evolving. Security systems try to keep up with the constant change. One challenge that arises is the large amount of logs generated on an operating system and the need to clarify which information contributes to the detection of possible malware. This work aims at the detection of malware using neural networks based on Windows audit log events. Neural networks can only process continuous data, but Windows audit logs are sequential and textual data. To address these challenges, we extract features out of the audit log events and use LSTMs to capture sequential effects. We create different subsets of features and analyze the effects of additional information. Features describe for example the action-type of windows audit log events, process names or target files that are accessed. Textual features are represented either as one-hot encoding or embedding representation, for which we compare three different approaches for representation learning. Effects of different feature subsets and representations are evaluated on a publicly available data set. Results indicate that using additional information improves the performance of the LSTM-model. While different representations lead to similar classification results, analysis of the latent space shows differences more precisely where FastText seems to be the most promising representation.
%0 Journal Article
%1 RING2021102389
%A Ring, Markus
%A Schlör, Daniel
%A Wunderlich, Sarah
%A Landes, Dieter
%A Hotho, Andreas
%D 2021
%J Computers & Security
%K 2021 Embeddings LSTM Malware Windows app_security audit author:hotho author:schloer csf from:hotho logs myown representation_learning research_imbalanced_data research_sequential
%P 102389
%R https://doi.org/10.1016/j.cose.2021.102389
%T Malware detection on windows audit logs using LSTMs
%U https://www.sciencedirect.com/science/article/pii/S0167404821002133
%V 109
%X Malware is a constant threat and is continuously evolving. Security systems try to keep up with the constant change. One challenge that arises is the large amount of logs generated on an operating system and the need to clarify which information contributes to the detection of possible malware. This work aims at the detection of malware using neural networks based on Windows audit log events. Neural networks can only process continuous data, but Windows audit logs are sequential and textual data. To address these challenges, we extract features out of the audit log events and use LSTMs to capture sequential effects. We create different subsets of features and analyze the effects of additional information. Features describe for example the action-type of windows audit log events, process names or target files that are accessed. Textual features are represented either as one-hot encoding or embedding representation, for which we compare three different approaches for representation learning. Effects of different feature subsets and representations are evaluated on a publicly available data set. Results indicate that using additional information improves the performance of the LSTM-model. While different representations lead to similar classification results, analysis of the latent space shows differences more precisely where FastText seems to be the most promising representation.
@article{RING2021102389,
abstract = {Malware is a constant threat and is continuously evolving. Security systems try to keep up with the constant change. One challenge that arises is the large amount of logs generated on an operating system and the need to clarify which information contributes to the detection of possible malware. This work aims at the detection of malware using neural networks based on Windows audit log events. Neural networks can only process continuous data, but Windows audit logs are sequential and textual data. To address these challenges, we extract features out of the audit log events and use LSTMs to capture sequential effects. We create different subsets of features and analyze the effects of additional information. Features describe for example the action-type of windows audit log events, process names or target files that are accessed. Textual features are represented either as one-hot encoding or embedding representation, for which we compare three different approaches for representation learning. Effects of different feature subsets and representations are evaluated on a publicly available data set. Results indicate that using additional information improves the performance of the LSTM-model. While different representations lead to similar classification results, analysis of the latent space shows differences more precisely where FastText seems to be the most promising representation.},
added-at = {2022-02-04T03:28:25.000+0100},
author = {Ring, Markus and Schlör, Daniel and Wunderlich, Sarah and Landes, Dieter and Hotho, Andreas},
biburl = {https://www.bibsonomy.org/bibtex/23ddb12d38e6773b7a382a56780a39e37/dmir},
doi = {https://doi.org/10.1016/j.cose.2021.102389},
interhash = {ec5a015286f1efc5262765d2c2b8636e},
intrahash = {3ddb12d38e6773b7a382a56780a39e37},
issn = {0167-4048},
journal = {Computers & Security},
keywords = {2021 Embeddings LSTM Malware Windows app_security audit author:hotho author:schloer csf from:hotho logs myown representation_learning research_imbalanced_data research_sequential},
pages = 102389,
timestamp = {2024-04-09T14:04:11.000+0200},
title = {Malware detection on windows audit logs using LSTMs},
url = {https://www.sciencedirect.com/science/article/pii/S0167404821002133},
volume = 109,
year = 2021
}