This paper addresses the problem of supervised video summarization by
formulating it as a sequence-to-sequence learning problem, where the input is a
sequence of original video frames, the output is a keyshot sequence. Our key
idea is to learn a deep summarization network with attention mechanism to mimic
the way of selecting the keyshots of human. To this end, we propose a novel
video summarization framework named Attentive encoder-decoder networks for
Video Summarization (AVS), in which the encoder uses a Bidirectional Long
Short-Term Memory (BiLSTM) to encode the contextual information among the input
video frames. As for the decoder, two attention-based LSTM networks are
explored by using additive and multiplicative objective functions,
respectively. Extensive experiments are conducted on three video summarization
benchmark datasets, i.e., SumMe, and TVSum. The results demonstrate the
superiority of the proposed AVS-based approaches against the state-of-the-art
approaches,with remarkable improvements from 0.8% to 3% on two
datasets,respectively..
Description
[1708.09545] Video Summarization with Attention-Based Encoder-Decoder Networks
%0 Generic
%1 ji2017video
%A Ji, Zhong
%A Xiong, Kailin
%A Pang, Yanwei
%A Li, Xuelong
%D 2017
%K encoder-decoder summarization video
%T Video Summarization with Attention-Based Encoder-Decoder Networks
%U http://arxiv.org/abs/1708.09545
%X This paper addresses the problem of supervised video summarization by
formulating it as a sequence-to-sequence learning problem, where the input is a
sequence of original video frames, the output is a keyshot sequence. Our key
idea is to learn a deep summarization network with attention mechanism to mimic
the way of selecting the keyshots of human. To this end, we propose a novel
video summarization framework named Attentive encoder-decoder networks for
Video Summarization (AVS), in which the encoder uses a Bidirectional Long
Short-Term Memory (BiLSTM) to encode the contextual information among the input
video frames. As for the decoder, two attention-based LSTM networks are
explored by using additive and multiplicative objective functions,
respectively. Extensive experiments are conducted on three video summarization
benchmark datasets, i.e., SumMe, and TVSum. The results demonstrate the
superiority of the proposed AVS-based approaches against the state-of-the-art
approaches,with remarkable improvements from 0.8% to 3% on two
datasets,respectively..
@misc{ji2017video,
abstract = {This paper addresses the problem of supervised video summarization by
formulating it as a sequence-to-sequence learning problem, where the input is a
sequence of original video frames, the output is a keyshot sequence. Our key
idea is to learn a deep summarization network with attention mechanism to mimic
the way of selecting the keyshots of human. To this end, we propose a novel
video summarization framework named Attentive encoder-decoder networks for
Video Summarization (AVS), in which the encoder uses a Bidirectional Long
Short-Term Memory (BiLSTM) to encode the contextual information among the input
video frames. As for the decoder, two attention-based LSTM networks are
explored by using additive and multiplicative objective functions,
respectively. Extensive experiments are conducted on three video summarization
benchmark datasets, i.e., SumMe, and TVSum. The results demonstrate the
superiority of the proposed AVS-based approaches against the state-of-the-art
approaches,with remarkable improvements from 0.8% to 3% on two
datasets,respectively..},
added-at = {2018-04-18T14:53:39.000+0200},
author = {Ji, Zhong and Xiong, Kailin and Pang, Yanwei and Li, Xuelong},
biburl = {https://www.bibsonomy.org/bibtex/26e2f2612153f73da2d17b4c5966db3e9/rcb},
description = {[1708.09545] Video Summarization with Attention-Based Encoder-Decoder Networks},
interhash = {90e9fae0625afa8641e1fc1b1646c031},
intrahash = {6e2f2612153f73da2d17b4c5966db3e9},
keywords = {encoder-decoder summarization video},
note = {cite arxiv:1708.09545Comment: 9 pages, 7 figures},
timestamp = {2018-04-18T14:53:39.000+0200},
title = {Video Summarization with Attention-Based Encoder-Decoder Networks},
url = {http://arxiv.org/abs/1708.09545},
year = 2017
}