Semantic event recognition based only on vision cues is a challenging problem. This problem is particularly acute when the application domain is unconstrained still images available on the Internet or in personal repositories. In recent years, it has been shown that metadata captured with pictures can provide valuable contextual cues complementary to the image content and can be used to improve classification performance. With the recent geotagging phenomenon, an important piece of metadata available with many geotagged pictures now on the World Wide Web is GPS information. In this study, we obtain satellite images corresponding to picture location data and investigate their novel use to recognize the picture-taking environment, as if through a third eye above the object. Additionally, we combine this inference with classical vision-based event detection methods and study the synergistic fusion of the two approaches. We employ both color- and structure-based visual vocabularies for characterizing ground and satellite images, respectively. Training of satellite image classifiers is done using a multiclass AdaBoost engine while the ground image classifiers are trained using SVMs. Modeling and prediction involve some of the most interesting semantic event-activity classes encountered in consumer pictures, including those that occur in residential areas, commercial areas, beaches, sports venues, and parks. The powerful fusion of the complementary views achieves significant performance improvement over the ground view baseline. With integrated GPS-capable cameras on the horizon, we believe that our line of research can revolutionize event recognition and media annotation in years to come.
%0 Conference Paper
%1 LuoYuEtAl08MM
%A Luo, Jiebo
%A Yu, Jie
%A Joshi, Dhiraj
%A Hao, Wei
%B MM '08: Proceedings of the 16th ACM International Conference on Multimedia, Vancouver, Canada
%D 2008
%K v1205 acm paper ai image recognition action location analysis
%P 1071-1080
%R 10.1145/1459359.1459574
%T Event Recognition: Viewing the World with a Third Eye
%X Semantic event recognition based only on vision cues is a challenging problem. This problem is particularly acute when the application domain is unconstrained still images available on the Internet or in personal repositories. In recent years, it has been shown that metadata captured with pictures can provide valuable contextual cues complementary to the image content and can be used to improve classification performance. With the recent geotagging phenomenon, an important piece of metadata available with many geotagged pictures now on the World Wide Web is GPS information. In this study, we obtain satellite images corresponding to picture location data and investigate their novel use to recognize the picture-taking environment, as if through a third eye above the object. Additionally, we combine this inference with classical vision-based event detection methods and study the synergistic fusion of the two approaches. We employ both color- and structure-based visual vocabularies for characterizing ground and satellite images, respectively. Training of satellite image classifiers is done using a multiclass AdaBoost engine while the ground image classifiers are trained using SVMs. Modeling and prediction involve some of the most interesting semantic event-activity classes encountered in consumer pictures, including those that occur in residential areas, commercial areas, beaches, sports venues, and parks. The powerful fusion of the complementary views achieves significant performance improvement over the ground view baseline. With integrated GPS-capable cameras on the horizon, we believe that our line of research can revolutionize event recognition and media annotation in years to come.
%@ 978-1-60558-303-7
@inproceedings{LuoYuEtAl08MM,
abstract = {Semantic event recognition based only on vision cues is a challenging problem. This problem is particularly acute when the application domain is unconstrained still images available on the Internet or in personal repositories. In recent years, it has been shown that metadata captured with pictures can provide valuable contextual cues complementary to the image content and can be used to improve classification performance. With the recent geotagging phenomenon, an important piece of metadata available with many geotagged pictures now on the World Wide Web is GPS information. In this study, we obtain satellite images corresponding to picture location data and investigate their novel use to recognize the picture-taking environment, as if through a third eye above the object. Additionally, we combine this inference with classical vision-based event detection methods and study the synergistic fusion of the two approaches. We employ both color- and structure-based visual vocabularies for characterizing ground and satellite images, respectively. Training of satellite image classifiers is done using a multiclass AdaBoost engine while the ground image classifiers are trained using SVMs. Modeling and prediction involve some of the most interesting semantic event-activity classes encountered in consumer pictures, including those that occur in residential areas, commercial areas, beaches, sports venues, and parks. The powerful fusion of the complementary views achieves significant performance improvement over the ground view baseline. With integrated GPS-capable cameras on the horizon, we believe that our line of research can revolutionize event recognition and media annotation in years to come.},
added-at = {2012-05-30T10:50:19.000+0200},
author = {Luo, Jiebo and Yu, Jie and Joshi, Dhiraj and Hao, Wei},
biburl = {https://www.bibsonomy.org/bibtex/2a402862f715c9b3c732035de54ad402b/flint63},
booktitle = {MM '08: Proceedings of the 16th ACM International Conference on Multimedia, Vancouver, Canada},
doi = {10.1145/1459359.1459574},
file = {ACM Digital Library:2008/LuoYuEtAl08MM.pdf:PDF},
groups = {public},
interhash = {b8abc5573299066e9f1bbaa27a298a6f},
intrahash = {a402862f715c9b3c732035de54ad402b},
isbn = {978-1-60558-303-7},
keywords = {v1205 acm paper ai image recognition action location analysis},
pages = {1071-1080},
timestamp = {2018-04-16T12:20:36.000+0200},
title = {Event Recognition: Viewing the World with a Third Eye},
username = {flint63},
year = 2008
}