The growing amount of scientific data from sensors and field observations
is posing a challenge to ᅢツᅡdata valetsᅢツᅡ responsible for managing
them in data repositories. These repositories built on commodity
clusters need to reliably ingest data continuously and ensure its
availability to a wide user community. Workflows provide several
benefits to modeling data-intensive science applications and many
of these benefits can help manage the data ingest pipelines too.
But using workflows is not panacea in itself and data valets need
to consider several issues when designing workflows that behave reliably
on fault prone hardware while retaining the consistency of the scientific
data. In this paper, we propose workflow designs for reliable data
ingest in a distributed environment and identify workflow framework
features to support resilience. We illustrate these using the data
pipeline for the Pan-STARRS repository, one of the largest digital
surveys that accumulates 100TB of data annually to support 300 astronomers.
%0 Conference Paper
%1 Simmhan:escience:2009
%A Simmhan, Yogesh
%A van Ingen, Catharine
%A Szalay, Alex
%A Barga, Roger
%A Heasley, Jim
%B International Conference on eScience (eScience)
%D 2009
%I IEEE
%K cloud, data escience, management, msr, panstarrs, peer reviewed workflows,
%P 321-328
%R 10.1109/e-Science.2009.52
%T Building Reliable Data Pipelines for Managing Community Data Using
Scientific Workflows
%X The growing amount of scientific data from sensors and field observations
is posing a challenge to ᅢツᅡdata valetsᅢツᅡ responsible for managing
them in data repositories. These repositories built on commodity
clusters need to reliably ingest data continuously and ensure its
availability to a wide user community. Workflows provide several
benefits to modeling data-intensive science applications and many
of these benefits can help manage the data ingest pipelines too.
But using workflows is not panacea in itself and data valets need
to consider several issues when designing workflows that behave reliably
on fault prone hardware while retaining the consistency of the scientific
data. In this paper, we propose workflow designs for reliable data
ingest in a distributed environment and identify workflow framework
features to support resilience. We illustrate these using the data
pipeline for the Pan-STARRS repository, one of the largest digital
surveys that accumulates 100TB of data annually to support 300 astronomers.
@inproceedings{Simmhan:escience:2009,
abstract = {The growing amount of scientific data from sensors and field observations
is posing a challenge to ᅢツᅡdata valetsᅢツᅡ responsible for managing
them in data repositories. These repositories built on commodity
clusters need to reliably ingest data continuously and ensure its
availability to a wide user community. Workflows provide several
benefits to modeling data-intensive science applications and many
of these benefits can help manage the data ingest pipelines too.
But using workflows is not panacea in itself and data valets need
to consider several issues when designing workflows that behave reliably
on fault prone hardware while retaining the consistency of the scientific
data. In this paper, we propose workflow designs for reliable data
ingest in a distributed environment and identify workflow framework
features to support resilience. We illustrate these using the data
pipeline for the Pan-STARRS repository, one of the largest digital
surveys that accumulates 100TB of data annually to support 300 astronomers.},
added-at = {2014-08-13T04:08:36.000+0200},
author = {Simmhan, Yogesh and van Ingen, Catharine and Szalay, Alex and Barga, Roger and Heasley, Jim},
biburl = {https://www.bibsonomy.org/bibtex/262b66882159e59ba448e4222a065f9f3/simmhan},
booktitle = {International Conference on eScience (eScience)},
doi = {10.1109/e-Science.2009.52},
interhash = {953b4ed310cd590cd27336714316ca60},
intrahash = {62b66882159e59ba448e4222a065f9f3},
keywords = {cloud, data escience, management, msr, panstarrs, peer reviewed workflows,},
month = {December},
note = {[CORE A]},
owner = {Simmhan},
pages = {321-328},
publisher = {IEEE},
timestamp = {2014-08-13T04:08:36.000+0200},
title = {Building Reliable Data Pipelines for Managing Community Data Using
Scientific Workflows},
year = 2009
}