Publications @InProceedings{TolLacRanWORKS11, AUTHOR = {Tolosana-Calasanz, Rafael and Lackovic, Marco and Rana, Omer and Banares, Jose and Talia, Domenico}, TITLE = {Characterizing quality of resilience in scientific workflows}, YEAR = {2011}, MONTH = {14 November}, BOOKTITLE = {Proceedings of the 6th workshop on Workflows in support of large-scale science}, PAGES = {117-126}, ADDRESS = {New York, NY, USA}, PUBLISHER = {ACM}, URL = {http://dl.acm.org/citation.cfm?id=2110511}, ABSTRACT = {The enactment of scientific workflows involves the distribution of tasks to distributed resources that exist in different administrative domains. Such resources can range in granularity from a single machine to one or more clusters and file systems. The use of such distributed resources during workflow enactment can be an error prone process and may lead to faults, which can range in type, frequency of occurrence and complexity. However, the level of fault tolerance available within many existing workflow engines varies significantly, ranging from no support available (requiring the user to intervene) or re-execution of a workflow automatically when a fault is detected. Many scientific workflows have to operate over heterogeneous infrastructure in the presence of failures -- therefore dealing with such failures in a more coherent way, so that a similar set of techniques can be applied across workflow engines is an important challenge. In this paper, we extend the concept of Quality of Service (QoS) -- where particular performance constraints need to be adhered to, with the concept of Quality of Resilience (QoR) -- a metric used to assess how resilient workflow enactment is likely to be in the presence of failures. We believe such a metric can guide: (i) the formulation of a workflow -- as a user annotated Directed Acyclic Graph (DAG); (ii) subsequent enactment of it over available resources. We identify how QoR must be considered at different levels -- from a user, workflow enactor and resource management perspectives. We identify the architectural elements that may be included within scientific workflows to support QoR and demonstrate the use of QoR in the Weka4WS data mining workflow system.}, NOTE = {ISBN: 978-1-4503-1100-7}, KEYWORDS = {Fault Tolerance, Weka4WS, Workflow} }