@inproceedings{2048d38daaa8466f9f20ff59f5a3c2ee,
title = "Efficient and Fast Approximate Consensus with Epidemic Failure Detection at Extreme Scale",
abstract = "This paper proposes a memory efficient failure detection and consensus algorithm, for fail-stop type process failures, based on epidemic protocols. It is suitable for extreme scale systems with reliable networks (no message loss) and high failure frequency. Communication time dominates the execution time at scale. The redundant failure detections and non-uniform information dissemination speed of epidemic algorithms make approximate epidemic-based consensus detection a useful way to trade communication overhead for accuracy. An approximate technique to the consensus detection is also proposed in this paper for faster consensus detection. Results show that the algorithm detects consensus correctly on failed processes with logarithmic scalability. The algorithm is tolerant to process failures both before and during the execution and the number of failures (occurring both before and during execution) have virtually no effect on the consensus detection time at scale. Comparison with similar deterministic consensus detection technique shows that the algorithm detects consensus at the same time with high probability. Further, benefits of the proposed approximate technique increase as system size increases. Compared to the non-approximate version, for a system size of 218 processes, the communication saved is 34% with accuracy loss of the order of 10^-4 in consensus detection.",
keywords = "Approximate consensus, Communication reduction, Consensus, Failure detection",
author = "Amogh Katti and Lilja, {David J.}",
year = "2018",
month = jun,
day = "6",
doi = "10.1109/PDP2018.2018.00045",
language = "English (US)",
series = "Proceedings - 26th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, PDP 2018",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "267--272",
editor = "Igor Kotenko and Ivan Merelli and Pietro Lio",
booktitle = "Proceedings - 26th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, PDP 2018",
note = "26th Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, PDP 2018 ; Conference date: 21-03-2018 Through 23-03-2018",
}