@inproceedings{253c2ed477554d43948e0078a118f336,
title = "Implementation of non local means filter in GPUs",
abstract = "In this paper, we review some alternatives to reduce the computational complexity of the Non-Local Means image filter and present a CUDA-based implementation of it for GPUs, comparing its performance on different GPUs and with respect to reference CPU implementations. Starting from a naive CUDA implementation, we describe different aspects of CUDA and the algorithm itself that can be leveraged to decrease the execution time. Our GPU implementation achieved speedups of up to 35.8x with respect to our reduced-complexity reference implementation on the CPU, and more than 700x over a plain CPU implementation.",
keywords = "CUDA, GPU, Image denoising, Non-local means",
author = "Adri{\'a}n M{\'a}rques and Alvaro Pardo",
year = "2013",
doi = "10.1007/978-3-642-41822-8_51",
language = "Ingl{\'e}s",
isbn = "9783642418211",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
number = "PART 1",
pages = "407--414",
booktitle = "Progress in Pattern Recognition, Image Analysis, Computer Vision, and Applications - 18th Iberoamerican Congress, CIARP 2013, Proceedings",
edition = "PART 1",
note = "18th Iberoamerican Congress on Pattern Recognition, CIARP 2013 ; Conference date: 20-11-2013 Through 23-11-2013",
}