From 69fc91077701337c24721d91f5632c910b5f8033 Mon Sep 17 00:00:00 2001 From: nyanmisaka Date: Sat, 4 Apr 2026 13:34:31 +0800 Subject: [PATCH] avfilter/scale_cuda: fix color bleeding in lanczos scaling Prior to this, the results were not saturated into the uchar/ushort range before being written. The characteristics of the Lanczos filter exposed this issue. In addition, the results were truncated rather than rounded, which resulted in checkerboard artifacts in solid color areas and were noticeable when using Lanczos with 8-bit input. Example: ffmpeg -init_hw_device cuda -f lavfi -i testsrc2=s=960x540,format=yuv420p \ -vf hwupload,scale_cuda=format=yuv420p:w=-2:h=720:interp_algo=lanczos \ -c:v h264_nvenc -qp:v 20 -t 1 Fix #20784 Signed-off-by: nyanmisaka --- compat/cuda/cuda_runtime.h | 1 + libavfilter/cuda/vector_helpers.cuh | 23 +++++++++++++++++++++++ libavfilter/vf_scale_cuda.cu | 15 +++++++++------ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h index 7614233eaf..d08511f72a 100644 --- a/compat/cuda/cuda_runtime.h +++ b/compat/cuda/cuda_runtime.h @@ -182,6 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } static inline __device__ float fabs(float a) { return __builtin_fabsf(a); } static inline __device__ double fabs(double a) { return __builtin_fabs(a); } static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); } +static inline __device__ float rintf(float a) { return __builtin_rintf(a); } static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); } static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } diff --git a/libavfilter/cuda/vector_helpers.cuh b/libavfilter/cuda/vector_helpers.cuh index 67332ef030..ca8246e5a0 100644 --- a/libavfilter/cuda/vector_helpers.cuh +++ b/libavfilter/cuda/vector_helpers.cuh @@ -109,4 +109,27 @@ inline __device__ float4 lerp_scalar(float4 v0, float4 v1, float t) { ); } +template +inline __device__ T saturate_rintf(T a, float factor) { + return rintf(__saturatef(a) * factor); +} + +template<> +inline __device__ float2 saturate_rintf(float2 a, float factor) { + return make_float2( + saturate_rintf(a.x, factor), + saturate_rintf(a.y, factor) + ); +} + +template<> +inline __device__ float4 saturate_rintf(float4 a, float factor) { + return make_float4( + saturate_rintf(a.x, factor), + saturate_rintf(a.y, factor), + saturate_rintf(a.z, factor), + saturate_rintf(a.w, factor) + ); +} + #endif diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu index d674c0885a..ca7955344b 100644 --- a/libavfilter/vf_scale_cuda.cu +++ b/libavfilter/vf_scale_cuda.cu @@ -1162,12 +1162,15 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex, #define PIX(x, y) tex2D(tex, (x), (y)) return from_floatN( - apply_coeffs(coeffsY, - apply_coeffs(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)), - apply_coeffs(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )), - apply_coeffs(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)), - apply_coeffs(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2)) - ) * factor + saturate_rintf( + apply_coeffs(coeffsY, + apply_coeffs(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)), + apply_coeffs(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )), + apply_coeffs(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)), + apply_coeffs(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2)) + ), + factor + ) ); #undef PIX