avfilter/scale_cuda: fix color bleeding in lanczos scaling

Prior to this, the results were not saturated into the uchar/ushort range before being written. The characteristics of the Lanczos filter exposed this issue. In addition, the results were truncated rather than rounded, which resulted in checkerboard artifacts in solid color areas and were noticeable when using Lanczos with 8-bit input. Example: ffmpeg -init_hw_device cuda -f lavfi -i testsrc2=s=960x540,format=yuv420p \ -vf hwupload,scale_cuda=format=yuv420p:w=-2:h=720:interp_algo=lanczos \ -c:v h264_nvenc -qp:v 20 -t 1 <OUTPUT> Fix #20784 Signed-off-by: nyanmisaka <nst799610810@gmail.com>
2026-04-20 21:00:41 +08:00 · 2026-04-04 13:34:31 +08:00
parent 2dff0156ba
commit 69fc910777
3 changed files with 33 additions and 6 deletions
--- a/compat/cuda/cuda_runtime.h
+++ b/compat/cuda/cuda_runtime.h
@@ -182,6 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
 static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
 static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
 static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
 static inline __device__ float rintf(float a) { return __builtin_rintf(a); }
 static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
 static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
--- a/libavfilter/cuda/vector_helpers.cuh
+++ b/libavfilter/cuda/vector_helpers.cuh
@@ -109,4 +109,27 @@ inline __device__ float4 lerp_scalar<float4>(float4 v0, float4 v1, float t) {
    );
 }
 template<typename T>
 inline __device__ T saturate_rintf(T a, float factor) {
    return rintf(__saturatef(a) * factor);
 }
 template<>
 inline __device__ float2 saturate_rintf<float2>(float2 a, float factor) {
    return make_float2(
        saturate_rintf(a.x, factor),
        saturate_rintf(a.y, factor)
    );
 }
 template<>
 inline __device__ float4 saturate_rintf<float4>(float4 a, float factor) {
    return make_float4(
        saturate_rintf(a.x, factor),
        saturate_rintf(a.y, factor),
        saturate_rintf(a.z, factor),
        saturate_rintf(a.w, factor)
    );
 }
 #endif
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -1162,12 +1162,15 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
 #define PIX(x, y) tex2D<floatT>(tex, (x), (y))
    return from_floatN<T, floatT>(
-        apply_coeffs<floatT>(coeffsY,
+        saturate_rintf(
-            apply_coeffs<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)),
+            apply_coeffs<floatT>(coeffsY,
-            apply_coeffs<floatT>(coeffsX, PIX(px - 1, py    ), PIX(px, py    ), PIX(px + 1, py    ), PIX(px + 2, py    )),
+                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)),
-            apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)),
+                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py    ), PIX(px, py    ), PIX(px + 1, py    ), PIX(px + 2, py    )),
-            apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2))
+                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)),
-        ) * factor
+                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2))
            ),
            factor
        )
    );
 #undef PIX