From 69fc91077701337c24721d91f5632c910b5f8033 Mon Sep 17 00:00:00 2001
From: nyanmisaka <nst799610810@gmail.com>
Date: Sat, 4 Apr 2026 13:34:31 +0800
Subject: [PATCH] avfilter/scale_cuda: fix color bleeding in lanczos scaling

Prior to this, the results were not saturated into the uchar/ushort range before
being written. The characteristics of the Lanczos filter exposed this issue.

In addition, the results were truncated rather than rounded, which resulted
in checkerboard artifacts in solid color areas and were noticeable when
using Lanczos with 8-bit input.

Example:
ffmpeg -init_hw_device cuda -f lavfi -i testsrc2=s=960x540,format=yuv420p \
-vf hwupload,scale_cuda=format=yuv420p:w=-2:h=720:interp_algo=lanczos \
-c:v h264_nvenc -qp:v 20 -t 1 <OUTPUT>

Fix #20784

Signed-off-by: nyanmisaka <nst799610810@gmail.com>
---
 compat/cuda/cuda_runtime.h          |  1 +
 libavfilter/cuda/vector_helpers.cuh | 23 +++++++++++++++++++++++
 libavfilter/vf_scale_cuda.cu        | 15 +++++++++------
 3 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h
index 7614233eaf..d08511f72a 100644
--- a/compat/cuda/cuda_runtime.h
+++ b/compat/cuda/cuda_runtime.h
@@ -182,6 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
 static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
 static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
 static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
+static inline __device__ float rintf(float a) { return __builtin_rintf(a); }
 
 static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
 static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
diff --git a/libavfilter/cuda/vector_helpers.cuh b/libavfilter/cuda/vector_helpers.cuh
index 67332ef030..ca8246e5a0 100644
--- a/libavfilter/cuda/vector_helpers.cuh
+++ b/libavfilter/cuda/vector_helpers.cuh
@@ -109,4 +109,27 @@ inline __device__ float4 lerp_scalar<float4>(float4 v0, float4 v1, float t) {
     );
 }
 
+template<typename T>
+inline __device__ T saturate_rintf(T a, float factor) {
+    return rintf(__saturatef(a) * factor);
+}
+
+template<>
+inline __device__ float2 saturate_rintf<float2>(float2 a, float factor) {
+    return make_float2(
+        saturate_rintf(a.x, factor),
+        saturate_rintf(a.y, factor)
+    );
+}
+
+template<>
+inline __device__ float4 saturate_rintf<float4>(float4 a, float factor) {
+    return make_float4(
+        saturate_rintf(a.x, factor),
+        saturate_rintf(a.y, factor),
+        saturate_rintf(a.z, factor),
+        saturate_rintf(a.w, factor)
+    );
+}
+
 #endif
diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index d674c0885a..ca7955344b 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -1162,12 +1162,15 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
 #define PIX(x, y) tex2D<floatT>(tex, (x), (y))
 
     return from_floatN<T, floatT>(
-        apply_coeffs<floatT>(coeffsY,
-            apply_coeffs<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)),
-            apply_coeffs<floatT>(coeffsX, PIX(px - 1, py    ), PIX(px, py    ), PIX(px + 1, py    ), PIX(px + 2, py    )),
-            apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)),
-            apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2))
-        ) * factor
+        saturate_rintf(
+            apply_coeffs<floatT>(coeffsY,
+                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)),
+                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py    ), PIX(px, py    ), PIX(px + 1, py    ), PIX(px + 2, py    )),
+                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)),
+                apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2))
+            ),
+            factor
+        )
     );
 
 #undef PIX