avfilter/scale_cuda: fix color bleeding in lanczos scaling

Prior to this, the results were not saturated into the uchar/ushort range before
being written. The characteristics of the Lanczos filter exposed this issue.

In addition, the results were truncated rather than rounded, which resulted
in checkerboard artifacts in solid color areas and were noticeable when
using Lanczos with 8-bit input.

Example:
ffmpeg -init_hw_device cuda -f lavfi -i testsrc2=s=960x540,format=yuv420p \
-vf hwupload,scale_cuda=format=yuv420p:w=-2:h=720:interp_algo=lanczos \
-c:v h264_nvenc -qp:v 20 -t 1 <OUTPUT>

Fix #20784

Signed-off-by: nyanmisaka <nst799610810@gmail.com>
This commit is contained in:
nyanmisaka
2026-04-04 13:34:31 +08:00
committed by Timo Rothenpieler
parent 2dff0156ba
commit 69fc910777
3 changed files with 33 additions and 6 deletions

View File

@@ -182,6 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
static inline __device__ float rintf(float a) { return __builtin_rintf(a); }
static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }

View File

@@ -109,4 +109,27 @@ inline __device__ float4 lerp_scalar<float4>(float4 v0, float4 v1, float t) {
);
}
template<typename T>
inline __device__ T saturate_rintf(T a, float factor) {
return rintf(__saturatef(a) * factor);
}
template<>
inline __device__ float2 saturate_rintf<float2>(float2 a, float factor) {
return make_float2(
saturate_rintf(a.x, factor),
saturate_rintf(a.y, factor)
);
}
template<>
inline __device__ float4 saturate_rintf<float4>(float4 a, float factor) {
return make_float4(
saturate_rintf(a.x, factor),
saturate_rintf(a.y, factor),
saturate_rintf(a.z, factor),
saturate_rintf(a.w, factor)
);
}
#endif

View File

@@ -1162,12 +1162,15 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
#define PIX(x, y) tex2D<floatT>(tex, (x), (y))
return from_floatN<T, floatT>(
apply_coeffs<floatT>(coeffsY,
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)),
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )),
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)),
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2))
) * factor
saturate_rintf(
apply_coeffs<floatT>(coeffsY,
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)),
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )),
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)),
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2))
),
factor
)
);
#undef PIX