diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h index 7614233eaf..d08511f72a 100644 --- a/compat/cuda/cuda_runtime.h +++ b/compat/cuda/cuda_runtime.h @@ -182,6 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } static inline __device__ float fabs(float a) { return __builtin_fabsf(a); } static inline __device__ double fabs(double a) { return __builtin_fabs(a); } static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); } +static inline __device__ float rintf(float a) { return __builtin_rintf(a); } static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); } static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } diff --git a/libavfilter/cuda/vector_helpers.cuh b/libavfilter/cuda/vector_helpers.cuh index 67332ef030..ca8246e5a0 100644 --- a/libavfilter/cuda/vector_helpers.cuh +++ b/libavfilter/cuda/vector_helpers.cuh @@ -109,4 +109,27 @@ inline __device__ float4 lerp_scalar(float4 v0, float4 v1, float t) { ); } +template +inline __device__ T saturate_rintf(T a, float factor) { + return rintf(__saturatef(a) * factor); +} + +template<> +inline __device__ float2 saturate_rintf(float2 a, float factor) { + return make_float2( + saturate_rintf(a.x, factor), + saturate_rintf(a.y, factor) + ); +} + +template<> +inline __device__ float4 saturate_rintf(float4 a, float factor) { + return make_float4( + saturate_rintf(a.x, factor), + saturate_rintf(a.y, factor), + saturate_rintf(a.z, factor), + saturate_rintf(a.w, factor) + ); +} + #endif diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu index d674c0885a..ca7955344b 100644 --- a/libavfilter/vf_scale_cuda.cu +++ b/libavfilter/vf_scale_cuda.cu @@ -1162,12 +1162,15 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex, #define PIX(x, y) tex2D(tex, (x), (y)) return from_floatN( - apply_coeffs(coeffsY, - apply_coeffs(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)), - apply_coeffs(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )), - apply_coeffs(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)), - apply_coeffs(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2)) - ) * factor + saturate_rintf( + apply_coeffs(coeffsY, + apply_coeffs(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)), + apply_coeffs(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )), + apply_coeffs(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)), + apply_coeffs(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2)) + ), + factor + ) ); #undef PIX