;****************************************************************************** ;* Copyright (c) 2025 Niklas Haas ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "ops_common.asm" SECTION_RODATA align 16 expand16_shuf: db 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 expand32_shuf: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 read8_unpack2: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 read8_unpack3: db 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1 read8_unpack4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 read16_unpack2: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 read16_unpack3: db 0, 1, 6, 7, 2, 3, 8, 9, 4, 5, 10, 11, -1, -1, -1, -1 read16_unpack4: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 write8_pack2: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 write8_pack3: db 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, -1, -1, -1, -1 write16_pack3: db 0, 1, 4, 5, 8, 9, 2, 3, 6, 7, 10, 11, -1, -1, -1, -1 %define write8_pack4 read8_unpack4 %define write16_pack4 read16_unpack2 %define write16_pack2 read16_unpack4 align 32 bits_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, \ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3 bits_mask: db 128, 64, 32, 16, 8, 4, 2, 1,128, 64, 32, 16, 8, 4, 2, 1 bits_reverse: db 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, align 32 mask1: times 32 db 0x01 mask2: times 32 db 0x03 mask3: times 32 db 0x07 mask4: times 32 db 0x0F const1b equ mask1 const1w: times 16 dw 0x01 SECTION .text ;--------------------------------------------------------- ; Global entry point. See `ops_common.asm` for info. %macro process_fn 1 ; number of planes cglobal sws_process%1_x86, 6, 7 + 2 * %1, 16 ; Args: ; execq, implq, bxd, yd as defined in ops_common.int ; bx_end and y_end are initially in tmp0d / tmp1d ; (see SwsOpFunc signature) ; ; Stack layout: ; [rsp + 0] = [qword] impl->cont (address of first kernel) ; [rsp + 8] = [qword] &impl[1] (restore implq after chain) ; [rsp + 16] = [dword] bx start (restore after line finish) ; [rsp + 20] = [dword] bx end (loop counter limit) ; [rsp + 24] = [dword] y end (loop counter limit) sub rsp, 32 mov [rsp + 16], bxd mov [rsp + 20], tmp0d ; bx_end mov [rsp + 24], tmp1d ; y_end mov tmp0q, [implq + SwsOpImpl.cont] add implq, SwsOpImpl.next mov [rsp + 0], tmp0q mov [rsp + 8], implq movsxdifnidn bxq, bxd movsxdifnidn yq, yd ; load plane pointers mov in0q, [execq + SwsOpExec.in0] IF %1 > 1, mov in1q, [execq + SwsOpExec.in1] IF %1 > 2, mov in2q, [execq + SwsOpExec.in2] IF %1 > 3, mov in3q, [execq + SwsOpExec.in3] mov out0q, [execq + SwsOpExec.out0] IF %1 > 1, mov out1q, [execq + SwsOpExec.out1] IF %1 > 2, mov out2q, [execq + SwsOpExec.out2] IF %1 > 3, mov out3q, [execq + SwsOpExec.out3] .loop: call [rsp] ; call into op chain mov implq, [rsp + 8] inc bxd cmp bxd, [rsp + 20] jne .loop ; end of line inc yd cmp yd, [rsp + 24] je .end ; bump addresses to point to start of next line add in0q, [execq + SwsOpExec.in_bump0] IF %1 > 1, add in1q, [execq + SwsOpExec.in_bump1] IF %1 > 2, add in2q, [execq + SwsOpExec.in_bump2] IF %1 > 3, add in3q, [execq + SwsOpExec.in_bump3] add out0q, [execq + SwsOpExec.out_bump0] IF %1 > 1, add out1q, [execq + SwsOpExec.out_bump1] IF %1 > 2, add out2q, [execq + SwsOpExec.out_bump2] IF %1 > 3, add out3q, [execq + SwsOpExec.out_bump3] mov bxd, [rsp + 16] ; conditionally apply y bump (if non-NULL) mov tmp0q, [execq + SwsOpExec.in_bump_y] test tmp0q, tmp0q jz .loop movsxd tmp0q, [tmp0q + yq * 4 - 4] ; load (signed) y bump %if %1 > 3 mov tmp1q, tmp0q imul tmp1q, [execq + SwsOpExec.in_stride3] add in3q, tmp1q %endif %if %1 > 2 mov tmp1q, tmp0q imul tmp1q, [execq + SwsOpExec.in_stride2] add in2q, tmp1q %endif %if %1 > 1 mov tmp1q, tmp0q imul tmp1q, [execq + SwsOpExec.in_stride1] add in1q, tmp1q %endif imul tmp0q, [execq + SwsOpExec.in_stride0] add in0q, tmp0q jmp .loop .end: add rsp, 32 RET %endmacro process_fn 1 process_fn 2 process_fn 3 process_fn 4 ;--------------------------------------------------------- ; Packed shuffle fast-path ; This is a special entry point for handling a subset of operation chains ; that can be reduced down to a single `pshufb` shuffle mask. For more details ; about when this works, refer to the documentation of `ff_sws_solve_shuffle`. ; ; We specialize this function for every possible combination of pixel strides. ; For example, gray -> gray16 is classified as an "8, 16" operation because it ; takes 8 bytes and expands them out to 16 bytes in each application of the ; 128-bit shuffle mask. ; ; Since pshufb can't shuffle across lanes, we only instantiate SSE4 versions for ; all shuffles that are not a clean multiple of 128 bits (e.g. rgb24 -> rgb0). ; For the clean multiples (e.g. rgba -> argb), we also define AVX2 and AVX512 ; versions that can handle a larger number of bytes at once. %macro MOVSIZE 3 ; size, dst, src %if %1 <= 4 movd %2, %3 %elif %1 <= 8 movq %2, %3 %else movu %2, %3 %endif %endmacro %macro packed_shuffle 2 ; size_in, size_out cglobal packed_shuffle%1_%2, 6, 10, 2, \ exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride mov srcq, [execq + SwsOpExec.in0] mov dstq, [execq + SwsOpExec.out0] mov src_strideq, [execq + SwsOpExec.in_stride0] mov dst_strideq, [execq + SwsOpExec.out_stride0] VBROADCASTI128 m1, [shuffleq] sub bxendd, bxd sub yendd, yd ; reuse now-unneeded regs %define srcidxq execq imul srcidxq, bxendq, -%1 %if %1 = %2 %define dstidxq srcidxq %else %define dstidxq shuffleq ; no longer needed reg imul dstidxq, bxendq, -%2 %endif sub srcq, srcidxq sub dstq, dstidxq .loop: MOVSIZE %1, m0, [srcq + srcidxq] pshufb m0, m1 MOVSIZE %2, [dstq + dstidxq], m0 add srcidxq, %1 IF %1 != %2,add dstidxq, %2 jnz .loop add srcq, src_strideq add dstq, dst_strideq imul srcidxq, bxendq, -%1 IF %1 != %2,imul dstidxq, bxendq, -%2 dec yendd jnz .loop RET %endmacro INIT_XMM sse4 packed_shuffle 5, 15 ; 8 -> 24 packed_shuffle 4, 16 ; 8 -> 32, 16 -> 64 packed_shuffle 2, 12 ; 8 -> 48 packed_shuffle 16, 8 ; 16 -> 8 packed_shuffle 10, 15 ; 16 -> 24 packed_shuffle 8, 16 ; 16 -> 32, 32 -> 64 packed_shuffle 4, 12 ; 16 -> 48 packed_shuffle 15, 5 ; 24 -> 8 packed_shuffle 15, 15 ; 24 -> 24 packed_shuffle 12, 16 ; 24 -> 32 packed_shuffle 6, 12 ; 24 -> 48 packed_shuffle 16, 4 ; 32 -> 8, 64 -> 16 packed_shuffle 16, 12 ; 32 -> 24, 64 -> 48 packed_shuffle 16, 16 ; 32 -> 32, 64 -> 64 packed_shuffle 8, 12 ; 32 -> 48 packed_shuffle 12, 12 ; 48 -> 48 INIT_YMM avx2 packed_shuffle 32, 32 INIT_ZMM avx512 packed_shuffle 64, 64 ;--------------------------------------------------------- ; Planar reads / writes %macro read_planar 1 ; elems op read_planar%1 movu mx, [in0q] IF %1 > 1, movu my, [in1q] IF %1 > 2, movu mz, [in2q] IF %1 > 3, movu mw, [in3q] %if V2 movu mx2, [in0q + mmsize] IF %1 > 1, movu my2, [in1q + mmsize] IF %1 > 2, movu mz2, [in2q + mmsize] IF %1 > 3, movu mw2, [in3q + mmsize] %endif LOAD_CONT tmp0q add in0q, mmsize * (1 + V2) IF %1 > 1, add in1q, mmsize * (1 + V2) IF %1 > 2, add in2q, mmsize * (1 + V2) IF %1 > 3, add in3q, mmsize * (1 + V2) CONTINUE tmp0q %endmacro %macro write_planar 1 ; elems op write_planar%1 movu [out0q], mx IF %1 > 1, movu [out1q], my IF %1 > 2, movu [out2q], mz IF %1 > 3, movu [out3q], mw %if V2 movu [out0q + mmsize], mx2 IF %1 > 1, movu [out1q + mmsize], my2 IF %1 > 2, movu [out2q + mmsize], mz2 IF %1 > 3, movu [out3q + mmsize], mw2 %endif add out0q, mmsize * (1 + V2) IF %1 > 1, add out1q, mmsize * (1 + V2) IF %1 > 2, add out2q, mmsize * (1 + V2) IF %1 > 3, add out3q, mmsize * (1 + V2) RET %endmacro %macro read_packed2 1 ; depth op read%1_packed2 movu m8, [in0q + 0*mmsize] movu m9, [in0q + 1*mmsize] IF V2, movu m10, [in0q + 2*mmsize] IF V2, movu m11, [in0q + 3*mmsize] IF %1 < 32, VBROADCASTI128 m12, [read%1_unpack2] LOAD_CONT tmp0q add in0q, mmsize * (2 + V2 * 2) %if %1 == 32 shufps m8, m8, q3120 shufps m9, m9, q3120 IF V2, shufps m10, m10, q3120 IF V2, shufps m11, m11, q3120 %else pshufb m8, m12 ; { X0 Y0 | X1 Y1 } pshufb m9, m12 ; { X2 Y2 | X3 Y3 } IF V2, pshufb m10, m12 IF V2, pshufb m11, m12 %endif unpcklpd mx, m8, m9 ; { X0 X2 | X1 X3 } unpckhpd my, m8, m9 ; { Y0 Y2 | Y1 Y3 } IF V2, unpcklpd mx2, m10, m11 IF V2, unpckhpd my2, m10, m11 %if cpuflag(avx2) vpermq mx, mx, q3120 ; { X0 X1 | X2 X3 } vpermq my, my, q3120 ; { Y0 Y1 | Y2 Y3 } IF V2, vpermq mx2, mx2, q3120 IF V2, vpermq my2, my2, q3120 %endif CONTINUE tmp0q %endmacro %macro write_packed2 1 ; depth op write%1_packed2 IF %1 < 32, VBROADCASTI128 m12, [write%1_pack2] %if cpuflag(avx2) vpermq mx, mx, q3120 ; { X0 X2 | X1 X3 } vpermq my, my, q3120 ; { Y0 Y2 | Y1 Y3 } IF V2, vpermq mx2, mx2, q3120 IF V2, vpermq my2, my2, q3120 %endif unpcklpd m8, mx, my ; { X0 Y0 | X1 Y1 } unpckhpd m9, mx, my ; { X2 Y2 | X3 Y3 } IF V2, unpcklpd m10, mx2, my2 IF V2, unpckhpd m11, mx2, my2 %if %1 == 32 shufps m8, m8, q3120 shufps m9, m9, q3120 IF V2, shufps m10, m10, q3120 IF V2, shufps m11, m11, q3120 %else pshufb m8, m12 pshufb m9, m12 IF V2, pshufb m10, m12 IF V2, pshufb m11, m12 %endif movu [out0q + 0*mmsize], m8 movu [out0q + 1*mmsize], m9 IF V2, movu [out0q + 2*mmsize], m10 IF V2, movu [out0q + 3*mmsize], m11 add out0q, mmsize * (2 + V2 * 2) RET %endmacro ; helper macro reused for both 3 and 4 component packed reads %macro read_packed_inner 7 ; x, y, z, w, addr, num, depth movu xm8, [%5 + 0 * %6] movu xm9, [%5 + 4 * %6] movu xm10, [%5 + 8 * %6] movu xm11, [%5 + 12 * %6] %if cpuflag(avx2) vinserti128 m8, m8, [%5 + 16 * %6], 1 vinserti128 m9, m9, [%5 + 20 * %6], 1 vinserti128 m10, m10, [%5 + 24 * %6], 1 vinserti128 m11, m11, [%5 + 28 * %6], 1 %endif %if %7 == 32 mova %1, m8 mova %2, m9 mova %3, m10 mova %4, m11 %else pshufb %1, m8, m12 ; { X0 Y0 Z0 W0 | X4 Y4 Z4 W4 } pshufb %2, m9, m12 ; { X1 Y1 Z1 W1 | X5 Y5 Z5 W5 } pshufb %3, m10, m12 ; { X2 Y2 Z2 W2 | X6 Y6 Z6 W6 } pshufb %4, m11, m12 ; { X3 Y3 Z3 W3 | X7 Y7 Z7 W7 } %endif punpckldq m8, %1, %2 ; { X0 X1 Y0 Y1 | X4 X5 Y4 Y5 } punpckldq m9, %3, %4 ; { X2 X3 Y2 Y3 | X6 X7 Y6 Y7 } punpckhdq m10, %1, %2 ; { Z0 Z1 W0 W1 | Z4 Z5 W4 W5 } punpckhdq m11, %3, %4 ; { Z2 Z3 W2 W3 | Z6 Z7 W6 W7 } punpcklqdq %1, m8, m9 ; { X0 X1 X2 X3 | X4 X5 X6 X7 } punpckhqdq %2, m8, m9 ; { Y0 Y1 Y2 Y3 | Y4 Y5 Y6 Y7 } punpcklqdq %3, m10, m11 ; { Z0 Z1 Z2 Z3 | Z4 Z5 Z6 Z7 } IF %6 > 3, punpckhqdq %4, m10, m11 ; { W0 W1 W2 W3 | W4 W5 W6 W7 } %endmacro %macro read_packed 2 ; num, depth op read%2_packed%1 IF %2 < 32, VBROADCASTI128 m12, [read%2_unpack%1] LOAD_CONT tmp0q read_packed_inner mx, my, mz, mw, in0q, %1, %2 IF1 V2, read_packed_inner mx2, my2, mz2, mw2, in0q + %1 * mmsize, %1, %2 add in0q, %1 * mmsize * (1 + V2) CONTINUE tmp0q %endmacro %macro write_packed_inner 7 ; x, y, z, w, addr, num, depth punpckldq m8, %1, %2 ; { X0 Y0 X1 Y1 | X4 Y4 X5 Y5 } punpckldq m9, %3, %4 ; { Z0 W0 Z1 W1 | Z4 W4 Z5 W5 } punpckhdq m10, %1, %2 ; { X2 Y2 X3 Y3 | X6 Y6 X7 Y7 } punpckhdq m11, %3, %4 ; { Z2 W2 Z3 W3 | Z6 W6 Z7 W7 } punpcklqdq %1, m8, m9 ; { X0 Y0 Z0 W0 | X4 Y4 Z4 W4 } punpckhqdq %2, m8, m9 ; { X1 Y1 Z1 W1 | X5 Y5 Z5 W5 } punpcklqdq %3, m10, m11 ; { X2 Y2 Z2 W2 | X6 Y6 Z6 W6 } punpckhqdq %4, m10, m11 ; { X3 Y3 Z3 W3 | X7 Y7 Z7 W7 } %if %7 == 32 mova m8, %1 mova m9, %2 mova m10, %3 mova m11, %4 %else pshufb m8, %1, m12 pshufb m9, %2, m12 pshufb m10, %3, m12 pshufb m11, %4, m12 %endif movu [%5 + 0*%6], xm8 movu [%5 + 4*%6], xm9 movu [%5 + 8*%6], xm10 movu [%5 + 12*%6], xm11 %if cpuflag(avx2) vextracti128 [%5 + 16*%6], m8, 1 vextracti128 [%5 + 20*%6], m9, 1 vextracti128 [%5 + 24*%6], m10, 1 vextracti128 [%5 + 28*%6], m11, 1 %endif %endmacro %macro write_packed 2 ; num, depth op write%2_packed%1 IF %2 < 32, VBROADCASTI128 m12, [write%2_pack%1] write_packed_inner mx, my, mz, mw, out0q, %1, %2 IF1 V2, write_packed_inner mx2, my2, mz2, mw2, out0q + %1 * mmsize, %1, %2 add out0q, %1 * mmsize * (1 + V2) RET %endmacro %macro rw_packed 1 ; depth read_packed2 %1 read_packed 3, %1 read_packed 4, %1 write_packed2 %1 write_packed 3, %1 write_packed 4, %1 %endmacro %macro read_nibbles 0 op read_nibbles1 %if cpuflag(avx2) movu xmx, [in0q] IF V2, movu xmx2, [in0q + 16] %else movq xmx, [in0q] IF V2, movq xmx2, [in0q + 8] %endif VBROADCASTI128 m8, [mask4] LOAD_CONT tmp0q add in0q, (mmsize >> 1) * (1 + V2) pmovzxbw mx, xmx IF V2, pmovzxbw mx2, xmx2 psllw my, mx, 8 IF V2, psllw my2, mx2, 8 psrlw mx, 4 IF V2, psrlw mx2, 4 pand my, m8 IF V2, pand my2, m8 por mx, my IF V2, por mx2, my2 CONTINUE tmp0q %endmacro %macro read_bits 0 op read_bits1 %if cpuflag(avx2) vpbroadcastd mx, [in0q] IF V2, vpbroadcastd mx2, [in0q + 4] %else movd mx, [in0q] IF V2, movd mx2, [in0q + 2] %endif mova m8, [bits_shuf] VBROADCASTI128 m9, [bits_mask] VBROADCASTI128 m10, [const1b] LOAD_CONT tmp0q add in0q, (mmsize >> 3) * (1 + V2) pshufb mx, m8 IF V2, pshufb mx2, m8 pand mx, m9 IF V2, pand mx2, m9 pcmpeqb mx, m9 IF V2, pcmpeqb mx2, m9 pand mx, m10 IF V2, pand mx2, m10 CONTINUE tmp0q %endmacro ; TODO: write_nibbles %macro write_bits 0 op write_bits1 VBROADCASTI128 m8, [bits_reverse] psllw mx, 7 IF V2, psllw mx2, 7 pshufb mx, m8 IF V2, pshufb mx2, m8 pmovmskb tmp0d, mx IF V2, pmovmskb tmp1d, mx2 mov [out0q], tmp0d IF V2, mov [out0q + (mmsize >> 3)], tmp1d add out0q, (mmsize >> 3) * (1 + V2) RET %endmacro ;-------------------------- ; Pixel packing / unpacking %macro pack_generic 3-4 0 ; x, y, z, w op pack_%1%2%3%4 ; pslld works for all sizes because the input should not overflow IF %2, pslld mx, %4+%3+%2 IF %3, pslld my, %4+%3 IF %4, pslld mz, %4 IF %2, por mx, my IF %3, por mx, mz IF %4, por mx, mw %if V2 IF %2, pslld mx2, %4+%3+%2 IF %3, pslld my2, %4+%3 IF %4, pslld mz2, %4 IF %2, por mx2, my2 IF %3, por mx2, mz2 IF %4, por mx2, mw2 %endif CONTINUE %endmacro %macro unpack 5-6 0 ; type, bits, x, y, z, w op unpack_%3%4%5%6 ; clear high bits by shifting left IF %6, vpsll%1 mw, mx, %2 - (%6) IF %5, vpsll%1 mz, mx, %2 - (%6+%5) IF %4, vpsll%1 my, mx, %2 - (%6+%5+%4) psrl%1 mx, %4+%5+%6 IF %4, psrl%1 my, %2 - %4 IF %5, psrl%1 mz, %2 - %5 IF %6, psrl%1 mw, %2 - %6 %if V2 IF %6, vpsll%1 mw2, mx2, %2 - (%6) IF %5, vpsll%1 mz2, mx2, %2 - (%6+%5) IF %4, vpsll%1 my2, mx2, %2 - (%6+%5+%4) psrl%1 mx2, %4+%5+%6 IF %4, psrl%1 my2, %2 - %4 IF %5, psrl%1 mz2, %2 - %5 IF %6, psrl%1 mw2, %2 - %6 %endif CONTINUE %endmacro %macro unpack8 3 ; x, y, z op unpack_%1%2%3 %+ 0 pand mz, mx, [mask%3] psrld my, mx, %3 psrld mx, %3+%2 pand my, [mask%2] pand mx, [mask%1] %if V2 pand mz2, mx2, [mask%3] psrld my2, mx2, %3 psrld mx2, %3+%2 pand my2, [mask%2] pand mx2, [mask%1] %endif CONTINUE %endmacro ;--------------------------------------------------------- ; Generic byte order shuffle (packed swizzle, endian, etc) %macro shuffle 0 op shuffle VBROADCASTI128 m8, [implq + SwsOpImpl.priv] LOAD_CONT tmp0q IF X, pshufb mx, m8 IF Y, pshufb my, m8 IF Z, pshufb mz, m8 IF W, pshufb mw, m8 %if V2 IF X, pshufb mx2, m8 IF Y, pshufb my2, m8 IF Z, pshufb mz2, m8 IF W, pshufb mw2, m8 %endif CONTINUE tmp0q %endmacro ;--------------------------------------------------------- ; Clearing %macro clear_alpha 3 ; idx, vreg, vreg2 op clear_alpha%1 LOAD_CONT tmp0q pcmpeqb %2, %2 IF V2, mova %3, %2 CONTINUE tmp0q %endmacro %macro clear_zero 3 ; idx, vreg, vreg2 op clear_zero%1 LOAD_CONT tmp0q pxor %2, %2 IF V2, mova %3, %2 CONTINUE tmp0q %endmacro %macro clear_generic 0 op clear LOAD_CONT tmp0q %if cpuflag(avx2) IF X, vpbroadcastd mx, [implq + SwsOpImpl.priv + 0] IF Y, vpbroadcastd my, [implq + SwsOpImpl.priv + 4] IF Z, vpbroadcastd mz, [implq + SwsOpImpl.priv + 8] IF W, vpbroadcastd mw, [implq + SwsOpImpl.priv + 12] %else ; !cpuflag(avx2) IF X, movd mx, [implq + SwsOpImpl.priv + 0] IF Y, movd my, [implq + SwsOpImpl.priv + 4] IF Z, movd mz, [implq + SwsOpImpl.priv + 8] IF W, movd mw, [implq + SwsOpImpl.priv + 12] IF X, pshufd mx, mx, 0 IF Y, pshufd my, my, 0 IF Z, pshufd mz, mz, 0 IF W, pshufd mw, mw, 0 %endif %if V2 IF X, mova mx2, mx IF Y, mova my2, my IF Z, mova mz2, mz IF W, mova mw2, mw %endif CONTINUE tmp0q %endmacro %macro clear_funcs 0 decl_pattern 0, 0, 0, 1, clear_generic decl_pattern 1, 0, 0, 0, clear_generic decl_pattern 1, 1, 0, 0, clear_generic decl_pattern 0, 1, 0, 0, clear_generic decl_pattern 0, 1, 1, 0, clear_generic decl_pattern 0, 0, 1, 1, clear_generic decl_pattern 1, 0, 1, 0, clear_generic decl_pattern 0, 1, 0, 1, clear_generic decl_pattern 0, 1, 1, 1, clear_generic decl_pattern 1, 0, 1, 1, clear_generic decl_pattern 1, 1, 0, 1, clear_generic %endmacro ;--------------------------------------------------------- ; Swizzling and duplicating ; mA := mB, mB := mC, ... mX := mA across both halves %macro vrotate 2-* ; A, B, C, ... %rep %0 %assign rot_a %1 + 4 %assign rot_b %2 + 4 mova m%1, m%2 IF V2, mova m%[rot_a], m%[rot_b] %rotate 1 %endrep %undef rot_a %undef rot_b %endmacro %macro swizzle_funcs 0 op swizzle_3012 LOAD_CONT tmp0q vrotate 8, 0, 3, 2, 1 CONTINUE tmp0q op swizzle_3021 LOAD_CONT tmp0q vrotate 8, 0, 3, 1 CONTINUE tmp0q op swizzle_2103 LOAD_CONT tmp0q vrotate 8, 0, 2 CONTINUE tmp0q op swizzle_3210 LOAD_CONT tmp0q vrotate 8, 0, 3 vrotate 8, 1, 2 CONTINUE tmp0q op swizzle_3102 LOAD_CONT tmp0q vrotate 8, 0, 3, 2 CONTINUE tmp0q op swizzle_3201 LOAD_CONT tmp0q vrotate 8, 0, 3, 1, 2 CONTINUE tmp0q op swizzle_1203 LOAD_CONT tmp0q vrotate 8, 0, 1, 2 CONTINUE tmp0q op swizzle_1023 LOAD_CONT tmp0q vrotate 8, 0, 1 CONTINUE tmp0q op swizzle_2013 LOAD_CONT tmp0q vrotate 8, 0, 2, 1 CONTINUE tmp0q op swizzle_2310 LOAD_CONT tmp0q vrotate 8, 0, 2, 1, 3 CONTINUE tmp0q op swizzle_2130 LOAD_CONT tmp0q vrotate 8, 0, 2, 3 CONTINUE tmp0q op swizzle_1230 LOAD_CONT tmp0q vrotate 8, 0, 1, 2, 3 CONTINUE tmp0q op swizzle_1320 LOAD_CONT tmp0q vrotate 8, 0, 1, 3 CONTINUE tmp0q op swizzle_0213 LOAD_CONT tmp0q vrotate 8, 1, 2 CONTINUE tmp0q op swizzle_0231 LOAD_CONT tmp0q vrotate 8, 1, 2, 3 CONTINUE tmp0q op swizzle_0312 LOAD_CONT tmp0q vrotate 8, 1, 3, 2 CONTINUE tmp0q op swizzle_3120 LOAD_CONT tmp0q vrotate 8, 0, 3 CONTINUE tmp0q op swizzle_0321 LOAD_CONT tmp0q vrotate 8, 1, 3 CONTINUE tmp0q op swizzle_0003 LOAD_CONT tmp0q mova my, mx mova mz, mx %if V2 mova my2, mx2 mova mz2, mx2 %endif CONTINUE tmp0q op swizzle_0001 LOAD_CONT tmp0q mova mw, my mova mz, mx mova my, mx %if V2 mova mw2, my2 mova mz2, mx2 mova my2, mx2 %endif CONTINUE tmp0q op swizzle_3000 LOAD_CONT tmp0q mova my, mx mova mz, mx mova mx, mw mova mw, my %if V2 mova my2, mx2 mova mz2, mx2 mova mx2, mw2 mova mw2, my2 %endif CONTINUE tmp0q op swizzle_1000 LOAD_CONT tmp0q mova mz, mx mova mw, mx mova mx, my mova my, mz %if V2 mova mz2, mx2 mova mw2, mx2 mova mx2, my2 mova my2, mz2 %endif CONTINUE tmp0q %endmacro ;--------------------------------------------------------- ; Pixel type conversions %macro conv8to16 1 ; type op %1_U8_U16 LOAD_CONT tmp0q %if V2 %if cpuflag(avx2) IF X, vextracti128 xmx2, mx, 1 IF Y, vextracti128 xmy2, my, 1 IF Z, vextracti128 xmz2, mz, 1 IF W, vextracti128 xmw2, mw, 1 %else IF X, psrldq xmx2, mx, 8 IF Y, psrldq xmy2, my, 8 IF Z, psrldq xmz2, mz, 8 IF W, psrldq xmw2, mw, 8 %endif IF X, pmovzxbw mx2, xmx2 IF Y, pmovzxbw my2, xmy2 IF Z, pmovzxbw mz2, xmz2 IF W, pmovzxbw mw2, xmw2 %endif ; V2 IF X, pmovzxbw mx, xmx IF Y, pmovzxbw my, xmy IF Z, pmovzxbw mz, xmz IF W, pmovzxbw mw, xmw %ifidn %1, expand VBROADCASTI128 m8, [expand16_shuf] %if V2 IF X, pshufb mx2, m8 IF Y, pshufb my2, m8 IF Z, pshufb mz2, m8 IF W, pshufb mw2, m8 %endif IF X, pshufb mx, m8 IF Y, pshufb my, m8 IF Z, pshufb mz, m8 IF W, pshufb mw, m8 %endif ; expand CONTINUE tmp0q %endmacro %macro conv16to8 0 op convert_U16_U8 LOAD_CONT tmp0q %if V2 ; this code technically works for the !V2 case as well, but slower IF X, packuswb mx, mx2 IF Y, packuswb my, my2 IF Z, packuswb mz, mz2 IF W, packuswb mw, mw2 IF X, vpermq mx, mx, q3120 IF Y, vpermq my, my, q3120 IF Z, vpermq mz, mz, q3120 IF W, vpermq mw, mw, q3120 %else IF X, vextracti128 xm8, mx, 1 IF Y, vextracti128 xm9, my, 1 IF Z, vextracti128 xm10, mz, 1 IF W, vextracti128 xm11, mw, 1 vzeroupper IF X, packuswb xmx, xm8 IF Y, packuswb xmy, xm9 IF Z, packuswb xmz, xm10 IF W, packuswb xmw, xm11 %endif CONTINUE tmp0q %endmacro %macro conv8to32 1 ; type op %1_U8_U32 LOAD_CONT tmp0q IF X, psrldq xmx2, xmx, 8 IF Y, psrldq xmy2, xmy, 8 IF Z, psrldq xmz2, xmz, 8 IF W, psrldq xmw2, xmw, 8 IF X, pmovzxbd mx, xmx IF Y, pmovzxbd my, xmy IF Z, pmovzxbd mz, xmz IF W, pmovzxbd mw, xmw IF X, pmovzxbd mx2, xmx2 IF Y, pmovzxbd my2, xmy2 IF Z, pmovzxbd mz2, xmz2 IF W, pmovzxbd mw2, xmw2 %ifidn %1, expand VBROADCASTI128 m8, [expand32_shuf] IF X, pshufb mx, m8 IF Y, pshufb my, m8 IF Z, pshufb mz, m8 IF W, pshufb mw, m8 IF X, pshufb mx2, m8 IF Y, pshufb my2, m8 IF Z, pshufb mz2, m8 IF W, pshufb mw2, m8 %endif ; expand CONTINUE tmp0q %endmacro %macro conv32to8 0 op convert_U32_U8 LOAD_CONT tmp0q IF X, packusdw mx, mx2 IF Y, packusdw my, my2 IF Z, packusdw mz, mz2 IF W, packusdw mw, mw2 IF X, vextracti128 xmx2, mx, 1 IF Y, vextracti128 xmy2, my, 1 IF Z, vextracti128 xmz2, mz, 1 IF W, vextracti128 xmw2, mw, 1 vzeroupper IF X, packuswb xmx, xmx2 IF Y, packuswb xmy, xmy2 IF Z, packuswb xmz, xmz2 IF W, packuswb xmw, xmw2 IF X, vpshufd xmx, xmx, q3120 IF Y, vpshufd xmy, xmy, q3120 IF Z, vpshufd xmz, xmz, q3120 IF W, vpshufd xmw, xmw, q3120 CONTINUE tmp0q %endmacro %macro conv16to32 0 op convert_U16_U32 LOAD_CONT tmp0q IF X, vextracti128 xmx2, mx, 1 IF Y, vextracti128 xmy2, my, 1 IF Z, vextracti128 xmz2, mz, 1 IF W, vextracti128 xmw2, mw, 1 IF X, pmovzxwd mx, xmx IF Y, pmovzxwd my, xmy IF Z, pmovzxwd mz, xmz IF W, pmovzxwd mw, xmw IF X, pmovzxwd mx2, xmx2 IF Y, pmovzxwd my2, xmy2 IF Z, pmovzxwd mz2, xmz2 IF W, pmovzxwd mw2, xmw2 CONTINUE tmp0q %endmacro %macro conv32to16 0 op convert_U32_U16 LOAD_CONT tmp0q IF X, packusdw mx, mx2 IF Y, packusdw my, my2 IF Z, packusdw mz, mz2 IF W, packusdw mw, mw2 IF X, vpermq mx, mx, q3120 IF Y, vpermq my, my, q3120 IF Z, vpermq mz, mz, q3120 IF W, vpermq mw, mw, q3120 CONTINUE tmp0q %endmacro ;--------------------------------------------------------- ; Shifting and scaling %macro lshift16 0 op lshift16 vmovq xm8, [implq + SwsOpImpl.priv] LOAD_CONT tmp0q IF X, psllw mx, xm8 IF Y, psllw my, xm8 IF Z, psllw mz, xm8 IF W, psllw mw, xm8 %if V2 IF X, psllw mx2, xm8 IF Y, psllw my2, xm8 IF Z, psllw mz2, xm8 IF W, psllw mw2, xm8 %endif CONTINUE tmp0q %endmacro %macro rshift16 0 op rshift16 vmovq xm8, [implq + SwsOpImpl.priv] LOAD_CONT tmp0q IF X, psrlw mx, xm8 IF Y, psrlw my, xm8 IF Z, psrlw mz, xm8 IF W, psrlw mw, xm8 %if V2 IF X, psrlw mx2, xm8 IF Y, psrlw my2, xm8 IF Z, psrlw mz2, xm8 IF W, psrlw mw2, xm8 %endif CONTINUE tmp0q %endmacro ; special cases for expanding bits to full range %macro expand_bits 2 ; bits, suffix op expand_bits%1 mova m8, [const1%2] LOAD_CONT tmp0q pcmpeq%2 mx, m8 IF V2, pcmpeq%2 mx2, m8 CONTINUE tmp0q %endmacro ;--------------------------------------------------------- ; Macro instantiations for kernel functions %macro funcs_u8 0 read_planar 1 read_planar 2 read_planar 3 read_planar 4 write_planar 1 write_planar 2 write_planar 3 write_planar 4 rw_packed 8 read_nibbles read_bits write_bits expand_bits 8, b pack_generic 1, 2, 1 pack_generic 3, 3, 2 pack_generic 2, 3, 3 unpack8 1, 2, 1 unpack8 3, 3, 2 unpack8 2, 3, 3 clear_alpha 0, mx, mx2 clear_alpha 1, my, my2 clear_alpha 3, mw, mw2 clear_zero 0, mx, mx2 clear_zero 1, my, my2 clear_zero 3, mw, mw2 clear_funcs swizzle_funcs decl_common_patterns shuffle %endmacro %macro funcs_u16 0 rw_packed 16 expand_bits 16, w pack_generic 4, 4, 4 pack_generic 5, 5, 5 pack_generic 5, 6, 5 unpack w, 16, 4, 4, 4 unpack w, 16, 5, 5, 5 unpack w, 16, 5, 6, 5 decl_common_patterns conv8to16 convert decl_common_patterns conv8to16 expand decl_common_patterns conv16to8 decl_common_patterns lshift16 decl_common_patterns rshift16 %endmacro INIT_XMM sse4 decl_v2 0, funcs_u8 decl_v2 1, funcs_u8 INIT_YMM avx2 decl_v2 0, funcs_u8 decl_v2 1, funcs_u8 decl_v2 0, funcs_u16 decl_v2 1, funcs_u16 INIT_YMM avx2 decl_v2 1, rw_packed 32 decl_v2 1, pack_generic 10, 10, 10, 2 decl_v2 1, pack_generic 2, 10, 10, 10 decl_v2 1, unpack d, 32, 10, 10, 10, 2 decl_v2 1, unpack d, 32, 2, 10, 10, 10 decl_common_patterns conv8to32 convert decl_common_patterns conv8to32 expand decl_common_patterns conv32to8 decl_common_patterns conv16to32 decl_common_patterns conv32to16