Files
FFmpeg/libswscale/aarch64/ops_asmgen.c
Niklas Haas 85bef2c2bc swscale/ops: split SwsConst up into op-specific structs
It was a bit clunky, lacked semantic contextual information, and made it
harder to reason about the effects of extending this struct. There should be
zero runtime overhead as a result of the fact that this is already a big
union.

I made the changes in this commit by hand, but due to the length and noise
level of the commit, I used Opus 4.6 to verify that I did not accidentally
introduce any bugs or typos.

Signed-off-by: Niklas Haas <git@haasn.dev>
2026-04-02 11:48:15 +00:00

1647 lines
59 KiB
C

/*
* Copyright (C) 2026 Ramiro Polla
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
#include <io.h>
#include <fcntl.h>
#endif
/**
* This file is compiled as a standalone build-time tool and must not depend
* on internal FFmpeg libraries. The necessary utils are redefined below using
* standard C equivalents.
*/
#define AVUTIL_AVASSERT_H
#define AVUTIL_LOG_H
#define AVUTIL_MACROS_H
#define AVUTIL_MEM_H
#define av_assert0(cond) assert(cond)
#define av_malloc(s) malloc(s)
#define av_mallocz(s) calloc(1, s)
#define av_realloc(p, s) realloc(p, s)
#define av_strdup(s) strdup(s)
#define av_free(p) free(p)
#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
static void av_freep(void *ptr)
{
void **pptr = (void **) ptr;
if (pptr) {
ptr = *pptr;
if (ptr)
free(ptr);
*pptr = NULL;
}
}
#include "libavutil/dynarray.h"
static void *av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size,
const uint8_t *elem_data)
{
uint8_t *tab_elem_data = NULL;
FF_DYNARRAY_ADD(INT_MAX, elem_size, *tab_ptr, *nb_ptr, {
tab_elem_data = (uint8_t *)*tab_ptr + (*nb_ptr) * elem_size;
if (elem_data)
memcpy(tab_elem_data, elem_data, elem_size);
}, {
av_freep(tab_ptr);
*nb_ptr = 0;
});
return tab_elem_data;
}
/*********************************************************************/
#include "rasm.c"
#include "rasm_print.c"
#include "ops_impl.c"
/**
* Implementation parameters for all exported functions. This list is
* compiled by performing a dummy run of all conversions in sws_ops and
* collecting all functions that need to be generated. This is achieved
* by running:
* make sws_ops_entries_aarch64
*/
static const SwsAArch64OpImplParams impl_params[] = {
#include "ops_entries.c"
{ .op = AARCH64_SWS_OP_NONE }
};
/*********************************************************************/
static size_t aarch64_pixel_size(SwsAArch64PixelType fmt)
{
switch (fmt) {
case AARCH64_PIXEL_U8: return 1;
case AARCH64_PIXEL_U16: return 2;
case AARCH64_PIXEL_U32: return 4;
case AARCH64_PIXEL_F32: return 4;
default:
av_assert0(!"Invalid pixel type!");
break;
}
return 0;
}
static void impl_func_name(char **buf, size_t *size, const SwsAArch64OpImplParams *params)
{
buf_appendf(buf, size, "ff_sws");
const ParamField **fields = op_fields[params->op];
for (int i = 0; fields[i]; i++) {
const ParamField *field = fields[i];
void *p = (void *) (((uintptr_t) params) + field->offset);
field->print_str(buf, size, p);
}
buf_appendf(buf, size, "_neon");
}
void aarch64_op_impl_func_name(char *buf, size_t size, const SwsAArch64OpImplParams *params)
{
impl_func_name(&buf, &size, params);
av_assert0(size && "string buffer exhausted");
}
/*********************************************************************/
typedef struct SwsAArch64Context {
RasmContext *rctx;
/* SwsOpFunc arguments. */
RasmOp exec;
RasmOp impl;
RasmOp bx_start;
RasmOp y_start;
RasmOp bx_end;
RasmOp y_end;
/* Loop iterator variables. */
RasmOp bx;
RasmOp y;
/* Scratch registers. */
RasmOp tmp0;
RasmOp tmp1;
/* CPS-related variables. */
RasmOp op0_func;
RasmOp op1_impl;
RasmOp cont;
/* Vector registers. Two banks (low and high) are used. */
RasmOp vl[ 4];
RasmOp vh[ 4];
RasmOp vt[12];
/* Read/Write data pointers and padding. */
RasmOp in[4];
RasmOp out[4];
RasmOp in_bump[4];
RasmOp out_bump[4];
/* Vector register dimensions. */
size_t el_size;
size_t el_count;
size_t vec_size;
bool use_vh;
} SwsAArch64Context;
/*********************************************************************/
/* Helpers functions. */
/* Looping when s->use_vh is set. */
#define LOOP_VH(s, mask, idx) if (s->use_vh) LOOP(mask, idx)
#define LOOP_MASK_VH(s, p, idx) if (s->use_vh) LOOP_MASK(p, idx)
#define LOOP_MASK_BWD_VH(s, p, idx) if (s->use_vh) LOOP_MASK_BWD(p, idx)
/* Inline rasm comments. */
#define CMT(comment) rasm_annotate(r, comment)
#define CMTF(fmt, ...) rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__)
/* Reshape all vector registers for current SwsOp. */
static void reshape_all_vectors(SwsAArch64Context *s, int el_count, int el_size)
{
s->vl[ 0] = a64op_make_vec( 0, el_count, el_size);
s->vl[ 1] = a64op_make_vec( 1, el_count, el_size);
s->vl[ 2] = a64op_make_vec( 2, el_count, el_size);
s->vl[ 3] = a64op_make_vec( 3, el_count, el_size);
s->vh[ 0] = a64op_make_vec( 4, el_count, el_size);
s->vh[ 1] = a64op_make_vec( 5, el_count, el_size);
s->vh[ 2] = a64op_make_vec( 6, el_count, el_size);
s->vh[ 3] = a64op_make_vec( 7, el_count, el_size);
s->vt[ 0] = a64op_make_vec(16, el_count, el_size);
s->vt[ 1] = a64op_make_vec(17, el_count, el_size);
s->vt[ 2] = a64op_make_vec(18, el_count, el_size);
s->vt[ 3] = a64op_make_vec(19, el_count, el_size);
s->vt[ 4] = a64op_make_vec(20, el_count, el_size);
s->vt[ 5] = a64op_make_vec(21, el_count, el_size);
s->vt[ 6] = a64op_make_vec(22, el_count, el_size);
s->vt[ 7] = a64op_make_vec(23, el_count, el_size);
s->vt[ 8] = a64op_make_vec(24, el_count, el_size);
s->vt[ 9] = a64op_make_vec(25, el_count, el_size);
s->vt[10] = a64op_make_vec(26, el_count, el_size);
s->vt[11] = a64op_make_vec(27, el_count, el_size);
}
/*********************************************************************/
/* Function frame */
static unsigned clobbered_frame_size(unsigned n)
{
return ((n + 1) >> 1) * 16;
}
static void asmgen_prologue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
{
RasmContext *r = s->rctx;
RasmOp sp = a64op_sp();
unsigned frame_size = clobbered_frame_size(n);
RasmOp sp_pre = a64op_pre(sp, -frame_size);
rasm_add_comment(r, "prologue");
if (n == 0) {
/* no-op */
} else if (n == 1) {
i_str(r, regs[0], sp_pre);
} else {
i_stp(r, regs[0], regs[1], sp_pre);
for (unsigned i = 2; i + 1 < n; i += 2)
i_stp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t)));
if (n & 1)
i_str(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t)));
}
}
static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned n)
{
RasmContext *r = s->rctx;
RasmOp sp = a64op_sp();
unsigned frame_size = clobbered_frame_size(n);
RasmOp sp_post = a64op_post(sp, frame_size);
rasm_add_comment(r, "epilogue");
if (n == 0) {
/* no-op */
} else if (n == 1) {
i_ldr(r, regs[0], sp_post);
} else {
if (n & 1)
i_ldr(r, regs[n - 1], a64op_off(sp, (n - 1) * sizeof(uint64_t)));
for (unsigned i = (n & ~1u) - 2; i >= 2; i -= 2)
i_ldp(r, regs[i], regs[i + 1], a64op_off(sp, i * sizeof(uint64_t)));
i_ldp(r, regs[0], regs[1], sp_post);
}
}
/*********************************************************************/
/* Callee-saved registers (r19-r28). */
#define MAX_SAVED_REGS 10
static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count,
RasmOp gpr)
{
const int n = a64op_gpr_n(gpr);
if (n >= 19 && n <= 28)
regs[(*count)++] = gpr;
}
static unsigned clobbered_gprs(const SwsAArch64Context *s,
const SwsAArch64OpImplParams *p,
RasmOp regs[MAX_SAVED_REGS])
{
unsigned count = 0;
LOOP_MASK(p, i) {
clobber_gpr(regs, &count, s->in[i]);
clobber_gpr(regs, &count, s->out[i]);
clobber_gpr(regs, &count, s->in_bump[i]);
clobber_gpr(regs, &count, s->out_bump[i]);
}
return count;
}
static void asmgen_process(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
char func_name[128];
char buf[64];
/**
* The process/process_return functions for aarch64 work similarly
* to the x86 backend. The description in x86/ops_common.asm mostly
* holds as well here.
*/
aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
rasm_func_begin(r, func_name, true, false);
/* Function prologue */
RasmOp saved_regs[MAX_SAVED_REGS];
unsigned nsaved = clobbered_gprs(s, p, saved_regs);
if (nsaved)
asmgen_prologue(s, saved_regs, nsaved);
/* Load values from impl. */
i_ldr(r, s->op0_func, a64op_off(s->impl, offsetof_impl_cont)); CMT("SwsFuncPtr op0_func = impl->cont;");
i_add(r, s->op1_impl, s->impl, IMM(sizeof_impl)); CMT("SwsOpImpl *op1_impl = impl + 1;");
/* Load values from exec. */
LOOP_MASK(p, i) {
rasm_annotate_nextf(r, buf, sizeof(buf), "in[%u] = exec->in[%u];", i, i);
i_ldr(r, s->in[i], a64op_off(s->exec, offsetof_exec_in + (i * sizeof(uint8_t *))));
}
LOOP_MASK(p, i) {
rasm_annotate_nextf(r, buf, sizeof(buf), "out[%u] = exec->out[%u];", i, i);
i_ldr(r, s->out[i], a64op_off(s->exec, offsetof_exec_out + (i * sizeof(uint8_t *))));
}
LOOP_MASK(p, i) {
rasm_annotate_nextf(r, buf, sizeof(buf), "in_bump[%u] = exec->in_bump[%u];", i, i);
i_ldr(r, s->in_bump[i], a64op_off(s->exec, offsetof_exec_in_bump + (i * sizeof(ptrdiff_t))));
}
LOOP_MASK(p, i) {
rasm_annotate_nextf(r, buf, sizeof(buf), "out_bump[%u] = exec->out_bump[%u];", i, i);
i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + (i * sizeof(ptrdiff_t))));
}
/* Reset x and jump to first kernel. */
i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
i_br (r, s->op0_func); CMT("jump to op0_func");
}
static void asmgen_process_return(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
char func_name[128];
aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
rasm_func_begin(r, func_name, true, true);
/* Reset impl to first kernel. */
i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
/* Perform horizontal loop. */
int loop = rasm_new_label(r, NULL);
i_add(r, s->bx, s->bx, IMM(1)); CMT("bx += 1;");
i_cmp(r, s->bx, s->bx_end); CMT("if (bx != bx_end)");
i_bne(r, loop); CMT(" goto loop;");
/* Perform vertical loop. */
int end = rasm_new_label(r, NULL);
i_add(r, s->y, s->y, IMM(1)); CMT("y += 1;");
i_cmp(r, s->y, s->y_end); CMT("if (y == y_end)");
i_beq(r, end); CMT(" goto end;");
/* Perform padding and reset x, preparing for next row. */
LOOP_MASK(p, i) { i_add(r, s->in[i], s->in[i], s->in_bump[i]); CMTF("in[%u] += in_bump[%u];", i, i); }
LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); CMTF("out[%u] += out_bump[%u];", i, i); }
i_mov(r, s->bx, s->bx_start); CMT("bx = bx_start;");
/* Loop back or end of function. */
rasm_add_label(r, loop); CMT("loop:");
i_br (r, s->op0_func); CMT("jump to op0_func");
rasm_add_label(r, end); CMT("end:");
/* Function epilogue */
RasmOp saved_regs[MAX_SAVED_REGS];
unsigned nsaved = clobbered_gprs(s, p, saved_regs);
if (nsaved)
asmgen_epilogue(s, saved_regs, nsaved);
i_ret(r);
}
/*********************************************************************/
/* gather raw pixels from planes */
/* AARCH64_SWS_OP_READ_BIT */
/* AARCH64_SWS_OP_READ_NIBBLE */
/* AARCH64_SWS_OP_READ_PACKED */
/* AARCH64_SWS_OP_READ_PLANAR */
static void asmgen_op_read_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp bitmask_vec = s->vt[1];
RasmOp wtmp = a64op_w(s->tmp0);
AArch64VecViews vl[1];
AArch64VecViews vtmp;
AArch64VecViews shift_vec;
a64op_vec_views(s->vt[0], &shift_vec);
a64op_vec_views(s->vl[0], &vl[0]);
a64op_vec_views(s->vt[2], &vtmp);
/* Note that shift_vec has negative values, so that using it with
* ushl actually performs a right shift. */
rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
if (p->block_size == 16) {
i_ldrh(r, wtmp, a64op_post(s->in[0], 2)); CMT("uint16_t tmp = *in[0]++;");
i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 16 times>};");
i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);");
i_lsr (r, wtmp, wtmp, IMM(8)); CMT("tmp >>= 8;");
i_dup (r, vtmp.b8, wtmp); CMT("vtmp.lo = broadcast(tmp);");
i_ins (r, vl[0].de[1], vtmp.de[0]); CMT("vl[0].hi = vtmp.lo;");
i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;");
i_and (r, vl[0].b16, vl[0].b16, bitmask_vec); CMT("vl[0] &= bitmask_vec;");
} else {
i_ldrb(r, wtmp, a64op_post(s->in[0], 1)); CMT("uint8_t tmp = *in[0]++;");
i_movi(r, bitmask_vec, IMM(1)); CMT("v128 bitmask_vec = {1 <repeats 8 times>, 0 <repeats 8 times>};");
i_dup (r, vl[0].b8, wtmp); CMT("vl[0].lo = broadcast(tmp);");
i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;");
i_and (r, vl[0].b8, vl[0].b8, bitmask_vec); CMT("vl[0] &= bitmask_vec;");
}
}
static void asmgen_op_read_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp nibble_mask = v_8b(s->vt[0]);
AArch64VecViews vl[1];
AArch64VecViews vtmp;
a64op_vec_views(s->vl[0], &vl[0]);
a64op_vec_views(s->vt[1], &vtmp);
rasm_annotate_next(r, "v128 nibble_mask = {0xf <repeats 8 times>, 0x0 <repeats 8 times>};");
i_movi(r, nibble_mask, IMM(0x0f));
if (p->block_size == 8) {
i_ldr (r, vl[0].s, a64op_post(s->in[0], 4)); CMT("vl[0] = *in[0]++;");
i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;");
i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;");
i_zip1(r, vl[0].b8, vtmp.b8, vl[0].b8); CMT("interleave");
} else {
i_ldr (r, vl[0].d, a64op_post(s->in[0], 8)); CMT("vl[0] = *in[0]++;");
i_ushr(r, vtmp.b8, vl[0].b8, IMM(4)); CMT("vtmp.lo = vl[0] >> 4;");
i_and (r, vl[0].b8, vl[0].b8, nibble_mask); CMT("vl[0].lo &= nibble_mask;");
i_zip1(r, vl[0].b16, vtmp.b16, vl[0].b16); CMT("interleave");
}
}
static void asmgen_op_read_packed_1(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
AArch64VecViews vl[1];
AArch64VecViews vh[1];
a64op_vec_views(s->vl[0], &vl[0]);
a64op_vec_views(s->vh[0], &vh[0]);
switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
case 0x008: i_ldr(r, vl[0].d, a64op_post(s->in[0], s->vec_size * 1)); break;
case 0x010: i_ldr(r, vl[0].q, a64op_post(s->in[0], s->vec_size * 1)); break;
case 0x108: i_ldp(r, vl[0].d, vh[0].d, a64op_post(s->in[0], s->vec_size * 2)); break;
case 0x110: i_ldp(r, vl[0].q, vh[0].q, a64op_post(s->in[0], s->vec_size * 2)); break;
}
}
static void asmgen_op_read_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx)
{
RasmContext *r = s->rctx;
switch (p->mask) {
case 0x0011: i_ld2(r, vv_2(vx[0], vx[1]), a64op_post(s->in[0], s->vec_size * 2)); break;
case 0x0111: i_ld3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->in[0], s->vec_size * 3)); break;
case 0x1111: i_ld4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->in[0], s->vec_size * 4)); break;
}
}
static void asmgen_op_read_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
if (p->mask == 0x0001) {
asmgen_op_read_packed_1(s, p);
} else {
asmgen_op_read_packed_n(s, p, s->vl);
if (s->use_vh)
asmgen_op_read_packed_n(s, p, s->vh);
}
}
static void asmgen_op_read_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
AArch64VecViews vl[4];
AArch64VecViews vh[4];
for (int i = 0; i < 4; i++) {
a64op_vec_views(s->vl[i], &vl[i]);
a64op_vec_views(s->vh[i], &vh[i]);
}
LOOP_MASK(p, i) {
switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
case 0x008: i_ldr(r, vl[i].d, a64op_post(s->in[i], s->vec_size * 1)); break;
case 0x010: i_ldr(r, vl[i].q, a64op_post(s->in[i], s->vec_size * 1)); break;
case 0x108: i_ldp(r, vl[i].d, vh[i].d, a64op_post(s->in[i], s->vec_size * 2)); break;
case 0x110: i_ldp(r, vl[i].q, vh[i].q, a64op_post(s->in[i], s->vec_size * 2)); break;
}
}
}
/*********************************************************************/
/* write raw pixels to planes */
/* AARCH64_SWS_OP_WRITE_BIT */
/* AARCH64_SWS_OP_WRITE_NIBBLE */
/* AARCH64_SWS_OP_WRITE_PACKED */
/* AARCH64_SWS_OP_WRITE_PLANAR */
static void asmgen_op_write_bit(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
AArch64VecViews vl[1];
AArch64VecViews shift_vec;
AArch64VecViews vtmp0;
AArch64VecViews vtmp1;
a64op_vec_views(s->vl[0], &vl[0]);
a64op_vec_views(s->vt[0], &shift_vec);
a64op_vec_views(s->vt[1], &vtmp0);
a64op_vec_views(s->vt[2], &vtmp1);
rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
if (p->block_size == 8) {
i_ushl(r, vl[0].b8, vl[0].b8, shift_vec.b8); CMT("vl[0] <<= shift_vec;");
i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);");
i_str (r, vtmp0.b, a64op_post(s->out[0], 1)); CMT("*out[0]++ = vtmp0;");
} else {
i_ushl(r, vl[0].b16, vl[0].b16, shift_vec.b16); CMT("vl[0] <<= shift_vec;");
i_addv(r, vtmp0.b, vl[0].b8); CMT("vtmp0[0] = add_across(vl[0].lo);");
i_ins (r, vtmp1.de[0], vl[0].de[1]); CMT("vtmp1.lo = vl[0].hi;");
i_addv(r, vtmp1.b, vtmp1.b8); CMT("vtmp1[0] = add_across(vtmp1);");
i_ins (r, vtmp0.be[1], vtmp1.be[0]); CMT("vtmp0[1] = vtmp1[0];");
i_str (r, vtmp0.h, a64op_post(s->out[0], 2)); CMT("*out[0]++ = vtmp0;");
}
}
static void asmgen_op_write_nibble(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
AArch64VecViews vl[4];
AArch64VecViews vtmp0;
AArch64VecViews vtmp1;
for (int i = 0; i < 4; i++)
a64op_vec_views(s->vl[i], &vl[i]);
a64op_vec_views(s->vt[0], &vtmp0);
a64op_vec_views(s->vt[1], &vtmp1);
if (p->block_size == 8) {
i_shl (r, vtmp0.h4, vl[0].h4, IMM(4));
i_ushr(r, vtmp1.h4, vl[0].h4, IMM(8));
i_orr (r, vl[0].b8, vtmp0.b8, vtmp1.b8);
i_xtn (r, vtmp0.b8, vl[0].h8);
i_str (r, vtmp0.s, a64op_post(s->out[0], 4));
} else {
i_shl (r, vtmp0.h8, vl[0].h8, IMM(4));
i_ushr(r, vtmp1.h8, vl[0].h8, IMM(8));
i_orr (r, vl[0].b16, vtmp0.b16, vtmp1.b16);
i_xtn (r, vtmp0.b8, vl[0].h8);
i_str (r, vtmp0.d, a64op_post(s->out[0], 8));
}
}
static void asmgen_op_write_packed_1(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
AArch64VecViews vl[1];
AArch64VecViews vh[1];
a64op_vec_views(s->vl[0], &vl[0]);
a64op_vec_views(s->vh[0], &vh[0]);
switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
case 0x008: i_str(r, vl[0].d, a64op_post(s->out[0], s->vec_size * 1)); break;
case 0x010: i_str(r, vl[0].q, a64op_post(s->out[0], s->vec_size * 1)); break;
case 0x108: i_stp(r, vl[0].d, vh[0].d, a64op_post(s->out[0], s->vec_size * 2)); break;
case 0x110: i_stp(r, vl[0].q, vh[0].q, a64op_post(s->out[0], s->vec_size * 2)); break;
}
}
static void asmgen_op_write_packed_n(SwsAArch64Context *s, const SwsAArch64OpImplParams *p, RasmOp *vx)
{
RasmContext *r = s->rctx;
switch (p->mask) {
case 0x0011: i_st2(r, vv_2(vx[0], vx[1]), a64op_post(s->out[0], s->vec_size * 2)); break;
case 0x0111: i_st3(r, vv_3(vx[0], vx[1], vx[2]), a64op_post(s->out[0], s->vec_size * 3)); break;
case 0x1111: i_st4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), a64op_post(s->out[0], s->vec_size * 4)); break;
}
}
static void asmgen_op_write_packed(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
if (p->mask == 0x0001) {
asmgen_op_write_packed_1(s, p);
} else {
asmgen_op_write_packed_n(s, p, s->vl);
if (s->use_vh)
asmgen_op_write_packed_n(s, p, s->vh);
}
}
static void asmgen_op_write_planar(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
AArch64VecViews vl[4];
AArch64VecViews vh[4];
for (int i = 0; i < 4; i++) {
a64op_vec_views(s->vl[i], &vl[i]);
a64op_vec_views(s->vh[i], &vh[i]);
}
LOOP_MASK(p, i) {
switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
case 0x008: i_str(r, vl[i].d, a64op_post(s->out[i], s->vec_size * 1)); break;
case 0x010: i_str(r, vl[i].q, a64op_post(s->out[i], s->vec_size * 1)); break;
case 0x108: i_stp(r, vl[i].d, vh[i].d, a64op_post(s->out[i], s->vec_size * 2)); break;
case 0x110: i_stp(r, vl[i].q, vh[i].q, a64op_post(s->out[i], s->vec_size * 2)); break;
}
}
}
/*********************************************************************/
/* swap byte order (for differing endianness) */
/* AARCH64_SWS_OP_SWAP_BYTES */
static void asmgen_op_swap_bytes(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
AArch64VecViews vl[4];
AArch64VecViews vh[4];
for (int i = 0; i < 4; i++) {
a64op_vec_views(s->vl[i], &vl[i]);
a64op_vec_views(s->vh[i], &vh[i]);
}
switch (aarch64_pixel_size(p->type)) {
case sizeof(uint16_t):
LOOP_MASK (p, i) i_rev16(r, vl[i].b16, vl[i].b16);
LOOP_MASK_VH(s, p, i) i_rev16(r, vh[i].b16, vh[i].b16);
break;
case sizeof(uint32_t):
LOOP_MASK (p, i) i_rev32(r, vl[i].b16, vl[i].b16);
LOOP_MASK_VH(s, p, i) i_rev32(r, vh[i].b16, vh[i].b16);
break;
}
}
/*********************************************************************/
/* rearrange channel order, or duplicate channels */
/* AARCH64_SWS_OP_SWIZZLE */
#define SWIZZLE_TMP 0xf
static const char *print_swizzle_v(char buf[8], uint8_t n, uint8_t vh)
{
if (n == SWIZZLE_TMP)
snprintf(buf, sizeof(char[8]), "vtmp%c", vh ? 'h' : 'l');
else
snprintf(buf, sizeof(char[8]), "v%c[%u]", vh ? 'h' : 'l', n);
return buf;
}
#define PRINT_SWIZZLE_V(n, vh) print_swizzle_v((char[8]){ 0 }, n, vh)
static RasmOp swizzle_a64op(SwsAArch64Context *s, uint8_t n, uint8_t vh)
{
if (n == SWIZZLE_TMP)
return s->vt[vh];
return vh ? s->vh[n] : s->vl[n];
}
static void swizzle_emit(SwsAArch64Context *s, uint8_t dst, uint8_t src)
{
RasmContext *r = s->rctx;
RasmOp src_op[2] = { swizzle_a64op(s, src, 0), swizzle_a64op(s, src, 1) };
RasmOp dst_op[2] = { swizzle_a64op(s, dst, 0), swizzle_a64op(s, dst, 1) };
i_mov (r, dst_op[0], src_op[0]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 0), PRINT_SWIZZLE_V(src, 0));
if (s->use_vh) {
i_mov(r, dst_op[1], src_op[1]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 1), PRINT_SWIZZLE_V(src, 1));
}
}
static void asmgen_op_swizzle(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
/* Compute used vectors (src and dst) */
uint8_t src_used[4] = { 0 };
bool done[4] = { true, true, true, true };
LOOP_MASK(p, dst) {
uint8_t src = MASK_GET(p->swizzle, dst);
src_used[src]++;
done[dst] = false;
}
/* First perform unobstructed copies. */
for (bool progress = true; progress; ) {
progress = false;
for (int dst = 0; dst < 4; dst++) {
if (done[dst] || src_used[dst])
continue;
uint8_t src = MASK_GET(p->swizzle, dst);
swizzle_emit(s, dst, src);
src_used[src]--;
done[dst] = true;
progress = true;
}
}
/* Then swap and rotate remaining operations. */
for (int dst = 0; dst < 4; dst++) {
if (done[dst])
continue;
swizzle_emit(s, SWIZZLE_TMP, dst);
uint8_t cur_dst = dst;
uint8_t src = MASK_GET(p->swizzle, cur_dst);
while (src != dst) {
swizzle_emit(s, cur_dst, src);
done[cur_dst] = true;
cur_dst = src;
src = MASK_GET(p->swizzle, cur_dst);
}
swizzle_emit(s, cur_dst, SWIZZLE_TMP);
done[cur_dst] = true;
}
}
#undef SWIZZLE_TMP
/*********************************************************************/
/* split tightly packed data into components */
/* AARCH64_SWS_OP_UNPACK */
static void asmgen_op_unpack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
RasmOp *vt = s->vt;
RasmOp mask_gpr = a64op_w(s->tmp0);
uint32_t mask_val[4] = { 0 };
uint8_t mask_idx[4] = { 0 };
uint8_t cur_vt = 0;
const int offsets[4] = {
MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
MASK_GET(p->pack, 3),
0
};
/* Generate masks. */
rasm_add_comment(r, "generate masks");
LOOP_MASK(p, i) {
uint32_t val = (1u << MASK_GET(p->pack, i)) - 1;
for (int j = 0; j < 4; j++) {
if (mask_val[j] == val) {
mask_val[i] = mask_val[j];
mask_idx[i] = mask_idx[j];
break;
}
}
if (!mask_val[i]) {
/**
* All-one values in movi only work up to 8-bit, and then
* at full 16- or 32-bit, but not for intermediate values
* like 10-bit. In those cases, we use mov + dup instead.
*/
if (val <= 0xff || val == 0xffff) {
i_movi(r, vt[cur_vt], IMM(val));
} else {
i_mov (r, mask_gpr, IMM(val));
i_dup (r, vt[cur_vt], mask_gpr);
}
mask_val[i] = val;
mask_idx[i] = cur_vt++;
}
}
/* Loop backwards to avoid clobbering component 0. */
LOOP_MASK_BWD (p, i) {
if (offsets[i]) {
i_ushr (r, vl[i], vl[0], IMM(offsets[i])); CMTF("vl[%u] >>= %u;", i, offsets[i]);
} else if (i) {
i_mov16b(r, vl[i], vl[0]); CMTF("vl[%u] = vl[0];", i);
}
}
LOOP_MASK_BWD_VH(s, p, i) {
if (offsets[i]) {
i_ushr (r, vh[i], vh[0], IMM(offsets[i])); CMTF("vh[%u] >>= %u;", i, offsets[i]);
} else if (i) {
i_mov16b(r, vh[i], vh[0]); CMTF("vh[%u] = vh[0];", i);
}
}
/* Apply masks. */
reshape_all_vectors(s, 16, 1);
LOOP_MASK_BWD (p, i) { i_and(r, vl[i], vl[i], vt[mask_idx[i]]); CMTF("vl[%u] &= 0x%x;", i, mask_val[i]); }
LOOP_MASK_BWD_VH(s, p, i) { i_and(r, vh[i], vh[i], vt[mask_idx[i]]); CMTF("vh[%u] &= 0x%x;", i, mask_val[i]); }
}
/*********************************************************************/
/* compress components into tightly packed data */
/* AARCH64_SWS_OP_PACK */
static void asmgen_op_pack(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
const int offsets[4] = {
MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
MASK_GET(p->pack, 3),
0
};
uint16_t offset_mask = 0;
LOOP_MASK(p, i) {
if (offsets[i])
MASK_SET(offset_mask, i, 1);
}
/* Perform left shift. */
LOOP (offset_mask, i) { i_shl(r, vl[i], vl[i], IMM(offsets[i])); CMTF("vl[%u] <<= %u;", i, offsets[i]); }
LOOP_VH(s, offset_mask, i) { i_shl(r, vh[i], vh[i], IMM(offsets[i])); CMTF("vh[%u] <<= %u;", i, offsets[i]); }
/* Combine components. */
reshape_all_vectors(s, 16, 1);
LOOP_MASK (p, i) {
if (i != 0) {
i_orr (r, vl[0], vl[0], vl[i]); CMTF("vl[0] |= vl[%u];", i);
if (s->use_vh) {
i_orr(r, vh[0], vh[0], vh[i]); CMTF("vh[0] |= vh[%u];", i);
}
}
}
}
/*********************************************************************/
/* logical left shift of raw pixel values */
/* AARCH64_SWS_OP_LSHIFT */
static void asmgen_op_lshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
LOOP_MASK (p, i) { i_shl(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] <<= %u;", i, p->shift); }
LOOP_MASK_VH(s, p, i) { i_shl(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] <<= %u;", i, p->shift); }
}
/*********************************************************************/
/* right shift of raw pixel values */
/* AARCH64_SWS_OP_RSHIFT */
static void asmgen_op_rshift(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
LOOP_MASK (p, i) { i_ushr(r, vl[i], vl[i], IMM(p->shift)); CMTF("vl[%u] >>= %u;", i, p->shift); }
LOOP_MASK_VH(s, p, i) { i_ushr(r, vh[i], vh[i], IMM(p->shift)); CMTF("vh[%u] >>= %u;", i, p->shift); }
}
/*********************************************************************/
/* clear pixel values */
/* AARCH64_SWS_OP_CLEAR */
static void asmgen_op_clear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
RasmOp clear_vec = s->vt[0];
/**
* TODO
* - pack elements in impl->priv and perform smaller loads
* - if only 1 element and not vh, load directly with ld1r
*/
i_ldr(r, v_q(clear_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 clear_vec = impl->priv.v128;");
LOOP_MASK (p, i) { i_dup(r, vl[i], a64op_elem(clear_vec, i)); CMTF("vl[%u] = broadcast(clear_vec[%u])", i, i); }
LOOP_MASK_VH(s, p, i) { i_dup(r, vh[i], a64op_elem(clear_vec, i)); CMTF("vh[%u] = broadcast(clear_vec[%u])", i, i); }
}
/*********************************************************************/
/* convert (cast) between formats */
/* AARCH64_SWS_OP_CONVERT */
static void asmgen_op_convert(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
AArch64VecViews vl[4];
AArch64VecViews vh[4];
/**
* Since each instruction in the convert operation needs specific
* element types, it is simpler to use arrangement specifiers for
* each operand instead of reshaping all vectors.
*/
for (int i = 0; i < 4; i++) {
a64op_vec_views(s->vl[i], &vl[i]);
a64op_vec_views(s->vh[i], &vh[i]);
}
size_t src_el_size = s->el_size;
size_t dst_el_size = aarch64_pixel_size(p->to_type);
/**
* This function assumes block_size is either 8 or 16, and that
* we're always using the most amount of vector registers possible.
* Therefore, u32 always uses the high vector bank.
*/
if (p->type == AARCH64_PIXEL_F32) {
rasm_add_comment(r, "f32 -> u32");
LOOP_MASK(p, i) i_fcvtzu(r, vl[i].s4, vl[i].s4);
LOOP_MASK(p, i) i_fcvtzu(r, vh[i].s4, vh[i].s4);
}
if (p->block_size == 8) {
if (src_el_size == 1 && dst_el_size > src_el_size) {
rasm_add_comment(r, "u8 -> u16");
LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8);
src_el_size = 2;
} else if (src_el_size == 4 && dst_el_size < src_el_size) {
rasm_add_comment(r, "u32 -> u16");
LOOP_MASK(p, i) i_xtn (r, vl[i].h4, vl[i].s4);
LOOP_MASK(p, i) i_xtn (r, vh[i].h4, vh[i].s4);
LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]);
src_el_size = 2;
}
if (src_el_size == 2 && dst_el_size == 4) {
rasm_add_comment(r, "u16 -> u32");
LOOP_MASK(p, i) i_uxtl2(r, vh[i].s4, vl[i].h8);
LOOP_MASK(p, i) i_uxtl (r, vl[i].s4, vl[i].h4);
src_el_size = 4;
} else if (src_el_size == 2 && dst_el_size == 1) {
rasm_add_comment(r, "u16 -> u8");
LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8);
src_el_size = 1;
}
} else /* if (p->block_size == 16) */ {
if (src_el_size == 1 && dst_el_size == 2) {
rasm_add_comment(r, "u8 -> u16");
LOOP_MASK(p, i) i_uxtl2(r, vh[i].h8, vl[i].b16);
LOOP_MASK(p, i) i_uxtl (r, vl[i].h8, vl[i].b8);
} else if (src_el_size == 2 && dst_el_size == 1) {
rasm_add_comment(r, "u16 -> u8");
LOOP_MASK(p, i) i_xtn (r, vl[i].b8, vl[i].h8);
LOOP_MASK(p, i) i_xtn (r, vh[i].b8, vh[i].h8);
LOOP_MASK(p, i) i_ins (r, vl[i].de[1], vh[i].de[0]);
}
}
/* See comment above for high vector bank usage for u32. */
if (p->to_type == AARCH64_PIXEL_F32) {
rasm_add_comment(r, "u32 -> f32");
LOOP_MASK(p, i) i_ucvtf(r, vl[i].s4, vl[i].s4);
LOOP_MASK(p, i) i_ucvtf(r, vh[i].s4, vh[i].s4);
}
}
/*********************************************************************/
/* expand integers to the full range */
/* AARCH64_SWS_OP_EXPAND */
static void asmgen_op_expand(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
size_t src_el_size = s->el_size;
size_t dst_el_size = aarch64_pixel_size(p->to_type);
size_t dst_total_size = p->block_size * dst_el_size;
size_t dst_vec_size = FFMIN(dst_total_size, 16);
if (!s->use_vh)
s->use_vh = (dst_vec_size != dst_total_size);
if (src_el_size == 1) {
rasm_add_comment(r, "u8 -> u16");
reshape_all_vectors(s, 16, 1);
LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
}
if (dst_el_size == 4) {
rasm_add_comment(r, "u16 -> u32");
reshape_all_vectors(s, 8, 2);
LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
LOOP_MASK (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
}
}
/*********************************************************************/
/* numeric minimum */
/* AARCH64_SWS_OP_MIN */
static void asmgen_op_min(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
RasmOp *vt = s->vt;
RasmOp min_vec = s->vt[4];
i_ldr(r, v_q(min_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 min_vec = impl->priv.v128;");
LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(min_vec, i)); CMTF("v128 vmin%u = min_vec[%u];", i, i); }
if (p->type == AARCH64_PIXEL_F32) {
LOOP_MASK (p, i) { i_fmin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
LOOP_MASK_VH(s, p, i) { i_fmin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
} else {
LOOP_MASK (p, i) { i_umin(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
LOOP_MASK_VH(s, p, i) { i_umin(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
}
}
/*********************************************************************/
/* numeric maximum */
/* AARCH64_SWS_OP_MAX */
static void asmgen_op_max(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
RasmOp *vt = s->vt;
RasmOp max_vec = s->vt[4];
i_ldr(r, v_q(max_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 max_vec = impl->priv.v128;");
LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(max_vec, i)); CMTF("v128 vmax%u = max_vec[%u];", i, i); }
if (p->type == AARCH64_PIXEL_F32) {
LOOP_MASK (p, i) { i_fmax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
LOOP_MASK_VH(s, p, i) { i_fmax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
} else {
LOOP_MASK (p, i) { i_umax(r, vl[i], vl[i], vt[i]); CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
LOOP_MASK_VH(s, p, i) { i_umax(r, vh[i], vh[i], vt[i]); CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
}
}
/*********************************************************************/
/* multiplication by scalar */
/* AARCH64_SWS_OP_SCALE */
static void asmgen_op_scale(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
RasmOp priv_ptr = s->tmp0;
RasmOp scale_vec = s->vt[0];
i_add (r, priv_ptr, s->impl, IMM(offsetof_impl_priv)); CMT("v128 *scale_vec_ptr = &impl->priv;");
i_ld1r(r, vv_1(scale_vec), a64op_base(priv_ptr)); CMT("v128 scale_vec = broadcast(*scale_vec_ptr);");
if (p->type == AARCH64_PIXEL_F32) {
LOOP_MASK (p, i) { i_fmul(r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); }
LOOP_MASK_VH(s, p, i) { i_fmul(r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); }
} else {
LOOP_MASK (p, i) { i_mul (r, vl[i], vl[i], scale_vec); CMTF("vl[%u] *= scale_vec;", i); }
LOOP_MASK_VH(s, p, i) { i_mul (r, vh[i], vh[i], scale_vec); CMTF("vh[%u] *= scale_vec;", i); }
}
}
/*********************************************************************/
/* generalized linear affine transform */
/* AARCH64_SWS_OP_LINEAR */
/**
* Performs one pass of the linear transform over a single vector bank
* (low or high).
*/
static void linear_pass(SwsAArch64Context *s, const SwsAArch64OpImplParams *p,
RasmOp *vt, RasmOp *vc,
int save_mask, bool vh_pass)
{
RasmContext *r = s->rctx;
/**
* The intermediate registers for fmul+fadd (for when SWS_BITEXACT
* is set) start from temp vector 4.
*/
RasmOp *vtmp = &vt[4];
RasmOp *vx = vh_pass ? s->vh : s->vl;
char cvh = vh_pass ? 'h' : 'l';
if (vh_pass && !s->use_vh)
return;
/**
* Save rows that need to be used as input after they have been already
* written to.
*/
RasmOp src_vx[4] = { vx[0], vx[1], vx[2], vx[3] };
if (save_mask) {
for (int i = 0; i < 4; i++) {
if (MASK_GET(save_mask, i)) {
src_vx[i] = vt[i];
i_mov16b(r, vt[i], vx[i]); CMTF("vsrc[%u] = v%c[%u];", i, cvh, i);
}
}
}
/**
* The non-zero coefficients have been packed in aarch64_setup_linear()
* in sequential order into the individual lanes of the coefficient
* vector registers. We must follow the same order of execution here.
*/
int i_coeff = 0;
LOOP_MASK(p, i) {
bool first = true;
RasmNode *pre_mul = rasm_get_current_node(r);
for (int j = 0; j < 5; j++) {
if (!LINEAR_MASK_GET(p->linear.mask, i, j))
continue;
bool is_offset = linear_index_is_offset(j);
int src_j = linear_index_to_vx(j);
RasmOp vsrc = src_vx[src_j];
uint8_t vc_i = i_coeff / 4;
uint8_t vc_j = i_coeff & 3;
RasmOp vcoeff = a64op_elem(vc[vc_i], vc_j);
i_coeff++;
if (first && is_offset) {
i_dup (r, vx[i], vcoeff); CMTF("v%c[%u] = broadcast(vc[%u][%u]);", cvh, i, vc_i, vc_j);
} else if (first && !is_offset) {
if (LINEAR_MASK_GET(p->linear.mask, i, j) == LINEAR_MASK_1) {
i_mov16b(r, vx[i], vsrc); CMTF("v%c[%u] = vsrc[%u];", cvh, i, src_j);
} else {
i_fmul (r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] = vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
}
} else if (!p->linear.fmla) {
/**
* Split the multiply-accumulate into fmul+fadd. All
* multiplications are performed first into temporary
* registers, and only then added to the destination,
* to reduce the dependency chain.
* There is no need to perform multiplications by 1.
*/
if (LINEAR_MASK_GET(p->linear.mask, i, j) != LINEAR_MASK_1) {
pre_mul = rasm_set_current_node(r, pre_mul);
i_fmul(r, vtmp[vc_j], vsrc, vcoeff); CMTF("vtmp[%u] = vsrc[%u] * vc[%u][%u];", vc_j, src_j, vc_i, vc_j);
pre_mul = rasm_set_current_node(r, pre_mul);
i_fadd(r, vx[i], vx[i], vtmp[vc_j]); CMTF("v%c[%u] += vtmp[%u];", cvh, i, vc_j);
} else {
i_fadd(r, vx[i], vx[i], vsrc); CMTF("v%c[%u] += vsrc[%u];", cvh, i, vc_j);
}
} else {
/**
* Most modern aarch64 cores have a fastpath for sequences
* of fmla instructions. This means that even if the coefficient
* is 1, it is still faster to use fmla by 1 instead of fadd.
*/
i_fmla(r, vx[i], vsrc, vcoeff); CMTF("v%c[%u] += vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
}
first = false;
}
}
}
static void asmgen_op_linear(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vt = s->vt;
RasmOp *vc = &vt[8]; /* The coefficients are loaded starting from temp vector 8 */
RasmOp ptr = s->tmp0;
RasmOp coeff_veclist;
/* Preload coefficients from impl->priv. */
const int num_vregs = linear_num_vregs(p);
av_assert0(num_vregs <= 4);
switch (num_vregs) {
case 1: coeff_veclist = vv_1(vc[0]); break;
case 2: coeff_veclist = vv_2(vc[0], vc[1]); break;
case 3: coeff_veclist = vv_3(vc[0], vc[1], vc[2]); break;
case 4: coeff_veclist = vv_4(vc[0], vc[1], vc[2], vc[3]); break;
}
i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 *vcoeff_ptr = impl->priv.ptr;");
i_ld1(r, coeff_veclist, a64op_base(ptr)); CMT("coeff_veclist = *vcoeff_ptr;");
/* Compute mask for rows that must be saved before being overwritten. */
uint16_t save_mask = 0;
bool overwritten[4] = { false, false, false, false };
LOOP_MASK(p, i) {
for (int j = 0; j < 5; j++) {
if (!LINEAR_MASK_GET(p->linear.mask, i, j))
continue;
bool is_offset = linear_index_is_offset(j);
int src_j = linear_index_to_vx(j);
if (!is_offset && overwritten[src_j])
MASK_SET(save_mask, j - 1, 1);
overwritten[i] = true;
}
}
/* Perform linear passes for low and high vector banks. */
linear_pass(s, p, vt, vc, save_mask, false);
linear_pass(s, p, vt, vc, save_mask, true);
}
/*********************************************************************/
/* add dithering noise */
/* AARCH64_SWS_OP_DITHER */
static void asmgen_op_dither(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
RasmOp *vl = s->vl;
RasmOp *vh = s->vh;
RasmOp ptr = s->tmp0;
RasmOp tmp1 = s->tmp1;
RasmOp wtmp1 = a64op_w(tmp1);
RasmOp dither_vl = s->vt[0];
RasmOp dither_vh = s->vt[1];
RasmOp bx64 = a64op_x(s->bx);
RasmOp y64 = a64op_x(s->y);
/**
* For a description of the matrix buffer layout, read the comments
* in aarch64_setup_dither() in aarch64/ops.c.
*/
/**
* Sort components by y_offset value so that we can start dithering
* with the smallest value, and increment the pointer upwards for
* each new offset. The dither matrix is over-allocated and may be
* over-read at the top, but it cannot be over-read before the start
* of the buffer. Since we only mask the y offset once, this would
* be an issue if we tried to subtract a value larger than the
* initial y_offset.
*/
int sorted[4];
int n_comps = 0;
/* Very cheap bucket sort. */
int max_offset = 0;
LOOP_MASK(p, i)
max_offset = FFMAX(max_offset, MASK_GET(p->dither.y_offset, i));
for (int y_off = 0; y_off <= max_offset; y_off++) {
LOOP_MASK(p, i) {
if (MASK_GET(p->dither.y_offset, i) == y_off)
sorted[n_comps++] = i;
}
}
i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv)); CMT("void *ptr = impl->priv.ptr;");
/**
* We use ubfiz to mask and shift left in one single instruction:
* ubfiz <Wd>, <Wn>, #<lsb>, #<width>
* Wd = (Wn & ((1 << width) - 1)) << lsb;
*
* Given:
* block_size = 8, log2(block_size) = 3
* dither_size = 16, log2(dither_size) = 4, dither_mask = 0b1111
* sizeof(float) = 4, log2(sizeof(float)) = 2
*
* Suppose we have bx = 0bvvvv. To get x, we left shift by
* log2(block_size) and end up with 0bvvvv000. Then we mask against
* dither_mask, and end up with 0bv000. Finally we multiply by
* sizeof(float), which is the same as shifting left by
* log2(sizeof(float)). The result is 0bv00000.
*
* Therefore:
* width = log2(dither_size) - log2(block_size)
* lsb = log2(block_size) + log2(sizeof(float))
*/
const int block_size_log2 = (p->block_size == 16) ? 4 : 3;
const int dither_size_log2 = p->dither.size_log2;
const int sizeof_float_log2 = 2;
if (dither_size_log2 != block_size_log2) {
RasmOp lsb = IMM(block_size_log2 + sizeof_float_log2);
RasmOp width = IMM(dither_size_log2 - block_size_log2);
i_ubfiz(r, tmp1, bx64, lsb, width); CMT("tmp1 = (bx & ((dither_size / block_size) - 1)) * block_size * sizeof(float);");
i_add (r, ptr, ptr, tmp1); CMT("ptr += tmp1;");
}
int last_y_off = -1;
int prev_i = 0;
for (int sorted_i = 0; sorted_i < n_comps; sorted_i++) {
int i = sorted[sorted_i];
uint8_t y_off = MASK_GET(p->dither.y_offset, i);
bool do_load = (y_off != last_y_off);
if (last_y_off < 0) {
/* On the first run, calculate pointer inside dither_matrix. */
RasmOp lsb = IMM(dither_size_log2 + sizeof_float_log2);
RasmOp width = IMM(dither_size_log2);
/**
* The ubfiz instruction for the y offset performs masking
* by the dither matrix size and shifts by the stride.
*/
if (y_off == 0) {
i_ubfiz(r, tmp1, y64, lsb, width); CMT("tmp1 = (y & (dither_size - 1)) * dither_size * sizeof(float);");
} else {
i_add (r, wtmp1, s->y, IMM(y_off)); CMTF("tmp1 = y + y_off[%u];", i);
i_ubfiz(r, tmp1, tmp1, lsb, width); CMT("tmp1 = (tmp1 & (dither_size - 1)) * dither_size * sizeof(float);");
}
i_add(r, ptr, ptr, tmp1); CMT("ptr += tmp1;");
} else if (do_load) {
/**
* On subsequent runs, just increment the pointer.
* The matrix is over-allocated, so we don't risk
* overreading.
*/
int delta = (y_off - last_y_off) * (1 << dither_size_log2) * sizeof(float);
i_add(r, ptr, ptr, IMM(delta)); CMTF("ptr += (y_off[%u] - y_off[%u]) * dither_size * sizeof(float);", i, prev_i);
}
if (do_load) {
RasmOp dither_vlq = v_q(dither_vl);
RasmOp dither_vhq = v_q(dither_vh);
i_ldp (r, dither_vlq, dither_vhq, a64op_base(ptr)); CMT("{ ditherl, ditherh } = *ptr;");
}
i_fadd (r, vl[i], vl[i], dither_vl); CMTF("vl[%u] += vditherl;", i);
if (s->use_vh) {
i_fadd(r, vh[i], vh[i], dither_vh); CMTF("vh[%u] += vditherh;", i);
}
last_y_off = y_off;
prev_i = i;
}
}
/*********************************************************************/
static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
RasmContext *r = s->rctx;
char func_name[128];
aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
rasm_func_begin(r, func_name, true, true);
/**
* Set up vector register dimensions and reshape all vectors
* accordingly.
*/
size_t el_size = aarch64_pixel_size(p->type);
size_t total_size = p->block_size * el_size;
s->vec_size = FFMIN(total_size, 16);
s->use_vh = (s->vec_size != total_size);
s->el_size = el_size;
s->el_count = s->vec_size / el_size;
reshape_all_vectors(s, s->el_count, el_size);
/* Common start for continuation-passing style (CPS) functions. */
i_ldr(r, s->cont, a64op_off(s->impl, offsetof_impl_cont)); CMT("SwsFuncPtr cont = impl->cont;");
switch (p->op) {
case AARCH64_SWS_OP_READ_BIT: asmgen_op_read_bit(s, p); break;
case AARCH64_SWS_OP_READ_NIBBLE: asmgen_op_read_nibble(s, p); break;
case AARCH64_SWS_OP_READ_PACKED: asmgen_op_read_packed(s, p); break;
case AARCH64_SWS_OP_READ_PLANAR: asmgen_op_read_planar(s, p); break;
case AARCH64_SWS_OP_WRITE_BIT: asmgen_op_write_bit(s, p); break;
case AARCH64_SWS_OP_WRITE_NIBBLE: asmgen_op_write_nibble(s, p); break;
case AARCH64_SWS_OP_WRITE_PACKED: asmgen_op_write_packed(s, p); break;
case AARCH64_SWS_OP_WRITE_PLANAR: asmgen_op_write_planar(s, p); break;
case AARCH64_SWS_OP_SWAP_BYTES: asmgen_op_swap_bytes(s, p); break;
case AARCH64_SWS_OP_SWIZZLE: asmgen_op_swizzle(s, p); break;
case AARCH64_SWS_OP_UNPACK: asmgen_op_unpack(s, p); break;
case AARCH64_SWS_OP_PACK: asmgen_op_pack(s, p); break;
case AARCH64_SWS_OP_LSHIFT: asmgen_op_lshift(s, p); break;
case AARCH64_SWS_OP_RSHIFT: asmgen_op_rshift(s, p); break;
case AARCH64_SWS_OP_CLEAR: asmgen_op_clear(s, p); break;
case AARCH64_SWS_OP_CONVERT: asmgen_op_convert(s, p); break;
case AARCH64_SWS_OP_EXPAND: asmgen_op_expand(s, p); break;
case AARCH64_SWS_OP_MIN: asmgen_op_min(s, p); break;
case AARCH64_SWS_OP_MAX: asmgen_op_max(s, p); break;
case AARCH64_SWS_OP_SCALE: asmgen_op_scale(s, p); break;
case AARCH64_SWS_OP_LINEAR: asmgen_op_linear(s, p); break;
case AARCH64_SWS_OP_DITHER: asmgen_op_dither(s, p); break;
/* TODO implement AARCH64_SWS_OP_SHUFFLE */
default:
break;
}
/* Common end for CPS functions. */
i_add(r, s->impl, s->impl, IMM(sizeof_impl)); CMT("impl += 1;");
i_br (r, s->cont); CMT("jump to cont");
}
static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
{
switch (p->op) {
case AARCH64_SWS_OP_PROCESS:
asmgen_process(s, p);
break;
case AARCH64_SWS_OP_PROCESS_RETURN:
asmgen_process_return(s, p);
break;
default:
asmgen_op_cps(s, p);
break;
}
}
/*********************************************************************/
static void aarch64_op_impl_lookup_str(char *buf, size_t size, const SwsAArch64OpImplParams *params,
const SwsAArch64OpImplParams *prev, const char *p_str)
{
int first_diff = 0;
int prev_levels = 0;
int levels = 0;
/* Compute number of current levels. */
if (params) {
const ParamField **fields = op_fields[params->op];
while (fields[levels])
levels++;
}
/* Compute number of previous levels. */
if (prev) {
const ParamField **prev_fields = op_fields[prev->op];
while (prev_fields[prev_levels])
prev_levels++;
}
/* Walk up and check the conditions that match. */
if (params && prev) {
const ParamField **fields = op_fields[params->op];
first_diff = -1;
for (int i = 0; fields[i]; i++) {
const ParamField *field = fields[i];
if (first_diff < 0) {
int diff = field->cmp_val((void *) (((uintptr_t) params) + field->offset),
(void *) (((uintptr_t) prev) + field->offset));
if (diff)
first_diff = i;
}
}
}
/* Walk back closing conditions. */
if (prev) {
for (int i = prev_levels - 1; i > first_diff; i--) {
buf_appendf(&buf, &size, "%*sreturn NULL;\n", 4 * (i + 1), "");
buf_appendf(&buf, &size, "%*s}\n", 4 * i, "");
}
}
/* Walk up adding conditions to return current function. */
if (params) {
const ParamField **fields = op_fields[params->op];
for (int i = first_diff; i < levels; i++) {
const ParamField *field = fields[i];
void *p = (void *) (((uintptr_t) params) + field->offset);
buf_appendf(&buf, &size, "%*sif (%s%s == ", 4 * (i + 1), "", p_str, field->name);
field->print_val(&buf, &size, p);
buf_appendf(&buf, &size, ")");
if (i == (levels - 1)) {
buf_appendf(&buf, &size, " return ");
impl_func_name(&buf, &size, params);
buf_appendf(&buf, &size, ";\n");
} else {
buf_appendf(&buf, &size, " {\n");
}
}
}
av_assert0(size && "string buffer exhausted");
}
static int lookup_gen(void)
{
char buf[1024];
/**
* The lookup function matches the SwsAArch64OpImplParams from
* ops_entries.c to the exported functions generated by asmgen_op().
* Each call to aarch64_op_impl_lookup_str() generates a code
* fragment to uniquely detect the current function, opening and/or
* closing conditions depending on the parameters of the previous
* function.
*/
/* External function declarations. */
printf("#include \"libswscale/aarch64/ops_lookup.h\"\n");
printf("\n");
for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
aarch64_op_impl_func_name(buf, sizeof(buf), p);
printf("extern void %s(void);\n", buf);
}
printf("\n");
/* Lookup function. */
printf("SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams *p)\n");
printf("{\n");
const SwsAArch64OpImplParams *prev = NULL;
for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
aarch64_op_impl_lookup_str(buf, sizeof(buf), p, prev, "p->");
printf("%s", buf);
prev = p;
}
aarch64_op_impl_lookup_str(buf, sizeof(buf), NULL, prev, "p->");
printf("%s", buf);
printf(" return NULL;\n");
printf("}\n");
return 0;
}
/*********************************************************************/
/* Generate all functions described by ops_entries.c */
static int asmgen(void)
{
RasmContext *rctx = rasm_alloc();
if (!rctx)
return AVERROR(ENOMEM);
SwsAArch64Context s = { .rctx = rctx };
int ret;
/**
* The entry point of the SwsOpFunc is the `process` function. The
* kernel functions are chained by directly branching to the next
* operation, using a continuation-passing style design. The exit
* point of the SwsOpFunc is the `process_return` function.
*
* The GPRs used by the entire call-chain are listed below.
*
* Function arguments are passed in r0-r5. After the parameters
* from `exec` have been read, r0 is reused to branch to the
* continuation functions. After the original parameters from
* `impl` have been computed, r1 is reused as the `impl` pointer
* for each operation.
*
* Loop iterators are r6 for `bx` and r3 for `y`, reused from
* `y_start`, which doesn't need to be preserved.
*
* The intra-procedure-call temporary registers (r16 and r17) are
* used as scratch registers. They may be used by call veneers and
* PLT code inserted by the linker, so we cannot expect them to
* persist across branches between functions.
*
* The Platform Register (r18) is not used.
*
* The read/write data pointers and padding values first use up the
* remaining free caller-saved registers, and only then are the
* caller-saved registers (r19-r28) used.
*/
/* SwsOpFunc arguments. */
s.exec = a64op_gpx(0); // const SwsOpExec *exec
s.impl = a64op_gpx(1); // const void *priv
s.bx_start = a64op_gpw(2); // int bx_start
s.y_start = a64op_gpw(3); // int y_start
s.bx_end = a64op_gpw(4); // int bx_end
s.y_end = a64op_gpw(5); // int y_end
/* Loop iterator variables. */
s.bx = a64op_gpw(6);
s.y = s.y_start; /* Reused from SwsOpFunc argument. */
/* Scratch registers. */
s.tmp0 = a64op_gpx(16); /* IP0 */
s.tmp1 = a64op_gpx(17); /* IP1 */
/* CPS-related variables. */
s.op0_func = a64op_gpx(7);
s.op1_impl = a64op_gpx(8);
s.cont = s.exec; /* Reused from SwsOpFunc argument. */
/* Read/Write data pointers and padding. */
s.in [0] = a64op_gpx(9);
s.out [0] = a64op_gpx(10);
s.in_bump [0] = a64op_gpx(11);
s.out_bump[0] = a64op_gpx(12);
s.in [1] = a64op_gpx(13);
s.out [1] = a64op_gpx(14);
s.in_bump [1] = a64op_gpx(15);
s.out_bump[1] = a64op_gpx(19);
s.in [2] = a64op_gpx(20);
s.out [2] = a64op_gpx(21);
s.in_bump [2] = a64op_gpx(22);
s.out_bump[2] = a64op_gpx(23);
s.in [3] = a64op_gpx(24);
s.out [3] = a64op_gpx(25);
s.in_bump [3] = a64op_gpx(26);
s.out_bump[3] = a64op_gpx(27);
/* Generate all functions from ops_entries.c using rasm. */
const SwsAArch64OpImplParams *params = impl_params;
while (params->op) {
asmgen_op(&s, params++);
if (rctx->error) {
ret = rctx->error;
goto error;
}
}
/* Print all rasm functions to stdout. */
printf("#include \"libavutil/aarch64/asm.S\"\n");
printf("\n");
ret = rasm_print(s.rctx, stdout);
error:
rasm_free(&s.rctx);
return ret;
}
/*********************************************************************/
int main(int argc, char *argv[])
{
bool lookup = false;
bool ops = false;
#ifdef _WIN32
_setmode(_fileno(stdout), _O_BINARY);
#endif
for (int i = 1; i < argc; i++) {
if (!strcmp(argv[i], "-ops"))
ops = true;
else if (!strcmp(argv[i], "-lookup"))
lookup = true;
}
if ((lookup && ops) || (!lookup && !ops)) {
fprintf(stderr, "Exactly one of -ops or -lookup must be specified.\n");
return -1;
}
return lookup ? lookup_gen() : asmgen();
}