Compare commits

...

5 Commits

Author SHA1 Message Date
Andreas Rheinhardt a677b38298 avcodec/x86/vp3dsp: Remove remnants of MMX
Forgotten in eefec06634.

Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Reviewed-by: Lynne <dev@lynne.ee>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-02 12:01:52 +01:00
Andreas Rheinhardt d355749ca6 avcodec/x86/hevc/add_res: Avoid unnecessary modification
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-02 09:46:15 +01:00
Andreas Rheinhardt f4d9fb0bd0 avcodec/x86/hevc/add_res: Reduce number of registers used
This makes these functions use only volatile registers (even on Win64).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-02 09:46:15 +01:00
Andreas Rheinhardt 23efbb5e2e avcodec/x86/hevc/add_res: Remove AVX add_residual functions
The AVX and SSE2 functions are identical except for the VEX encodings
used since e9abef437f and
8b8492452d.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-02 09:46:15 +01:00
James Almer 7770c0bf0d avcodec/exr: don't remove Float2HalfTables tables alongside the deprecated gamma code
It's used by other parts of the module that will fail to build otherwise after
the aforementioned removal.

Signed-off-by: James Almer <jamrial@gmail.com>
2025-11-02 00:21:18 -03:00
5 changed files with 30 additions and 77 deletions

View File

@ -196,9 +196,9 @@ typedef struct EXRContext {
enum AVColorTransferCharacteristic apply_trc_type;
float gamma;
uint16_t gamma_table[65536];
Float2HalfTables f2h_tables;
#endif
Float2HalfTables f2h_tables;
Half2FloatTables h2f_tables;
} EXRContext;
@ -2300,9 +2300,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
union av_intfloat32 t;
float one_gamma = 1.0f / s->gamma;
av_csp_trc_function trc_func = NULL;
ff_init_float2half_tables(&s->f2h_tables);
#endif
ff_init_float2half_tables(&s->f2h_tables);
ff_init_half2float_tables(&s->h2f_tables);
s->avctx = avctx;

View File

@ -27,9 +27,9 @@ cextern pw_1023
%define max_pixels_10 pw_1023
; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
%macro ADD_RES_MMX_4_8 0
mova m0, [r1]
mova m2, [r1+8]
%macro ADD_RES_MMX_4_8 1
mova m0, [r1+%1]
mova m2, [r1+%1+8]
movd m1, [r0]
movd m3, [r0+r2]
@ -50,31 +50,26 @@ INIT_MMX mmxext
; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_4_8, 3, 3, 6
pxor m4, m4
ADD_RES_MMX_4_8
add r1, 16
ADD_RES_MMX_4_8 0
lea r0, [r0+r2*2]
ADD_RES_MMX_4_8
ADD_RES_MMX_4_8 16
RET
%macro ADD_RES_SSE_8_8 0
%macro ADD_RES_SSE_8_8 1
movq m0, [r0]
movq m1, [r0+r2]
punpcklbw m0, m4
punpcklbw m1, m4
mova m2, [r1]
mova m3, [r1+16]
paddsw m0, m2
paddsw m1, m3
paddsw m0, [r1+%1]
paddsw m1, [r1+%1+16]
packuswb m0, m1
movq m2, [r0+r2*2]
movq m3, [r0+r3]
punpcklbw m2, m4
punpcklbw m3, m4
mova m6, [r1+32]
mova m7, [r1+48]
paddsw m2, m6
paddsw m3, m7
paddsw m2, [r1+%1+32]
paddsw m3, [r1+%1+48]
packuswb m2, m3
movq [r0], m0
@ -88,27 +83,33 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
mova m2, m1
punpcklbw m1, m0
punpckhbw m2, m0
%if cpuflag(avx2)
mova xm5, [r1+%1]
mova xm6, [r1+%1+16]
%if cpuflag(avx2)
vinserti128 m5, m5, [r1+%1+32], 1
vinserti128 m6, m6, [r1+%1+48], 1
%endif
paddsw m1, m5
paddsw m2, m6
%else
paddsw m1, [r1+%1]
paddsw m2, [r1+%1+16]
%endif
mova m3, [%3]
mova m4, m3
punpcklbw m3, m0
punpckhbw m4, m0
%if cpuflag(avx2)
mova xm5, [r1+%1+mmsize*2]
mova xm6, [r1+%1+mmsize*2+16]
%if cpuflag(avx2)
vinserti128 m5, m5, [r1+%1+96], 1
vinserti128 m6, m6, [r1+%1+112], 1
%endif
paddsw m3, m5
paddsw m4, m6
%else
paddsw m3, [r1+%1+mmsize*2]
paddsw m4, [r1+%1+mmsize*2+16]
%endif
packuswb m1, m2
packuswb m3, m4
@ -117,19 +118,18 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
%endmacro
%macro TRANSFORM_ADD_8 0
INIT_XMM sse2
; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_8_8, 3, 4, 8
cglobal hevc_add_residual_8_8, 3, 4, 5
pxor m4, m4
lea r3, [r2*3]
ADD_RES_SSE_8_8
add r1, 64
ADD_RES_SSE_8_8 0
lea r0, [r0+r2*4]
ADD_RES_SSE_8_8
ADD_RES_SSE_8_8 64
RET
; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_16_8, 3, 5, 7
cglobal hevc_add_residual_16_8, 3, 5, 5
pxor m0, m0
lea r3, [r2*3]
mov r4d, 4
@ -143,7 +143,7 @@ cglobal hevc_add_residual_16_8, 3, 5, 7
RET
; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_32_8, 3, 5, 7
cglobal hevc_add_residual_32_8, 3, 5, 5
pxor m0, m0
mov r4d, 16
.loop:
@ -154,12 +154,7 @@ cglobal hevc_add_residual_32_8, 3, 5, 7
dec r4d
jg .loop
RET
%endmacro
INIT_XMM sse2
TRANSFORM_ADD_8
INIT_XMM avx
TRANSFORM_ADD_8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
@ -295,9 +290,8 @@ cglobal hevc_add_residual_4_10, 3, 3, 6
pxor m2, m2
mova m3, [max_pixels_10]
ADD_RES_MMX_4_10 r0, r2, r1
add r1, 16
lea r0, [r0+2*r2]
ADD_RES_MMX_4_10 r0, r2, r1
ADD_RES_MMX_4_10 r0, r2, r1+16
RET
INIT_XMM sse2
@ -308,8 +302,7 @@ cglobal hevc_add_residual_8_10, 3, 4, 6
ADD_RES_SSE_8_10 r0, r2, r3, r1
lea r0, [r0+r2*4]
add r1, 64
ADD_RES_SSE_8_10 r0, r2, r3, r1
ADD_RES_SSE_8_10 r0, r2, r3, r1+64
RET
cglobal hevc_add_residual_16_10, 3, 5, 6

View File

@ -172,10 +172,6 @@ void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t s
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride);

View File

@ -877,10 +877,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_8_avx;
c->idct[1] = ff_hevc_idct_8x8_8_avx;
c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
}
if (EXTERNAL_AVX2(cpu_flags)) {
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;

View File

@ -520,7 +520,6 @@ cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
%endmacro
%macro VP3_IDCT 1
%if mmsize == 16
%define I(x) [%1+16*x]
%define O(x) [%1+16*x]
%define C(x) [vp3_idct_data+16*(x-1)]
@ -538,37 +537,6 @@ cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
%define ADD(x) paddsw x, [pw_8]
VP3_1D_IDCT_SSE2
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%else ; mmsize == 8
; eax = quantized input
; ebx = dequantizer matrix
; ecx = IDCT constants
; M(I) = ecx + MaskOffset(0) + I * 8
; C(I) = ecx + CosineOffset(32) + (I-1) * 8
; edx = output
; r0..r7 = mm0..mm7
%define OC_8 [pw_8]
%define C(x) [vp3_idct_data+16*(x-1)]
; at this point, function has completed dequantization + dezigzag +
; partial transposition; now do the idct itself
%define I(x) [%1+16*x]
%define J(x) [%1+16*x]
RowIDCT
Transpose
%define I(x) [%1+16*x+8]
%define J(x) [%1+16*x+8]
RowIDCT
Transpose
%define I(x) [%1+16* x]
%define J(x) [%1+16*(x-4)+8]
ColumnIDCT
%define I(x) [%1+16* x +64]
%define J(x) [%1+16*(x-4)+72]
ColumnIDCT
%endif ; mmsize == 16/8
%endmacro
%macro vp3_idct_funcs 0