Compare commits

...

5 Commits

Author SHA1 Message Date
Andreas Rheinhardt a677b38298 avcodec/x86/vp3dsp: Remove remnants of MMX
Forgotten in eefec06634.

Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Reviewed-by: Lynne <dev@lynne.ee>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-02 12:01:52 +01:00
Andreas Rheinhardt d355749ca6 avcodec/x86/hevc/add_res: Avoid unnecessary modification
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-02 09:46:15 +01:00
Andreas Rheinhardt f4d9fb0bd0 avcodec/x86/hevc/add_res: Reduce number of registers used
This makes these functions use only volatile registers (even on Win64).

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-02 09:46:15 +01:00
Andreas Rheinhardt 23efbb5e2e avcodec/x86/hevc/add_res: Remove AVX add_residual functions
The AVX and SSE2 functions are identical except for the VEX encodings
used since e9abef437f and
8b8492452d.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-02 09:46:15 +01:00
James Almer 7770c0bf0d avcodec/exr: don't remove Float2HalfTables tables alongside the deprecated gamma code
It's used by other parts of the module that will fail to build otherwise after
the aforementioned removal.

Signed-off-by: James Almer <jamrial@gmail.com>
2025-11-02 00:21:18 -03:00
5 changed files with 30 additions and 77 deletions

View File

@ -196,9 +196,9 @@ typedef struct EXRContext {
enum AVColorTransferCharacteristic apply_trc_type; enum AVColorTransferCharacteristic apply_trc_type;
float gamma; float gamma;
uint16_t gamma_table[65536]; uint16_t gamma_table[65536];
Float2HalfTables f2h_tables;
#endif #endif
Float2HalfTables f2h_tables;
Half2FloatTables h2f_tables; Half2FloatTables h2f_tables;
} EXRContext; } EXRContext;
@ -2300,9 +2300,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
union av_intfloat32 t; union av_intfloat32 t;
float one_gamma = 1.0f / s->gamma; float one_gamma = 1.0f / s->gamma;
av_csp_trc_function trc_func = NULL; av_csp_trc_function trc_func = NULL;
ff_init_float2half_tables(&s->f2h_tables);
#endif #endif
ff_init_float2half_tables(&s->f2h_tables);
ff_init_half2float_tables(&s->h2f_tables); ff_init_half2float_tables(&s->h2f_tables);
s->avctx = avctx; s->avctx = avctx;

View File

@ -27,9 +27,9 @@ cextern pw_1023
%define max_pixels_10 pw_1023 %define max_pixels_10 pw_1023
; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
%macro ADD_RES_MMX_4_8 0 %macro ADD_RES_MMX_4_8 1
mova m0, [r1] mova m0, [r1+%1]
mova m2, [r1+8] mova m2, [r1+%1+8]
movd m1, [r0] movd m1, [r0]
movd m3, [r0+r2] movd m3, [r0+r2]
@ -50,31 +50,26 @@ INIT_MMX mmxext
; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride) ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_4_8, 3, 3, 6 cglobal hevc_add_residual_4_8, 3, 3, 6
pxor m4, m4 pxor m4, m4
ADD_RES_MMX_4_8 ADD_RES_MMX_4_8 0
add r1, 16
lea r0, [r0+r2*2] lea r0, [r0+r2*2]
ADD_RES_MMX_4_8 ADD_RES_MMX_4_8 16
RET RET
%macro ADD_RES_SSE_8_8 0 %macro ADD_RES_SSE_8_8 1
movq m0, [r0] movq m0, [r0]
movq m1, [r0+r2] movq m1, [r0+r2]
punpcklbw m0, m4 punpcklbw m0, m4
punpcklbw m1, m4 punpcklbw m1, m4
mova m2, [r1] paddsw m0, [r1+%1]
mova m3, [r1+16] paddsw m1, [r1+%1+16]
paddsw m0, m2
paddsw m1, m3
packuswb m0, m1 packuswb m0, m1
movq m2, [r0+r2*2] movq m2, [r0+r2*2]
movq m3, [r0+r3] movq m3, [r0+r3]
punpcklbw m2, m4 punpcklbw m2, m4
punpcklbw m3, m4 punpcklbw m3, m4
mova m6, [r1+32] paddsw m2, [r1+%1+32]
mova m7, [r1+48] paddsw m3, [r1+%1+48]
paddsw m2, m6
paddsw m3, m7
packuswb m2, m3 packuswb m2, m3
movq [r0], m0 movq [r0], m0
@ -88,27 +83,33 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
mova m2, m1 mova m2, m1
punpcklbw m1, m0 punpcklbw m1, m0
punpckhbw m2, m0 punpckhbw m2, m0
%if cpuflag(avx2)
mova xm5, [r1+%1] mova xm5, [r1+%1]
mova xm6, [r1+%1+16] mova xm6, [r1+%1+16]
%if cpuflag(avx2)
vinserti128 m5, m5, [r1+%1+32], 1 vinserti128 m5, m5, [r1+%1+32], 1
vinserti128 m6, m6, [r1+%1+48], 1 vinserti128 m6, m6, [r1+%1+48], 1
%endif
paddsw m1, m5 paddsw m1, m5
paddsw m2, m6 paddsw m2, m6
%else
paddsw m1, [r1+%1]
paddsw m2, [r1+%1+16]
%endif
mova m3, [%3] mova m3, [%3]
mova m4, m3 mova m4, m3
punpcklbw m3, m0 punpcklbw m3, m0
punpckhbw m4, m0 punpckhbw m4, m0
%if cpuflag(avx2)
mova xm5, [r1+%1+mmsize*2] mova xm5, [r1+%1+mmsize*2]
mova xm6, [r1+%1+mmsize*2+16] mova xm6, [r1+%1+mmsize*2+16]
%if cpuflag(avx2)
vinserti128 m5, m5, [r1+%1+96], 1 vinserti128 m5, m5, [r1+%1+96], 1
vinserti128 m6, m6, [r1+%1+112], 1 vinserti128 m6, m6, [r1+%1+112], 1
%endif
paddsw m3, m5 paddsw m3, m5
paddsw m4, m6 paddsw m4, m6
%else
paddsw m3, [r1+%1+mmsize*2]
paddsw m4, [r1+%1+mmsize*2+16]
%endif
packuswb m1, m2 packuswb m1, m2
packuswb m3, m4 packuswb m3, m4
@ -117,19 +118,18 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
%endmacro %endmacro
%macro TRANSFORM_ADD_8 0 INIT_XMM sse2
; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride) ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_8_8, 3, 4, 8 cglobal hevc_add_residual_8_8, 3, 4, 5
pxor m4, m4 pxor m4, m4
lea r3, [r2*3] lea r3, [r2*3]
ADD_RES_SSE_8_8 ADD_RES_SSE_8_8 0
add r1, 64
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
ADD_RES_SSE_8_8 ADD_RES_SSE_8_8 64
RET RET
; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride) ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_16_8, 3, 5, 7 cglobal hevc_add_residual_16_8, 3, 5, 5
pxor m0, m0 pxor m0, m0
lea r3, [r2*3] lea r3, [r2*3]
mov r4d, 4 mov r4d, 4
@ -143,7 +143,7 @@ cglobal hevc_add_residual_16_8, 3, 5, 7
RET RET
; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride) ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual_32_8, 3, 5, 7 cglobal hevc_add_residual_32_8, 3, 5, 5
pxor m0, m0 pxor m0, m0
mov r4d, 16 mov r4d, 16
.loop: .loop:
@ -154,12 +154,7 @@ cglobal hevc_add_residual_32_8, 3, 5, 7
dec r4d dec r4d
jg .loop jg .loop
RET RET
%endmacro
INIT_XMM sse2
TRANSFORM_ADD_8
INIT_XMM avx
TRANSFORM_ADD_8
%if HAVE_AVX2_EXTERNAL %if HAVE_AVX2_EXTERNAL
INIT_YMM avx2 INIT_YMM avx2
@ -295,9 +290,8 @@ cglobal hevc_add_residual_4_10, 3, 3, 6
pxor m2, m2 pxor m2, m2
mova m3, [max_pixels_10] mova m3, [max_pixels_10]
ADD_RES_MMX_4_10 r0, r2, r1 ADD_RES_MMX_4_10 r0, r2, r1
add r1, 16
lea r0, [r0+2*r2] lea r0, [r0+2*r2]
ADD_RES_MMX_4_10 r0, r2, r1 ADD_RES_MMX_4_10 r0, r2, r1+16
RET RET
INIT_XMM sse2 INIT_XMM sse2
@ -308,8 +302,7 @@ cglobal hevc_add_residual_8_10, 3, 4, 6
ADD_RES_SSE_8_10 r0, r2, r3, r1 ADD_RES_SSE_8_10 r0, r2, r3, r1
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
add r1, 64 ADD_RES_SSE_8_10 r0, r2, r3, r1+64
ADD_RES_SSE_8_10 r0, r2, r3, r1
RET RET
cglobal hevc_add_residual_16_10, 3, 5, 6 cglobal hevc_add_residual_16_10, 3, 5, 6

View File

@ -172,10 +172,6 @@ void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t s
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride);

View File

@ -877,10 +877,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_8_avx; c->idct[0] = ff_hevc_idct_4x4_8_avx;
c->idct[1] = ff_hevc_idct_8x8_8_avx; c->idct[1] = ff_hevc_idct_8x8_8_avx;
c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
} }
if (EXTERNAL_AVX2(cpu_flags)) { if (EXTERNAL_AVX2(cpu_flags)) {
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2; c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;

View File

@ -520,7 +520,6 @@ cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
%endmacro %endmacro
%macro VP3_IDCT 1 %macro VP3_IDCT 1
%if mmsize == 16
%define I(x) [%1+16*x] %define I(x) [%1+16*x]
%define O(x) [%1+16*x] %define O(x) [%1+16*x]
%define C(x) [vp3_idct_data+16*(x-1)] %define C(x) [vp3_idct_data+16*(x-1)]
@ -538,37 +537,6 @@ cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
%define ADD(x) paddsw x, [pw_8] %define ADD(x) paddsw x, [pw_8]
VP3_1D_IDCT_SSE2 VP3_1D_IDCT_SSE2
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%else ; mmsize == 8
; eax = quantized input
; ebx = dequantizer matrix
; ecx = IDCT constants
; M(I) = ecx + MaskOffset(0) + I * 8
; C(I) = ecx + CosineOffset(32) + (I-1) * 8
; edx = output
; r0..r7 = mm0..mm7
%define OC_8 [pw_8]
%define C(x) [vp3_idct_data+16*(x-1)]
; at this point, function has completed dequantization + dezigzag +
; partial transposition; now do the idct itself
%define I(x) [%1+16*x]
%define J(x) [%1+16*x]
RowIDCT
Transpose
%define I(x) [%1+16*x+8]
%define J(x) [%1+16*x+8]
RowIDCT
Transpose
%define I(x) [%1+16* x]
%define J(x) [%1+16*(x-4)+8]
ColumnIDCT
%define I(x) [%1+16* x +64]
%define J(x) [%1+16*(x-4)+72]
ColumnIDCT
%endif ; mmsize == 16/8
%endmacro %endmacro
%macro vp3_idct_funcs 0 %macro vp3_idct_funcs 0