avcodec/x86/vp3dsp: Remove remnants of MMX

Forgotten in eefec06634. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Reviewed-by: Lynne <dev@lynne.ee> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
avcodec/x86/hevc/add_res: Avoid unnecessary modification
2025-11-02 12:01:52 +01:00 · 2025-11-02 09:46:15 +01:00 · 2025-11-02 09:46:15 +01:00 · 2025-11-02 09:46:15 +01:00 · 2025-11-02 00:21:18 -03:00
5 changed files with 30 additions and 77 deletions
--- a/libavcodec/exr.c
+++ b/libavcodec/exr.c
@ -196,9 +196,9 @@ typedef struct EXRContext {
    enum AVColorTransferCharacteristic apply_trc_type;
    float gamma;
    uint16_t gamma_table[65536];
-    Float2HalfTables f2h_tables;
 #endif

+    Float2HalfTables f2h_tables;
    Half2FloatTables h2f_tables;
 } EXRContext;

@ -2300,9 +2300,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
    union av_intfloat32 t;
    float one_gamma = 1.0f / s->gamma;
    av_csp_trc_function trc_func = NULL;
-    ff_init_float2half_tables(&s->f2h_tables);
 #endif

+    ff_init_float2half_tables(&s->f2h_tables);
    ff_init_half2float_tables(&s->h2f_tables);

    s->avctx              = avctx;
--- a/libavcodec/x86/hevc/add_res.asm
+++ b/libavcodec/x86/hevc/add_res.asm
@ -27,9 +27,9 @@ cextern pw_1023
 %define max_pixels_10 pw_1023

 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
-%macro ADD_RES_MMX_4_8 0
-    mova              m0, [r1]
-    mova              m2, [r1+8]
+%macro ADD_RES_MMX_4_8 1
+    mova              m0, [r1+%1]
+    mova              m2, [r1+%1+8]

    movd              m1, [r0]
    movd              m3, [r0+r2]
@ -50,31 +50,26 @@ INIT_MMX mmxext
 ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
 cglobal hevc_add_residual_4_8, 3, 3, 6
    pxor              m4, m4
-    ADD_RES_MMX_4_8
-    add               r1, 16
+    ADD_RES_MMX_4_8    0
    lea               r0, [r0+r2*2]
-    ADD_RES_MMX_4_8
+    ADD_RES_MMX_4_8   16
    RET

-%macro ADD_RES_SSE_8_8 0
+%macro ADD_RES_SSE_8_8 1
    movq              m0, [r0]
    movq              m1, [r0+r2]
    punpcklbw         m0, m4
    punpcklbw         m1, m4
-    mova              m2, [r1]
-    mova              m3, [r1+16]
-    paddsw            m0, m2
-    paddsw            m1, m3
+    paddsw            m0, [r1+%1]
+    paddsw            m1, [r1+%1+16]
    packuswb          m0, m1

    movq              m2, [r0+r2*2]
    movq              m3, [r0+r3]
    punpcklbw         m2, m4
    punpcklbw         m3, m4
-    mova              m6, [r1+32]
-    mova              m7, [r1+48]
-    paddsw            m2, m6
-    paddsw            m3, m7
+    paddsw            m2, [r1+%1+32]
+    paddsw            m3, [r1+%1+48]
    packuswb          m2, m3

    movq            [r0], m0
@ -88,27 +83,33 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
    mova              m2, m1
    punpcklbw         m1, m0
    punpckhbw         m2, m0
+%if cpuflag(avx2)
    mova             xm5, [r1+%1]
    mova             xm6, [r1+%1+16]
-%if cpuflag(avx2)
    vinserti128       m5, m5, [r1+%1+32], 1
    vinserti128       m6, m6, [r1+%1+48], 1
-%endif
    paddsw            m1, m5
    paddsw            m2, m6
+%else
+    paddsw            m1, [r1+%1]
+    paddsw            m2, [r1+%1+16]
+%endif

    mova              m3, [%3]
    mova              m4, m3
    punpcklbw         m3, m0
    punpckhbw         m4, m0
+%if cpuflag(avx2)
    mova             xm5, [r1+%1+mmsize*2]
    mova             xm6, [r1+%1+mmsize*2+16]
-%if cpuflag(avx2)
    vinserti128       m5, m5, [r1+%1+96], 1
    vinserti128       m6, m6, [r1+%1+112], 1
-%endif
    paddsw            m3, m5
    paddsw            m4, m6
+%else
+    paddsw            m3, [r1+%1+mmsize*2]
+    paddsw            m4, [r1+%1+mmsize*2+16]
+%endif

    packuswb          m1, m2
    packuswb          m3, m4
@ -117,19 +118,18 @@ cglobal hevc_add_residual_4_8, 3, 3, 6
 %endmacro


-%macro TRANSFORM_ADD_8 0
+INIT_XMM sse2
 ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_8_8, 3, 4, 8
+cglobal hevc_add_residual_8_8, 3, 4, 5
    pxor              m4, m4
    lea               r3, [r2*3]
-    ADD_RES_SSE_8_8
-    add               r1, 64
+    ADD_RES_SSE_8_8    0
    lea               r0, [r0+r2*4]
-    ADD_RES_SSE_8_8
+    ADD_RES_SSE_8_8   64
    RET

 ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_16_8, 3, 5, 7
+cglobal hevc_add_residual_16_8, 3, 5, 5
    pxor                m0, m0
    lea                 r3, [r2*3]
    mov                r4d, 4
@ -143,7 +143,7 @@ cglobal hevc_add_residual_16_8, 3, 5, 7
    RET

 ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
-cglobal hevc_add_residual_32_8, 3, 5, 7
+cglobal hevc_add_residual_32_8, 3, 5, 5
    pxor                m0, m0
    mov                r4d, 16
 .loop:
@ -154,12 +154,7 @@ cglobal hevc_add_residual_32_8, 3, 5, 7
    dec                r4d
    jg .loop
    RET
-%endmacro

-INIT_XMM sse2
-TRANSFORM_ADD_8
-INIT_XMM avx
-TRANSFORM_ADD_8

 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
@ -295,9 +290,8 @@ cglobal hevc_add_residual_4_10, 3, 3, 6
    pxor              m2, m2
    mova              m3, [max_pixels_10]
    ADD_RES_MMX_4_10  r0, r2, r1
-    add               r1, 16
    lea               r0, [r0+2*r2]
-    ADD_RES_MMX_4_10  r0, r2, r1
+    ADD_RES_MMX_4_10  r0, r2, r1+16
    RET

 INIT_XMM sse2
@ -308,8 +302,7 @@ cglobal hevc_add_residual_8_10, 3, 4, 6

    ADD_RES_SSE_8_10  r0, r2, r3, r1
    lea               r0, [r0+r2*4]
-    add               r1, 64
-    ADD_RES_SSE_8_10  r0, r2, r3, r1
+    ADD_RES_SSE_8_10  r0, r2, r3, r1+64
    RET

 cglobal hevc_add_residual_16_10, 3, 5, 6
--- a/libavcodec/x86/hevc/dsp.h
+++ b/libavcodec/x86/hevc/dsp.h
@ -172,10 +172,6 @@ void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t s
 void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);

-void ff_hevc_add_residual_8_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_16_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-void ff_hevc_add_residual_32_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-
 void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride);

 void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
--- a/libavcodec/x86/hevc/dsp_init.c
+++ b/libavcodec/x86/hevc/dsp_init.c
@ -877,10 +877,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)

            c->idct[0] = ff_hevc_idct_4x4_8_avx;
            c->idct[1] = ff_hevc_idct_8x8_8_avx;
-
-            c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
-            c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
-            c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
        }
        if (EXTERNAL_AVX2(cpu_flags)) {
            c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@ -520,7 +520,6 @@ cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
 %endmacro

 %macro VP3_IDCT 1
-%if mmsize == 16
 %define I(x) [%1+16*x]
 %define O(x) [%1+16*x]
 %define C(x) [vp3_idct_data+16*(x-1)]
@ -538,37 +537,6 @@ cglobal put_vp_no_rnd_pixels8_l2, 5, 6, 0, dst, src1, src2, stride, h, stride3
 %define ADD(x)   paddsw x, [pw_8]
        VP3_1D_IDCT_SSE2
        PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
-%else ; mmsize == 8
-    ; eax = quantized input
-    ; ebx = dequantizer matrix
-    ; ecx = IDCT constants
-    ;  M(I) = ecx + MaskOffset(0) + I * 8
-    ;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
-    ; edx = output
-    ; r0..r7 = mm0..mm7
-%define OC_8 [pw_8]
-%define C(x) [vp3_idct_data+16*(x-1)]
-
-    ; at this point, function has completed dequantization + dezigzag +
-    ; partial transposition; now do the idct itself
-%define I(x) [%1+16*x]
-%define J(x) [%1+16*x]
-    RowIDCT
-    Transpose
-
-%define I(x) [%1+16*x+8]
-%define J(x) [%1+16*x+8]
-    RowIDCT
-    Transpose
-
-%define I(x) [%1+16* x]
-%define J(x) [%1+16*(x-4)+8]
-    ColumnIDCT
-
-%define I(x) [%1+16* x   +64]
-%define J(x) [%1+16*(x-4)+72]
-    ColumnIDCT
-%endif ; mmsize == 16/8
 %endmacro

 %macro vp3_idct_funcs 0
Author	SHA1	Message	Date
Andreas Rheinhardt	a677b38298	avcodec/x86/vp3dsp: Remove remnants of MMX Forgotten in `eefec06634`. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Reviewed-by: Lynne <dev@lynne.ee> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-02 12:01:52 +01:00
Andreas Rheinhardt	d355749ca6	avcodec/x86/hevc/add_res: Avoid unnecessary modification Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-02 09:46:15 +01:00
Andreas Rheinhardt	f4d9fb0bd0	avcodec/x86/hevc/add_res: Reduce number of registers used This makes these functions use only volatile registers (even on Win64). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-02 09:46:15 +01:00
Andreas Rheinhardt	23efbb5e2e	avcodec/x86/hevc/add_res: Remove AVX add_residual functions The AVX and SSE2 functions are identical except for the VEX encodings used since `e9abef437f` and `8b8492452d`. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-02 09:46:15 +01:00
James Almer	7770c0bf0d	avcodec/exr: don't remove Float2HalfTables tables alongside the deprecated gamma code It's used by other parts of the module that will fail to build otherwise after the aforementioned removal. Signed-off-by: James Almer <jamrial@gmail.com>	2025-11-02 00:21:18 -03:00