Compare commits

...

4 Commits

Author SHA1 Message Date
Andreas Rheinhardt ed007ad427 avcodec/x86/fpel: Port ff_put_pixels8_mmx() to SSE2
This has the advantage of not violating the ABI by using
MMX registers without issuing emms; it e.g. allows
to remove an emms_c from bink.c.

This commit uses GP registers on Unix64 (there are not
enough volatile registers to do likewise on Win64) which
reduces codesize and is faster on some CPUs.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-10-17 13:27:56 +02:00
Andreas Rheinhardt d91b1559e0 avcodec/x86/me_cmp: Replace MMXEXT size 16 funcs by unaligned SSE2 funcs
Snow calls some of the me_cmp_funcs with insufficient alignment
for the first pointer (see get_block_rd() in snowenc.c);
therefore SSE2 functions which really need this alignment
don't get set for Snow and 542765ce3e
consequently didn't remove MMXEXT functions which are overridden
by these SSE2 functions for normal codecs.

For reference, here is a command line which would segfault
if one simply used the ordinary SSE2 functions for Snow:

./ffmpeg -i mm-short.mpg -an -vcodec snow -t 0.2 -pix_fmt yuv444p \
-vstrict -2 -qscale 2 -flags +qpel -motion_est iter 444iter.avi

This commit adds unaligned SSE2 versions of these functions
and removes the MMXEXT ones. This in particular implies that
sad 16x16 now never uses MMX which allows to remove an emms_c
from ac3enc.c.

Benchmarks (u means unaligned version):
sad_0_c:                                                 8.2 ( 1.00x)
sad_0_mmxext:                                           10.8 ( 0.76x)
sad_0_sse2:                                              6.2 ( 1.33x)
sad_0_sse2u:                                             6.7 ( 1.23x)

vsad_0_c:                                               44.7 ( 1.00x)
vsad_0_mmxext (approx):                                 12.2 ( 3.68x)
vsad_0_sse2 (approx):                                    7.8 ( 5.75x)

vsad_4_c:                                               88.4 ( 1.00x)
vsad_4_mmxext:                                           7.1 (12.46x)
vsad_4_sse2:                                             4.2 (21.15x)
vsad_4_sse2u:                                            5.5 (15.96x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-10-17 13:05:07 +02:00
Andreas Rheinhardt 69a700043d avcodec/x86/me_cmp: Remove MMXEXT functions overridden by SSE2
The SSE2 function overriding them are currently only set
if the SSE2SLOW flag is not set and if the codec is not Snow.
The former affects only outdated processors (AMDs from
before Barcelona (i.e. before 2007)) and is therefore irrelevant.
Snow does not use the pix_abs function pointers at all,
so this is also no obstacle.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-10-17 13:05:07 +02:00
Andreas Rheinhardt 20c4608af8 avcodec/x86/me_cmp: Add SSE2 sad 8,16 xy2 functions
The new functions are faster than the existing exact
functions, yet get beaten by the nonexact functions
(they can avoid unpacking to words and back).
The exact (slow) MMX functions have therefore been
removed, which was actually beneficial size-wise
(416B of new functions, 619B of functions removed).

pix_abs_0_3_c:                                         216.8 ( 1.00x)
pix_abs_0_3_mmx:                                        71.8 ( 3.02x)
pix_abs_0_3_mmxext (approximative):                     17.6 (12.34x)
pix_abs_0_3_sse2:                                       23.5 ( 9.23x)
pix_abs_0_3_sse2 (approximative):                        9.9 (21.94x)

pix_abs_1_3_c:                                          98.4 ( 1.00x)
pix_abs_1_3_mmx:                                        36.9 ( 2.66x)
pix_abs_1_3_mmxext (approximative):                      9.2 (10.73x)
pix_abs_1_3_sse2:                                       14.8 ( 6.63x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-10-17 13:05:07 +02:00
10 changed files with 205 additions and 364 deletions

View File

@ -32,7 +32,6 @@
#include "libavutil/avassert.h"
#include "libavutil/channel_layout.h"
#include "libavutil/crc.h"
#include "libavutil/emms.h"
#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
@ -945,8 +944,6 @@ static void ac3_process_exponents(AC3EncodeContext *s)
compute_exp_strategy(s);
encode_exponents(s);
emms_c();
}

View File

@ -21,7 +21,6 @@
*/
#include "libavutil/attributes.h"
#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
@ -1297,7 +1296,6 @@ static int decode_frame(AVCodecContext *avctx, AVFrame *frame,
if (get_bits_count(&gb) >= bits_count)
break;
}
emms_c();
if (c->version > 'b') {
if ((ret = av_frame_replace(c->last, frame)) < 0)

View File

@ -46,13 +46,6 @@ static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride)
#endif /* HAVE_SSE2_EXTERNAL */
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c)
{
#if HAVE_MMX_EXTERNAL
c->put_cavs_qpel_pixels_tab[1][0] = ff_put_pixels8x8_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
#if HAVE_SSE2_EXTERNAL
#define DEF_QPEL(OPNAME) \
void ff_ ## OPNAME ## _cavs_qpel8_mc20_sse2(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \
@ -98,9 +91,6 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
{
av_unused int cpu_flags = av_get_cpu_flags();
if (X86_MMX(cpu_flags))
cavsdsp_init_mmx(c);
#if HAVE_MMX_EXTERNAL
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->avg_cavs_qpel_pixels_tab[1][0] = ff_avg_pixels8x8_mmxext;
@ -113,6 +103,7 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c)
c->put_cavs_qpel_pixels_tab[0][ 4] = put_cavs_qpel16_mc01_sse2;
c->put_cavs_qpel_pixels_tab[0][ 8] = put_cavs_qpel16_mc02_sse2;
c->put_cavs_qpel_pixels_tab[0][12] = put_cavs_qpel16_mc03_sse2;
c->put_cavs_qpel_pixels_tab[1][ 0] = ff_put_pixels8x8_sse2;
c->put_cavs_qpel_pixels_tab[1][ 2] = ff_put_cavs_qpel8_mc20_sse2;
c->put_cavs_qpel_pixels_tab[1][ 4] = put_cavs_qpel8_mc01_sse2;
c->put_cavs_qpel_pixels_tab[1][ 8] = ff_put_cavs_qpel8_mc02_sse2;

View File

@ -27,7 +27,7 @@ SECTION .text
; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
; ptrdiff_t line_size, int h)
%macro OP_PIXELS 2
%macro OP_PIXELS 2-3 0
%if %2 == mmsize/2
%define LOAD movh
%define SAVE movh
@ -35,14 +35,25 @@ SECTION .text
%define LOAD movu
%define SAVE mova
%endif
cglobal %1_pixels%2x%2, 3,5,4
cglobal %1_pixels%2x%2, 3,5+4*%3,%3 ? 4 : 0
mov r3d, %2
jmp %1_pixels%2_after_prologue
cglobal %1_pixels%2, 4,5,4
cglobal %1_pixels%2, 4,5+4*%3,%3 ? 4 : 0
%1_pixels%2_after_prologue:
lea r4, [r2*3]
.loop:
%if %3
; Use GPRs on UNIX64 for put8, but not on Win64 due to a lack of volatile GPRs
mov r5q, [r1]
mov r6q, [r1+r2]
mov r7q, [r1+r2*2]
mov r8q, [r1+r4]
mov [r0], r5q
mov [r0+r2], r6q
mov [r0+r2*2], r7q
mov [r0+r4], r8q
%else
LOAD m0, [r1]
LOAD m1, [r1+r2]
LOAD m2, [r1+r2*2]
@ -57,6 +68,7 @@ cglobal %1_pixels%2, 4,5,4
SAVE [r0+r2], m1
SAVE [r0+r2*2], m2
SAVE [r0+r4], m3
%endif
sub r3d, 4
lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
@ -64,12 +76,10 @@ cglobal %1_pixels%2, 4,5,4
RET
%endmacro
INIT_MMX mmx
OP_PIXELS put, 8
INIT_MMX mmxext
OP_PIXELS avg, 8
INIT_XMM sse2
OP_PIXELS put, 8, UNIX64
OP_PIXELS put, 16
OP_PIXELS avg, 16

View File

@ -30,10 +30,10 @@ void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16x16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8x8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels8_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8x8_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size);
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16x16_sse2(uint8_t *block, const uint8_t *pixels,

View File

@ -74,14 +74,6 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
#if HAVE_MMX_EXTERNAL
c->put_no_rnd_pixels_tab[1][0] =
c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
#endif
}
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
{
#if HAVE_MMXEXT_EXTERNAL
@ -115,6 +107,9 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2;
c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_sse2;
c->put_no_rnd_pixels_tab[1][0] =
c->put_pixels_tab[1][0] = ff_put_pixels8_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
@ -143,9 +138,6 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags))
hpeldsp_init_mmx(c, flags);
if (EXTERNAL_MMXEXT(cpu_flags))
hpeldsp_init_mmxext(c, flags);

View File

@ -23,10 +23,9 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_1
cextern pb_80
cextern pw_2
SECTION .text
@ -468,22 +467,25 @@ HF_NOISE 16
;---------------------------------------------------------------------------------------
;int ff_sad_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
;---------------------------------------------------------------------------------------
;%1 = 8/16
%macro SAD 1
cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
;%1 = 8/16, %2 = a/u (whether pix1 is aligned or not)
%macro SAD 1-2
%ifidn %2, u
cglobal sad%1u, 5, 5, 5, v, pix1, pix2, stride, h
%else
cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
%endif
movu m2, [pix2q]
movu m1, [pix2q+strideq]
%ifidn %2, u
movu m0, [pix1q]
movu m3, [pix1q+strideq]
psadbw m2, m0
psadbw m1, m3
%else
psadbw m2, [pix1q]
psadbw m1, [pix1q+strideq]
paddw m2, m1
%if %1 != mmsize
movu m0, [pix2q+8]
movu m1, [pix2q+strideq+8]
psadbw m0, [pix1q+8]
psadbw m1, [pix1q+strideq+8]
paddw m2, m0
paddw m2, m1
%endif
paddw m2, m1
sub hd, 2
align 16
@ -492,18 +494,17 @@ align 16
lea pix2q, [pix2q+strideq*2]
movu m0, [pix2q]
movu m1, [pix2q+strideq]
%ifidn %2, u
movu m3, [pix1q]
movu m4, [pix1q+strideq]
psadbw m0, m3
psadbw m1, m4
%else
psadbw m0, [pix1q]
psadbw m1, [pix1q+strideq]
paddw m2, m0
paddw m2, m1
%if %1 != mmsize
movu m0, [pix2q+8]
movu m1, [pix2q+strideq+8]
psadbw m0, [pix1q+8]
psadbw m1, [pix1q+strideq+8]
paddw m2, m0
paddw m2, m1
%endif
paddw m2, m0
paddw m2, m1
sub hd, 2
jg .loop
%if mmsize == 16
@ -516,9 +517,9 @@ align 16
INIT_MMX mmxext
SAD 8
SAD 16
INIT_XMM sse2
SAD 16
SAD 16, u
;------------------------------------------------------------------------------------------
;int ff_sad_x2_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
@ -540,16 +541,6 @@ cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
psadbw m0, [pix1q]
psadbw m2, [pix1q+strideq]
paddw m0, m2
%if %1 != mmsize
movu m1, [pix2q+8]
movu m2, [pix2q+strideq+8]
pavgb m1, [pix2q+9]
pavgb m2, [pix2q+strideq+9]
psadbw m1, [pix1q+8]
psadbw m2, [pix1q+strideq+8]
paddw m0, m1
paddw m0, m2
%endif
sub hd, 2
align 16
@ -571,16 +562,6 @@ align 16
psadbw m2, [pix1q+strideq]
paddw m0, m1
paddw m0, m2
%if %1 != mmsize
movu m1, [pix2q+8]
movu m2, [pix2q+strideq+8]
pavgb m1, [pix2q+9]
pavgb m2, [pix2q+strideq+9]
psadbw m1, [pix1q+8]
psadbw m2, [pix1q+strideq+8]
paddw m0, m1
paddw m0, m2
%endif
sub hd, 2
jg .loop
%if mmsize == 16
@ -593,7 +574,6 @@ align 16
INIT_MMX mmxext
SAD_X2 8
SAD_X2 16
INIT_XMM sse2
SAD_X2 16
@ -612,18 +592,6 @@ cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
psadbw m0, [pix1q+strideq]
paddw m0, m1
mova m1, m3
%if %1 != mmsize
movu m4, [pix2q+8]
movu m5, [pix2q+strideq+8]
movu m6, [pix2q+2*strideq+8]
pavgb m4, m5
pavgb m5, m6
psadbw m4, [pix1q+8]
psadbw m5, [pix1q+strideq+8]
paddw m0, m4
paddw m0, m5
mova m4, m6
%endif
add pix2q, strideq
sub hd, 2
@ -640,17 +608,6 @@ align 16
paddw m0, m1
paddw m0, m2
mova m1, m3
%if %1 != mmsize
movu m5, [pix2q+8]
movu m6, [pix2q+strideq+8]
pavgb m4, m5
pavgb m5, m6
psadbw m4, [pix1q+8]
psadbw m5, [pix1q+strideq+8]
paddw m0, m4
paddw m0, m5
mova m4, m6
%endif
sub hd, 2
jg .loop
%if mmsize == 16
@ -663,10 +620,105 @@ align 16
INIT_MMX mmxext
SAD_Y2 8
SAD_Y2 16
INIT_XMM sse2
SAD_Y2 16
;------------------------------------------------------------------------------------------
;int ff_sad_xy2_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
;------------------------------------------------------------------------------------------
;%1 = 8/16, %2 = aligned mov, %3 = unaligned mov
%macro SAD_XY2 3
cglobal sad%1_xy2, 5, 5, mmsize == 16 ? 8 + ARCH_X86_64 : 7, v, pix1, pix2, stride, h
mov%3 m2, [pix2q]
mov%3 m3, [pix2q+1]
%if %1 == mmsize
%if ARCH_X86_64
mova m8, [pw_2]
%define PW_2 m8
%else
%define PW_2 [pw_2]
%endif
%else ; %1 != mmsize
mova m6, [pw_2]
%define PW_2 m6
%endif
pxor m1, m1
add pix2q, strideq
%if %1 != mmsize/2
mova m6, m2
mova m7, m3
punpckhbw m6, m1
punpckhbw m7, m1
paddw m6, m7
%endif
punpcklbw m2, m1
punpcklbw m3, m1
paddw m2, m3
mova m0, m1
.loop:
mov%3 m3, [pix2q]
mov%3 m4, [pix2q+1]
%if %1 != mmsize/2
mova m5, m3
mova m7, m4
punpckhbw m5, m1
punpckhbw m7, m1
paddw m7, m5
paddw m7, PW_2
paddw m6, m7
psraw m6, 2
%endif
mov%2 m5, [pix1q]
punpcklbw m3, m1
punpcklbw m4, m1
paddw m3, m4
paddw m3, PW_2
paddw m2, m3
psraw m2, 2
packuswb m2, m6
psadbw m2, m5
paddw m0, m2
mov%3 m2, [pix2q+strideq]
mov%3 m4, [pix2q+strideq+1]
%if %1 != mmsize/2
mova m5, m2
mova m6, m4
punpckhbw m5, m1
punpckhbw m6, m1
paddw m6, m5
paddw m7, m6
psraw m7, 2
%endif
mov%2 m5, [pix1q+strideq]
punpcklbw m2, m1
punpcklbw m4, m1
paddw m2, m4
paddw m3, m2
psraw m3, 2
packuswb m3, m7
psadbw m3, m5
paddw m0, m3
sub hd, 2
lea pix1q, [pix1q+2*strideq]
lea pix2q, [pix2q+2*strideq]
jnz .loop
%if %1 == 16
movhlps m1, m0
paddw m0, m1
%endif
movd eax, m0
RET
%endmacro
INIT_XMM sse2
SAD_XY2 8, h, h
SAD_XY2 16, a, u
;-------------------------------------------------------------------------------------------
;int ff_sad_approx_xy2_<opt>(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
;-------------------------------------------------------------------------------------------
@ -696,22 +748,6 @@ cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
psadbw m0, [pix1q+strideq]
paddw m0, m1
mova m1, m3
%if %1 != mmsize
movu m5, [pix2q+8]
movu m6, [pix2q+strideq+8]
movu m7, [pix2q+2*strideq+8]
pavgb m5, [pix2q+1+8]
pavgb m6, [pix2q+strideq+1+8]
pavgb m7, [pix2q+2*strideq+1+8]
psubusb m6, m4
pavgb m5, m6
pavgb m6, m7
psadbw m5, [pix1q+8]
psadbw m6, [pix1q+strideq+8]
paddw m0, m5
paddw m0, m6
mova m5, m7
%endif
add pix2q, strideq
sub hd, 2
@ -738,20 +774,6 @@ align 16
paddw m0, m1
paddw m0, m2
mova m1, m3
%if %1 != mmsize
movu m6, [pix2q+8]
movu m7, [pix2q+strideq+8]
pavgb m6, [pix2q+8+1]
pavgb m7, [pix2q+strideq+8+1]
psubusb m6, m4
pavgb m5, m6
pavgb m6, m7
psadbw m5, [pix1q+8]
psadbw m6, [pix1q+strideq+8]
paddw m0, m5
paddw m0, m6
mova m5, m7
%endif
sub hd, 2
jg .loop
%if mmsize == 16
@ -764,7 +786,6 @@ align 16
INIT_MMX mmxext
SAD_APPROX_XY2 8
SAD_APPROX_XY2 16
INIT_XMM sse2
SAD_APPROX_XY2 16
@ -772,46 +793,26 @@ SAD_APPROX_XY2 16
;int ff_vsad_intra(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
; ptrdiff_t line_size, int h);
;--------------------------------------------------------------------
; %1 = 8/16
%macro VSAD_INTRA 1
cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
mova m0, [pix1q]
%if %1 == mmsize
mova m2, [pix1q+lsizeq]
psadbw m0, m2
; %1 = 8/16, %2 = a/u (whether pix1 is aligned or not)
%macro VSAD_INTRA 2
%ifidn %2, u
cglobal vsad_intra%1u, 5, 5, 3, v, pix1, pix2, lsize, h
%else
mova m2, [pix1q+lsizeq]
mova m3, [pix1q+8]
mova m4, [pix1q+lsizeq+8]
psadbw m0, m2
psadbw m3, m4
paddw m0, m3
cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
%endif
mov%2 m0, [pix1q]
mov%2 m2, [pix1q+lsizeq]
psadbw m0, m2
sub hd, 2
.loop:
lea pix1q, [pix1q + 2*lsizeq]
%if %1 == mmsize
mova m1, [pix1q]
mov%2 m1, [pix1q]
psadbw m2, m1
paddw m0, m2
mova m2, [pix1q+lsizeq]
mov%2 m2, [pix1q+lsizeq]
psadbw m1, m2
paddw m0, m1
%else
mova m1, [pix1q]
mova m3, [pix1q+8]
psadbw m2, m1
psadbw m4, m3
paddw m0, m2
paddw m0, m4
mova m2, [pix1q+lsizeq]
mova m4, [pix1q+lsizeq+8]
psadbw m1, m2
psadbw m3, m4
paddw m0, m1
paddw m0, m3
%endif
sub hd, 2
jg .loop
@ -824,22 +825,25 @@ cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
%endmacro
INIT_MMX mmxext
VSAD_INTRA 8
VSAD_INTRA 16
VSAD_INTRA 8, a
INIT_XMM sse2
VSAD_INTRA 16
VSAD_INTRA 16, a
VSAD_INTRA 16, u
;---------------------------------------------------------------------
;int ff_vsad_approx(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
; ptrdiff_t line_size, int h);
;---------------------------------------------------------------------
; %1 = 8/16
%macro VSAD_APPROX 1
cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
; %1 = 8/16, %2 = a/u (whether pix1 is aligned or not)
%macro VSAD_APPROX 2
%ifidn %2, u
cglobal vsad%1u_approx, 5, 5, 5, v, pix1, pix2, lsize, h
%else
cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
%endif
mova m1, [pb_80]
mova m0, [pix1q]
%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
mova m4, [pix1q+lsizeq]
mov%2 m0, [pix1q]
mov%2 m4, [pix1q+lsizeq]
%if mmsize == 16
movu m3, [pix2q]
movu m2, [pix2q+lsizeq]
@ -852,29 +856,12 @@ cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
pxor m0, m1
pxor m4, m1
psadbw m0, m4
%else ; vsad16_mmxext
mova m3, [pix1q+8]
psubb m0, [pix2q]
psubb m3, [pix2q+8]
pxor m0, m1
pxor m3, m1
mova m4, [pix1q+lsizeq]
mova m5, [pix1q+lsizeq+8]
psubb m4, [pix2q+lsizeq]
psubb m5, [pix2q+lsizeq+8]
pxor m4, m1
pxor m5, m1
psadbw m0, m4
psadbw m3, m5
paddw m0, m3
%endif
sub hd, 2
.loop:
lea pix1q, [pix1q + 2*lsizeq]
lea pix2q, [pix2q + 2*lsizeq]
mova m2, [pix1q]
%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
mov%2 m2, [pix1q]
%if mmsize == 16
movu m3, [pix2q]
psubb m2, m3
@ -884,33 +871,12 @@ cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
pxor m2, m1
psadbw m4, m2
paddw m0, m4
mova m4, [pix1q+lsizeq]
mov%2 m4, [pix1q+lsizeq]
movu m3, [pix2q+lsizeq]
psubb m4, m3
pxor m4, m1
psadbw m2, m4
paddw m0, m2
%else ; vsad16_mmxext
mova m3, [pix1q+8]
psubb m2, [pix2q]
psubb m3, [pix2q+8]
pxor m2, m1
pxor m3, m1
psadbw m4, m2
psadbw m5, m3
paddw m0, m4
paddw m0, m5
mova m4, [pix1q+lsizeq]
mova m5, [pix1q+lsizeq+8]
psubb m4, [pix2q+lsizeq]
psubb m5, [pix2q+lsizeq+8]
pxor m4, m1
pxor m5, m1
psadbw m2, m4
psadbw m3, m5
paddw m0, m2
paddw m0, m3
%endif
sub hd, 2
jg .loop
@ -923,7 +889,7 @@ cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
%endmacro
INIT_MMX mmxext
VSAD_APPROX 8
VSAD_APPROX 16
VSAD_APPROX 8, a
INIT_XMM sse2
VSAD_APPROX 16
VSAD_APPROX 16, a
VSAD_APPROX 16, u

View File

@ -24,8 +24,6 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem_internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/me_cmp.h"
#include "libavcodec/mpegvideoenc.h"
@ -42,40 +40,38 @@ int ff_hf_noise8_mmx(const uint8_t *pix1, ptrdiff_t stride, int h);
int ff_hf_noise16_mmx(const uint8_t *pix1, ptrdiff_t stride, int h);
int ff_sad8_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16u_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_x2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_x2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_x2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_y2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_y2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_y2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_approx_xy2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_approx_xy2_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad8_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_approx_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sad16_xy2_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad_intra8_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad_intra16_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad_intra16_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad_intra16u_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad8_approx_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad16_approx_mmxext(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad16_approx_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
#define hadamard_func(cpu) \
int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1, \
@ -121,127 +117,10 @@ static int nsse8_mmx(MPVEncContext *c, const uint8_t *pix1, const uint8_t *pix2,
#endif /* HAVE_X86ASM */
#if HAVE_INLINE_ASM
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
0x0000000000000000ULL,
0x0001000100010001ULL,
0x0002000200020002ULL,
};
static inline void sad8_4_mmx(const uint8_t *blk1, const uint8_t *blk2,
ptrdiff_t stride, int h)
{
x86_reg len = -stride * h;
__asm__ volatile (
"movq (%1, %%"FF_REG_a"), %%mm0\n\t"
"movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%2, %%"FF_REG_a"), %%mm2\n\t"
"movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddw %%mm4, %%mm2 \n\t"
"paddw %%mm5, %%mm3 \n\t"
"movq %5, %%mm5 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
"paddw %%mm5, %%mm0 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
"movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"psubusb %%mm0, %%mm4 \n\t"
"psubusb %%mm5, %%mm0 \n\t"
"por %%mm4, %%mm0 \n\t"
"movq %%mm0, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm4 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm4, %%mm6 \n\t"
"movq %%mm2, %%mm0 \n\t"
"movq %%mm3, %%mm1 \n\t"
"add %4, %%"FF_REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
"r" (stride), "m" (round_tab[2]));
}
static inline int sum_mmx(void)
{
int ret;
__asm__ volatile (
"movq %%mm6, %%mm0 \n\t"
"psrlq $32, %%mm6 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"movq %%mm6, %%mm0 \n\t"
"psrlq $16, %%mm6 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"movd %%mm6, %0 \n\t"
: "=r" (ret));
return ret & 0xFFFF;
}
#define PIX_SADXY(suf) \
static int sad8_xy2_ ## suf(MPVEncContext *v, const uint8_t *blk2, \
const uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
::); \
\
sad8_4_ ## suf(blk1, blk2, stride, h); \
\
return sum_ ## suf(); \
} \
\
static int sad16_xy2_ ## suf(MPVEncContext *v, const uint8_t *blk2, \
const uint8_t *blk1, ptrdiff_t stride, int h) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"pxor %%mm6, %%mm6 \n\t" \
::); \
\
sad8_4_ ## suf(blk1, blk2, stride, h); \
sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
\
return sum_ ## suf(); \
} \
PIX_SADXY(mmx)
#endif /* HAVE_INLINE_ASM */
av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
c->pix_abs[0][3] = sad16_xy2_mmx;
c->pix_abs[1][3] = sad8_xy2_mmx;
}
#endif /* HAVE_INLINE_ASM */
if (EXTERNAL_MMX(cpu_flags)) {
c->sse[1] = ff_sse8_mmx;
#if HAVE_X86ASM
@ -256,24 +135,17 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
#endif
c->sad[0] = ff_sad16_mmxext;
c->sad[1] = ff_sad8_mmxext;
c->pix_abs[0][0] = ff_sad16_mmxext;
c->pix_abs[0][1] = ff_sad16_x2_mmxext;
c->pix_abs[0][2] = ff_sad16_y2_mmxext;
c->pix_abs[1][0] = ff_sad8_mmxext;
c->pix_abs[1][1] = ff_sad8_x2_mmxext;
c->pix_abs[1][2] = ff_sad8_y2_mmxext;
c->vsad[4] = ff_vsad_intra16_mmxext;
c->vsad[5] = ff_vsad_intra8_mmxext;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
c->vsad[0] = ff_vsad16_approx_mmxext;
c->vsad[1] = ff_vsad8_approx_mmxext;
}
}
@ -282,21 +154,36 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->sse[0] = ff_sse16_sse2;
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
c->pix_abs[0][0] = ff_sad16_sse2;
c->pix_abs[0][1] = ff_sad16_x2_sse2;
c->pix_abs[0][2] = ff_sad16_y2_sse2;
c->pix_abs[0][3] = ff_sad16_xy2_sse2;
#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
#endif
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
if (avctx->codec_id != AV_CODEC_ID_SNOW) {
c->sad[0] = ff_sad16_sse2;
c->pix_abs[0][0] = ff_sad16_sse2;
c->pix_abs[0][1] = ff_sad16_x2_sse2;
c->pix_abs[0][2] = ff_sad16_y2_sse2;
c->vsad[4] = ff_vsad_intra16_sse2;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
c->vsad[0] = ff_vsad16_approx_sse2;
}
} else {
// Snow does not abide by the alignment requirements
// of blk1, so we use special versions without them for it.
c->sad[0] = ff_sad16u_sse2;
c->vsad[4] = ff_vsad_intra16u_sse2;
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
c->vsad[0] = ff_vsad16u_approx_sse2;
}
}
if (avctx->flags & AV_CODEC_FLAG_BITEXACT) {
c->pix_abs[1][3] = ff_sad8_xy2_sse2;
} else {
c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
}
}

View File

@ -521,8 +521,6 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
c->put_no_rnd_qpel_pixels_tab[1][0] =
c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_mmx;
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
@ -532,6 +530,8 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_no_rnd_qpel_pixels_tab[0][0] =
c->put_qpel_pixels_tab[0][0] = ff_put_pixels16x16_sse2;
c->put_no_rnd_qpel_pixels_tab[1][0] =
c->put_qpel_pixels_tab[1][0] = ff_put_pixels8x8_sse2;
c->avg_qpel_pixels_tab[0][0] = ff_avg_pixels16x16_sse2;
}
#endif

View File

@ -73,7 +73,7 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, ptrdiff_t stride, int pq)
ff_ ## OP ## pixels ## DEPTH ## INSN(dst, src, stride, DEPTH); \
}
DECLARE_FUNCTION(put_, 8, _mmx)
DECLARE_FUNCTION(put_, 8, _sse2)
DECLARE_FUNCTION(avg_, 8, _mmxext)
DECLARE_FUNCTION(put_, 16, _sse2)
DECLARE_FUNCTION(avg_, 16, _sse2)
@ -125,7 +125,6 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
if (EXTERNAL_MMX(cpu_flags)) {
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
dsp->put_vc1_mspel_pixels_tab[1][0] = put_vc1_mspel_mc00_8_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_LF4(mmxext);
@ -142,6 +141,7 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
ASSIGN_LF816(sse2);
dsp->put_vc1_mspel_pixels_tab[0][0] = put_vc1_mspel_mc00_16_sse2;
dsp->put_vc1_mspel_pixels_tab[1][0] = put_vc1_mspel_mc00_8_sse2;
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {