Compare commits

...

2 Commits

Author SHA1 Message Date
Zhao Zhili 222127418b avutil/internal: remove some compats for msvc
They are supported by MSVC with C11.
2025-11-06 02:16:27 +00:00
Andreas Rheinhardt 79080a547a avcodec/x86/h264_chromamc: Use xmm regs in chroma_mc4 SSSE3 functions
Doubling the register size allowed to avoid two pmaddubsw.
It is also ABI compliant (the old version lacked an emms)
and the average versions no longer rely on padding (the old versions
used pavgb with a memory operand reading eight bytes,
although only four are needed).

Old benchmarks (the latter four refer to RV40):
avg_h264_chroma_mc4_8_c:                               145.7 ( 1.00x)
avg_h264_chroma_mc4_8_ssse3:                            32.3 ( 4.51x)
put_h264_chroma_mc4_8_c:                               136.1 ( 1.00x)
put_h264_chroma_mc4_8_ssse3:                            29.0 ( 4.70x)
avg_chroma_mc4_c:                                      162.1 ( 1.00x)
avg_chroma_mc4_ssse3:                                   31.1 ( 5.22x)
put_chroma_mc4_c:                                      137.5 ( 1.00x)
put_chroma_mc4_ssse3:                                   28.6 ( 4.81x)

New benchmarks:
avg_h264_chroma_mc4_8_c:                               146.7 ( 1.00x)
avg_h264_chroma_mc4_8_ssse3:                            26.5 ( 5.53x)
put_h264_chroma_mc4_8_c:                               136.8 ( 1.00x)
put_h264_chroma_mc4_8_ssse3:                            22.5 ( 6.09x)
avg_chroma_mc4_c:                                      165.5 ( 1.00x)
avg_chroma_mc4_ssse3:                                   27.2 ( 6.08x)
put_chroma_mc4_c:                                      138.1 ( 1.00x)
put_chroma_mc4_ssse3:                                   23.2 ( 5.96x)

Reviewed-by: Lynne <dev@lynne.ee>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
2025-11-06 02:16:28 +01:00
2 changed files with 46 additions and 54 deletions

View File

@ -276,51 +276,57 @@ cglobal %1_%2_chroma_mc8%3, 6, 7+UNIX64, 8
%endmacro %endmacro
%macro chroma_mc4_ssse3_func 2 %macro chroma_mc4_ssse3_func 2
cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 0 cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 8
movq m5, [pw_32] mova m5, [pw_32]
..@%1_%2_chroma_mc4_after_init_ %+ cpuname: ..@%1_%2_chroma_mc4_after_init_ %+ cpuname:
mov r6, r4 mov r6d, r4d
shl r4d, 8 shl r4d, 8
sub r4d, r6d movd m0, [r1]
mov r6, 8 sub r6d, 8
add r4d, 8 ; x*288+8 sub r4d, r6d ; x << 8 | (8-x)
sub r6d, r5d mov r6d, r5d
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) shl r5d, 16
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) movd m1, [r1+1]
sub r6d, 8
sub r5d, r6d ; y << 16 | (8-y)
imul r4d, r5d ; xy << 24 | (8-x)y << 16 | x(8-y) << 8 | (8-x)(8-y)
add r1, r2
movd m7, r6d movd m6, r4d ; ABCD
movd m6, r4d punpcklwd m6, m6 ; ABABCDCD
movd m0, [r1 ] pshufd m7, m6, 0x55 ; CDCDCDCDCDCDCDCD
pshufw m7, m7, 0 punpcklbw m0, m1
punpcklbw m0, [r1+1] pshufd m6, m6, 0x0 ; ABABABABABABABAB
pshufw m6, m6, 0
.next2rows: .next2rows:
movd m1, [r1+r2*1 ] movd m1, [r1]
movd m3, [r1+r2*2 ] movd m2, [r1+1]
punpcklbw m1, [r1+r2*1+1] movd m3, [r1+r2]
punpcklbw m3, [r1+r2*2+1] movd m4, [r1+r2+1]
lea r1, [r1+r2*2] punpcklbw m1, m2
movq m2, m1 punpcklqdq m0, m1
movq m4, m3 pmaddubsw m0, m6
pmaddubsw m0, m7 punpcklbw m3, m4
pmaddubsw m1, m6 punpcklqdq m1, m3
pmaddubsw m2, m7 pmaddubsw m1, m7
pmaddubsw m3, m6 %ifidn %1, avg
movd m2, [r0]
movd m4, [r0+r2]
%endif
paddw m0, m5 paddw m0, m5
paddw m2, m5 lea r1, [r1+r2*2]
paddw m1, m0 paddw m0, m1
paddw m3, m2 psrlw m0, 6
psrlw m1, 6 packuswb m0, m0
movq m0, m4 pshufd m1, m0, 0x1
psrlw m3, 6 %ifidn %1, avg
packuswb m1, m1 pavgb m0, m2
packuswb m3, m3 pavgb m1, m4
CHROMAMC_AVG m1, [r0 ] %endif
CHROMAMC_AVG m3, [r0+r2]
movd [r0 ], m1
movd [r0+r2], m3
sub r3d, 2 sub r3d, 2
movd [r0], m0
movd [r0+r2], m1
mova m0, m3
lea r0, [r0+r2*2] lea r0, [r0+r2*2]
jg .next2rows jg .next2rows
RET RET
@ -379,26 +385,23 @@ cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 0
%macro rv40_chroma_mc4_func 1 ; put vs avg %macro rv40_chroma_mc4_func 1 ; put vs avg
%if CONFIG_RV40_DECODER %if CONFIG_RV40_DECODER
cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 0 cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 8
rv40_get_bias m5 rv40_get_bias m5
jmp ..@%1_h264_chroma_mc4_after_init_ %+ cpuname jmp ..@%1_h264_chroma_mc4_after_init_ %+ cpuname
%endif %endif
%endmacro %endmacro
%define CHROMAMC_AVG NOTHING
INIT_XMM ssse3 INIT_XMM ssse3
%define CHROMAMC_AVG NOTHING
chroma_mc8_ssse3_func put, h264, _rnd chroma_mc8_ssse3_func put, h264, _rnd
chroma_mc8_ssse3_func put, vc1, _nornd chroma_mc8_ssse3_func put, vc1, _nornd
rv40_chroma_mc8_func put rv40_chroma_mc8_func put
INIT_MMX ssse3
chroma_mc4_ssse3_func put, h264 chroma_mc4_ssse3_func put, h264
rv40_chroma_mc4_func put rv40_chroma_mc4_func put
%define CHROMAMC_AVG DIRECT_AVG %define CHROMAMC_AVG DIRECT_AVG
INIT_XMM ssse3
chroma_mc8_ssse3_func avg, h264, _rnd chroma_mc8_ssse3_func avg, h264, _rnd
chroma_mc8_ssse3_func avg, vc1, _nornd chroma_mc8_ssse3_func avg, vc1, _nornd
rv40_chroma_mc8_func avg rv40_chroma_mc8_func avg
INIT_MMX ssse3
chroma_mc4_ssse3_func avg, h264 chroma_mc4_ssse3_func avg, h264
rv40_chroma_mc4_func avg rv40_chroma_mc4_func avg

View File

@ -115,19 +115,8 @@ void avpriv_report_missing_feature(void *avc,
void avpriv_request_sample(void *avc, void avpriv_request_sample(void *avc,
const char *msg, ...) av_printf_format(2, 3); const char *msg, ...) av_printf_format(2, 3);
#if HAVE_LIBC_MSVCRT
#include <crtversion.h>
#if defined(_VC_CRT_MAJOR_VERSION) && _VC_CRT_MAJOR_VERSION < 14
#pragma comment(linker, "/include:" EXTERN_PREFIX "avpriv_strtod")
#pragma comment(linker, "/include:" EXTERN_PREFIX "avpriv_snprintf")
#endif
#define PTRDIFF_SPECIFIER "Id"
#define SIZE_SPECIFIER "Iu"
#else
#define PTRDIFF_SPECIFIER "td" #define PTRDIFF_SPECIFIER "td"
#define SIZE_SPECIFIER "zu" #define SIZE_SPECIFIER "zu"
#endif
#ifdef DEBUG #ifdef DEBUG
# define ff_dlog(ctx, ...) av_log(ctx, AV_LOG_DEBUG, __VA_ARGS__) # define ff_dlog(ctx, ...) av_log(ctx, AV_LOG_DEBUG, __VA_ARGS__)