Compare commits

...

3 Commits

Author SHA1 Message Date
Bin Peng fef37af2e2 lavc/aarch64: Fix addp overflow in ff_pred16x16_plane_neon_10
The mismatch between neon and C functions can be reproduced
using the following bitstream and command line on aarch64 devices.

wget https://streams.videolan.org/ffmpeg/incoming/replay_intra_pred_16x16.h264
 ./ffmpeg -cpuflags 0  -threads 1 -i replay_intra_pred_16x16.h264  -f framemd5 -y md5_ref
 ./ffmpeg              -threads 1 -i replay_intra_pred_16x16.h264 -f framemd5 -y md5_neon

Signed-off-by: Bin Peng <pengbin@visionular.com>
(cherry picked from commit 3115c0c0e6)
2025-11-04 14:11:13 +02:00
Bin Peng 8c8cb15db4 lavc/aarch64: Fix ff_pred16x16_plane_neon_10
Fix test failure on aarch64:
./tests/checkasm/checkasm --test=h264pred 367840

Signed-off-by: Peng Bin <pengbin@visionular.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit 72a3656e84)
2025-11-04 14:11:09 +02:00
Bin Peng 7ec86238a3 lavc/aarch64: Fix ff_pred8x8_plane_neon_10
Fix test failure on aarch64:
./tests/checkasm/checkasm --test=h264pred 479612

The mismatch between neon and C functions can also be reproduced using the following bitstream and command line.

wget https://streams.videolan.org/ffmpeg/incoming/intra8x8pred_10bit.264
 ./ffmpeg -cpuflags 0  -threads 1 -i intra8x8pred_10bit.264  -f framemd5 -y md5_ref
 ./ffmpeg              -threads 1 -i intra8x8pred_10bit.264  -f framemd5 -y md5_neon

Signed-off-by: Bin Peng <pengbin@visionular.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
(cherry picked from commit decc9e643c)
2025-11-04 14:11:04 +02:00
1 changed files with 15 additions and 18 deletions

View File

@ -489,10 +489,10 @@ function ff_pred16x16_plane_neon_10, export=1
mul v2.8h, v2.8h, v0.8h
mul v3.8h, v3.8h, v0.8h
addp v2.8h, v2.8h, v3.8h
addp v2.8h, v2.8h, v2.8h
addp v2.4h, v2.4h, v2.4h
sshll v3.4s, v2.4h, #2
saddw v2.4s, v3.4s, v2.4h
saddlp v2.4s, v2.8h
addp v2.4s, v2.4s, v2.4s
shl v3.4s, v2.4s, #2
add v2.4s, v3.4s, v2.4s
rshrn v4.4h, v2.4s, #6
trn2 v5.4h, v4.4h, v4.4h
add v2.4h, v4.4h, v5.4h
@ -502,28 +502,26 @@ function ff_pred16x16_plane_neon_10, export=1
add v7.4h, v7.4h, v0.4h
shl v2.4h, v7.4h, #4
ssubl v2.4s, v2.4h, v3.4h
shl v3.4h, v4.4h, #4
ext v0.16b, v0.16b, v0.16b, #14
ssubl v6.4s, v5.4h, v3.4h
sxtl v6.4s, v5.4h // c
mov v0.h[0], wzr
mul v0.8h, v0.8h, v4.h[0]
dup v16.4s, v2.s[0]
dup v17.4s, v2.s[0]
dup v2.8h, v4.h[0]
dup v3.4s, v6.s[0]
shl v2.8h, v2.8h, #3
saddw v16.4s, v16.4s, v0.4h
saddw2 v17.4s, v17.4s, v0.8h
saddw v3.4s, v3.4s, v2.4h
dup v2.8h, v4.h[0] // b
dup v3.4s, v6.s[0] // c
sshll v2.4s, v2.4h, #3 // b * 8
smlal v16.4s, v0.4h, v4.h[0]
smlal2 v17.4s, v0.8h, v4.h[0]
sub v3.4s, v3.4s, v2.4s
mov w3, #16
mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping
1:
sqshrun v0.4h, v16.4s, #5
sqshrun2 v0.8h, v17.4s, #5
saddw v16.4s, v16.4s, v2.4h
saddw v17.4s, v17.4s, v2.4h
add v16.4s, v16.4s, v2.4s
add v17.4s, v17.4s, v2.4s
sqshrun v1.4h, v16.4s, #5
sqshrun2 v1.8h, v17.4s, #5
add v16.4s, v16.4s, v3.4s
@ -595,12 +593,11 @@ function ff_pred8x8_plane_neon_10, export=1
ssubl v2.4s, v2.4h, v3.4h
ext v0.16b, v0.16b, v0.16b, #14
mov v0.h[0], wzr
mul v0.8h, v0.8h, v5.h[0]
dup v1.4s, v2.s[0]
dup v2.4s, v2.s[0]
dup v3.8h, v5.h[1]
saddw v1.4s, v1.4s, v0.4h
saddw2 v2.4s, v2.4s, v0.8h
smlal v1.4s, v0.4h, v5.h[0]
smlal2 v2.4s, v0.8h, v5.h[0]
mov w3, #8
mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping
1: