lavc/aarch64: Fix addp overflow in ff_pred16x16_plane_neon_10

The mismatch between neon and C functions can be reproduced using the following bitstream and command line on aarch64 devices. wget https://streams.videolan.org/ffmpeg/incoming/replay_intra_pred_16x16.h264 ./ffmpeg -cpuflags 0 -threads 1 -i replay_intra_pred_16x16.h264 -f framemd5 -y md5_ref ./ffmpeg -threads 1 -i replay_intra_pred_16x16.h264 -f framemd5 -y md5_neon Signed-off-by: Bin Peng <pengbin@visionular.com> (cherry picked from commit 3115c0c0e6)
lavc/aarch64: Fix ff_pred16x16_plane_neon_10
2025-11-04 14:11:13 +02:00 · 2025-11-04 14:11:09 +02:00 · 2025-11-04 14:11:04 +02:00
1 changed files with 15 additions and 18 deletions
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@ -489,10 +489,10 @@ function ff_pred16x16_plane_neon_10, export=1
        mul             v2.8h,  v2.8h,  v0.8h
        mul             v3.8h,  v3.8h,  v0.8h
        addp            v2.8h,  v2.8h,  v3.8h
-        addp            v2.8h,  v2.8h,  v2.8h
-        addp            v2.4h,  v2.4h,  v2.4h
-        sshll           v3.4s,  v2.4h,  #2
-        saddw           v2.4s,  v3.4s,  v2.4h
+        saddlp          v2.4s,  v2.8h
+        addp            v2.4s,  v2.4s,  v2.4s
+        shl             v3.4s,  v2.4s,  #2
+        add             v2.4s,  v3.4s,  v2.4s
        rshrn           v4.4h,  v2.4s,  #6
        trn2            v5.4h,  v4.4h,  v4.4h
        add             v2.4h,  v4.4h,  v5.4h
@ -502,28 +502,26 @@ function ff_pred16x16_plane_neon_10, export=1
        add             v7.4h,  v7.4h,  v0.4h
        shl             v2.4h,  v7.4h,  #4
        ssubl           v2.4s,  v2.4h,  v3.4h
-        shl             v3.4h,  v4.4h,  #4
        ext             v0.16b, v0.16b, v0.16b, #14
-        ssubl           v6.4s,  v5.4h,  v3.4h
+        sxtl            v6.4s,  v5.4h          // c

        mov             v0.h[0],  wzr
-        mul             v0.8h,  v0.8h,  v4.h[0]
        dup             v16.4s, v2.s[0]
        dup             v17.4s, v2.s[0]
-        dup             v2.8h,  v4.h[0]
-        dup             v3.4s,  v6.s[0]
-        shl             v2.8h,  v2.8h,  #3
-        saddw           v16.4s, v16.4s, v0.4h
-        saddw2          v17.4s, v17.4s, v0.8h
-        saddw           v3.4s,  v3.4s,  v2.4h
+        dup             v2.8h,  v4.h[0]        // b
+        dup             v3.4s,  v6.s[0]        // c
+        sshll           v2.4s,  v2.4h,  #3     // b * 8
+        smlal           v16.4s, v0.4h, v4.h[0]
+        smlal2          v17.4s, v0.8h, v4.h[0]
+        sub             v3.4s,  v3.4s,  v2.4s

        mov             w3,      #16
        mvni            v4.8h,   #0xFC, lsl #8 // 1023 for clipping
 1:
        sqshrun         v0.4h,  v16.4s, #5
        sqshrun2        v0.8h,  v17.4s, #5
-        saddw           v16.4s, v16.4s, v2.4h
-        saddw           v17.4s, v17.4s, v2.4h
+        add             v16.4s, v16.4s, v2.4s
+        add             v17.4s, v17.4s, v2.4s
        sqshrun         v1.4h,  v16.4s, #5
        sqshrun2        v1.8h,  v17.4s, #5
        add             v16.4s, v16.4s, v3.4s
@ -595,12 +593,11 @@ function ff_pred8x8_plane_neon_10, export=1
        ssubl           v2.4s,  v2.4h,  v3.4h
        ext             v0.16b, v0.16b, v0.16b, #14
        mov             v0.h[0],  wzr
-        mul             v0.8h,  v0.8h,  v5.h[0]
        dup             v1.4s,  v2.s[0]
        dup             v2.4s,  v2.s[0]
        dup             v3.8h,  v5.h[1]
-        saddw           v1.4s,  v1.4s,  v0.4h
-        saddw2          v2.4s,  v2.4s,  v0.8h
+        smlal           v1.4s,  v0.4h,  v5.h[0]
+        smlal2          v2.4s,  v0.8h,  v5.h[0]
        mov             w3,  #8
        mvni            v4.8h,  #0xFC,  lsl #8 // 1023 for clipping
 1: