configure: filter out -guard:signret from armasm flags

While cl.exe supports -guard:signret, armasm64 complains about unknown flag. Note that -guard:ehcont is accepted by armasm64. Fixes: error A2029: unknown command-line argument or argument value -guard:signret Signed-off-by: Kacper Michajłow <kasper93@gmail.com>
fate: add more configure flags to fate config
2025-11-17 20:41:34 +00:00 · 2025-11-17 20:25:24 +00:00 · 2025-11-17 12:18:12 +01:00 · 2025-11-17 12:18:12 +01:00 · 2025-11-17 12:18:12 +01:00 · 2025-11-17 12:18:12 +01:00
15 changed files with 943 additions and 730 deletions
--- a/1
+++ b/1
@ -4968,6 +4968,7 @@ armasm_flags(){
            # Filter out MSVC cl.exe options from cflags that shouldn't
            # be passed to gas-preprocessor
            -M[TD]*)                                            ;;
+            -guard:signret)                                     ;;
            *)                  echo $flag                      ;;
        esac
   done
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@ -329,7 +329,7 @@ OBJS-$(CONFIG_FRAMESTEP_FILTER)              += vf_framestep.o
 OBJS-$(CONFIG_FREEZEDETECT_FILTER)           += vf_freezedetect.o
 OBJS-$(CONFIG_FREEZEFRAMES_FILTER)           += vf_freezeframes.o
 OBJS-$(CONFIG_FREI0R_FILTER)                 += vf_frei0r.o
-OBJS-$(CONFIG_FSPP_FILTER)                   += vf_fspp.o qp_table.o
+OBJS-$(CONFIG_FSPP_FILTER)                   += vf_fspp.o vf_fsppdsp.o qp_table.o
 OBJS-$(CONFIG_FSYNC_FILTER)                  += vf_fsync.o
 OBJS-$(CONFIG_GBLUR_FILTER)                  += vf_gblur.o
 OBJS-$(CONFIG_GBLUR_VULKAN_FILTER)           += vf_gblur_vulkan.o vulkan.o vulkan_filter.o
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@ -41,12 +41,41 @@
 #include "libavutil/mem_internal.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/video_enc_params.h"

+#include "avfilter.h"
 #include "filters.h"
 #include "qp_table.h"
-#include "vf_fspp.h"
+#include "vf_fsppdsp.h"
 #include "video.h"

+#define BLOCKSZ  12
+#define MAX_LEVEL 5
+
+typedef struct FSPPContext {
+    const struct AVClass *class;
+
+    int log2_count;
+    int strength;
+    int hsub;
+    int vsub;
+    int temp_stride;
+    int qp;
+    enum AVVideoEncParamsType qscale_type;
+    int prev_q;
+    uint8_t *src;
+    int16_t *temp;
+    int8_t  *non_b_qp_table;
+    int non_b_qp_stride;
+    int use_bframe_qp;
+
+    FSPPDSPContext dsp;
+
+    DECLARE_ALIGNED(16, int16_t, threshold_mtx_noq)[8 * 8];
+    DECLARE_ALIGNED(16, int16_t, threshold_mtx)[8 * 8];
+} FSPPContext;
+
+
 #define OFFSET(x) offsetof(FSPPContext, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
 static const AVOption fspp_options[] = {
@ -59,98 +88,22 @@ static const AVOption fspp_options[] = {

 AVFILTER_DEFINE_CLASS(fspp);

-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
-    {  0,  48,  12,  60,   3,  51,  15,  63, },
-    { 32,  16,  44,  28,  35,  19,  47,  31, },
-    {  8,  56,   4,  52,  11,  59,   7,  55, },
-    { 40,  24,  36,  20,  43,  27,  39,  23, },
-    {  2,  50,  14,  62,   1,  49,  13,  61, },
-    { 34,  18,  46,  30,  33,  17,  45,  29, },
-    { 10,  58,   6,  54,   9,  57,   5,  53, },
-    { 42,  26,  38,  22,  41,  25,  37,  21, },
-};
-
 static const short custom_threshold[64] = {
 // values (296) can't be too high
 // -it causes too big quant dependence
 // or maybe overflow(check), which results in some flashing
-     71, 296, 295, 237,  71,  40,  38,  19,
-    245, 193, 185, 121, 102,  73,  53,  27,
-    158, 129, 141, 107,  97,  73,  50,  26,
-    102, 116, 109,  98,  82,  66,  45,  23,
-     71,  94,  95,  81,  70,  56,  38,  20,
-     56,  77,  74,  66,  56,  44,  30,  15,
-     38,  53,  50,  45,  38,  30,  21,  11,
-     20,  27,  26,  23,  20,  15,  11,   5
+// reorder coefficients to the order in which columns are processed
+#define REORDER(a,b,c,d,e,f,g,h) c, g, a, e, f, d, b, h
+    REORDER( 71, 296, 295, 237,  71,  40,  38,  19),
+    REORDER(245, 193, 185, 121, 102,  73,  53,  27),
+    REORDER(158, 129, 141, 107,  97,  73,  50,  26),
+    REORDER(102, 116, 109,  98,  82,  66,  45,  23),
+    REORDER( 71,  94,  95,  81,  70,  56,  38,  20),
+    REORDER( 56,  77,  74,  66,  56,  44,  30,  15),
+    REORDER( 38,  53,  50,  45,  38,  30,  21,  11),
+    REORDER( 20,  27,  26,  23,  20,  15,  11,   5)
 };

-//This func reads from 1 slice, 1 and clears 0 & 1
-static void store_slice_c(uint8_t *dst, int16_t *src,
-                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
-{
-    int y, x;
-#define STORE(pos)                                                             \
-    temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
-    src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
-    if (temp & 0x100) temp = ~(temp >> 31);                                    \
-    dst[x + pos] = temp;
-
-    for (y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
-        for (x = 0; x < width; x += 8) {
-            int temp;
-            STORE(0);
-            STORE(1);
-            STORE(2);
-            STORE(3);
-            STORE(4);
-            STORE(5);
-            STORE(6);
-            STORE(7);
-        }
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
-
-//This func reads from 2 slices, 0 & 2  and clears 2-nd
-static void store_slice2_c(uint8_t *dst, int16_t *src,
-                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
-{
-    int y, x;
-#define STORE2(pos)                                                                                       \
-    temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
-    src[x + pos + 16 * src_stride] = 0;                                                                   \
-    if (temp & 0x100) temp = ~(temp >> 31);                                                               \
-    dst[x + pos] = temp;
-
-    for (y = 0; y < height; y++) {
-        const uint8_t *d = dither[y];
-        for (x = 0; x < width; x += 8) {
-            int temp;
-            STORE2(0);
-            STORE2(1);
-            STORE2(2);
-            STORE2(3);
-            STORE2(4);
-            STORE2(5);
-            STORE2(6);
-            STORE2(7);
-        }
-        src += src_stride;
-        dst += dst_stride;
-    }
-}
-
-static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
-{
-    int a;
-    for (a = 0; a < 64; a++)
-        thr_adr[a] = q * thr_adr_noq[a];
-}
-
 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
                   int dst_stride, int src_stride,
                   int width, int height,
@ -163,9 +116,9 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
    const int qpsh = 4 - p->hsub * !is_luma;
    const int qpsv = 4 - p->vsub * !is_luma;

-    DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
-    int16_t *block  = (int16_t *)block_align;
-    int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
+    DECLARE_ALIGNED(16, int16_t, block_align)[8 * 8 * BLOCKSZ + 8 * 8 * BLOCKSZ];
+    int16_t *block  = block_align;
+    int16_t *block3 = block_align + 8 * 8 * BLOCKSZ;

    memset(block3, 0, 4 * 8 * BLOCKSZ);

@ -197,13 +150,13 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
        if (qy < 0) qy = 0;

        qy = (qy >> qpsv) * qp_stride;
-        p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
+        p->dsp.row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);

        for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
-            p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
+            p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));

            if (p->qp)
-                p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
+                p->dsp.column_fidct(p->threshold_mtx, block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
            else
                for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
                    t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
@ -213,288 +166,45 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
                    t = qp_store[qy + (t >> qpsh)];
                    t = ff_norm_qscale(t, p->qscale_type);

-                    if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
-                    p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
+                    if (t != p->prev_q) {
+                        p->prev_q = t;
+                        p->dsp.mul_thrmat(p->threshold_mtx_noq, p->threshold_mtx, t);
+                    }
+                    p->dsp.column_fidct(p->threshold_mtx, block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
                }
-            p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
+            p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
            memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
            memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
        }

        es = width + 8 - x0; //  8, ...
        if (es > 8)
-            p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
+            p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);

-        p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
+        p->dsp.column_fidct(p->threshold_mtx, block, block3, es&(~1));
        if (es > 3)
-            p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
+            p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);

        if (!(y1 & 7) && y1) {
            if (y1 & 8)
-                p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
-                               dst_stride, stride, width, 8, 5 - p->log2_count);
+                p->dsp.store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
+                                   dst_stride, stride, width, 8, 5 - p->log2_count);
            else
-                p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
-                                dst_stride, stride, width, 8, 5 - p->log2_count);
+                p->dsp.store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
+                                    dst_stride, stride, width, 8, 5 - p->log2_count);
        }
    }

    if (y & 7) {  // height % 8 != 0
        if (y & 8)
-            p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
-                           dst_stride, stride, width, y&7, 5 - p->log2_count);
+            p->dsp.store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
+                               dst_stride, stride, width, y&7, 5 - p->log2_count);
        else
-            p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
+            p->dsp.store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
                            dst_stride, stride, width, y&7, 5 - p->log2_count);
    }
 }

-static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
-{
-    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    int_simd16_t tmp10, tmp11, tmp12, tmp13;
-    int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
-    int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
-
-    int16_t *dataptr;
-    int16_t *wsptr;
-    int16_t *threshold;
-    int ctr;
-
-    dataptr = data;
-    wsptr = output;
-
-    for (; cnt > 0; cnt -= 2) { //start positions
-        threshold = (int16_t *)thr_adr;//threshold_mtx
-        for (ctr = DCTSIZE; ctr > 0; ctr--) {
-            // Process columns from input, add to output.
-            tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
-            tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
-
-            tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
-            tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
-
-            tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
-            tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
-
-            tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
-            tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
-
-            // Even part of FDCT
-
-            tmp10 = tmp0 + tmp3;
-            tmp13 = tmp0 - tmp3;
-            tmp11 = tmp1 + tmp2;
-            tmp12 = tmp1 - tmp2;
-
-            d0 = tmp10 + tmp11;
-            d4 = tmp10 - tmp11;
-
-            z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
-            d2 = tmp13 + z1;
-            d6 = tmp13 - z1;
-
-            // Even part of IDCT
-
-            THRESHOLD(tmp0, d0, threshold[0 * 8]);
-            THRESHOLD(tmp1, d2, threshold[2 * 8]);
-            THRESHOLD(tmp2, d4, threshold[4 * 8]);
-            THRESHOLD(tmp3, d6, threshold[6 * 8]);
-            tmp0 += 2;
-            tmp10 = (tmp0 + tmp2) >> 2;
-            tmp11 = (tmp0 - tmp2) >> 2;
-
-            tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
-            tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
-
-            tmp0 = tmp10 + tmp13; //->temps
-            tmp3 = tmp10 - tmp13; //->temps
-            tmp1 = tmp11 + tmp12; //->temps
-            tmp2 = tmp11 - tmp12; //->temps
-
-            // Odd part of FDCT
-
-            tmp10 = tmp4 + tmp5;
-            tmp11 = tmp5 + tmp6;
-            tmp12 = tmp6 + tmp7;
-
-            z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
-            z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
-            z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
-            z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
-
-            z11 = tmp7 + z3;
-            z13 = tmp7 - z3;
-
-            d5 = z13 + z2;
-            d3 = z13 - z2;
-            d1 = z11 + z4;
-            d7 = z11 - z4;
-
-            // Odd part of IDCT
-
-            THRESHOLD(tmp4, d1, threshold[1 * 8]);
-            THRESHOLD(tmp5, d3, threshold[3 * 8]);
-            THRESHOLD(tmp6, d5, threshold[5 * 8]);
-            THRESHOLD(tmp7, d7, threshold[7 * 8]);
-
-            //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
-            z13 = tmp6 + tmp5;
-            z10 = (tmp6 - tmp5) << 1;
-            z11 = tmp4 + tmp7;
-            z12 = (tmp4 - tmp7) << 1;
-
-            tmp7  = (z11 + z13) >> 2; //+2 !
-            tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
-            z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
-            tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
-            tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
-
-            tmp6 = tmp12 - tmp7;
-            tmp5 = tmp11 - tmp6;
-            tmp4 = tmp10 + tmp5;
-
-            wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
-            wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
-            wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
-            wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
-            wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
-            wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
-            wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
-            wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
-            //
-            dataptr++; //next column
-            wsptr++;
-            threshold++;
-        }
-        dataptr += 8; //skip each second start pos
-        wsptr   += 8;
-    }
-}
-
-static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
-{
-    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    int_simd16_t tmp10, tmp11, tmp12, tmp13;
-    int_simd16_t z5, z10, z11, z12, z13;
-    int16_t *outptr;
-    int16_t *wsptr;
-
-    cnt *= 4;
-    wsptr = workspace;
-    outptr = output_adr;
-    for (; cnt > 0; cnt--) {
-        // Even part
-        //Simd version reads 4x4 block and transposes it
-        tmp10 = wsptr[2] +  wsptr[3];
-        tmp11 = wsptr[2] -  wsptr[3];
-
-        tmp13 = wsptr[0] +  wsptr[1];
-        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
-
-        tmp0 = tmp10 + tmp13; //->temps
-        tmp3 = tmp10 - tmp13; //->temps
-        tmp1 = tmp11 + tmp12;
-        tmp2 = tmp11 - tmp12;
-
-        // Odd part
-        //Also transpose, with previous:
-        // ---- ----      ||||
-        // ---- ---- idct ||||
-        // ---- ---- ---> ||||
-        // ---- ----      ||||
-        z13 = wsptr[4] + wsptr[5];
-        z10 = wsptr[4] - wsptr[5];
-        z11 = wsptr[6] + wsptr[7];
-        z12 = wsptr[6] - wsptr[7];
-
-        tmp7 = z11 + z13;
-        tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
-
-        z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
-        tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
-        tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
-
-        tmp6 = (tmp12 << 3) - tmp7;
-        tmp5 = (tmp11 << 3) - tmp6;
-        tmp4 = (tmp10 << 3) + tmp5;
-
-        // Final output stage: descale and write column
-        outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
-        outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
-        outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
-        outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
-        outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
-        outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
-        outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
-        outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
-        outptr++;
-
-        wsptr += DCTSIZE;       // advance pointer to next row
-    }
-}
-
-static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
-{
-    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    int_simd16_t tmp10, tmp11, tmp12, tmp13;
-    int_simd16_t z1, z2, z3, z4, z5, z11, z13;
-    int16_t *dataptr;
-
-    cnt *= 4;
-    // Pass 1: process rows.
-
-    dataptr = data;
-    for (; cnt > 0; cnt--) {
-        tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
-        tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
-        tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
-        tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
-        tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
-        tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
-        tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
-        tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
-
-        // Even part
-
-        tmp10 = tmp0 + tmp3;
-        tmp13 = tmp0 - tmp3;
-        tmp11 = tmp1 + tmp2;
-        tmp12 = tmp1 - tmp2;
-        //Even columns are written first, this leads to different order of columns
-        //in column_fidct(), but they are processed independently, so all ok.
-        //Later in the row_idct() columns are read in the same order.
-        dataptr[2] = tmp10 + tmp11;
-        dataptr[3] = tmp10 - tmp11;
-
-        z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
-        dataptr[0] = tmp13 + z1;
-        dataptr[1] = tmp13 - z1;
-
-        // Odd part
-
-        tmp10 = (tmp4 + tmp5) << 2;
-        tmp11 = (tmp5 + tmp6) << 2;
-        tmp12 = (tmp6 + tmp7) << 2;
-
-        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
-        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
-        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
-        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
-
-        z11 = tmp7 + z3;
-        z13 = tmp7 - z3;
-
-        dataptr[4] = z13 + z2;
-        dataptr[5] = z13 - z2;
-        dataptr[6] = z11 + z4;
-        dataptr[7] = z11 - z4;
-
-        pixels++;               // advance pointer to next column
-        dataptr += DCTSIZE;
-    }
-}
-
 static const enum AVPixelFormat pix_fmts[] = {
    AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
    AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
@ -522,16 +232,7 @@ static int config_input(AVFilterLink *inlink)
    if (!fspp->temp || !fspp->src)
        return AVERROR(ENOMEM);

-    fspp->store_slice  = store_slice_c;
-    fspp->store_slice2 = store_slice2_c;
-    fspp->mul_thrmat   = mul_thrmat_c;
-    fspp->column_fidct = column_fidct_c;
-    fspp->row_idct     = row_idct_c;
-    fspp->row_fdct     = row_fdct_c;
-
-#if ARCH_X86
-    ff_fspp_init_x86(fspp);
-#endif
+    ff_fsppdsp_init(&fspp->dsp);

    return 0;
 }
@ -545,30 +246,17 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)

    int qp_stride = 0;
    int8_t *qp_table = NULL;
-    int i, bias;
    int ret = 0;
-    int custom_threshold_m[64];

-    bias = (1 << 4) + fspp->strength;
+    //FIXME: tune custom_threshold[] and remove this !
+    for (int i = 0, bias = (1 << 4) + fspp->strength; i < 64; ++i)
+        fspp->threshold_mtx_noq[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);

-    for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
-        custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
-
-    for (i = 0; i < 8; i++) {
-        fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
-                                      |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
-
-        fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
-                                          |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
-                                          |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
-                                          |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
+    if (fspp->qp) {
+        fspp->prev_q = fspp->qp;
+        fspp->dsp.mul_thrmat(fspp->threshold_mtx_noq, fspp->threshold_mtx, fspp->qp);
    }

-    if (fspp->qp)
-        fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
-
    /* if we are not in a constant user quantizer mode and we don't want to use
     * the quantizers from the B-frames (B-frames often have a higher QP), we
     * need to save the qp table from the last non B-frame; this is what the
--- a/libavfilter/vf_fspp.h
+++ b/libavfilter/vf_fspp.h
@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
- * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
- * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#ifndef AVFILTER_FSPP_H
-#define AVFILTER_FSPP_H
-
-#include "libavutil/video_enc_params.h"
-#include "avfilter.h"
-
-#define BLOCKSZ 12
-#define MAX_LEVEL 5
-
-#define DCTSIZE 8
-#define DCTSIZE_S "8"
-
-#define FIX(x,s)  ((x) * (1 << s) + 0.5)
-
-#define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
-#define THRESHOLD(r,x,t)                         \
-    if(((unsigned)((x) + t)) > t * 2) r = (x);   \
-    else r = 0;
-#define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
-
-typedef int32_t int_simd16_t;
-static const int16_t FIX_0_382683433   = FIX(0.382683433, 14);
-static const int16_t FIX_0_541196100   = FIX(0.541196100, 14);
-static const int16_t FIX_0_707106781   = FIX(M_SQRT1_2  , 14);
-static const int16_t FIX_1_306562965   = FIX(1.306562965, 14);
-static const int16_t FIX_1_414213562_A = FIX(M_SQRT2    , 14);
-static const int16_t FIX_1_847759065   = FIX(1.847759065, 13);
-static const int16_t FIX_2_613125930   = FIX(-2.613125930, 13);
-static const int16_t FIX_1_414213562   = FIX(M_SQRT2    , 13);
-static const int16_t FIX_1_082392200   = FIX(1.082392200, 13);
-
-typedef struct FSPPContext {
-    AVClass *class;
-    uint64_t threshold_mtx_noq[8 * 2];
-    uint64_t threshold_mtx[8 * 2];        //used in both C & MMX (& later SSE2) versions
-
-    int log2_count;
-    int strength;
-    int hsub;
-    int vsub;
-    int temp_stride;
-    int qp;
-    enum AVVideoEncParamsType qscale_type;
-    int prev_q;
-    uint8_t *src;
-    int16_t *temp;
-    int8_t  *non_b_qp_table;
-    int non_b_qp_stride;
-    int use_bframe_qp;
-
-    void (*store_slice)(uint8_t *dst, int16_t *src,
-                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-
-    void (*store_slice2)(uint8_t *dst, int16_t *src,
-                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-
-    void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-
-    void (*column_fidct)(int16_t *thr_adr, int16_t *data,
-                         int16_t *output, int cnt);
-
-    void (*row_idct)(int16_t *workspace, int16_t *output_adr,
-                     ptrdiff_t output_stride, int cnt);
-
-    void (*row_fdct)(int16_t *data, const uint8_t *pixels,
-                     ptrdiff_t line_size, int cnt);
-
-} FSPPContext;
-
-void ff_fspp_init_x86(FSPPContext *fspp);
-
-#endif /* AVFILTER_FSPP_H */
--- a/libavfilter/vf_fsppdsp.c
+++ b/libavfilter/vf_fsppdsp.c
@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+ * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdint.h>
+
+#include "vf_fsppdsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/mem_internal.h"
+
+#define DCTSIZE 8
+
+#define FIX(x,s)  (int)((x) * (1 << s) + 0.5)
+
+#define MULTIPLY16H(x,k)   (((x) * (k)) >> 16)
+#define THRESHOLD(r,x,t)                         \
+    if (((unsigned)((x) + t)) >= t * 2) r = (x); \
+    else r = 0;
+#define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> n)
+
+typedef int32_t int_simd16_t;
+
+enum {
+    FIX_0_382683433   = FIX(0.382683433, 14),
+    FIX_0_541196100   = FIX(0.541196100, 14),
+    FIX_0_707106781   = FIX(M_SQRT1_2  , 14),
+    FIX_1_306562965   = FIX(1.306562965, 14),
+    FIX_1_414213562_A = FIX(M_SQRT2    , 14),
+    FIX_1_847759065   = FIX(1.847759065, 13),
+    FIX_2_613125930   = FIX(-2.613125930, 13),
+    FIX_1_414213562   = FIX(M_SQRT2    , 13),
+    FIX_1_082392200   = FIX(1.082392200, 13),
+};
+
+DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
+    {  0,  48,  12,  60,   3,  51,  15,  63, },
+    { 32,  16,  44,  28,  35,  19,  47,  31, },
+    {  8,  56,   4,  52,  11,  59,   7,  55, },
+    { 40,  24,  36,  20,  43,  27,  39,  23, },
+    {  2,  50,  14,  62,   1,  49,  13,  61, },
+    { 34,  18,  46,  30,  33,  17,  45,  29, },
+    { 10,  58,   6,  54,   9,  57,   5,  53, },
+    { 42,  26,  38,  22,  41,  25,  37,  21, },
+};
+
+//This func reads from 1 slice, 1 and clears 0 & 1
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
+                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE(pos)                                                             \
+    temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
+    src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
+    temp = av_clip_uint8(temp);                                                \
+    dst[x + pos] = temp;
+
+    for (int y = 0; y < height; y++) {
+        const uint8_t *d = ff_fspp_dither[y];
+        for (int x = 0; x < width; x += 8) {
+            int temp;
+            STORE(0);
+            STORE(1);
+            STORE(2);
+            STORE(3);
+            STORE(4);
+            STORE(5);
+            STORE(6);
+            STORE(7);
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+//This func reads from 2 slices, 0 & 2  and clears 2-nd
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
+                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+{
+#define STORE2(pos)                                                                                       \
+    temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
+    src[x + pos + 16 * src_stride] = 0;                                                                   \
+    temp = av_clip_uint8(temp);                                                                           \
+    dst[x + pos] = temp;
+
+    for (int y = 0; y < height; y++) {
+        const uint8_t *d = ff_fspp_dither[y];
+        for (int x = 0; x < width; x += 8) {
+            int temp;
+            STORE2(0);
+            STORE2(1);
+            STORE2(2);
+            STORE2(3);
+            STORE2(4);
+            STORE2(5);
+            STORE2(6);
+            STORE2(7);
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q)
+{
+    for (int a = 0; a < 64; a++)
+        thr_adr[a] = q * thr_adr_noq[a];
+}
+
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data,
+                       int16_t *restrict output, int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
+    int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
+
+    int16_t *wsptr;
+
+    wsptr = output;
+
+    for (; cnt > 0; cnt -= 2) { //start positions
+        const int16_t *threshold = thr_adr;//threshold_mtx
+        for (int ctr = DCTSIZE; ctr > 0; ctr--) {
+            // Process columns from input, add to output.
+            tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
+            tmp7 = data[DCTSIZE * 0] - data[DCTSIZE * 7];
+
+            tmp1 = data[DCTSIZE * 1] + data[DCTSIZE * 6];
+            tmp6 = data[DCTSIZE * 1] - data[DCTSIZE * 6];
+
+            tmp2 = data[DCTSIZE * 2] + data[DCTSIZE * 5];
+            tmp5 = data[DCTSIZE * 2] - data[DCTSIZE * 5];
+
+            tmp3 = data[DCTSIZE * 3] + data[DCTSIZE * 4];
+            tmp4 = data[DCTSIZE * 3] - data[DCTSIZE * 4];
+
+            // Even part of FDCT
+
+            tmp10 = tmp0 + tmp3;
+            tmp13 = tmp0 - tmp3;
+            tmp11 = tmp1 + tmp2;
+            tmp12 = tmp1 - tmp2;
+
+            d0 = tmp10 + tmp11;
+            d4 = tmp10 - tmp11;
+
+            z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
+            d2 = tmp13 + z1;
+            d6 = tmp13 - z1;
+
+            // Even part of IDCT
+
+            THRESHOLD(tmp0, d0, threshold[0 * 8]);
+            THRESHOLD(tmp1, d2, threshold[2 * 8]);
+            THRESHOLD(tmp2, d4, threshold[4 * 8]);
+            THRESHOLD(tmp3, d6, threshold[6 * 8]);
+            tmp0 += 2;
+            tmp10 = (tmp0 + tmp2) >> 2;
+            tmp11 = (tmp0 - tmp2) >> 2;
+
+            tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
+            tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
+
+            tmp0 = tmp10 + tmp13; //->temps
+            tmp3 = tmp10 - tmp13; //->temps
+            tmp1 = tmp11 + tmp12; //->temps
+            tmp2 = tmp11 - tmp12; //->temps
+
+            // Odd part of FDCT
+
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+            z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
+            z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
+            z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
+
+            z11 = tmp7 + z3;
+            z13 = tmp7 - z3;
+
+            d5 = z13 + z2;
+            d3 = z13 - z2;
+            d1 = z11 + z4;
+            d7 = z11 - z4;
+
+            // Odd part of IDCT
+
+            THRESHOLD(tmp4, d1, threshold[1 * 8]);
+            THRESHOLD(tmp5, d3, threshold[3 * 8]);
+            THRESHOLD(tmp6, d5, threshold[5 * 8]);
+            THRESHOLD(tmp7, d7, threshold[7 * 8]);
+
+            //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
+            z13 = tmp6 + tmp5;
+            z10 = (tmp6 - tmp5) * 2;
+            z11 = tmp4 + tmp7;
+            z12 = (tmp4 - tmp7) * 2;
+
+            tmp7  = (z11 + z13) >> 2; //+2 !
+            tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562 << 1);
+            z5    = MULTIPLY16H(z10 + z12, FIX_1_847759065);
+            tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
+            tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - !!
+
+            tmp6 = tmp12 - tmp7;
+            tmp5 = tmp11 - tmp6;
+            tmp4 = tmp10 + tmp5;
+
+            wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
+            wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
+            wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
+            wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
+            wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
+            wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
+            wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
+            wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
+            //
+            data++; //next column
+            wsptr++;
+            threshold++;
+        }
+        data  += 8; //skip each second start pos
+        wsptr   += 8;
+    }
+}
+
+void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
+                   ptrdiff_t output_stride, int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z5, z10, z11, z12, z13;
+    int16_t *outptr;
+
+    cnt *= 4;
+    outptr = output_adr;
+    for (; cnt > 0; cnt--) {
+        // Even part
+        //Simd version reads 4x4 block and transposes it
+        tmp10 = wsptr[2] +  wsptr[3];
+        tmp11 = wsptr[2] -  wsptr[3];
+
+        tmp13 = wsptr[0] +  wsptr[1];
+        tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) * 4) - tmp13;//this shift order to avoid overflow
+
+        tmp0 = tmp10 + tmp13; //->temps
+        tmp3 = tmp10 - tmp13; //->temps
+        tmp1 = tmp11 + tmp12;
+        tmp2 = tmp11 - tmp12;
+
+        // Odd part
+        //Also transpose, with previous:
+        // ---- ----      ||||
+        // ---- ---- idct ||||
+        // ---- ---- ---> ||||
+        // ---- ----      ||||
+        z13 = wsptr[4] + wsptr[5];
+        z10 = wsptr[4] - wsptr[5];
+        z11 = wsptr[6] + wsptr[7];
+        z12 = wsptr[6] - wsptr[7];
+
+        tmp7 = z11 + z13;
+        tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
+
+        z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
+        tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
+        tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
+
+        tmp6 = tmp12 * 8 - tmp7;
+        tmp5 = tmp11 * 8 - tmp6;
+        tmp4 = tmp10 * 8 + tmp5;
+
+        // Final output stage: descale and write column
+        outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
+        outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
+        outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
+        outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
+        outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
+        outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
+        outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
+        outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
+        outptr++;
+
+        wsptr += DCTSIZE;       // advance pointer to next row
+    }
+}
+
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+                   ptrdiff_t line_size, int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z1, z2, z3, z4, z5, z11, z13;
+    int16_t *dataptr;
+
+    cnt *= 4;
+    // Pass 1: process rows.
+
+    dataptr = data;
+    for (; cnt > 0; cnt--) {
+        tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
+        tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
+        tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
+        tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
+        tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
+        tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
+        tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
+        tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
+
+        // Even part
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+        //Even columns are written first, this leads to different order of columns
+        //in column_fidct(), but they are processed independently, so all ok.
+        //Later in the row_idct() columns are read in the same order.
+        dataptr[2] = tmp10 + tmp11;
+        dataptr[3] = tmp10 - tmp11;
+
+        z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
+        dataptr[0] = tmp13 + z1;
+        dataptr[1] = tmp13 - z1;
+
+        // Odd part
+
+        tmp10 = tmp4 + tmp5;
+        tmp11 = tmp5 + tmp6;
+        tmp12 = tmp6 + tmp7;
+
+        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
+        z2 = MULTIPLY16H(tmp10,         FIX_0_541196100 << 2) + z5;
+        z4 = MULTIPLY16H(tmp12,         FIX_1_306562965 << 2) + z5;
+        z3 = MULTIPLY16H(tmp11,         FIX_0_707106781 << 2);
+
+        z11 = tmp7 + z3;
+        z13 = tmp7 - z3;
+
+        dataptr[4] = z13 + z2;
+        dataptr[5] = z13 - z2;
+        dataptr[6] = z11 + z4;
+        dataptr[7] = z11 - z4;
+
+        pixels++;               // advance pointer to next column
+        dataptr += DCTSIZE;
+    }
+}
--- a/libavfilter/vf_fsppdsp.h
+++ b/libavfilter/vf_fsppdsp.h
@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+ * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef AVFILTER_FSPPDSP_H
+#define AVFILTER_FSPPDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config.h"
+
+#include "libavutil/attributes_internal.h"
+
+typedef struct FSPPDSPContext {
+    void (*store_slice)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
+                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+
+    void (*store_slice2)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
+                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+
+    void (*mul_thrmat)(const int16_t *restrict thr_adr_noq /* align 16 */,
+                       int16_t *restrict thr_adr /* align 16 */, int q);
+
+    void (*column_fidct)(const int16_t *restrict thr_adr, const int16_t *restrict data,
+                         int16_t *restrict output, int cnt);
+
+    void (*row_idct)(const int16_t *restrict workspace, int16_t *restrict output_adr,
+                     ptrdiff_t output_stride, int cnt);
+
+    void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
+                     ptrdiff_t line_size, int cnt);
+} FSPPDSPContext;
+
+FF_VISIBILITY_PUSH_HIDDEN
+extern const uint8_t ff_fspp_dither[8][8];
+
+void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
+                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
+                       ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                       ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q);
+void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data,
+                       int16_t *restrict output, int cnt);
+void ff_row_idct_c(const int16_t *restrict workspace, int16_t *restrict output_adr,
+                   ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
+                   ptrdiff_t line_size, int cnt);
+
+void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
+FF_VISIBILITY_POP_HIDDEN
+
+static inline void ff_fsppdsp_init(FSPPDSPContext *fspp)
+{
+    fspp->store_slice  = ff_store_slice_c;
+    fspp->store_slice2 = ff_store_slice2_c;
+    fspp->mul_thrmat   = ff_mul_thrmat_c;
+    fspp->column_fidct = ff_column_fidct_c;
+    fspp->row_idct     = ff_row_idct_c;
+    fspp->row_fdct     = ff_row_fdct_c;
+
+#if ARCH_X86
+    ff_fsppdsp_init_x86(fspp);
+#endif
+}
+
+#endif /* AVFILTER_FSPPDSP_H */
--- a/libavfilter/x86/vf_fspp.asm
+++ b/libavfilter/x86/vf_fspp.asm
@ -25,36 +25,33 @@

 SECTION_RODATA

-pb_dither: db 0,  48,  12,  60,   3,  51,  15,  63, 32,  16,  44,  28,  35,  19,  47,  31, \
-              8,  56,   4,  52,  11,  59,   7,  55, 40,  24,  36,  20,  43,  27,  39,  23, \
-              2,  50,  14,  62,   1,  49,  13,  61, 34,  18,  46,  30,  33,  17,  45,  29, \
-             10,  58,   6,  54,   9,  57,   5,  53, 42,  26,  38,  22,  41,  25,  37,  21
+cextern fspp_dither
+pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2
+pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4
+pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14)
+pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14)
+pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2
+pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13)
+pw_2:    times 8 dw 2
 pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
 pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
 pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
-pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
-pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
 pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
-pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
-pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
-pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
-pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
 pw_4:    times 4 dw 4
-pw_2:    times 4 dw 2

 SECTION .text

 %define DCTSIZE 8

-INIT_MMX mmx
+INIT_XMM sse2

-;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
-;                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
-;                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+;void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
+;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 %if ARCH_X86_64
-cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
 %else
-cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
 %define dst_strideq r2m
 %define src_strideq r3m
    mov       widthq, r4m
@ -65,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
    mov       tmpq, src_strideq
    and       widthq, ~7
    sub       dst_strideq, widthq
-    movd      m5, ditherd ; log2_scale
+    movd      m4, ditherd ; log2_scale
    xor       ditherq, -1 ; log2_scale
    mov       tmp2q, tmpq
    add       ditherq, 7 ; log2_scale
@ -73,33 +70,25 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
    sub       tmp2q, widthq
    movd      m2, ditherd ; log2_scale
    add       tmp2q, tmp2q
-    lea       ditherq, [pb_dither]
+    lea       ditherq, [fspp_dither]
    mov       src_strideq, tmp2q
    shl       tmpq, 4
    lea       dither_heightq, [ditherq+dither_heightq*8]
-    pxor      m7, m7
+    pxor      m1, m1

 .loop_height:
    movq      m3, [ditherq]
-    movq      m4, m3
-    punpcklbw m3, m7
-    punpckhbw m4, m7
+    punpcklbw m3, m1
    mov       tmp2q, widthq
-    psraw     m3, m5
-    psraw     m4, m5
+    psraw     m3, m4

 .loop_width:
-    movq      [srcq+tmpq], m7
-    movq      m0, [srcq]
-    movq      m1, [srcq+8]
-    movq      [srcq+tmpq+8], m7
+    mova      m0, [srcq]
+    mova      [srcq+tmpq], m1
    paddw     m0, m3
-    paddw     m1, m4
-    movq      [srcq], m7
+    mova      [srcq], m1
    psraw     m0, m2
-    psraw     m1, m2
-    movq      [srcq+8], m7
-    packuswb  m0, m1
+    packuswb  m0, m0
    add       srcq, 16
    movq      [dstq], m0
    add       dstq, 8
@ -113,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
    jl .loop_height
    RET

-;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
-;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
-;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+;                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 %if ARCH_X86_64
-cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
 %else
-cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
 %define dst_strideq r2m
 %define src_strideq r3m
    mov       dstq, dstm
@ -132,41 +121,32 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
    mov       tmpq, src_strideq
    and       widthq, ~7
    sub       dst_strideq, widthq
-    movd      m5, ditherd ; log2_scale
+    movd      m4, ditherd ; log2_scale
    xor       ditherq, -1 ; log2_scale
    mov       tmp2q, tmpq
    add       ditherq, 7 ; log2_scale
    sub       tmp2q, widthq
    movd      m2, ditherd ; log2_scale
    add       tmp2q, tmp2q
-    lea       ditherq, [pb_dither]
+    lea       ditherq, [fspp_dither]
    mov       src_strideq, tmp2q
    shl       tmpq, 5
    lea       dither_heightq, [ditherq+dither_heightq*8]
-    pxor      m7, m7
+    pxor      m1, m1

 .loop_height:
    movq      m3, [ditherq]
-    movq      m4, m3
-    punpcklbw m3, m7
-    punpckhbw m4, m7
+    punpcklbw m3, m1
    mov       tmp2q,widthq
-    psraw     m3, m5
-    psraw     m4, m5
+    psraw     m3, m4

 .loop_width:
-    movq      m0, [srcq]
-    movq      m1, [srcq+8]
+    mova      m0, [srcq]
    paddw     m0, m3
    paddw     m0, [srcq+tmpq]
-    paddw     m1, m4
-    movq      m6, [srcq+tmpq+8]
-    movq      [srcq+tmpq], m7
+    mova      [srcq+tmpq], m1
    psraw     m0, m2
-    paddw     m1, m6
-    movq      [srcq+tmpq+8], m7
-    psraw     m1, m2
-    packuswb  m0, m1
+    packuswb  m0, m0
    movq      [dstq], m0
    add       srcq, 16
    add       dstq, 8
@ -180,164 +160,152 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
    jl .loop_height
    RET

-;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
-    movd      m7, qd
-    movq      m0, [thrnq]
-    punpcklwd m7, m7
-    movq      m1, [thrnq+8]
-    punpckldq m7, m7
-    pmullw    m0, m7
-    movq      m2, [thrnq+8*2]
-    pmullw    m1, m7
-    movq      m3, [thrnq+8*3]
-    pmullw    m2, m7
-    movq      [thrq], m0
-    movq      m4, [thrnq+8*4]
-    pmullw    m3, m7
-    movq      [thrq+8], m1
-    movq      m5, [thrnq+8*5]
-    pmullw    m4, m7
-    movq      [thrq+8*2], m2
-    movq      m6, [thrnq+8*6]
-    pmullw    m5, m7
-    movq      [thrq+8*3], m3
-    movq      m0, [thrnq+8*7]
-    pmullw    m6, m7
-    movq      [thrq+8*4], m4
-    movq      m1, [thrnq+8*7+8]
-    pmullw    m0, m7
-    movq      [thrq+8*5], m5
-    movq      m2, [thrnq+8*7+8*2]
-    pmullw    m1, m7
-    movq      [thrq+8*6], m6
-    movq      m3, [thrnq+8*7+8*3]
-    pmullw    m2, m7
-    movq      [thrq+8*7], m0
-    movq      m4, [thrnq+8*7+8*4]
-    pmullw    m3, m7
-    movq      [thrq+8*7+8], m1
-    movq      m5, [thrnq+8*7+8*5]
-    pmullw    m4, m7
-    movq      [thrq+8*7+8*2], m2
-    movq      m6, [thrnq+8*7+8*6]
-    pmullw    m5, m7
-    movq      [thrq+8*7+8*3], m3
-    movq      m0, [thrnq+14*8]
-    pmullw    m6, m7
-    movq      [thrq+8*7+8*4], m4
-    movq      m1, [thrnq+14*8+8]
-    pmullw    m0, m7
-    movq      [thrq+8*7+8*5], m5
-    pmullw    m1, m7
-    movq      [thrq+8*7+8*6], m6
-    movq      [thrq+14*8], m0
-    movq      [thrq+14*8+8], m1
+;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
+    movd      m4, qd
+    mova      m0, [thrnq]
+    punpcklwd m4, m4
+    mova      m1, [thrnq+16]
+    pshufd    m4, m4, 0
+    pmullw    m0, m4
+    mova      m2, [thrnq+16*2]
+    pmullw    m1, m4
+    mova      m3, [thrnq+16*3]
+    pmullw    m2, m4
+    mova      [thrq], m0
+    mova      m0, [thrnq+16*4]
+    pmullw    m3, m4
+    mova      [thrq+16], m1
+    mova      m1, [thrnq+16*5]
+    pmullw    m0, m4
+    mova      [thrq+16*2], m2
+    mova      m2, [thrnq+16*6]
+    pmullw    m1, m4
+    mova      [thrq+16*3], m3
+    mova      m3, [thrnq+16*7]
+    pmullw    m2, m4
+    mova      [thrq+16*4], m0
+    pmullw    m3, m4
+    mova      [thrq+16*5], m1
+    mova      [thrq+16*6], m2
+    mova      [thrq+16*7], m3
    RET

-%macro COLUMN_FDCT 1-3 0, 0
-    movq      m1, [srcq+DCTSIZE*0*2]
-    movq      m7, [srcq+DCTSIZE*3*2]
-    movq      m0, m1
+%macro COLUMN_FDCT 1
+    mova      m1, [srcq+DCTSIZE*0*2]
+    mova      m7, [srcq+DCTSIZE*3*2]
+    mova      m0, m1
    paddw     m1, [srcq+DCTSIZE*7*2]
-    movq      m3, m7
+    mova      m3, m7
    paddw     m7, [srcq+DCTSIZE*4*2]
-    movq      m5, m1
-    movq      m6, [srcq+DCTSIZE*1*2]
+    mova      m5, m1
+    mova      m6, [srcq+DCTSIZE*1*2]
    psubw     m1, m7
-    movq      m2, [srcq+DCTSIZE*2*2]
-    movq      m4, m6
+    mova      m2, [srcq+DCTSIZE*2*2]
+    mova      m4, m6
    paddw     m6, [srcq+DCTSIZE*6*2]
    paddw     m5, m7
    paddw     m2, [srcq+DCTSIZE*5*2]
-    movq      m7, m6
+    mova      m7, m6
    paddw     m6, m2
    psubw     m7, m2
-    movq      m2, m5
+    mova      m2, m5
+%if ARCH_X86_64
+    mova      m8, [thrq]
+%define THRQ m8
+%else
+%define THRQ [thrq]
+%endif
    paddw     m5, m6
    psubw     m2, m6
    paddw     m7, m1
-    movq      m6, [thrq+4*16+%2]
-    psllw     m7, 2
-    psubw     m5, [thrq+%2]
+    mova      m6, [thrq+4*16]
+    psllw     m7, 1
+    psubw     m5, THRQ
    psubw     m2, m6
-    paddusw   m5, [thrq+%2]
+    paddusw   m5, THRQ
    paddusw   m2, m6
-    pmulhw    m7, [pw_2D41]
-    paddw     m5, [thrq+%2]
+    pmulhw    m7, SQRT2
+    paddw     m5, THRQ
    paddw     m2, m6
-    psubusw   m5, [thrq+%2]
+    psubusw   m5, THRQ
    psubusw   m2, m6
    paddw     m5, [pw_2]
-    movq      m6, m2
+    mova      m6, m2
    paddw     m2, m5
+%if ARCH_X86_64
+    mova      m8, [thrq+2*16]
+%define THRQ m8
+%else
+%define THRQ [thrq+2*16]
+%endif
    psubw     m5, m6
-    movq      m6, m1
+    mova      m6, m1
    paddw     m1, m7
-    psubw     m1, [thrq+2*16+%2]
+    psubw     m1, THRQ
    psubw     m6, m7
-    movq      m7, [thrq+6*16+%2]
+    mova      m7, [thrq+6*16]
    psraw     m5, 2
-    paddusw   m1, [thrq+2*16+%2]
+    paddusw   m1, THRQ
    psubw     m6, m7
-    paddw     m1, [thrq+2*16+%2]
+    paddw     m1, THRQ
    paddusw   m6, m7
-    psubusw   m1, [thrq+2*16+%2]
+    psubusw   m1, THRQ
    paddw     m6, m7
    psubw     m3, [srcq+DCTSIZE*4*2]
    psubusw   m6, m7
-    movq      m7, m1
+    mova      m7, m1
    psraw     m2, 2
    psubw     m4, [srcq+DCTSIZE*6*2]
    psubw     m1, m6
    psubw     m0, [srcq+DCTSIZE*7*2]
    paddw     m6, m7
    psraw     m6, 2
-    movq      m7, m2
-    pmulhw    m1, [pw_5A82]
+    mova      m7, m2
+    pmulhw    m1, SQRT2
    paddw     m2, m6
-    movq      [rsp], m2
+    mova    tmp0, m2
    psubw     m7, m6
-    movq      m2, [srcq+DCTSIZE*2*2]
+    mova      m2, [srcq+DCTSIZE*2*2]
    psubw     m1, m6
    psubw     m2, [srcq+DCTSIZE*5*2]
-    movq      m6, m5
-    movq      [rsp+8*3], m7
+    mova      m6, m5
+    mova    tmp3, m7
    paddw     m3, m2
    paddw     m2, m4
    paddw     m4, m0
-    movq      m7, m3
+    mova      m7, m3
    psubw     m3, m4
-    psllw     m3, 2
-    psllw     m7, 2
-    pmulhw    m3, [pw_187E]
+    psllw     m7, 1
+    pmulhw    m3, [pw_61F8]
    psllw     m4, 2
-    pmulhw    m7, [pw_22A3]
-    psllw     m2, 2
+    add     srcq, 32
+    pmulhw    m7, [pw_4546]
+    psllw     m2, 1
    pmulhw    m4, [pw_539F]
    paddw     m5, m1
-    pmulhw    m2, [pw_2D41]
+    pmulhw    m2, SQRT2
    psubw     m6, m1
    paddw     m7, m3
-    movq      [rsp+8], m5
+    mova    tmp1, m5
    paddw     m4, m3
-    movq      m3, [thrq+3*16+%2]
-    movq      m1, m0
-    movq      [rsp+8*2], m6
+    mova      m3, [thrq+3*16]
+    mova      m1, m0
+    mova    tmp2, m6
    psubw     m1, m2
    paddw     m0, m2
-    movq      m5, m1
-    movq      m2, [thrq+5*16+%2]
+    mova      m5, m1
+    mova      m2, [thrq+5*16]
    psubw     m1, m7
    paddw     m5, m7
    psubw     m1, m3
-    movq      m7, [thrq+16+%2]
+    mova      m7, [thrq+16]
    psubw     m5, m2
-    movq      m6, m0
+    mova      m6, m0
    paddw     m0, m4
    paddusw   m1, m3
    psubw     m6, m4
-    movq      m4, [thrq+7*16+%2]
+    mova      m4, [thrq+7*16]
    psubw     m0, m7
    psubw     m6, m4
    paddusw   m5, m2
@ -348,139 +316,149 @@ cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
    psubusw   m1, m3
    psubusw   m5, m2
    psubusw   m6, m4
-    movq      m4, m1
+    mova      m4, m1
    por       m4, m5
    paddusw   m0, m7
    por       m4, m6
    paddw     m0, m7
    packssdw  m4, m4
    psubusw   m0, m7
-    movd      tmpd, m4
-    or        tmpd, tmpd
+%if ARCH_X86_64
+    movq    tmpq, m4
+%else
+    packssdw  m4, m4
+    movd    tmpd, m4
+%endif
+    or      tmpq, tmpq
    jnz %1
-    movq      m4, [rsp]
-    movq      m1, m0
-    pmulhw    m0, [pw_3642]
-    movq      m2, m1
-    movq      m5, [outq+DCTSIZE*0*2]
-    movq      m3, m2
-    pmulhw    m1, [pw_2441]
+    mova      m4, tmp0
+    psraw     m3, m0, 2
+    mova      m5, [outq+DCTSIZE*0*2]
+    pmulhw    m1, m0, [pw_7642]
+    pmulhw    m2, m0, [pw_4546]
+    pmulhw    m0, SQRT2
    paddw     m5, m4
-    movq      m6, [rsp+8]
-    psraw     m3, 2
-    pmulhw    m2, [pw_0CBB]
+    mova      m6, tmp1
+    psubw     m2, m1
    psubw     m4, m3
-    movq      m7, [outq+DCTSIZE*1*2]
+    mova      m7, [outq+DCTSIZE*1*2]
    paddw     m5, m3
-    movq      [outq+DCTSIZE*7*2], m4
+    psubw     m1, m3
+    mova      [outq+DCTSIZE*7*2], m4
+    psubw     m0, m1
+    paddw     m2, m0
+    mova      [outq+DCTSIZE*0*2], m5
    paddw     m7, m6
-    movq      m3, [rsp+8*2]
-    psubw     m6, m0
-    movq      m4, [outq+DCTSIZE*2*2]
-    paddw     m7, m0
-    movq      [outq], m5
+    mova      m3, tmp2
+    psubw     m6, m1
+    mova      m4, [outq+DCTSIZE*2*2]
+    paddw     m7, m1
+    mova  [outq], m5
    paddw     m4, m3
-    movq      [outq+DCTSIZE*6*2], m6
-    psubw     m3, m1
-    movq      m5, [outq+DCTSIZE*5*2]
-    paddw     m4, m1
-    movq      m6, [outq+DCTSIZE*3*2]
+    mova      [outq+DCTSIZE*6*2], m6
+    psubw     m3, m0
+    mova      m5, [outq+DCTSIZE*5*2]
+    paddw     m4, m0
+    mova      m6, [outq+DCTSIZE*3*2]
    paddw     m5, m3
-    movq      m0, [rsp+8*3]
-    add       srcq, 8+%3
-    movq      [outq+DCTSIZE*1*2], m7
+    mova      m0, tmp3
+    mova      [outq+DCTSIZE*1*2], m7
    paddw     m6, m0
-    movq      [outq+DCTSIZE*2*2], m4
-    psubw     m0, m2
-    movq      m7, [outq+DCTSIZE*4*2]
-    paddw     m6, m2
-    movq      [outq+DCTSIZE*5*2], m5
+    mova      [outq+DCTSIZE*2*2], m4
+    paddw     m0, m2
+    mova      m7, [outq+DCTSIZE*4*2]
+    psubw     m6, m2
+    mova      [outq+DCTSIZE*5*2], m5
    paddw     m7, m0
-    movq      [outq+DCTSIZE*3*2], m6
-    movq      [outq+DCTSIZE*4*2], m7
-    add       outq, 8+%3
+    mova      [outq+DCTSIZE*3*2], m6
+    mova      [outq+DCTSIZE*4*2], m7
+    add     outq, 32
 %endmacro

-%macro COLUMN_IDCT 0-1 0
-    movq      m3, m5
+%macro COLUMN_IDCT 0
+    mova      m3, m5
    psubw     m5, m1
-    psllw     m5, 1
    paddw     m3, m1
-    movq      m2, m0
+    mova      m2, m0
    psubw     m0, m6
-    movq      m1, m5
-    psllw     m0, 1
+    psllw     m1, m5, 1
    pmulhw    m1, [pw_AC62]
    paddw     m5, m0
-    pmulhw    m5, [pw_3B21]
+    pmulhw    m5, [pw_7642]
    paddw     m2, m6
-    pmulhw    m0, [pw_22A3]
-    movq      m7, m2
-    movq      m4, [rsp]
+    pmulhw    m0, [pw_4546]
+    mova      m7, m2
+    mova      m4, tmp0
    psubw     m2, m3
-    psllw     m2, 1
    paddw     m7, m3
-    pmulhw    m2, [pw_2D41]
-    movq      m6, m4
+    pmulhw    m2, SQRT2
+    mova      m6, m4
    psraw     m7, 2
    paddw     m4, [outq]
    psubw     m6, m7
-    movq      m3, [rsp+8]
+    mova      m3, tmp1
    paddw     m4, m7
-    movq      [outq+DCTSIZE*7*2], m6
+    mova      [outq+DCTSIZE*7*2], m6
    paddw     m1, m5
-    movq      [outq], m4
+    mova  [outq], m4
    psubw     m1, m7
-    movq      m7, [rsp+8*2]
+    mova      m7, tmp2
    psubw     m0, m5
-    movq      m6, [rsp+8*3]
-    movq      m5, m3
+    mova      m6, tmp3
+    mova      m5, m3
    paddw     m3, [outq+DCTSIZE*1*2]
    psubw     m5, m1
    psubw     m2, m1
    paddw     m3, m1
-    movq      [outq+DCTSIZE*6*2], m5
-    movq      m4, m7
+    mova      [outq+DCTSIZE*6*2], m5
+    mova      m4, m7
    paddw     m7, [outq+DCTSIZE*2*2]
    psubw     m4, m2
    paddw     m4, [outq+DCTSIZE*5*2]
    paddw     m7, m2
-    movq      [outq+DCTSIZE*1*2], m3
+    mova      [outq+DCTSIZE*1*2], m3
    paddw     m0, m2
-    movq      [outq+DCTSIZE*2*2], m7
-    movq      m1, m6
+    mova      [outq+DCTSIZE*2*2], m7
+    mova      m1, m6
    paddw     m6, [outq+DCTSIZE*4*2]
    psubw     m1, m0
    paddw     m1, [outq+DCTSIZE*3*2]
    paddw     m6, m0
-    movq      [outq+DCTSIZE*5*2], m4
-    add       srcq, 8+%1
-    movq      [outq+DCTSIZE*4*2], m6
-    movq      [outq+DCTSIZE*3*2], m1
-    add       outq, 8+%1
+    mova      [outq+DCTSIZE*5*2], m4
+    mova      [outq+DCTSIZE*4*2], m6
+    mova      [outq+DCTSIZE*3*2], m1
+    add     outq, 32
 %endmacro

-;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
-cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
-.fdct1:
-    COLUMN_FDCT .idct1
-    jmp .fdct2
+;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+cglobal column_fidct, 4, 5, 8+5*ARCH_X86_64, 64*!ARCH_X86_64, thr, src, out, cnt, tmp
+%if ARCH_X86_64
+    %define tmp0 m8
+    %define tmp1 m9
+    %define tmp2 m10
+    %define tmp3 m11
+    %define SQRT2 m12
+    mova     m12, [pw_5A82]
+%else
+    %define tmp0 [rsp]
+    %define tmp1 [rsp+16]
+    %define tmp2 [rsp+2*16]
+    %define tmp3 [rsp+3*16]
+    %define SQRT2 [pw_5A82]
+%endif
+.fdct:
+    COLUMN_FDCT .idct
+    sub    cntd, 2
+    jg .fdct
+    RET

-.idct1:
+.idct:
    COLUMN_IDCT
-
-.fdct2:
-    COLUMN_FDCT .idct2, 8, 16
    sub    cntd, 2
-    jg .fdct1
-    RET
-
-.idct2:
-    COLUMN_IDCT 16
-    sub    cntd, 2
-    jg .fdct1
+    jg .fdct
    RET

+INIT_MMX mmx
 ;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
 cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
    add       strideq, strideq
--- a/libavfilter/x86/vf_fspp_init.c
+++ b/libavfilter/x86/vf_fspp_init.c
@ -21,29 +21,31 @@

 #include "libavutil/attributes.h"
 #include "libavutil/x86/cpu.h"
-#include "libavfilter/vf_fspp.h"
+#include "libavfilter/vf_fsppdsp.h"

-void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
-                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
-void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
-void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
-void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
+                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_mul_thrmat_sse2(const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_sse2(const int16_t *thr_adr, const int16_t *data, int16_t *output, int cnt);
+void ff_row_idct_mmx(const int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
 void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);

-av_cold void ff_fspp_init_x86(FSPPContext *s)
+av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
 {
    int cpu_flags = av_get_cpu_flags();

    if (EXTERNAL_MMX(cpu_flags)) {
-        s->store_slice  = ff_store_slice_mmx;
-        s->store_slice2 = ff_store_slice2_mmx;
-        s->mul_thrmat   = ff_mul_thrmat_mmx;
-        s->column_fidct = ff_column_fidct_mmx;
        s->row_idct     = ff_row_idct_mmx;
        s->row_fdct     = ff_row_fdct_mmx;
    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->store_slice  = ff_store_slice_sse2;
+        s->store_slice2 = ff_store_slice2_sse2;
+        s->mul_thrmat   = ff_mul_thrmat_sse2;
+        s->column_fidct = ff_column_fidct_sse2;
+    }
 }
--- a/libavfilter/x86/vf_spp.c
+++ b/libavfilter/x86/vf_spp.c
@ -64,7 +64,7 @@ static void store_slice_sse2(uint8_t *dst, const int16_t *src,
    }
 }

-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */

 av_cold void ff_spp_init_x86(SPPContext *s)
 {
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@ -64,6 +64,7 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)      += vf_bwdif.o
 AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o
 AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
 AVFILTEROBJS-$(CONFIG_EQ_FILTER)         += vf_eq.o
+AVFILTEROBJS-$(CONFIG_FSPP_FILTER)       += vf_fspp.o
 AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
 AVFILTEROBJS-$(CONFIG_HFLIP_FILTER)      += vf_hflip.o
 AVFILTEROBJS-$(CONFIG_IDET_FILTER)       += vf_idet.o
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@ -297,6 +297,9 @@ static const struct {
    #if CONFIG_EQ_FILTER
        { "vf_eq", checkasm_check_vf_eq },
    #endif
+    #if CONFIG_FSPP_FILTER
+        { "vf_fspp", checkasm_check_vf_fspp },
+    #endif
    #if CONFIG_GBLUR_FILTER
        { "vf_gblur", checkasm_check_vf_gblur },
    #endif
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@ -148,6 +148,7 @@ void checkasm_check_v210enc(void);
 void checkasm_check_vc1dsp(void);
 void checkasm_check_vf_bwdif(void);
 void checkasm_check_vf_eq(void);
+void checkasm_check_vf_fspp(void);
 void checkasm_check_vf_gblur(void);
 void checkasm_check_vf_hflip(void);
 void checkasm_check_vf_threshold(void);
--- a/tests/checkasm/vf_fspp.c
+++ b/tests/checkasm/vf_fspp.c
@ -0,0 +1,170 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "checkasm.h"
+#include "libavfilter/vf_fsppdsp.h"
+#include "libavcodec/mathops.h"
+#include "libavutil/mem_internal.h"
+
+#define randomize_buffers(buf)                           \
+    do {                                                 \
+        for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
+            buf[j] = rnd();                              \
+    } while (0)
+
+#define randomize_mask_buffers(buf, buf2, nb_elems, nb_bits)\
+    do {                                                    \
+        for (size_t j = 0; j < nb_elems; ++j)               \
+            buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
+    } while (0)
+
+#define randomize_buffer_range(buf, min, max)               \
+    do {                                                    \
+        for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j)    \
+            buf[j] = min + rnd() % (max - min + 1);         \
+    } while (0)
+
+static void check_store_slice(void)
+{
+    enum {
+        MAX_WIDTH  = 256,
+        /// in elements, not in bytes; 32 is arbitrary
+        MAX_STRIDE = MAX_WIDTH + 32,
+        MAX_HEIGHT = 8,
+    };
+    FSPPDSPContext fspp;
+    ff_fsppdsp_init(&fspp);
+    declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *src,
+                      ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                      ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+
+    for (int i = 0; i < 2; ++i) {
+        if (check_func(i ? fspp.store_slice2 : fspp.store_slice, "store_slice%s", i ? "2" : "")) {
+            // store slice resets the row eight lines above the current one
+            DECLARE_ALIGNED(16, int16_t, src_ref1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH];
+            DECLARE_ALIGNED(16, int16_t, src_new1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH];
+            // store_slice2 resets the row 16 lines below the current one
+            DECLARE_ALIGNED(16, int16_t, src_ref2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH];
+            DECLARE_ALIGNED(16, int16_t, src_new2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH];
+            uint8_t dstbuf_new[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH], dstbuf_ref[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH];
+            uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
+            int16_t *src_ref, *src_new, *or_src_ref, *or_src_new;
+            ptrdiff_t      width = 1 + rnd() % MAX_WIDTH;
+            ptrdiff_t src_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8);
+            ptrdiff_t dst_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8);
+            ptrdiff_t height = 1 + rnd() % 8;
+            size_t nb_elems;
+
+            if (i) {
+                src_ref      = src_ref2;
+                src_new      = src_new2;
+                or_src_ref   = src_ref2;
+                or_src_new   = src_new2;
+                nb_elems     = FF_ARRAY_ELEMS(src_ref2);
+            } else {
+                src_ref      = src_ref1 + 8 * src_stride;
+                src_new      = src_new1 + 8 * src_stride;
+                or_src_ref   = src_ref1;
+                or_src_new   = src_new1;
+                nb_elems     = FF_ARRAY_ELEMS(src_ref1);
+            }
+            if (rnd() & 1) {
+                dst_ref    += dst_stride * (height - 1);
+                dst_new    += dst_stride * (height - 1);
+                dst_stride *= -1;
+            }
+            randomize_buffers(dstbuf_new);
+            memcpy(dstbuf_ref, dstbuf_new, sizeof(dstbuf_ref));
+            randomize_mask_buffers(or_src_ref, or_src_new, nb_elems, 14);
+
+            ptrdiff_t log2_scale = rnd() & 1;
+            call_ref(dst_ref, src_ref, dst_stride, src_stride, width, height, log2_scale);
+            call_new(dst_new, src_new, dst_stride, src_stride, width, height, log2_scale);
+            if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref)) ||
+                memcmp(or_src_ref, or_src_new, sizeof(*or_src_new) * nb_elems))
+                fail();
+            // don't use random parameters for benchmarks
+            src_ref = or_src_ref + !i * 8 * MAX_STRIDE;
+            bench_new(dstbuf_new, src_ref,
+                      MAX_STRIDE, MAX_STRIDE, MAX_WIDTH, 8, 1);
+        }
+    }
+}
+
+static void check_mul_thrmat(void)
+{
+    FSPPDSPContext fspp;
+    DECLARE_ALIGNED(16, int16_t, src)[64];
+    DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
+    DECLARE_ALIGNED(16, int16_t, dst_new)[64];
+    const int q = (uint8_t)rnd();
+    declare_func(void, const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+
+    ff_fsppdsp_init(&fspp);
+
+    if (check_func(fspp.mul_thrmat, "mul_thrmat")) {
+        randomize_buffers(src);
+        call_ref(src, dst_ref, q);
+        call_new(src, dst_new, q);
+        if (memcmp(dst_ref, dst_new, sizeof(dst_ref)))
+            fail();
+        bench_new(src, dst_new, q);
+    }
+}
+
+static void check_column_fidct(void)
+{
+    enum {
+        NB_BLOCKS = 8, ///< arbitrary
+    };
+    FSPPDSPContext fspp;
+    declare_func(void, const int16_t *thr_adr, const int16_t *data,
+                       int16_t *output, int cnt);
+
+    ff_fsppdsp_init(&fspp);
+
+    if (check_func(fspp.column_fidct, "column_fidct")) {
+        DECLARE_ALIGNED(16, int16_t, threshold)[64];
+        DECLARE_ALIGNED(16, int16_t, src)[8*(8*NB_BLOCKS + 6)];
+        DECLARE_ALIGNED(16, int16_t, dst_new)[8*(8*NB_BLOCKS + 6)];
+        DECLARE_ALIGNED(16, int16_t, dst_ref)[8*(8*NB_BLOCKS + 6)];
+
+        randomize_buffer_range(threshold, 0, INT16_MAX);
+        randomize_buffer_range(src, -1284, 1284);
+        randomize_buffers(dst_new);
+        memcpy(dst_ref, dst_new, sizeof(dst_ref));
+
+        call_ref(threshold, src, dst_ref, NB_BLOCKS * 8);
+        call_new(threshold, src, dst_new, NB_BLOCKS * 8);
+
+        if (memcmp(dst_new, dst_ref, sizeof(dst_new)))
+            fail();
+
+        bench_new(threshold, src, dst_new, NB_BLOCKS * 8);
+    }
+}
+
+void checkasm_check_vf_fspp(void)
+{
+    check_store_slice();
+    check_mul_thrmat();
+    check_column_fidct();
+}
--- a/tests/fate.sh
+++ b/tests/fate.sh
@ -55,13 +55,17 @@ configure()(
        ${cross_prefix:+--cross-prefix="$cross_prefix"}                 \
        ${as:+--as="$as"}                                               \
        ${cc:+--cc="$cc"}                                               \
+        ${cxx:+--cxx="$cxx"}                                            \
        ${ld:+--ld="$ld"}                                               \
+        ${nm:+--nm="$nm"}                                               \
        ${target_os:+--target-os="$target_os"}                          \
        ${sysroot:+--sysroot="$sysroot"}                                \
        ${target_exec:+--target-exec="$target_exec"}                    \
        ${target_path:+--target-path="$target_path"}                    \
        ${target_samples:+--target-samples="$target_samples"}           \
        ${extra_cflags:+--extra-cflags="$extra_cflags"}                 \
+        ${extra_cxxflags:+--extra-cxxflags="$extra_cxxflags"}           \
+        ${extra_objcflags:+--extra-objcflags="$extra_objcflags"}        \
        ${extra_ldflags:+--extra-ldflags="$extra_ldflags"}              \
        ${extra_libs:+--extra-libs="$extra_libs"}                       \
        ${extra_conf}
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@ -67,6 +67,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                                 \
                fate-checkasm-vf_colordetect                            \
                fate-checkasm-vf_colorspace                             \
                fate-checkasm-vf_eq                                     \
+                fate-checkasm-vf_fspp                                   \
                fate-checkasm-vf_gblur                                  \
                fate-checkasm-vf_hflip                                  \
                fate-checkasm-vf_nlmeans                                \
Author	SHA1	Message	Date
Kacper Michajłow	9b2162275b	configure: filter out -guard:signret from armasm flags While cl.exe supports -guard:signret, armasm64 complains about unknown flag. Note that -guard:ehcont is accepted by armasm64. Fixes: error A2029: unknown command-line argument or argument value -guard:signret Signed-off-by: Kacper Michajłow <kasper93@gmail.com>	2025-11-17 20:41:34 +00:00
Kacper Michajłow	523d688c2b	fate: add more configure flags to fate config Signed-off-by: Kacper Michajłow <kasper93@gmail.com>	2025-11-17 20:25:24 +00:00
Andreas Rheinhardt	ddf443f1e9	avfilter/vf_fsppdsp: Fix left shifts of negative numbers They are undefined behavior and UBSan warns about them (in the checkasm test). Put the shifts in the constants instead. This even gives a tiny speedup here: Old benchmarks: column_fidct_c: 3369.9 ( 1.00x) column_fidct_sse2: 829.1 ( 4.06x) New benchmarks: column_fidct_c: 3304.2 ( 1.00x) column_fidct_sse2: 827.9 ( 3.99x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:12 +01:00
Andreas Rheinhardt	f8bcea4946	avfilter/vf_fsppdsp: Remove pointless cast Also don't cast const away and use a smaller scope. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:12 +01:00
Andreas Rheinhardt	0c556a6b09	avfilter/vf_fspp: Pre-reorder threshold table Avoids reordering at runtime. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:12 +01:00
Andreas Rheinhardt	778ff97efa	avfilter/vf_fspp: Make output endian-independent Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:12 +01:00
Andreas Rheinhardt	f442145729	avfilter/vf_fspp: Avoid casts, effective-type violations Maybe uint64_t has been used as a poor man's alignment specifier? Anyway, reading an uint64_t via an lvalue of type int16_t (as happens in the C versions of the dsp functions) is undefined behavior. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:12 +01:00
Andreas Rheinhardt	c0648b2004	avfilter/x86/vf_spp: Fix comment Forgotten in `dcb28ed860`. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:12 +01:00
Andreas Rheinhardt	06b0dae51b	avfilter/vf_fsppdsp: Constify Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:12 +01:00
Andreas Rheinhardt	cc97f1e276	avfilter/vf_fspp: Fix effective type violation Also don't use unnecessarily large alignment; it avoids having to align the stack. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:12 +01:00
Andreas Rheinhardt	3cd452cbf1	avfilter/x86/vf_fspp: Avoid stack on x64 Possible due to the amount of registers. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:12 +01:00
Andreas Rheinhardt	ddd74276f8	avfilter/x86/vf_fspp: Port ff_column_fidct_mmx() to SSE2 It gains a lot because it has to operate on eight words; it also saves 608B of .text here. Old benchmarks: column_fidct_c: 3365.7 ( 1.00x) column_fidct_mmx: 1784.6 ( 1.89x) New benchmarks: column_fidct_c: 3361.5 ( 1.00x) column_fidct_sse2: 801.1 ( 4.20x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:11 +01:00
Andreas Rheinhardt	68b11cde82	tests/checkasm/vf_fspp: Add test for column_fidct Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:11 +01:00
Andreas Rheinhardt	63493bf0e0	avfilter/x86/vf_fspp: Put shifts into constants This avoids some shift instructions and also gives us more headroom in the registers. In fact, I have proven to myself that everything that is supposed to fit into 16bits now actually does so. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:11 +01:00
Andreas Rheinhardt	66af18d06a	avfilter/x86/vf_fspp: Make ff_column_fidct_mmx() bitexact It currently is not, because the shortcut mode uses different rounding than the C code (as well as the non-shortcut code). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 12:18:11 +01:00
Andreas Rheinhardt	1049a5fba8	avfilter/vf_fsppdsp: Reduce discrepancies between C code and x86 asm The x86 assembly uses the following pattern to zero all the values with abs<threshold: x -= threshold; x satu+= threshold (unsigned saturated addition) x += threshold x satu-= threshold (unsigned saturated subtraction) The reference C code meanwhile zeroed everything with abs <= threshold. This commit makes the C code behave like the x86 assembly to reduce discrepancies between the two. An alternative would be to require SSSE3, so that one can use pabsw, pcmpgtw for abs>threshold, followed by a pand with the original data. Or one could modify the thresholds to make both equal. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00
Andreas Rheinhardt	d19050a1ae	avfilter/vf_fsppdsp: Use restrict It is possible because the requirements are fulfilled; it is also beneficial performance and code-size wise. For GCC 14 (with -O3), this reduced codesize by 26750B here; for Clang 20, it was 432B. Old benchmarks: mul_thrmat_c: 4.3 ( 1.00x) mul_thrmat_sse2: 4.3 ( 1.00x) store_slice_c: 2810.8 ( 1.00x) store_slice_sse2: 542.5 ( 5.18x) store_slice2_c: 3817.0 ( 1.00x) store_slice2_sse2: 410.4 ( 9.30x) New benchmarks: mul_thrmat_c: 4.3 ( 1.00x) mul_thrmat_sse2: 4.3 ( 1.00x) store_slice_c: 1510.1 ( 1.00x) store_slice_sse2: 545.2 ( 2.77x) store_slice2_c: 1763.5 ( 1.00x) store_slice2_sse2: 408.3 ( 4.32x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00
Andreas Rheinhardt	ff85a20b7d	avfilter/x86/vf_fspp: Port store_slice to SSE2 Old benchmarks: store_slice_c: 2798.3 ( 1.00x) store_slice_mmx: 950.2 ( 2.94x) store_slice2_c: 3811.7 ( 1.00x) store_slice2_mmx: 682.3 ( 5.59x) New benchmarks: store_slice_c: 2797.2 ( 1.00x) store_slice_sse2: 543.5 ( 5.15x) store_slice2_c: 3817.0 ( 1.00x) store_slice2_sse2: 408.2 ( 9.35x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00
Andreas Rheinhardt	570f8fc6c9	tests/checkasm/vf_fspp: Test store_slice Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00
Andreas Rheinhardt	e042f17e99	avfilter/vf_fsppdsp: Use standard clamping This is obviously what is intended and what the MMX code does; yet I cannot rule out that it changes the output for some inputs: I have observed individual src values which would lead to temp values just above 512 if they came in pairs (i.e. if both inputs were simultaneously huge). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00
Andreas Rheinhardt	52ba2ac7bd	avfilter/x86/vf_fspp: Port mul_thrmat to SSE2 This fixes an ABI violation, as mul_thrmat did not issue emms. It seems that this ABI violation could reach the user, namely if ff_get_video_buffer() fails. Notice that ff_get_video_buffer() itself could fail because of this, namely if the allocator uses floating point registers. On x64 (where GCC already used SSE2 in the C version) mul_thrmat_c: 4.4 ( 1.00x) mul_thrmat_mmx: 8.6 ( 0.52x) mul_thrmat_sse2: 4.4 ( 1.00x) On 32bit (where SSE2 is not known to be available): mul_thrmat_c: 56.0 ( 1.00x) mul_thrmat_sse2: 6.0 ( 9.40x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00
Andreas Rheinhardt	70eb8a76a9	tests/checkasm: Add vf_fspp mul_thrmat test Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00
Andreas Rheinhardt	9f4d5d818d	avfilter/x86/vf_fspp: Don't duplicate dither table Reuse the one from vf_fsppdsp.c; also don't overalign said table too much. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00
Andreas Rheinhardt	1699de0955	avfilter/vf_fsppdsp: Use enum for constants It means that the compiler does not have to optimize the static const object away. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00
Andreas Rheinhardt	9b34088c4d	avfilter/vf_fspp: Add DSPCtx, move DSP functions to file of their own This is in preparation for adding checkasm tests; without it, checkasm would pull all of libavfilter in. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-17 11:28:04 +01:00