Compare commits

..

No commits in common. "9b2162275b52e4d9558de18b0f58096e1ce0347c" and "57d6898730836ac2006d10bf18396752de092e49" have entirely different histories.

15 changed files with 733 additions and 946 deletions

1
configure vendored
View File

@ -4968,7 +4968,6 @@ armasm_flags(){
# Filter out MSVC cl.exe options from cflags that shouldn't
# be passed to gas-preprocessor
-M[TD]*) ;;
-guard:signret) ;;
*) echo $flag ;;
esac
done

View File

@ -329,7 +329,7 @@ OBJS-$(CONFIG_FRAMESTEP_FILTER) += vf_framestep.o
OBJS-$(CONFIG_FREEZEDETECT_FILTER) += vf_freezedetect.o
OBJS-$(CONFIG_FREEZEFRAMES_FILTER) += vf_freezeframes.o
OBJS-$(CONFIG_FREI0R_FILTER) += vf_frei0r.o
OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o vf_fsppdsp.o qp_table.o
OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o qp_table.o
OBJS-$(CONFIG_FSYNC_FILTER) += vf_fsync.o
OBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o
OBJS-$(CONFIG_GBLUR_VULKAN_FILTER) += vf_gblur_vulkan.o vulkan.o vulkan_filter.o

View File

@ -41,41 +41,12 @@
#include "libavutil/mem_internal.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
#include "libavutil/video_enc_params.h"
#include "avfilter.h"
#include "filters.h"
#include "qp_table.h"
#include "vf_fsppdsp.h"
#include "vf_fspp.h"
#include "video.h"
#define BLOCKSZ 12
#define MAX_LEVEL 5
typedef struct FSPPContext {
const struct AVClass *class;
int log2_count;
int strength;
int hsub;
int vsub;
int temp_stride;
int qp;
enum AVVideoEncParamsType qscale_type;
int prev_q;
uint8_t *src;
int16_t *temp;
int8_t *non_b_qp_table;
int non_b_qp_stride;
int use_bframe_qp;
FSPPDSPContext dsp;
DECLARE_ALIGNED(16, int16_t, threshold_mtx_noq)[8 * 8];
DECLARE_ALIGNED(16, int16_t, threshold_mtx)[8 * 8];
} FSPPContext;
#define OFFSET(x) offsetof(FSPPContext, x)
#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
static const AVOption fspp_options[] = {
@ -88,22 +59,98 @@ static const AVOption fspp_options[] = {
AVFILTER_DEFINE_CLASS(fspp);
DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
{ 0, 48, 12, 60, 3, 51, 15, 63, },
{ 32, 16, 44, 28, 35, 19, 47, 31, },
{ 8, 56, 4, 52, 11, 59, 7, 55, },
{ 40, 24, 36, 20, 43, 27, 39, 23, },
{ 2, 50, 14, 62, 1, 49, 13, 61, },
{ 34, 18, 46, 30, 33, 17, 45, 29, },
{ 10, 58, 6, 54, 9, 57, 5, 53, },
{ 42, 26, 38, 22, 41, 25, 37, 21, },
};
static const short custom_threshold[64] = {
// values (296) can't be too high
// -it causes too big quant dependence
// or maybe overflow(check), which results in some flashing
// reorder coefficients to the order in which columns are processed
#define REORDER(a,b,c,d,e,f,g,h) c, g, a, e, f, d, b, h
REORDER( 71, 296, 295, 237, 71, 40, 38, 19),
REORDER(245, 193, 185, 121, 102, 73, 53, 27),
REORDER(158, 129, 141, 107, 97, 73, 50, 26),
REORDER(102, 116, 109, 98, 82, 66, 45, 23),
REORDER( 71, 94, 95, 81, 70, 56, 38, 20),
REORDER( 56, 77, 74, 66, 56, 44, 30, 15),
REORDER( 38, 53, 50, 45, 38, 30, 21, 11),
REORDER( 20, 27, 26, 23, 20, 15, 11, 5)
71, 296, 295, 237, 71, 40, 38, 19,
245, 193, 185, 121, 102, 73, 53, 27,
158, 129, 141, 107, 97, 73, 50, 26,
102, 116, 109, 98, 82, 66, 45, 23,
71, 94, 95, 81, 70, 56, 38, 20,
56, 77, 74, 66, 56, 44, 30, 15,
38, 53, 50, 45, 38, 30, 21, 11,
20, 27, 26, 23, 20, 15, 11, 5
};
//This func reads from 1 slice, 1 and clears 0 & 1
static void store_slice_c(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
int y, x;
#define STORE(pos) \
temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
if (temp & 0x100) temp = ~(temp >> 31); \
dst[x + pos] = temp;
for (y = 0; y < height; y++) {
const uint8_t *d = dither[y];
for (x = 0; x < width; x += 8) {
int temp;
STORE(0);
STORE(1);
STORE(2);
STORE(3);
STORE(4);
STORE(5);
STORE(6);
STORE(7);
}
src += src_stride;
dst += dst_stride;
}
}
//This func reads from 2 slices, 0 & 2 and clears 2-nd
static void store_slice2_c(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
int y, x;
#define STORE2(pos) \
temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
src[x + pos + 16 * src_stride] = 0; \
if (temp & 0x100) temp = ~(temp >> 31); \
dst[x + pos] = temp;
for (y = 0; y < height; y++) {
const uint8_t *d = dither[y];
for (x = 0; x < width; x += 8) {
int temp;
STORE2(0);
STORE2(1);
STORE2(2);
STORE2(3);
STORE2(4);
STORE2(5);
STORE2(6);
STORE2(7);
}
src += src_stride;
dst += dst_stride;
}
}
static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
{
int a;
for (a = 0; a < 64; a++)
thr_adr[a] = q * thr_adr_noq[a];
}
static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
int dst_stride, int src_stride,
int width, int height,
@ -116,9 +163,9 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
const int qpsh = 4 - p->hsub * !is_luma;
const int qpsv = 4 - p->vsub * !is_luma;
DECLARE_ALIGNED(16, int16_t, block_align)[8 * 8 * BLOCKSZ + 8 * 8 * BLOCKSZ];
int16_t *block = block_align;
int16_t *block3 = block_align + 8 * 8 * BLOCKSZ;
DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
int16_t *block = (int16_t *)block_align;
int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
memset(block3, 0, 4 * 8 * BLOCKSZ);
@ -150,13 +197,13 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
if (qy < 0) qy = 0;
qy = (qy >> qpsv) * qp_stride;
p->dsp.row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
if (p->qp)
p->dsp.column_fidct(p->threshold_mtx, block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
else
for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
t = x + x0 - 2; //correct t=x+x0-2-(y&1), but its the same
@ -166,45 +213,288 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
t = qp_store[qy + (t >> qpsh)];
t = ff_norm_qscale(t, p->qscale_type);
if (t != p->prev_q) {
p->prev_q = t;
p->dsp.mul_thrmat(p->threshold_mtx_noq, p->threshold_mtx, t);
}
p->dsp.column_fidct(p->threshold_mtx, block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
}
p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
}
es = width + 8 - x0; // 8, ...
if (es > 8)
p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
p->dsp.column_fidct(p->threshold_mtx, block, block3, es&(~1));
p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
if (es > 3)
p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
if (!(y1 & 7) && y1) {
if (y1 & 8)
p->dsp.store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
dst_stride, stride, width, 8, 5 - p->log2_count);
p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
dst_stride, stride, width, 8, 5 - p->log2_count);
else
p->dsp.store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
dst_stride, stride, width, 8, 5 - p->log2_count);
p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
dst_stride, stride, width, 8, 5 - p->log2_count);
}
}
if (y & 7) { // height % 8 != 0
if (y & 8)
p->dsp.store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
dst_stride, stride, width, y&7, 5 - p->log2_count);
p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
dst_stride, stride, width, y&7, 5 - p->log2_count);
else
p->dsp.store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
dst_stride, stride, width, y&7, 5 - p->log2_count);
}
}
static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
int16_t *dataptr;
int16_t *wsptr;
int16_t *threshold;
int ctr;
dataptr = data;
wsptr = output;
for (; cnt > 0; cnt -= 2) { //start positions
threshold = (int16_t *)thr_adr;//threshold_mtx
for (ctr = DCTSIZE; ctr > 0; ctr--) {
// Process columns from input, add to output.
tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
// Even part of FDCT
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
d0 = tmp10 + tmp11;
d4 = tmp10 - tmp11;
z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
d2 = tmp13 + z1;
d6 = tmp13 - z1;
// Even part of IDCT
THRESHOLD(tmp0, d0, threshold[0 * 8]);
THRESHOLD(tmp1, d2, threshold[2 * 8]);
THRESHOLD(tmp2, d4, threshold[4 * 8]);
THRESHOLD(tmp3, d6, threshold[6 * 8]);
tmp0 += 2;
tmp10 = (tmp0 + tmp2) >> 2;
tmp11 = (tmp0 - tmp2) >> 2;
tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
tmp0 = tmp10 + tmp13; //->temps
tmp3 = tmp10 - tmp13; //->temps
tmp1 = tmp11 + tmp12; //->temps
tmp2 = tmp11 - tmp12; //->temps
// Odd part of FDCT
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
z11 = tmp7 + z3;
z13 = tmp7 - z3;
d5 = z13 + z2;
d3 = z13 - z2;
d1 = z11 + z4;
d7 = z11 - z4;
// Odd part of IDCT
THRESHOLD(tmp4, d1, threshold[1 * 8]);
THRESHOLD(tmp5, d3, threshold[3 * 8]);
THRESHOLD(tmp6, d5, threshold[5 * 8]);
THRESHOLD(tmp7, d7, threshold[7 * 8]);
//Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
z13 = tmp6 + tmp5;
z10 = (tmp6 - tmp5) << 1;
z11 = tmp4 + tmp7;
z12 = (tmp4 - tmp7) << 1;
tmp7 = (z11 + z13) >> 2; //+2 !
tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
tmp6 = tmp12 - tmp7;
tmp5 = tmp11 - tmp6;
tmp4 = tmp10 + tmp5;
wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
//
dataptr++; //next column
wsptr++;
threshold++;
}
dataptr += 8; //skip each second start pos
wsptr += 8;
}
}
static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z5, z10, z11, z12, z13;
int16_t *outptr;
int16_t *wsptr;
cnt *= 4;
wsptr = workspace;
outptr = output_adr;
for (; cnt > 0; cnt--) {
// Even part
//Simd version reads 4x4 block and transposes it
tmp10 = wsptr[2] + wsptr[3];
tmp11 = wsptr[2] - wsptr[3];
tmp13 = wsptr[0] + wsptr[1];
tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
tmp0 = tmp10 + tmp13; //->temps
tmp3 = tmp10 - tmp13; //->temps
tmp1 = tmp11 + tmp12;
tmp2 = tmp11 - tmp12;
// Odd part
//Also transpose, with previous:
// ---- ---- ||||
// ---- ---- idct ||||
// ---- ---- ---> ||||
// ---- ---- ||||
z13 = wsptr[4] + wsptr[5];
z10 = wsptr[4] - wsptr[5];
z11 = wsptr[6] + wsptr[7];
z12 = wsptr[6] - wsptr[7];
tmp7 = z11 + z13;
tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
tmp6 = (tmp12 << 3) - tmp7;
tmp5 = (tmp11 << 3) - tmp6;
tmp4 = (tmp10 << 3) + tmp5;
// Final output stage: descale and write column
outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
outptr++;
wsptr += DCTSIZE; // advance pointer to next row
}
}
static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z1, z2, z3, z4, z5, z11, z13;
int16_t *dataptr;
cnt *= 4;
// Pass 1: process rows.
dataptr = data;
for (; cnt > 0; cnt--) {
tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
// Even part
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
//Even columns are written first, this leads to different order of columns
//in column_fidct(), but they are processed independently, so all ok.
//Later in the row_idct() columns are read in the same order.
dataptr[2] = tmp10 + tmp11;
dataptr[3] = tmp10 - tmp11;
z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
dataptr[0] = tmp13 + z1;
dataptr[1] = tmp13 - z1;
// Odd part
tmp10 = (tmp4 + tmp5) << 2;
tmp11 = (tmp5 + tmp6) << 2;
tmp12 = (tmp6 + tmp7) << 2;
z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
z11 = tmp7 + z3;
z13 = tmp7 - z3;
dataptr[4] = z13 + z2;
dataptr[5] = z13 - z2;
dataptr[6] = z11 + z4;
dataptr[7] = z11 - z4;
pixels++; // advance pointer to next column
dataptr += DCTSIZE;
}
}
static const enum AVPixelFormat pix_fmts[] = {
AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P,
AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV411P,
@ -232,7 +522,16 @@ static int config_input(AVFilterLink *inlink)
if (!fspp->temp || !fspp->src)
return AVERROR(ENOMEM);
ff_fsppdsp_init(&fspp->dsp);
fspp->store_slice = store_slice_c;
fspp->store_slice2 = store_slice2_c;
fspp->mul_thrmat = mul_thrmat_c;
fspp->column_fidct = column_fidct_c;
fspp->row_idct = row_idct_c;
fspp->row_fdct = row_fdct_c;
#if ARCH_X86
ff_fspp_init_x86(fspp);
#endif
return 0;
}
@ -246,17 +545,30 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
int qp_stride = 0;
int8_t *qp_table = NULL;
int i, bias;
int ret = 0;
int custom_threshold_m[64];
//FIXME: tune custom_threshold[] and remove this !
for (int i = 0, bias = (1 << 4) + fspp->strength; i < 64; ++i)
fspp->threshold_mtx_noq[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
bias = (1 << 4) + fspp->strength;
if (fspp->qp) {
fspp->prev_q = fspp->qp;
fspp->dsp.mul_thrmat(fspp->threshold_mtx_noq, fspp->threshold_mtx, fspp->qp);
for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
for (i = 0; i < 8; i++) {
fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
|(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
|(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
|(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
|(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
|(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
|(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
}
if (fspp->qp)
fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
/* if we are not in a constant user quantizer mode and we don't want to use
* the quantizers from the B-frames (B-frames often have a higher QP), we
* need to save the qp table from the last non B-frame; this is what the

96
libavfilter/vf_fspp.h Normal file
View File

@ -0,0 +1,96 @@
/*
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
* Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef AVFILTER_FSPP_H
#define AVFILTER_FSPP_H
#include "libavutil/video_enc_params.h"
#include "avfilter.h"
#define BLOCKSZ 12
#define MAX_LEVEL 5
#define DCTSIZE 8
#define DCTSIZE_S "8"
#define FIX(x,s) ((x) * (1 << s) + 0.5)
#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
#define THRESHOLD(r,x,t) \
if(((unsigned)((x) + t)) > t * 2) r = (x); \
else r = 0;
#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
typedef int32_t int_simd16_t;
static const int16_t FIX_0_382683433 = FIX(0.382683433, 14);
static const int16_t FIX_0_541196100 = FIX(0.541196100, 14);
static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14);
static const int16_t FIX_1_306562965 = FIX(1.306562965, 14);
static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14);
static const int16_t FIX_1_847759065 = FIX(1.847759065, 13);
static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13);
static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13);
static const int16_t FIX_1_082392200 = FIX(1.082392200, 13);
typedef struct FSPPContext {
AVClass *class;
uint64_t threshold_mtx_noq[8 * 2];
uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later SSE2) versions
int log2_count;
int strength;
int hsub;
int vsub;
int temp_stride;
int qp;
enum AVVideoEncParamsType qscale_type;
int prev_q;
uint8_t *src;
int16_t *temp;
int8_t *non_b_qp_table;
int non_b_qp_stride;
int use_bframe_qp;
void (*store_slice)(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void (*store_slice2)(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void (*column_fidct)(int16_t *thr_adr, int16_t *data,
int16_t *output, int cnt);
void (*row_idct)(int16_t *workspace, int16_t *output_adr,
ptrdiff_t output_stride, int cnt);
void (*row_fdct)(int16_t *data, const uint8_t *pixels,
ptrdiff_t line_size, int cnt);
} FSPPContext;
void ff_fspp_init_x86(FSPPContext *fspp);
#endif /* AVFILTER_FSPP_H */

View File

@ -1,371 +0,0 @@
/*
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
* Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <stdint.h>
#include "vf_fsppdsp.h"
#include "libavutil/common.h"
#include "libavutil/mathematics.h"
#include "libavutil/mem_internal.h"
#define DCTSIZE 8
#define FIX(x,s) (int)((x) * (1 << s) + 0.5)
#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
#define THRESHOLD(r,x,t) \
if (((unsigned)((x) + t)) >= t * 2) r = (x); \
else r = 0;
#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
typedef int32_t int_simd16_t;
enum {
FIX_0_382683433 = FIX(0.382683433, 14),
FIX_0_541196100 = FIX(0.541196100, 14),
FIX_0_707106781 = FIX(M_SQRT1_2 , 14),
FIX_1_306562965 = FIX(1.306562965, 14),
FIX_1_414213562_A = FIX(M_SQRT2 , 14),
FIX_1_847759065 = FIX(1.847759065, 13),
FIX_2_613125930 = FIX(-2.613125930, 13),
FIX_1_414213562 = FIX(M_SQRT2 , 13),
FIX_1_082392200 = FIX(1.082392200, 13),
};
DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
{ 0, 48, 12, 60, 3, 51, 15, 63, },
{ 32, 16, 44, 28, 35, 19, 47, 31, },
{ 8, 56, 4, 52, 11, 59, 7, 55, },
{ 40, 24, 36, 20, 43, 27, 39, 23, },
{ 2, 50, 14, 62, 1, 49, 13, 61, },
{ 34, 18, 46, 30, 33, 17, 45, 29, },
{ 10, 58, 6, 54, 9, 57, 5, 53, },
{ 42, 26, 38, 22, 41, 25, 37, 21, },
};
//This func reads from 1 slice, 1 and clears 0 & 1
void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
#define STORE(pos) \
temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
temp = av_clip_uint8(temp); \
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
const uint8_t *d = ff_fspp_dither[y];
for (int x = 0; x < width; x += 8) {
int temp;
STORE(0);
STORE(1);
STORE(2);
STORE(3);
STORE(4);
STORE(5);
STORE(6);
STORE(7);
}
src += src_stride;
dst += dst_stride;
}
}
//This func reads from 2 slices, 0 & 2 and clears 2-nd
void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
#define STORE2(pos) \
temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
src[x + pos + 16 * src_stride] = 0; \
temp = av_clip_uint8(temp); \
dst[x + pos] = temp;
for (int y = 0; y < height; y++) {
const uint8_t *d = ff_fspp_dither[y];
for (int x = 0; x < width; x += 8) {
int temp;
STORE2(0);
STORE2(1);
STORE2(2);
STORE2(3);
STORE2(4);
STORE2(5);
STORE2(6);
STORE2(7);
}
src += src_stride;
dst += dst_stride;
}
}
void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q)
{
for (int a = 0; a < 64; a++)
thr_adr[a] = q * thr_adr_noq[a];
}
void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data,
int16_t *restrict output, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
int16_t *wsptr;
wsptr = output;
for (; cnt > 0; cnt -= 2) { //start positions
const int16_t *threshold = thr_adr;//threshold_mtx
for (int ctr = DCTSIZE; ctr > 0; ctr--) {
// Process columns from input, add to output.
tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
tmp7 = data[DCTSIZE * 0] - data[DCTSIZE * 7];
tmp1 = data[DCTSIZE * 1] + data[DCTSIZE * 6];
tmp6 = data[DCTSIZE * 1] - data[DCTSIZE * 6];
tmp2 = data[DCTSIZE * 2] + data[DCTSIZE * 5];
tmp5 = data[DCTSIZE * 2] - data[DCTSIZE * 5];
tmp3 = data[DCTSIZE * 3] + data[DCTSIZE * 4];
tmp4 = data[DCTSIZE * 3] - data[DCTSIZE * 4];
// Even part of FDCT
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
d0 = tmp10 + tmp11;
d4 = tmp10 - tmp11;
z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
d2 = tmp13 + z1;
d6 = tmp13 - z1;
// Even part of IDCT
THRESHOLD(tmp0, d0, threshold[0 * 8]);
THRESHOLD(tmp1, d2, threshold[2 * 8]);
THRESHOLD(tmp2, d4, threshold[4 * 8]);
THRESHOLD(tmp3, d6, threshold[6 * 8]);
tmp0 += 2;
tmp10 = (tmp0 + tmp2) >> 2;
tmp11 = (tmp0 - tmp2) >> 2;
tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
tmp0 = tmp10 + tmp13; //->temps
tmp3 = tmp10 - tmp13; //->temps
tmp1 = tmp11 + tmp12; //->temps
tmp2 = tmp11 - tmp12; //->temps
// Odd part of FDCT
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
z11 = tmp7 + z3;
z13 = tmp7 - z3;
d5 = z13 + z2;
d3 = z13 - z2;
d1 = z11 + z4;
d7 = z11 - z4;
// Odd part of IDCT
THRESHOLD(tmp4, d1, threshold[1 * 8]);
THRESHOLD(tmp5, d3, threshold[3 * 8]);
THRESHOLD(tmp6, d5, threshold[5 * 8]);
THRESHOLD(tmp7, d7, threshold[7 * 8]);
//Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
z13 = tmp6 + tmp5;
z10 = (tmp6 - tmp5) * 2;
z11 = tmp4 + tmp7;
z12 = (tmp4 - tmp7) * 2;
tmp7 = (z11 + z13) >> 2; //+2 !
tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562 << 1);
z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
tmp6 = tmp12 - tmp7;
tmp5 = tmp11 - tmp6;
tmp4 = tmp10 + tmp5;
wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
//
data++; //next column
wsptr++;
threshold++;
}
data += 8; //skip each second start pos
wsptr += 8;
}
}
void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z5, z10, z11, z12, z13;
int16_t *outptr;
cnt *= 4;
outptr = output_adr;
for (; cnt > 0; cnt--) {
// Even part
//Simd version reads 4x4 block and transposes it
tmp10 = wsptr[2] + wsptr[3];
tmp11 = wsptr[2] - wsptr[3];
tmp13 = wsptr[0] + wsptr[1];
tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) * 4) - tmp13;//this shift order to avoid overflow
tmp0 = tmp10 + tmp13; //->temps
tmp3 = tmp10 - tmp13; //->temps
tmp1 = tmp11 + tmp12;
tmp2 = tmp11 - tmp12;
// Odd part
//Also transpose, with previous:
// ---- ---- ||||
// ---- ---- idct ||||
// ---- ---- ---> ||||
// ---- ---- ||||
z13 = wsptr[4] + wsptr[5];
z10 = wsptr[4] - wsptr[5];
z11 = wsptr[6] + wsptr[7];
z12 = wsptr[6] - wsptr[7];
tmp7 = z11 + z13;
tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
tmp6 = tmp12 * 8 - tmp7;
tmp5 = tmp11 * 8 - tmp6;
tmp4 = tmp10 * 8 + tmp5;
// Final output stage: descale and write column
outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
outptr++;
wsptr += DCTSIZE; // advance pointer to next row
}
}
void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
int_simd16_t z1, z2, z3, z4, z5, z11, z13;
int16_t *dataptr;
cnt *= 4;
// Pass 1: process rows.
dataptr = data;
for (; cnt > 0; cnt--) {
tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
// Even part
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
//Even columns are written first, this leads to different order of columns
//in column_fidct(), but they are processed independently, so all ok.
//Later in the row_idct() columns are read in the same order.
dataptr[2] = tmp10 + tmp11;
dataptr[3] = tmp10 - tmp11;
z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
dataptr[0] = tmp13 + z1;
dataptr[1] = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
z11 = tmp7 + z3;
z13 = tmp7 - z3;
dataptr[4] = z13 + z2;
dataptr[5] = z13 - z2;
dataptr[6] = z11 + z4;
dataptr[7] = z11 - z4;
pixels++; // advance pointer to next column
dataptr += DCTSIZE;
}
}

View File

@ -1,89 +0,0 @@
/*
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
* Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef AVFILTER_FSPPDSP_H
#define AVFILTER_FSPPDSP_H
#include <stddef.h>
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes_internal.h"
typedef struct FSPPDSPContext {
void (*store_slice)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void (*store_slice2)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void (*mul_thrmat)(const int16_t *restrict thr_adr_noq /* align 16 */,
int16_t *restrict thr_adr /* align 16 */, int q);
void (*column_fidct)(const int16_t *restrict thr_adr, const int16_t *restrict data,
int16_t *restrict output, int cnt);
void (*row_idct)(const int16_t *restrict workspace, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt);
void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt);
} FSPPDSPContext;
FF_VISIBILITY_PUSH_HIDDEN
extern const uint8_t ff_fspp_dither[8][8];
void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q);
void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data,
int16_t *restrict output, int cnt);
void ff_row_idct_c(const int16_t *restrict workspace, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt);
void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt);
void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
FF_VISIBILITY_POP_HIDDEN
static inline void ff_fsppdsp_init(FSPPDSPContext *fspp)
{
fspp->store_slice = ff_store_slice_c;
fspp->store_slice2 = ff_store_slice2_c;
fspp->mul_thrmat = ff_mul_thrmat_c;
fspp->column_fidct = ff_column_fidct_c;
fspp->row_idct = ff_row_idct_c;
fspp->row_fdct = ff_row_fdct_c;
#if ARCH_X86
ff_fsppdsp_init_x86(fspp);
#endif
}
#endif /* AVFILTER_FSPPDSP_H */

View File

@ -25,33 +25,36 @@
SECTION_RODATA
cextern fspp_dither
pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2
pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4
pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14)
pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14)
pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2
pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13)
pw_2: times 8 dw 2
pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \
8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \
2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \
10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
pw_4: times 4 dw 4
pw_2: times 4 dw 2
SECTION .text
%define DCTSIZE 8
INIT_XMM sse2
INIT_MMX mmx
;void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
%if ARCH_X86_64
cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
%else
cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
%define dst_strideq r2m
%define src_strideq r3m
mov widthq, r4m
@ -62,7 +65,7 @@ cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
mov tmpq, src_strideq
and widthq, ~7
sub dst_strideq, widthq
movd m4, ditherd ; log2_scale
movd m5, ditherd ; log2_scale
xor ditherq, -1 ; log2_scale
mov tmp2q, tmpq
add ditherq, 7 ; log2_scale
@ -70,25 +73,33 @@ cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
sub tmp2q, widthq
movd m2, ditherd ; log2_scale
add tmp2q, tmp2q
lea ditherq, [fspp_dither]
lea ditherq, [pb_dither]
mov src_strideq, tmp2q
shl tmpq, 4
lea dither_heightq, [ditherq+dither_heightq*8]
pxor m1, m1
pxor m7, m7
.loop_height:
movq m3, [ditherq]
punpcklbw m3, m1
movq m4, m3
punpcklbw m3, m7
punpckhbw m4, m7
mov tmp2q, widthq
psraw m3, m4
psraw m3, m5
psraw m4, m5
.loop_width:
mova m0, [srcq]
mova [srcq+tmpq], m1
movq [srcq+tmpq], m7
movq m0, [srcq]
movq m1, [srcq+8]
movq [srcq+tmpq+8], m7
paddw m0, m3
mova [srcq], m1
paddw m1, m4
movq [srcq], m7
psraw m0, m2
packuswb m0, m0
psraw m1, m2
movq [srcq+8], m7
packuswb m0, m1
add srcq, 16
movq [dstq], m0
add dstq, 8
@ -102,13 +113,13 @@ cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
jl .loop_height
RET
;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
%if ARCH_X86_64
cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
%else
cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
%define dst_strideq r2m
%define src_strideq r3m
mov dstq, dstm
@ -121,32 +132,41 @@ cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
mov tmpq, src_strideq
and widthq, ~7
sub dst_strideq, widthq
movd m4, ditherd ; log2_scale
movd m5, ditherd ; log2_scale
xor ditherq, -1 ; log2_scale
mov tmp2q, tmpq
add ditherq, 7 ; log2_scale
sub tmp2q, widthq
movd m2, ditherd ; log2_scale
add tmp2q, tmp2q
lea ditherq, [fspp_dither]
lea ditherq, [pb_dither]
mov src_strideq, tmp2q
shl tmpq, 5
lea dither_heightq, [ditherq+dither_heightq*8]
pxor m1, m1
pxor m7, m7
.loop_height:
movq m3, [ditherq]
punpcklbw m3, m1
movq m4, m3
punpcklbw m3, m7
punpckhbw m4, m7
mov tmp2q,widthq
psraw m3, m4
psraw m3, m5
psraw m4, m5
.loop_width:
mova m0, [srcq]
movq m0, [srcq]
movq m1, [srcq+8]
paddw m0, m3
paddw m0, [srcq+tmpq]
mova [srcq+tmpq], m1
paddw m1, m4
movq m6, [srcq+tmpq+8]
movq [srcq+tmpq], m7
psraw m0, m2
packuswb m0, m0
paddw m1, m6
movq [srcq+tmpq+8], m7
psraw m1, m2
packuswb m0, m1
movq [dstq], m0
add srcq, 16
add dstq, 8
@ -160,152 +180,164 @@ cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
jl .loop_height
RET
;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
movd m4, qd
mova m0, [thrnq]
punpcklwd m4, m4
mova m1, [thrnq+16]
pshufd m4, m4, 0
pmullw m0, m4
mova m2, [thrnq+16*2]
pmullw m1, m4
mova m3, [thrnq+16*3]
pmullw m2, m4
mova [thrq], m0
mova m0, [thrnq+16*4]
pmullw m3, m4
mova [thrq+16], m1
mova m1, [thrnq+16*5]
pmullw m0, m4
mova [thrq+16*2], m2
mova m2, [thrnq+16*6]
pmullw m1, m4
mova [thrq+16*3], m3
mova m3, [thrnq+16*7]
pmullw m2, m4
mova [thrq+16*4], m0
pmullw m3, m4
mova [thrq+16*5], m1
mova [thrq+16*6], m2
mova [thrq+16*7], m3
;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
movd m7, qd
movq m0, [thrnq]
punpcklwd m7, m7
movq m1, [thrnq+8]
punpckldq m7, m7
pmullw m0, m7
movq m2, [thrnq+8*2]
pmullw m1, m7
movq m3, [thrnq+8*3]
pmullw m2, m7
movq [thrq], m0
movq m4, [thrnq+8*4]
pmullw m3, m7
movq [thrq+8], m1
movq m5, [thrnq+8*5]
pmullw m4, m7
movq [thrq+8*2], m2
movq m6, [thrnq+8*6]
pmullw m5, m7
movq [thrq+8*3], m3
movq m0, [thrnq+8*7]
pmullw m6, m7
movq [thrq+8*4], m4
movq m1, [thrnq+8*7+8]
pmullw m0, m7
movq [thrq+8*5], m5
movq m2, [thrnq+8*7+8*2]
pmullw m1, m7
movq [thrq+8*6], m6
movq m3, [thrnq+8*7+8*3]
pmullw m2, m7
movq [thrq+8*7], m0
movq m4, [thrnq+8*7+8*4]
pmullw m3, m7
movq [thrq+8*7+8], m1
movq m5, [thrnq+8*7+8*5]
pmullw m4, m7
movq [thrq+8*7+8*2], m2
movq m6, [thrnq+8*7+8*6]
pmullw m5, m7
movq [thrq+8*7+8*3], m3
movq m0, [thrnq+14*8]
pmullw m6, m7
movq [thrq+8*7+8*4], m4
movq m1, [thrnq+14*8+8]
pmullw m0, m7
movq [thrq+8*7+8*5], m5
pmullw m1, m7
movq [thrq+8*7+8*6], m6
movq [thrq+14*8], m0
movq [thrq+14*8+8], m1
RET
%macro COLUMN_FDCT 1
mova m1, [srcq+DCTSIZE*0*2]
mova m7, [srcq+DCTSIZE*3*2]
mova m0, m1
%macro COLUMN_FDCT 1-3 0, 0
movq m1, [srcq+DCTSIZE*0*2]
movq m7, [srcq+DCTSIZE*3*2]
movq m0, m1
paddw m1, [srcq+DCTSIZE*7*2]
mova m3, m7
movq m3, m7
paddw m7, [srcq+DCTSIZE*4*2]
mova m5, m1
mova m6, [srcq+DCTSIZE*1*2]
movq m5, m1
movq m6, [srcq+DCTSIZE*1*2]
psubw m1, m7
mova m2, [srcq+DCTSIZE*2*2]
mova m4, m6
movq m2, [srcq+DCTSIZE*2*2]
movq m4, m6
paddw m6, [srcq+DCTSIZE*6*2]
paddw m5, m7
paddw m2, [srcq+DCTSIZE*5*2]
mova m7, m6
movq m7, m6
paddw m6, m2
psubw m7, m2
mova m2, m5
%if ARCH_X86_64
mova m8, [thrq]
%define THRQ m8
%else
%define THRQ [thrq]
%endif
movq m2, m5
paddw m5, m6
psubw m2, m6
paddw m7, m1
mova m6, [thrq+4*16]
psllw m7, 1
psubw m5, THRQ
movq m6, [thrq+4*16+%2]
psllw m7, 2
psubw m5, [thrq+%2]
psubw m2, m6
paddusw m5, THRQ
paddusw m5, [thrq+%2]
paddusw m2, m6
pmulhw m7, SQRT2
paddw m5, THRQ
pmulhw m7, [pw_2D41]
paddw m5, [thrq+%2]
paddw m2, m6
psubusw m5, THRQ
psubusw m5, [thrq+%2]
psubusw m2, m6
paddw m5, [pw_2]
mova m6, m2
movq m6, m2
paddw m2, m5
%if ARCH_X86_64
mova m8, [thrq+2*16]
%define THRQ m8
%else
%define THRQ [thrq+2*16]
%endif
psubw m5, m6
mova m6, m1
movq m6, m1
paddw m1, m7
psubw m1, THRQ
psubw m1, [thrq+2*16+%2]
psubw m6, m7
mova m7, [thrq+6*16]
movq m7, [thrq+6*16+%2]
psraw m5, 2
paddusw m1, THRQ
paddusw m1, [thrq+2*16+%2]
psubw m6, m7
paddw m1, THRQ
paddw m1, [thrq+2*16+%2]
paddusw m6, m7
psubusw m1, THRQ
psubusw m1, [thrq+2*16+%2]
paddw m6, m7
psubw m3, [srcq+DCTSIZE*4*2]
psubusw m6, m7
mova m7, m1
movq m7, m1
psraw m2, 2
psubw m4, [srcq+DCTSIZE*6*2]
psubw m1, m6
psubw m0, [srcq+DCTSIZE*7*2]
paddw m6, m7
psraw m6, 2
mova m7, m2
pmulhw m1, SQRT2
movq m7, m2
pmulhw m1, [pw_5A82]
paddw m2, m6
mova tmp0, m2
movq [rsp], m2
psubw m7, m6
mova m2, [srcq+DCTSIZE*2*2]
movq m2, [srcq+DCTSIZE*2*2]
psubw m1, m6
psubw m2, [srcq+DCTSIZE*5*2]
mova m6, m5
mova tmp3, m7
movq m6, m5
movq [rsp+8*3], m7
paddw m3, m2
paddw m2, m4
paddw m4, m0
mova m7, m3
movq m7, m3
psubw m3, m4
psllw m7, 1
pmulhw m3, [pw_61F8]
psllw m3, 2
psllw m7, 2
pmulhw m3, [pw_187E]
psllw m4, 2
add srcq, 32
pmulhw m7, [pw_4546]
psllw m2, 1
pmulhw m7, [pw_22A3]
psllw m2, 2
pmulhw m4, [pw_539F]
paddw m5, m1
pmulhw m2, SQRT2
pmulhw m2, [pw_2D41]
psubw m6, m1
paddw m7, m3
mova tmp1, m5
movq [rsp+8], m5
paddw m4, m3
mova m3, [thrq+3*16]
mova m1, m0
mova tmp2, m6
movq m3, [thrq+3*16+%2]
movq m1, m0
movq [rsp+8*2], m6
psubw m1, m2
paddw m0, m2
mova m5, m1
mova m2, [thrq+5*16]
movq m5, m1
movq m2, [thrq+5*16+%2]
psubw m1, m7
paddw m5, m7
psubw m1, m3
mova m7, [thrq+16]
movq m7, [thrq+16+%2]
psubw m5, m2
mova m6, m0
movq m6, m0
paddw m0, m4
paddusw m1, m3
psubw m6, m4
mova m4, [thrq+7*16]
movq m4, [thrq+7*16+%2]
psubw m0, m7
psubw m6, m4
paddusw m5, m2
@ -316,149 +348,139 @@ cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
psubusw m1, m3
psubusw m5, m2
psubusw m6, m4
mova m4, m1
movq m4, m1
por m4, m5
paddusw m0, m7
por m4, m6
paddw m0, m7
packssdw m4, m4
psubusw m0, m7
%if ARCH_X86_64
movq tmpq, m4
%else
packssdw m4, m4
movd tmpd, m4
%endif
or tmpq, tmpq
movd tmpd, m4
or tmpd, tmpd
jnz %1
mova m4, tmp0
psraw m3, m0, 2
mova m5, [outq+DCTSIZE*0*2]
pmulhw m1, m0, [pw_7642]
pmulhw m2, m0, [pw_4546]
pmulhw m0, SQRT2
movq m4, [rsp]
movq m1, m0
pmulhw m0, [pw_3642]
movq m2, m1
movq m5, [outq+DCTSIZE*0*2]
movq m3, m2
pmulhw m1, [pw_2441]
paddw m5, m4
mova m6, tmp1
psubw m2, m1
movq m6, [rsp+8]
psraw m3, 2
pmulhw m2, [pw_0CBB]
psubw m4, m3
mova m7, [outq+DCTSIZE*1*2]
movq m7, [outq+DCTSIZE*1*2]
paddw m5, m3
psubw m1, m3
mova [outq+DCTSIZE*7*2], m4
psubw m0, m1
paddw m2, m0
mova [outq+DCTSIZE*0*2], m5
movq [outq+DCTSIZE*7*2], m4
paddw m7, m6
mova m3, tmp2
psubw m6, m1
mova m4, [outq+DCTSIZE*2*2]
paddw m7, m1
mova [outq], m5
paddw m4, m3
mova [outq+DCTSIZE*6*2], m6
psubw m3, m0
mova m5, [outq+DCTSIZE*5*2]
paddw m4, m0
mova m6, [outq+DCTSIZE*3*2]
paddw m5, m3
mova m0, tmp3
mova [outq+DCTSIZE*1*2], m7
paddw m6, m0
mova [outq+DCTSIZE*2*2], m4
paddw m0, m2
mova m7, [outq+DCTSIZE*4*2]
psubw m6, m2
mova [outq+DCTSIZE*5*2], m5
movq m3, [rsp+8*2]
psubw m6, m0
movq m4, [outq+DCTSIZE*2*2]
paddw m7, m0
mova [outq+DCTSIZE*3*2], m6
mova [outq+DCTSIZE*4*2], m7
add outq, 32
movq [outq], m5
paddw m4, m3
movq [outq+DCTSIZE*6*2], m6
psubw m3, m1
movq m5, [outq+DCTSIZE*5*2]
paddw m4, m1
movq m6, [outq+DCTSIZE*3*2]
paddw m5, m3
movq m0, [rsp+8*3]
add srcq, 8+%3
movq [outq+DCTSIZE*1*2], m7
paddw m6, m0
movq [outq+DCTSIZE*2*2], m4
psubw m0, m2
movq m7, [outq+DCTSIZE*4*2]
paddw m6, m2
movq [outq+DCTSIZE*5*2], m5
paddw m7, m0
movq [outq+DCTSIZE*3*2], m6
movq [outq+DCTSIZE*4*2], m7
add outq, 8+%3
%endmacro
%macro COLUMN_IDCT 0
mova m3, m5
%macro COLUMN_IDCT 0-1 0
movq m3, m5
psubw m5, m1
psllw m5, 1
paddw m3, m1
mova m2, m0
movq m2, m0
psubw m0, m6
psllw m1, m5, 1
movq m1, m5
psllw m0, 1
pmulhw m1, [pw_AC62]
paddw m5, m0
pmulhw m5, [pw_7642]
pmulhw m5, [pw_3B21]
paddw m2, m6
pmulhw m0, [pw_4546]
mova m7, m2
mova m4, tmp0
pmulhw m0, [pw_22A3]
movq m7, m2
movq m4, [rsp]
psubw m2, m3
psllw m2, 1
paddw m7, m3
pmulhw m2, SQRT2
mova m6, m4
pmulhw m2, [pw_2D41]
movq m6, m4
psraw m7, 2
paddw m4, [outq]
psubw m6, m7
mova m3, tmp1
movq m3, [rsp+8]
paddw m4, m7
mova [outq+DCTSIZE*7*2], m6
movq [outq+DCTSIZE*7*2], m6
paddw m1, m5
mova [outq], m4
movq [outq], m4
psubw m1, m7
mova m7, tmp2
movq m7, [rsp+8*2]
psubw m0, m5
mova m6, tmp3
mova m5, m3
movq m6, [rsp+8*3]
movq m5, m3
paddw m3, [outq+DCTSIZE*1*2]
psubw m5, m1
psubw m2, m1
paddw m3, m1
mova [outq+DCTSIZE*6*2], m5
mova m4, m7
movq [outq+DCTSIZE*6*2], m5
movq m4, m7
paddw m7, [outq+DCTSIZE*2*2]
psubw m4, m2
paddw m4, [outq+DCTSIZE*5*2]
paddw m7, m2
mova [outq+DCTSIZE*1*2], m3
movq [outq+DCTSIZE*1*2], m3
paddw m0, m2
mova [outq+DCTSIZE*2*2], m7
mova m1, m6
movq [outq+DCTSIZE*2*2], m7
movq m1, m6
paddw m6, [outq+DCTSIZE*4*2]
psubw m1, m0
paddw m1, [outq+DCTSIZE*3*2]
paddw m6, m0
mova [outq+DCTSIZE*5*2], m4
mova [outq+DCTSIZE*4*2], m6
mova [outq+DCTSIZE*3*2], m1
add outq, 32
movq [outq+DCTSIZE*5*2], m4
add srcq, 8+%1
movq [outq+DCTSIZE*4*2], m6
movq [outq+DCTSIZE*3*2], m1
add outq, 8+%1
%endmacro
;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
cglobal column_fidct, 4, 5, 8+5*ARCH_X86_64, 64*!ARCH_X86_64, thr, src, out, cnt, tmp
%if ARCH_X86_64
%define tmp0 m8
%define tmp1 m9
%define tmp2 m10
%define tmp3 m11
%define SQRT2 m12
mova m12, [pw_5A82]
%else
%define tmp0 [rsp]
%define tmp1 [rsp+16]
%define tmp2 [rsp+2*16]
%define tmp3 [rsp+3*16]
%define SQRT2 [pw_5A82]
%endif
.fdct:
COLUMN_FDCT .idct
sub cntd, 2
jg .fdct
RET
;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
.fdct1:
COLUMN_FDCT .idct1
jmp .fdct2
.idct:
.idct1:
COLUMN_IDCT
.fdct2:
COLUMN_FDCT .idct2, 8, 16
sub cntd, 2
jg .fdct
jg .fdct1
RET
.idct2:
COLUMN_IDCT 16
sub cntd, 2
jg .fdct1
RET
INIT_MMX mmx
;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
add strideq, strideq

View File

@ -21,31 +21,29 @@
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_fsppdsp.h"
#include "libavfilter/vf_fspp.h"
void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_mul_thrmat_sse2(const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_sse2(const int16_t *thr_adr, const int16_t *data, int16_t *output, int cnt);
void ff_row_idct_mmx(const int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
av_cold void ff_fspp_init_x86(FSPPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
s->store_slice = ff_store_slice_mmx;
s->store_slice2 = ff_store_slice2_mmx;
s->mul_thrmat = ff_mul_thrmat_mmx;
s->column_fidct = ff_column_fidct_mmx;
s->row_idct = ff_row_idct_mmx;
s->row_fdct = ff_row_fdct_mmx;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->store_slice = ff_store_slice_sse2;
s->store_slice2 = ff_store_slice2_sse2;
s->mul_thrmat = ff_mul_thrmat_sse2;
s->column_fidct = ff_column_fidct_sse2;
}
}

View File

@ -64,7 +64,7 @@ static void store_slice_sse2(uint8_t *dst, const int16_t *src,
}
}
#endif /* HAVE_SSE2_INLINE */
#endif /* HAVE_MMX_INLINE */
av_cold void ff_spp_init_x86(SPPContext *s)
{

View File

@ -64,7 +64,6 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o
AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o
AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o
AVFILTEROBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o
AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o
AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o
AVFILTEROBJS-$(CONFIG_IDET_FILTER) += vf_idet.o

View File

@ -297,9 +297,6 @@ static const struct {
#if CONFIG_EQ_FILTER
{ "vf_eq", checkasm_check_vf_eq },
#endif
#if CONFIG_FSPP_FILTER
{ "vf_fspp", checkasm_check_vf_fspp },
#endif
#if CONFIG_GBLUR_FILTER
{ "vf_gblur", checkasm_check_vf_gblur },
#endif

View File

@ -148,7 +148,6 @@ void checkasm_check_v210enc(void);
void checkasm_check_vc1dsp(void);
void checkasm_check_vf_bwdif(void);
void checkasm_check_vf_eq(void);
void checkasm_check_vf_fspp(void);
void checkasm_check_vf_gblur(void);
void checkasm_check_vf_hflip(void);
void checkasm_check_vf_threshold(void);

View File

@ -1,170 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <stddef.h>
#include <stdint.h>
#include "checkasm.h"
#include "libavfilter/vf_fsppdsp.h"
#include "libavcodec/mathops.h"
#include "libavutil/mem_internal.h"
#define randomize_buffers(buf) \
do { \
for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
buf[j] = rnd(); \
} while (0)
#define randomize_mask_buffers(buf, buf2, nb_elems, nb_bits)\
do { \
for (size_t j = 0; j < nb_elems; ++j) \
buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
} while (0)
#define randomize_buffer_range(buf, min, max) \
do { \
for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
buf[j] = min + rnd() % (max - min + 1); \
} while (0)
static void check_store_slice(void)
{
enum {
MAX_WIDTH = 256,
/// in elements, not in bytes; 32 is arbitrary
MAX_STRIDE = MAX_WIDTH + 32,
MAX_HEIGHT = 8,
};
FSPPDSPContext fspp;
ff_fsppdsp_init(&fspp);
declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
for (int i = 0; i < 2; ++i) {
if (check_func(i ? fspp.store_slice2 : fspp.store_slice, "store_slice%s", i ? "2" : "")) {
// store slice resets the row eight lines above the current one
DECLARE_ALIGNED(16, int16_t, src_ref1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH];
DECLARE_ALIGNED(16, int16_t, src_new1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH];
// store_slice2 resets the row 16 lines below the current one
DECLARE_ALIGNED(16, int16_t, src_ref2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH];
DECLARE_ALIGNED(16, int16_t, src_new2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH];
uint8_t dstbuf_new[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH], dstbuf_ref[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH];
uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
int16_t *src_ref, *src_new, *or_src_ref, *or_src_new;
ptrdiff_t width = 1 + rnd() % MAX_WIDTH;
ptrdiff_t src_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8);
ptrdiff_t dst_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8);
ptrdiff_t height = 1 + rnd() % 8;
size_t nb_elems;
if (i) {
src_ref = src_ref2;
src_new = src_new2;
or_src_ref = src_ref2;
or_src_new = src_new2;
nb_elems = FF_ARRAY_ELEMS(src_ref2);
} else {
src_ref = src_ref1 + 8 * src_stride;
src_new = src_new1 + 8 * src_stride;
or_src_ref = src_ref1;
or_src_new = src_new1;
nb_elems = FF_ARRAY_ELEMS(src_ref1);
}
if (rnd() & 1) {
dst_ref += dst_stride * (height - 1);
dst_new += dst_stride * (height - 1);
dst_stride *= -1;
}
randomize_buffers(dstbuf_new);
memcpy(dstbuf_ref, dstbuf_new, sizeof(dstbuf_ref));
randomize_mask_buffers(or_src_ref, or_src_new, nb_elems, 14);
ptrdiff_t log2_scale = rnd() & 1;
call_ref(dst_ref, src_ref, dst_stride, src_stride, width, height, log2_scale);
call_new(dst_new, src_new, dst_stride, src_stride, width, height, log2_scale);
if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref)) ||
memcmp(or_src_ref, or_src_new, sizeof(*or_src_new) * nb_elems))
fail();
// don't use random parameters for benchmarks
src_ref = or_src_ref + !i * 8 * MAX_STRIDE;
bench_new(dstbuf_new, src_ref,
MAX_STRIDE, MAX_STRIDE, MAX_WIDTH, 8, 1);
}
}
}
static void check_mul_thrmat(void)
{
FSPPDSPContext fspp;
DECLARE_ALIGNED(16, int16_t, src)[64];
DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
DECLARE_ALIGNED(16, int16_t, dst_new)[64];
const int q = (uint8_t)rnd();
declare_func(void, const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
ff_fsppdsp_init(&fspp);
if (check_func(fspp.mul_thrmat, "mul_thrmat")) {
randomize_buffers(src);
call_ref(src, dst_ref, q);
call_new(src, dst_new, q);
if (memcmp(dst_ref, dst_new, sizeof(dst_ref)))
fail();
bench_new(src, dst_new, q);
}
}
static void check_column_fidct(void)
{
enum {
NB_BLOCKS = 8, ///< arbitrary
};
FSPPDSPContext fspp;
declare_func(void, const int16_t *thr_adr, const int16_t *data,
int16_t *output, int cnt);
ff_fsppdsp_init(&fspp);
if (check_func(fspp.column_fidct, "column_fidct")) {
DECLARE_ALIGNED(16, int16_t, threshold)[64];
DECLARE_ALIGNED(16, int16_t, src)[8*(8*NB_BLOCKS + 6)];
DECLARE_ALIGNED(16, int16_t, dst_new)[8*(8*NB_BLOCKS + 6)];
DECLARE_ALIGNED(16, int16_t, dst_ref)[8*(8*NB_BLOCKS + 6)];
randomize_buffer_range(threshold, 0, INT16_MAX);
randomize_buffer_range(src, -1284, 1284);
randomize_buffers(dst_new);
memcpy(dst_ref, dst_new, sizeof(dst_ref));
call_ref(threshold, src, dst_ref, NB_BLOCKS * 8);
call_new(threshold, src, dst_new, NB_BLOCKS * 8);
if (memcmp(dst_new, dst_ref, sizeof(dst_new)))
fail();
bench_new(threshold, src, dst_new, NB_BLOCKS * 8);
}
}
void checkasm_check_vf_fspp(void)
{
check_store_slice();
check_mul_thrmat();
check_column_fidct();
}

View File

@ -55,17 +55,13 @@ configure()(
${cross_prefix:+--cross-prefix="$cross_prefix"} \
${as:+--as="$as"} \
${cc:+--cc="$cc"} \
${cxx:+--cxx="$cxx"} \
${ld:+--ld="$ld"} \
${nm:+--nm="$nm"} \
${target_os:+--target-os="$target_os"} \
${sysroot:+--sysroot="$sysroot"} \
${target_exec:+--target-exec="$target_exec"} \
${target_path:+--target-path="$target_path"} \
${target_samples:+--target-samples="$target_samples"} \
${extra_cflags:+--extra-cflags="$extra_cflags"} \
${extra_cxxflags:+--extra-cxxflags="$extra_cxxflags"} \
${extra_objcflags:+--extra-objcflags="$extra_objcflags"} \
${extra_ldflags:+--extra-ldflags="$extra_ldflags"} \
${extra_libs:+--extra-libs="$extra_libs"} \
${extra_conf}

View File

@ -67,7 +67,6 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \
fate-checkasm-vf_colordetect \
fate-checkasm-vf_colorspace \
fate-checkasm-vf_eq \
fate-checkasm-vf_fspp \
fate-checkasm-vf_gblur \
fate-checkasm-vf_hflip \
fate-checkasm-vf_nlmeans \