mirror of https://github.com/FFmpeg/FFmpeg.git
Compare commits
25 Commits
57d6898730
...
9b2162275b
| Author | SHA1 | Date |
|---|---|---|
|
|
9b2162275b | |
|
|
523d688c2b | |
|
|
ddf443f1e9 | |
|
|
f8bcea4946 | |
|
|
0c556a6b09 | |
|
|
778ff97efa | |
|
|
f442145729 | |
|
|
c0648b2004 | |
|
|
06b0dae51b | |
|
|
cc97f1e276 | |
|
|
3cd452cbf1 | |
|
|
ddd74276f8 | |
|
|
68b11cde82 | |
|
|
63493bf0e0 | |
|
|
66af18d06a | |
|
|
1049a5fba8 | |
|
|
d19050a1ae | |
|
|
ff85a20b7d | |
|
|
570f8fc6c9 | |
|
|
e042f17e99 | |
|
|
52ba2ac7bd | |
|
|
70eb8a76a9 | |
|
|
9f4d5d818d | |
|
|
1699de0955 | |
|
|
9b34088c4d |
|
|
@ -4968,6 +4968,7 @@ armasm_flags(){
|
|||
# Filter out MSVC cl.exe options from cflags that shouldn't
|
||||
# be passed to gas-preprocessor
|
||||
-M[TD]*) ;;
|
||||
-guard:signret) ;;
|
||||
*) echo $flag ;;
|
||||
esac
|
||||
done
|
||||
|
|
|
|||
|
|
@ -329,7 +329,7 @@ OBJS-$(CONFIG_FRAMESTEP_FILTER) += vf_framestep.o
|
|||
OBJS-$(CONFIG_FREEZEDETECT_FILTER) += vf_freezedetect.o
|
||||
OBJS-$(CONFIG_FREEZEFRAMES_FILTER) += vf_freezeframes.o
|
||||
OBJS-$(CONFIG_FREI0R_FILTER) += vf_frei0r.o
|
||||
OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o qp_table.o
|
||||
OBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o vf_fsppdsp.o qp_table.o
|
||||
OBJS-$(CONFIG_FSYNC_FILTER) += vf_fsync.o
|
||||
OBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o
|
||||
OBJS-$(CONFIG_GBLUR_VULKAN_FILTER) += vf_gblur_vulkan.o vulkan.o vulkan_filter.o
|
||||
|
|
|
|||
|
|
@ -41,12 +41,41 @@
|
|||
#include "libavutil/mem_internal.h"
|
||||
#include "libavutil/opt.h"
|
||||
#include "libavutil/pixdesc.h"
|
||||
#include "libavutil/video_enc_params.h"
|
||||
|
||||
#include "avfilter.h"
|
||||
#include "filters.h"
|
||||
#include "qp_table.h"
|
||||
#include "vf_fspp.h"
|
||||
#include "vf_fsppdsp.h"
|
||||
#include "video.h"
|
||||
|
||||
#define BLOCKSZ 12
|
||||
#define MAX_LEVEL 5
|
||||
|
||||
typedef struct FSPPContext {
|
||||
const struct AVClass *class;
|
||||
|
||||
int log2_count;
|
||||
int strength;
|
||||
int hsub;
|
||||
int vsub;
|
||||
int temp_stride;
|
||||
int qp;
|
||||
enum AVVideoEncParamsType qscale_type;
|
||||
int prev_q;
|
||||
uint8_t *src;
|
||||
int16_t *temp;
|
||||
int8_t *non_b_qp_table;
|
||||
int non_b_qp_stride;
|
||||
int use_bframe_qp;
|
||||
|
||||
FSPPDSPContext dsp;
|
||||
|
||||
DECLARE_ALIGNED(16, int16_t, threshold_mtx_noq)[8 * 8];
|
||||
DECLARE_ALIGNED(16, int16_t, threshold_mtx)[8 * 8];
|
||||
} FSPPContext;
|
||||
|
||||
|
||||
#define OFFSET(x) offsetof(FSPPContext, x)
|
||||
#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
|
||||
static const AVOption fspp_options[] = {
|
||||
|
|
@ -59,98 +88,22 @@ static const AVOption fspp_options[] = {
|
|||
|
||||
AVFILTER_DEFINE_CLASS(fspp);
|
||||
|
||||
DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
|
||||
{ 0, 48, 12, 60, 3, 51, 15, 63, },
|
||||
{ 32, 16, 44, 28, 35, 19, 47, 31, },
|
||||
{ 8, 56, 4, 52, 11, 59, 7, 55, },
|
||||
{ 40, 24, 36, 20, 43, 27, 39, 23, },
|
||||
{ 2, 50, 14, 62, 1, 49, 13, 61, },
|
||||
{ 34, 18, 46, 30, 33, 17, 45, 29, },
|
||||
{ 10, 58, 6, 54, 9, 57, 5, 53, },
|
||||
{ 42, 26, 38, 22, 41, 25, 37, 21, },
|
||||
};
|
||||
|
||||
static const short custom_threshold[64] = {
|
||||
// values (296) can't be too high
|
||||
// -it causes too big quant dependence
|
||||
// or maybe overflow(check), which results in some flashing
|
||||
71, 296, 295, 237, 71, 40, 38, 19,
|
||||
245, 193, 185, 121, 102, 73, 53, 27,
|
||||
158, 129, 141, 107, 97, 73, 50, 26,
|
||||
102, 116, 109, 98, 82, 66, 45, 23,
|
||||
71, 94, 95, 81, 70, 56, 38, 20,
|
||||
56, 77, 74, 66, 56, 44, 30, 15,
|
||||
38, 53, 50, 45, 38, 30, 21, 11,
|
||||
20, 27, 26, 23, 20, 15, 11, 5
|
||||
// reorder coefficients to the order in which columns are processed
|
||||
#define REORDER(a,b,c,d,e,f,g,h) c, g, a, e, f, d, b, h
|
||||
REORDER( 71, 296, 295, 237, 71, 40, 38, 19),
|
||||
REORDER(245, 193, 185, 121, 102, 73, 53, 27),
|
||||
REORDER(158, 129, 141, 107, 97, 73, 50, 26),
|
||||
REORDER(102, 116, 109, 98, 82, 66, 45, 23),
|
||||
REORDER( 71, 94, 95, 81, 70, 56, 38, 20),
|
||||
REORDER( 56, 77, 74, 66, 56, 44, 30, 15),
|
||||
REORDER( 38, 53, 50, 45, 38, 30, 21, 11),
|
||||
REORDER( 20, 27, 26, 23, 20, 15, 11, 5)
|
||||
};
|
||||
|
||||
//This func reads from 1 slice, 1 and clears 0 & 1
|
||||
static void store_slice_c(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
{
|
||||
int y, x;
|
||||
#define STORE(pos) \
|
||||
temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
|
||||
src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
|
||||
if (temp & 0x100) temp = ~(temp >> 31); \
|
||||
dst[x + pos] = temp;
|
||||
|
||||
for (y = 0; y < height; y++) {
|
||||
const uint8_t *d = dither[y];
|
||||
for (x = 0; x < width; x += 8) {
|
||||
int temp;
|
||||
STORE(0);
|
||||
STORE(1);
|
||||
STORE(2);
|
||||
STORE(3);
|
||||
STORE(4);
|
||||
STORE(5);
|
||||
STORE(6);
|
||||
STORE(7);
|
||||
}
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
//This func reads from 2 slices, 0 & 2 and clears 2-nd
|
||||
static void store_slice2_c(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
{
|
||||
int y, x;
|
||||
#define STORE2(pos) \
|
||||
temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
|
||||
src[x + pos + 16 * src_stride] = 0; \
|
||||
if (temp & 0x100) temp = ~(temp >> 31); \
|
||||
dst[x + pos] = temp;
|
||||
|
||||
for (y = 0; y < height; y++) {
|
||||
const uint8_t *d = dither[y];
|
||||
for (x = 0; x < width; x += 8) {
|
||||
int temp;
|
||||
STORE2(0);
|
||||
STORE2(1);
|
||||
STORE2(2);
|
||||
STORE2(3);
|
||||
STORE2(4);
|
||||
STORE2(5);
|
||||
STORE2(6);
|
||||
STORE2(7);
|
||||
}
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
|
||||
{
|
||||
int a;
|
||||
for (a = 0; a < 64; a++)
|
||||
thr_adr[a] = q * thr_adr_noq[a];
|
||||
}
|
||||
|
||||
static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
|
||||
int dst_stride, int src_stride,
|
||||
int width, int height,
|
||||
|
|
@ -163,9 +116,9 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
|
|||
const int qpsh = 4 - p->hsub * !is_luma;
|
||||
const int qpsv = 4 - p->vsub * !is_luma;
|
||||
|
||||
DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
|
||||
int16_t *block = (int16_t *)block_align;
|
||||
int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
|
||||
DECLARE_ALIGNED(16, int16_t, block_align)[8 * 8 * BLOCKSZ + 8 * 8 * BLOCKSZ];
|
||||
int16_t *block = block_align;
|
||||
int16_t *block3 = block_align + 8 * 8 * BLOCKSZ;
|
||||
|
||||
memset(block3, 0, 4 * 8 * BLOCKSZ);
|
||||
|
||||
|
|
@ -197,13 +150,13 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
|
|||
if (qy < 0) qy = 0;
|
||||
|
||||
qy = (qy >> qpsv) * qp_stride;
|
||||
p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
|
||||
p->dsp.row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
|
||||
|
||||
for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
|
||||
p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
|
||||
p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
|
||||
|
||||
if (p->qp)
|
||||
p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
|
||||
p->dsp.column_fidct(p->threshold_mtx, block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
|
||||
else
|
||||
for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
|
||||
t = x + x0 - 2; //correct t=x+x0-2-(y&1), but its the same
|
||||
|
|
@ -213,288 +166,45 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
|
|||
t = qp_store[qy + (t >> qpsh)];
|
||||
t = ff_norm_qscale(t, p->qscale_type);
|
||||
|
||||
if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
|
||||
p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
|
||||
if (t != p->prev_q) {
|
||||
p->prev_q = t;
|
||||
p->dsp.mul_thrmat(p->threshold_mtx_noq, p->threshold_mtx, t);
|
||||
}
|
||||
p->dsp.column_fidct(p->threshold_mtx, block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
|
||||
}
|
||||
p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
|
||||
p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
|
||||
memmove(block, block + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
|
||||
memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
|
||||
}
|
||||
|
||||
es = width + 8 - x0; // 8, ...
|
||||
if (es > 8)
|
||||
p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
|
||||
p->dsp.row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
|
||||
|
||||
p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
|
||||
p->dsp.column_fidct(p->threshold_mtx, block, block3, es&(~1));
|
||||
if (es > 3)
|
||||
p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
|
||||
p->dsp.row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
|
||||
|
||||
if (!(y1 & 7) && y1) {
|
||||
if (y1 & 8)
|
||||
p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
|
||||
dst_stride, stride, width, 8, 5 - p->log2_count);
|
||||
p->dsp.store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
|
||||
dst_stride, stride, width, 8, 5 - p->log2_count);
|
||||
else
|
||||
p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
|
||||
dst_stride, stride, width, 8, 5 - p->log2_count);
|
||||
p->dsp.store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
|
||||
dst_stride, stride, width, 8, 5 - p->log2_count);
|
||||
}
|
||||
}
|
||||
|
||||
if (y & 7) { // height % 8 != 0
|
||||
if (y & 8)
|
||||
p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
|
||||
dst_stride, stride, width, y&7, 5 - p->log2_count);
|
||||
p->dsp.store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
|
||||
dst_stride, stride, width, y&7, 5 - p->log2_count);
|
||||
else
|
||||
p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
|
||||
p->dsp.store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
|
||||
dst_stride, stride, width, y&7, 5 - p->log2_count);
|
||||
}
|
||||
}
|
||||
|
||||
static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
|
||||
{
|
||||
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
int_simd16_t tmp10, tmp11, tmp12, tmp13;
|
||||
int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
|
||||
int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
|
||||
|
||||
int16_t *dataptr;
|
||||
int16_t *wsptr;
|
||||
int16_t *threshold;
|
||||
int ctr;
|
||||
|
||||
dataptr = data;
|
||||
wsptr = output;
|
||||
|
||||
for (; cnt > 0; cnt -= 2) { //start positions
|
||||
threshold = (int16_t *)thr_adr;//threshold_mtx
|
||||
for (ctr = DCTSIZE; ctr > 0; ctr--) {
|
||||
// Process columns from input, add to output.
|
||||
tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
|
||||
tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
|
||||
|
||||
tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
|
||||
tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
|
||||
|
||||
tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
|
||||
tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
|
||||
|
||||
tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
|
||||
tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
|
||||
|
||||
// Even part of FDCT
|
||||
|
||||
tmp10 = tmp0 + tmp3;
|
||||
tmp13 = tmp0 - tmp3;
|
||||
tmp11 = tmp1 + tmp2;
|
||||
tmp12 = tmp1 - tmp2;
|
||||
|
||||
d0 = tmp10 + tmp11;
|
||||
d4 = tmp10 - tmp11;
|
||||
|
||||
z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
|
||||
d2 = tmp13 + z1;
|
||||
d6 = tmp13 - z1;
|
||||
|
||||
// Even part of IDCT
|
||||
|
||||
THRESHOLD(tmp0, d0, threshold[0 * 8]);
|
||||
THRESHOLD(tmp1, d2, threshold[2 * 8]);
|
||||
THRESHOLD(tmp2, d4, threshold[4 * 8]);
|
||||
THRESHOLD(tmp3, d6, threshold[6 * 8]);
|
||||
tmp0 += 2;
|
||||
tmp10 = (tmp0 + tmp2) >> 2;
|
||||
tmp11 = (tmp0 - tmp2) >> 2;
|
||||
|
||||
tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
|
||||
tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
|
||||
|
||||
tmp0 = tmp10 + tmp13; //->temps
|
||||
tmp3 = tmp10 - tmp13; //->temps
|
||||
tmp1 = tmp11 + tmp12; //->temps
|
||||
tmp2 = tmp11 - tmp12; //->temps
|
||||
|
||||
// Odd part of FDCT
|
||||
|
||||
tmp10 = tmp4 + tmp5;
|
||||
tmp11 = tmp5 + tmp6;
|
||||
tmp12 = tmp6 + tmp7;
|
||||
|
||||
z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
|
||||
z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
|
||||
z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
|
||||
z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
|
||||
|
||||
z11 = tmp7 + z3;
|
||||
z13 = tmp7 - z3;
|
||||
|
||||
d5 = z13 + z2;
|
||||
d3 = z13 - z2;
|
||||
d1 = z11 + z4;
|
||||
d7 = z11 - z4;
|
||||
|
||||
// Odd part of IDCT
|
||||
|
||||
THRESHOLD(tmp4, d1, threshold[1 * 8]);
|
||||
THRESHOLD(tmp5, d3, threshold[3 * 8]);
|
||||
THRESHOLD(tmp6, d5, threshold[5 * 8]);
|
||||
THRESHOLD(tmp7, d7, threshold[7 * 8]);
|
||||
|
||||
//Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
|
||||
z13 = tmp6 + tmp5;
|
||||
z10 = (tmp6 - tmp5) << 1;
|
||||
z11 = tmp4 + tmp7;
|
||||
z12 = (tmp4 - tmp7) << 1;
|
||||
|
||||
tmp7 = (z11 + z13) >> 2; //+2 !
|
||||
tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
|
||||
z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
|
||||
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
|
||||
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
|
||||
|
||||
tmp6 = tmp12 - tmp7;
|
||||
tmp5 = tmp11 - tmp6;
|
||||
tmp4 = tmp10 + tmp5;
|
||||
|
||||
wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
|
||||
wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
|
||||
wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
|
||||
wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
|
||||
wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
|
||||
wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
|
||||
wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
|
||||
wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
|
||||
//
|
||||
dataptr++; //next column
|
||||
wsptr++;
|
||||
threshold++;
|
||||
}
|
||||
dataptr += 8; //skip each second start pos
|
||||
wsptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
|
||||
{
|
||||
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
int_simd16_t tmp10, tmp11, tmp12, tmp13;
|
||||
int_simd16_t z5, z10, z11, z12, z13;
|
||||
int16_t *outptr;
|
||||
int16_t *wsptr;
|
||||
|
||||
cnt *= 4;
|
||||
wsptr = workspace;
|
||||
outptr = output_adr;
|
||||
for (; cnt > 0; cnt--) {
|
||||
// Even part
|
||||
//Simd version reads 4x4 block and transposes it
|
||||
tmp10 = wsptr[2] + wsptr[3];
|
||||
tmp11 = wsptr[2] - wsptr[3];
|
||||
|
||||
tmp13 = wsptr[0] + wsptr[1];
|
||||
tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
|
||||
|
||||
tmp0 = tmp10 + tmp13; //->temps
|
||||
tmp3 = tmp10 - tmp13; //->temps
|
||||
tmp1 = tmp11 + tmp12;
|
||||
tmp2 = tmp11 - tmp12;
|
||||
|
||||
// Odd part
|
||||
//Also transpose, with previous:
|
||||
// ---- ---- ||||
|
||||
// ---- ---- idct ||||
|
||||
// ---- ---- ---> ||||
|
||||
// ---- ---- ||||
|
||||
z13 = wsptr[4] + wsptr[5];
|
||||
z10 = wsptr[4] - wsptr[5];
|
||||
z11 = wsptr[6] + wsptr[7];
|
||||
z12 = wsptr[6] - wsptr[7];
|
||||
|
||||
tmp7 = z11 + z13;
|
||||
tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
|
||||
|
||||
z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
|
||||
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
|
||||
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
|
||||
|
||||
tmp6 = (tmp12 << 3) - tmp7;
|
||||
tmp5 = (tmp11 << 3) - tmp6;
|
||||
tmp4 = (tmp10 << 3) + tmp5;
|
||||
|
||||
// Final output stage: descale and write column
|
||||
outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
|
||||
outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
|
||||
outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
|
||||
outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
|
||||
outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
|
||||
outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
|
||||
outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
|
||||
outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
|
||||
outptr++;
|
||||
|
||||
wsptr += DCTSIZE; // advance pointer to next row
|
||||
}
|
||||
}
|
||||
|
||||
static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
|
||||
{
|
||||
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
int_simd16_t tmp10, tmp11, tmp12, tmp13;
|
||||
int_simd16_t z1, z2, z3, z4, z5, z11, z13;
|
||||
int16_t *dataptr;
|
||||
|
||||
cnt *= 4;
|
||||
// Pass 1: process rows.
|
||||
|
||||
dataptr = data;
|
||||
for (; cnt > 0; cnt--) {
|
||||
tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
|
||||
tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
|
||||
tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
|
||||
tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
|
||||
tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
|
||||
tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
|
||||
tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
|
||||
tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
|
||||
|
||||
// Even part
|
||||
|
||||
tmp10 = tmp0 + tmp3;
|
||||
tmp13 = tmp0 - tmp3;
|
||||
tmp11 = tmp1 + tmp2;
|
||||
tmp12 = tmp1 - tmp2;
|
||||
//Even columns are written first, this leads to different order of columns
|
||||
//in column_fidct(), but they are processed independently, so all ok.
|
||||
//Later in the row_idct() columns are read in the same order.
|
||||
dataptr[2] = tmp10 + tmp11;
|
||||
dataptr[3] = tmp10 - tmp11;
|
||||
|
||||
z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
|
||||
dataptr[0] = tmp13 + z1;
|
||||
dataptr[1] = tmp13 - z1;
|
||||
|
||||
// Odd part
|
||||
|
||||
tmp10 = (tmp4 + tmp5) << 2;
|
||||
tmp11 = (tmp5 + tmp6) << 2;
|
||||
tmp12 = (tmp6 + tmp7) << 2;
|
||||
|
||||
z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
|
||||
z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
|
||||
z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
|
||||
z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
|
||||
|
||||
z11 = tmp7 + z3;
|
||||
z13 = tmp7 - z3;
|
||||
|
||||
dataptr[4] = z13 + z2;
|
||||
dataptr[5] = z13 - z2;
|
||||
dataptr[6] = z11 + z4;
|
||||
dataptr[7] = z11 - z4;
|
||||
|
||||
pixels++; // advance pointer to next column
|
||||
dataptr += DCTSIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static const enum AVPixelFormat pix_fmts[] = {
|
||||
AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P,
|
||||
AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV411P,
|
||||
|
|
@ -522,16 +232,7 @@ static int config_input(AVFilterLink *inlink)
|
|||
if (!fspp->temp || !fspp->src)
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
fspp->store_slice = store_slice_c;
|
||||
fspp->store_slice2 = store_slice2_c;
|
||||
fspp->mul_thrmat = mul_thrmat_c;
|
||||
fspp->column_fidct = column_fidct_c;
|
||||
fspp->row_idct = row_idct_c;
|
||||
fspp->row_fdct = row_fdct_c;
|
||||
|
||||
#if ARCH_X86
|
||||
ff_fspp_init_x86(fspp);
|
||||
#endif
|
||||
ff_fsppdsp_init(&fspp->dsp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -545,30 +246,17 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
|
|||
|
||||
int qp_stride = 0;
|
||||
int8_t *qp_table = NULL;
|
||||
int i, bias;
|
||||
int ret = 0;
|
||||
int custom_threshold_m[64];
|
||||
|
||||
bias = (1 << 4) + fspp->strength;
|
||||
//FIXME: tune custom_threshold[] and remove this !
|
||||
for (int i = 0, bias = (1 << 4) + fspp->strength; i < 64; ++i)
|
||||
fspp->threshold_mtx_noq[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
|
||||
|
||||
for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
|
||||
custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
|
||||
|(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
|
||||
|(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
|
||||
|(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
|
||||
|
||||
fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
|
||||
|(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
|
||||
|(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
|
||||
|(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
|
||||
if (fspp->qp) {
|
||||
fspp->prev_q = fspp->qp;
|
||||
fspp->dsp.mul_thrmat(fspp->threshold_mtx_noq, fspp->threshold_mtx, fspp->qp);
|
||||
}
|
||||
|
||||
if (fspp->qp)
|
||||
fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
|
||||
|
||||
/* if we are not in a constant user quantizer mode and we don't want to use
|
||||
* the quantizers from the B-frames (B-frames often have a higher QP), we
|
||||
* need to save the qp table from the last non B-frame; this is what the
|
||||
|
|
|
|||
|
|
@ -1,96 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
|
||||
* Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
|
||||
#ifndef AVFILTER_FSPP_H
|
||||
#define AVFILTER_FSPP_H
|
||||
|
||||
#include "libavutil/video_enc_params.h"
|
||||
#include "avfilter.h"
|
||||
|
||||
#define BLOCKSZ 12
|
||||
#define MAX_LEVEL 5
|
||||
|
||||
#define DCTSIZE 8
|
||||
#define DCTSIZE_S "8"
|
||||
|
||||
#define FIX(x,s) ((x) * (1 << s) + 0.5)
|
||||
|
||||
#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
|
||||
#define THRESHOLD(r,x,t) \
|
||||
if(((unsigned)((x) + t)) > t * 2) r = (x); \
|
||||
else r = 0;
|
||||
#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
|
||||
|
||||
typedef int32_t int_simd16_t;
|
||||
static const int16_t FIX_0_382683433 = FIX(0.382683433, 14);
|
||||
static const int16_t FIX_0_541196100 = FIX(0.541196100, 14);
|
||||
static const int16_t FIX_0_707106781 = FIX(M_SQRT1_2 , 14);
|
||||
static const int16_t FIX_1_306562965 = FIX(1.306562965, 14);
|
||||
static const int16_t FIX_1_414213562_A = FIX(M_SQRT2 , 14);
|
||||
static const int16_t FIX_1_847759065 = FIX(1.847759065, 13);
|
||||
static const int16_t FIX_2_613125930 = FIX(-2.613125930, 13);
|
||||
static const int16_t FIX_1_414213562 = FIX(M_SQRT2 , 13);
|
||||
static const int16_t FIX_1_082392200 = FIX(1.082392200, 13);
|
||||
|
||||
typedef struct FSPPContext {
|
||||
AVClass *class;
|
||||
uint64_t threshold_mtx_noq[8 * 2];
|
||||
uint64_t threshold_mtx[8 * 2]; //used in both C & MMX (& later SSE2) versions
|
||||
|
||||
int log2_count;
|
||||
int strength;
|
||||
int hsub;
|
||||
int vsub;
|
||||
int temp_stride;
|
||||
int qp;
|
||||
enum AVVideoEncParamsType qscale_type;
|
||||
int prev_q;
|
||||
uint8_t *src;
|
||||
int16_t *temp;
|
||||
int8_t *non_b_qp_table;
|
||||
int non_b_qp_stride;
|
||||
int use_bframe_qp;
|
||||
|
||||
void (*store_slice)(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
|
||||
void (*store_slice2)(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
|
||||
void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
|
||||
|
||||
void (*column_fidct)(int16_t *thr_adr, int16_t *data,
|
||||
int16_t *output, int cnt);
|
||||
|
||||
void (*row_idct)(int16_t *workspace, int16_t *output_adr,
|
||||
ptrdiff_t output_stride, int cnt);
|
||||
|
||||
void (*row_fdct)(int16_t *data, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int cnt);
|
||||
|
||||
} FSPPContext;
|
||||
|
||||
void ff_fspp_init_x86(FSPPContext *fspp);
|
||||
|
||||
#endif /* AVFILTER_FSPP_H */
|
||||
|
|
@ -0,0 +1,371 @@
|
|||
/*
|
||||
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
|
||||
* Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "vf_fsppdsp.h"
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/mathematics.h"
|
||||
#include "libavutil/mem_internal.h"
|
||||
|
||||
#define DCTSIZE 8
|
||||
|
||||
#define FIX(x,s) (int)((x) * (1 << s) + 0.5)
|
||||
|
||||
#define MULTIPLY16H(x,k) (((x) * (k)) >> 16)
|
||||
#define THRESHOLD(r,x,t) \
|
||||
if (((unsigned)((x) + t)) >= t * 2) r = (x); \
|
||||
else r = 0;
|
||||
#define DESCALE(x,n) (((x) + (1 << ((n) - 1))) >> n)
|
||||
|
||||
typedef int32_t int_simd16_t;
|
||||
|
||||
enum {
|
||||
FIX_0_382683433 = FIX(0.382683433, 14),
|
||||
FIX_0_541196100 = FIX(0.541196100, 14),
|
||||
FIX_0_707106781 = FIX(M_SQRT1_2 , 14),
|
||||
FIX_1_306562965 = FIX(1.306562965, 14),
|
||||
FIX_1_414213562_A = FIX(M_SQRT2 , 14),
|
||||
FIX_1_847759065 = FIX(1.847759065, 13),
|
||||
FIX_2_613125930 = FIX(-2.613125930, 13),
|
||||
FIX_1_414213562 = FIX(M_SQRT2 , 13),
|
||||
FIX_1_082392200 = FIX(1.082392200, 13),
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
|
||||
{ 0, 48, 12, 60, 3, 51, 15, 63, },
|
||||
{ 32, 16, 44, 28, 35, 19, 47, 31, },
|
||||
{ 8, 56, 4, 52, 11, 59, 7, 55, },
|
||||
{ 40, 24, 36, 20, 43, 27, 39, 23, },
|
||||
{ 2, 50, 14, 62, 1, 49, 13, 61, },
|
||||
{ 34, 18, 46, 30, 33, 17, 45, 29, },
|
||||
{ 10, 58, 6, 54, 9, 57, 5, 53, },
|
||||
{ 42, 26, 38, 22, 41, 25, 37, 21, },
|
||||
};
|
||||
|
||||
//This func reads from 1 slice, 1 and clears 0 & 1
|
||||
void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
{
|
||||
#define STORE(pos) \
|
||||
temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
|
||||
src[x + pos] = src[x + pos - 8 * src_stride] = 0; \
|
||||
temp = av_clip_uint8(temp); \
|
||||
dst[x + pos] = temp;
|
||||
|
||||
for (int y = 0; y < height; y++) {
|
||||
const uint8_t *d = ff_fspp_dither[y];
|
||||
for (int x = 0; x < width; x += 8) {
|
||||
int temp;
|
||||
STORE(0);
|
||||
STORE(1);
|
||||
STORE(2);
|
||||
STORE(3);
|
||||
STORE(4);
|
||||
STORE(5);
|
||||
STORE(6);
|
||||
STORE(7);
|
||||
}
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
//This func reads from 2 slices, 0 & 2 and clears 2-nd
|
||||
void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
{
|
||||
#define STORE2(pos) \
|
||||
temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale); \
|
||||
src[x + pos + 16 * src_stride] = 0; \
|
||||
temp = av_clip_uint8(temp); \
|
||||
dst[x + pos] = temp;
|
||||
|
||||
for (int y = 0; y < height; y++) {
|
||||
const uint8_t *d = ff_fspp_dither[y];
|
||||
for (int x = 0; x < width; x += 8) {
|
||||
int temp;
|
||||
STORE2(0);
|
||||
STORE2(1);
|
||||
STORE2(2);
|
||||
STORE2(3);
|
||||
STORE2(4);
|
||||
STORE2(5);
|
||||
STORE2(6);
|
||||
STORE2(7);
|
||||
}
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q)
|
||||
{
|
||||
for (int a = 0; a < 64; a++)
|
||||
thr_adr[a] = q * thr_adr_noq[a];
|
||||
}
|
||||
|
||||
void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data,
|
||||
int16_t *restrict output, int cnt)
|
||||
{
|
||||
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
int_simd16_t tmp10, tmp11, tmp12, tmp13;
|
||||
int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
|
||||
int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
|
||||
|
||||
int16_t *wsptr;
|
||||
|
||||
wsptr = output;
|
||||
|
||||
for (; cnt > 0; cnt -= 2) { //start positions
|
||||
const int16_t *threshold = thr_adr;//threshold_mtx
|
||||
for (int ctr = DCTSIZE; ctr > 0; ctr--) {
|
||||
// Process columns from input, add to output.
|
||||
tmp0 = data[DCTSIZE * 0] + data[DCTSIZE * 7];
|
||||
tmp7 = data[DCTSIZE * 0] - data[DCTSIZE * 7];
|
||||
|
||||
tmp1 = data[DCTSIZE * 1] + data[DCTSIZE * 6];
|
||||
tmp6 = data[DCTSIZE * 1] - data[DCTSIZE * 6];
|
||||
|
||||
tmp2 = data[DCTSIZE * 2] + data[DCTSIZE * 5];
|
||||
tmp5 = data[DCTSIZE * 2] - data[DCTSIZE * 5];
|
||||
|
||||
tmp3 = data[DCTSIZE * 3] + data[DCTSIZE * 4];
|
||||
tmp4 = data[DCTSIZE * 3] - data[DCTSIZE * 4];
|
||||
|
||||
// Even part of FDCT
|
||||
|
||||
tmp10 = tmp0 + tmp3;
|
||||
tmp13 = tmp0 - tmp3;
|
||||
tmp11 = tmp1 + tmp2;
|
||||
tmp12 = tmp1 - tmp2;
|
||||
|
||||
d0 = tmp10 + tmp11;
|
||||
d4 = tmp10 - tmp11;
|
||||
|
||||
z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
|
||||
d2 = tmp13 + z1;
|
||||
d6 = tmp13 - z1;
|
||||
|
||||
// Even part of IDCT
|
||||
|
||||
THRESHOLD(tmp0, d0, threshold[0 * 8]);
|
||||
THRESHOLD(tmp1, d2, threshold[2 * 8]);
|
||||
THRESHOLD(tmp2, d4, threshold[4 * 8]);
|
||||
THRESHOLD(tmp3, d6, threshold[6 * 8]);
|
||||
tmp0 += 2;
|
||||
tmp10 = (tmp0 + tmp2) >> 2;
|
||||
tmp11 = (tmp0 - tmp2) >> 2;
|
||||
|
||||
tmp13 = (tmp1 + tmp3) >>2; //+2 ! (psnr decides)
|
||||
tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
|
||||
|
||||
tmp0 = tmp10 + tmp13; //->temps
|
||||
tmp3 = tmp10 - tmp13; //->temps
|
||||
tmp1 = tmp11 + tmp12; //->temps
|
||||
tmp2 = tmp11 - tmp12; //->temps
|
||||
|
||||
// Odd part of FDCT
|
||||
|
||||
tmp10 = tmp4 + tmp5;
|
||||
tmp11 = tmp5 + tmp6;
|
||||
tmp12 = tmp6 + tmp7;
|
||||
|
||||
z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
|
||||
z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
|
||||
z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
|
||||
z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
|
||||
|
||||
z11 = tmp7 + z3;
|
||||
z13 = tmp7 - z3;
|
||||
|
||||
d5 = z13 + z2;
|
||||
d3 = z13 - z2;
|
||||
d1 = z11 + z4;
|
||||
d7 = z11 - z4;
|
||||
|
||||
// Odd part of IDCT
|
||||
|
||||
THRESHOLD(tmp4, d1, threshold[1 * 8]);
|
||||
THRESHOLD(tmp5, d3, threshold[3 * 8]);
|
||||
THRESHOLD(tmp6, d5, threshold[5 * 8]);
|
||||
THRESHOLD(tmp7, d7, threshold[7 * 8]);
|
||||
|
||||
//Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
|
||||
z13 = tmp6 + tmp5;
|
||||
z10 = (tmp6 - tmp5) * 2;
|
||||
z11 = tmp4 + tmp7;
|
||||
z12 = (tmp4 - tmp7) * 2;
|
||||
|
||||
tmp7 = (z11 + z13) >> 2; //+2 !
|
||||
tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562 << 1);
|
||||
z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
|
||||
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
|
||||
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
|
||||
|
||||
tmp6 = tmp12 - tmp7;
|
||||
tmp5 = tmp11 - tmp6;
|
||||
tmp4 = tmp10 + tmp5;
|
||||
|
||||
wsptr[DCTSIZE * 0] += (tmp0 + tmp7);
|
||||
wsptr[DCTSIZE * 1] += (tmp1 + tmp6);
|
||||
wsptr[DCTSIZE * 2] += (tmp2 + tmp5);
|
||||
wsptr[DCTSIZE * 3] += (tmp3 - tmp4);
|
||||
wsptr[DCTSIZE * 4] += (tmp3 + tmp4);
|
||||
wsptr[DCTSIZE * 5] += (tmp2 - tmp5);
|
||||
wsptr[DCTSIZE * 6] = (tmp1 - tmp6);
|
||||
wsptr[DCTSIZE * 7] = (tmp0 - tmp7);
|
||||
//
|
||||
data++; //next column
|
||||
wsptr++;
|
||||
threshold++;
|
||||
}
|
||||
data += 8; //skip each second start pos
|
||||
wsptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void ff_row_idct_c(const int16_t *restrict wsptr, int16_t *restrict output_adr,
|
||||
ptrdiff_t output_stride, int cnt)
|
||||
{
|
||||
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
int_simd16_t tmp10, tmp11, tmp12, tmp13;
|
||||
int_simd16_t z5, z10, z11, z12, z13;
|
||||
int16_t *outptr;
|
||||
|
||||
cnt *= 4;
|
||||
outptr = output_adr;
|
||||
for (; cnt > 0; cnt--) {
|
||||
// Even part
|
||||
//Simd version reads 4x4 block and transposes it
|
||||
tmp10 = wsptr[2] + wsptr[3];
|
||||
tmp11 = wsptr[2] - wsptr[3];
|
||||
|
||||
tmp13 = wsptr[0] + wsptr[1];
|
||||
tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) * 4) - tmp13;//this shift order to avoid overflow
|
||||
|
||||
tmp0 = tmp10 + tmp13; //->temps
|
||||
tmp3 = tmp10 - tmp13; //->temps
|
||||
tmp1 = tmp11 + tmp12;
|
||||
tmp2 = tmp11 - tmp12;
|
||||
|
||||
// Odd part
|
||||
//Also transpose, with previous:
|
||||
// ---- ---- ||||
|
||||
// ---- ---- idct ||||
|
||||
// ---- ---- ---> ||||
|
||||
// ---- ---- ||||
|
||||
z13 = wsptr[4] + wsptr[5];
|
||||
z10 = wsptr[4] - wsptr[5];
|
||||
z11 = wsptr[6] + wsptr[7];
|
||||
z12 = wsptr[6] - wsptr[7];
|
||||
|
||||
tmp7 = z11 + z13;
|
||||
tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
|
||||
|
||||
z5 = MULTIPLY16H(z10 + z12, FIX_1_847759065);
|
||||
tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
|
||||
tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
|
||||
|
||||
tmp6 = tmp12 * 8 - tmp7;
|
||||
tmp5 = tmp11 * 8 - tmp6;
|
||||
tmp4 = tmp10 * 8 + tmp5;
|
||||
|
||||
// Final output stage: descale and write column
|
||||
outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
|
||||
outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
|
||||
outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
|
||||
outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
|
||||
outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
|
||||
outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
|
||||
outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
|
||||
outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
|
||||
outptr++;
|
||||
|
||||
wsptr += DCTSIZE; // advance pointer to next row
|
||||
}
|
||||
}
|
||||
|
||||
void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
|
||||
ptrdiff_t line_size, int cnt)
|
||||
{
|
||||
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
int_simd16_t tmp10, tmp11, tmp12, tmp13;
|
||||
int_simd16_t z1, z2, z3, z4, z5, z11, z13;
|
||||
int16_t *dataptr;
|
||||
|
||||
cnt *= 4;
|
||||
// Pass 1: process rows.
|
||||
|
||||
dataptr = data;
|
||||
for (; cnt > 0; cnt--) {
|
||||
tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
|
||||
tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
|
||||
tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
|
||||
tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
|
||||
tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
|
||||
tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
|
||||
tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
|
||||
tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
|
||||
|
||||
// Even part
|
||||
|
||||
tmp10 = tmp0 + tmp3;
|
||||
tmp13 = tmp0 - tmp3;
|
||||
tmp11 = tmp1 + tmp2;
|
||||
tmp12 = tmp1 - tmp2;
|
||||
//Even columns are written first, this leads to different order of columns
|
||||
//in column_fidct(), but they are processed independently, so all ok.
|
||||
//Later in the row_idct() columns are read in the same order.
|
||||
dataptr[2] = tmp10 + tmp11;
|
||||
dataptr[3] = tmp10 - tmp11;
|
||||
|
||||
z1 = MULTIPLY16H(tmp12 + tmp13, FIX_0_707106781 << 2);
|
||||
dataptr[0] = tmp13 + z1;
|
||||
dataptr[1] = tmp13 - z1;
|
||||
|
||||
// Odd part
|
||||
|
||||
tmp10 = tmp4 + tmp5;
|
||||
tmp11 = tmp5 + tmp6;
|
||||
tmp12 = tmp6 + tmp7;
|
||||
|
||||
z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433 << 2);
|
||||
z2 = MULTIPLY16H(tmp10, FIX_0_541196100 << 2) + z5;
|
||||
z4 = MULTIPLY16H(tmp12, FIX_1_306562965 << 2) + z5;
|
||||
z3 = MULTIPLY16H(tmp11, FIX_0_707106781 << 2);
|
||||
|
||||
z11 = tmp7 + z3;
|
||||
z13 = tmp7 - z3;
|
||||
|
||||
dataptr[4] = z13 + z2;
|
||||
dataptr[5] = z13 - z2;
|
||||
dataptr[6] = z11 + z4;
|
||||
dataptr[7] = z11 - z4;
|
||||
|
||||
pixels++; // advance pointer to next column
|
||||
dataptr += DCTSIZE;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
/*
|
||||
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
|
||||
* Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
|
||||
#ifndef AVFILTER_FSPPDSP_H
|
||||
#define AVFILTER_FSPPDSP_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes_internal.h"
|
||||
|
||||
typedef struct FSPPDSPContext {
|
||||
void (*store_slice)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
|
||||
void (*store_slice2)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
|
||||
void (*mul_thrmat)(const int16_t *restrict thr_adr_noq /* align 16 */,
|
||||
int16_t *restrict thr_adr /* align 16 */, int q);
|
||||
|
||||
void (*column_fidct)(const int16_t *restrict thr_adr, const int16_t *restrict data,
|
||||
int16_t *restrict output, int cnt);
|
||||
|
||||
void (*row_idct)(const int16_t *restrict workspace, int16_t *restrict output_adr,
|
||||
ptrdiff_t output_stride, int cnt);
|
||||
|
||||
void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
|
||||
ptrdiff_t line_size, int cnt);
|
||||
} FSPPDSPContext;
|
||||
|
||||
FF_VISIBILITY_PUSH_HIDDEN
|
||||
extern const uint8_t ff_fspp_dither[8][8];
|
||||
|
||||
void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
void ff_mul_thrmat_c(const int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q);
|
||||
void ff_column_fidct_c(const int16_t *restrict thr_adr, const int16_t *restrict data,
|
||||
int16_t *restrict output, int cnt);
|
||||
void ff_row_idct_c(const int16_t *restrict workspace, int16_t *restrict output_adr,
|
||||
ptrdiff_t output_stride, int cnt);
|
||||
void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
|
||||
ptrdiff_t line_size, int cnt);
|
||||
|
||||
void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
|
||||
FF_VISIBILITY_POP_HIDDEN
|
||||
|
||||
static inline void ff_fsppdsp_init(FSPPDSPContext *fspp)
|
||||
{
|
||||
fspp->store_slice = ff_store_slice_c;
|
||||
fspp->store_slice2 = ff_store_slice2_c;
|
||||
fspp->mul_thrmat = ff_mul_thrmat_c;
|
||||
fspp->column_fidct = ff_column_fidct_c;
|
||||
fspp->row_idct = ff_row_idct_c;
|
||||
fspp->row_fdct = ff_row_fdct_c;
|
||||
|
||||
#if ARCH_X86
|
||||
ff_fsppdsp_init_x86(fspp);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* AVFILTER_FSPPDSP_H */
|
||||
|
|
@ -25,36 +25,33 @@
|
|||
|
||||
SECTION_RODATA
|
||||
|
||||
pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \
|
||||
8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \
|
||||
2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \
|
||||
10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21
|
||||
cextern fspp_dither
|
||||
pw_4546: times 8 dw 0x4546 ; FIX(1.082392200, 13)*2
|
||||
pw_61F8: times 8 dw 0x61F8 ; FIX(0.382683433, 14)*4
|
||||
pw_539F: times 8 dw 0x539F ; FIX(1.306562965, 14)
|
||||
pw_5A82: times 8 dw 0x5A82 ; FIX(1.414213562, 14)
|
||||
pw_7642: times 8 dw 0x7642 ; FIX(1.847759065, 13)*2
|
||||
pw_AC62: times 8 dw 0xAC62 ; FIX(-2.613125930, 13)
|
||||
pw_2: times 8 dw 2
|
||||
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
|
||||
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
|
||||
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
|
||||
pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
|
||||
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
|
||||
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
|
||||
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
|
||||
pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
|
||||
pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
|
||||
pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
|
||||
pw_4: times 4 dw 4
|
||||
pw_2: times 4 dw 2
|
||||
|
||||
SECTION .text
|
||||
|
||||
%define DCTSIZE 8
|
||||
|
||||
INIT_MMX mmx
|
||||
INIT_XMM sse2
|
||||
|
||||
;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
|
||||
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
;void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
|
||||
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
%if ARCH_X86_64
|
||||
cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
|
||||
cglobal store_slice, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
|
||||
%else
|
||||
cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
cglobal store_slice, 2, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
%define dst_strideq r2m
|
||||
%define src_strideq r3m
|
||||
mov widthq, r4m
|
||||
|
|
@ -65,7 +62,7 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
|||
mov tmpq, src_strideq
|
||||
and widthq, ~7
|
||||
sub dst_strideq, widthq
|
||||
movd m5, ditherd ; log2_scale
|
||||
movd m4, ditherd ; log2_scale
|
||||
xor ditherq, -1 ; log2_scale
|
||||
mov tmp2q, tmpq
|
||||
add ditherq, 7 ; log2_scale
|
||||
|
|
@ -73,33 +70,25 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
|||
sub tmp2q, widthq
|
||||
movd m2, ditherd ; log2_scale
|
||||
add tmp2q, tmp2q
|
||||
lea ditherq, [pb_dither]
|
||||
lea ditherq, [fspp_dither]
|
||||
mov src_strideq, tmp2q
|
||||
shl tmpq, 4
|
||||
lea dither_heightq, [ditherq+dither_heightq*8]
|
||||
pxor m7, m7
|
||||
pxor m1, m1
|
||||
|
||||
.loop_height:
|
||||
movq m3, [ditherq]
|
||||
movq m4, m3
|
||||
punpcklbw m3, m7
|
||||
punpckhbw m4, m7
|
||||
punpcklbw m3, m1
|
||||
mov tmp2q, widthq
|
||||
psraw m3, m5
|
||||
psraw m4, m5
|
||||
psraw m3, m4
|
||||
|
||||
.loop_width:
|
||||
movq [srcq+tmpq], m7
|
||||
movq m0, [srcq]
|
||||
movq m1, [srcq+8]
|
||||
movq [srcq+tmpq+8], m7
|
||||
mova m0, [srcq]
|
||||
mova [srcq+tmpq], m1
|
||||
paddw m0, m3
|
||||
paddw m1, m4
|
||||
movq [srcq], m7
|
||||
mova [srcq], m1
|
||||
psraw m0, m2
|
||||
psraw m1, m2
|
||||
movq [srcq+8], m7
|
||||
packuswb m0, m1
|
||||
packuswb m0, m0
|
||||
add srcq, 16
|
||||
movq [dstq], m0
|
||||
add dstq, 8
|
||||
|
|
@ -113,13 +102,13 @@ cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
|||
jl .loop_height
|
||||
RET
|
||||
|
||||
;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
|
||||
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
;void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
|
||||
; ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
|
||||
%if ARCH_X86_64
|
||||
cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
|
||||
cglobal store_slice2, 7, 9, 5, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
|
||||
%else
|
||||
cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
cglobal store_slice2, 0, 7, 5, dst, src, width, dither_height, dither, tmp, tmp2
|
||||
%define dst_strideq r2m
|
||||
%define src_strideq r3m
|
||||
mov dstq, dstm
|
||||
|
|
@ -132,41 +121,32 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
|||
mov tmpq, src_strideq
|
||||
and widthq, ~7
|
||||
sub dst_strideq, widthq
|
||||
movd m5, ditherd ; log2_scale
|
||||
movd m4, ditherd ; log2_scale
|
||||
xor ditherq, -1 ; log2_scale
|
||||
mov tmp2q, tmpq
|
||||
add ditherq, 7 ; log2_scale
|
||||
sub tmp2q, widthq
|
||||
movd m2, ditherd ; log2_scale
|
||||
add tmp2q, tmp2q
|
||||
lea ditherq, [pb_dither]
|
||||
lea ditherq, [fspp_dither]
|
||||
mov src_strideq, tmp2q
|
||||
shl tmpq, 5
|
||||
lea dither_heightq, [ditherq+dither_heightq*8]
|
||||
pxor m7, m7
|
||||
pxor m1, m1
|
||||
|
||||
.loop_height:
|
||||
movq m3, [ditherq]
|
||||
movq m4, m3
|
||||
punpcklbw m3, m7
|
||||
punpckhbw m4, m7
|
||||
punpcklbw m3, m1
|
||||
mov tmp2q,widthq
|
||||
psraw m3, m5
|
||||
psraw m4, m5
|
||||
psraw m3, m4
|
||||
|
||||
.loop_width:
|
||||
movq m0, [srcq]
|
||||
movq m1, [srcq+8]
|
||||
mova m0, [srcq]
|
||||
paddw m0, m3
|
||||
paddw m0, [srcq+tmpq]
|
||||
paddw m1, m4
|
||||
movq m6, [srcq+tmpq+8]
|
||||
movq [srcq+tmpq], m7
|
||||
mova [srcq+tmpq], m1
|
||||
psraw m0, m2
|
||||
paddw m1, m6
|
||||
movq [srcq+tmpq+8], m7
|
||||
psraw m1, m2
|
||||
packuswb m0, m1
|
||||
packuswb m0, m0
|
||||
movq [dstq], m0
|
||||
add srcq, 16
|
||||
add dstq, 8
|
||||
|
|
@ -180,164 +160,152 @@ cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
|
|||
jl .loop_height
|
||||
RET
|
||||
|
||||
;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
|
||||
cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
|
||||
movd m7, qd
|
||||
movq m0, [thrnq]
|
||||
punpcklwd m7, m7
|
||||
movq m1, [thrnq+8]
|
||||
punpckldq m7, m7
|
||||
pmullw m0, m7
|
||||
movq m2, [thrnq+8*2]
|
||||
pmullw m1, m7
|
||||
movq m3, [thrnq+8*3]
|
||||
pmullw m2, m7
|
||||
movq [thrq], m0
|
||||
movq m4, [thrnq+8*4]
|
||||
pmullw m3, m7
|
||||
movq [thrq+8], m1
|
||||
movq m5, [thrnq+8*5]
|
||||
pmullw m4, m7
|
||||
movq [thrq+8*2], m2
|
||||
movq m6, [thrnq+8*6]
|
||||
pmullw m5, m7
|
||||
movq [thrq+8*3], m3
|
||||
movq m0, [thrnq+8*7]
|
||||
pmullw m6, m7
|
||||
movq [thrq+8*4], m4
|
||||
movq m1, [thrnq+8*7+8]
|
||||
pmullw m0, m7
|
||||
movq [thrq+8*5], m5
|
||||
movq m2, [thrnq+8*7+8*2]
|
||||
pmullw m1, m7
|
||||
movq [thrq+8*6], m6
|
||||
movq m3, [thrnq+8*7+8*3]
|
||||
pmullw m2, m7
|
||||
movq [thrq+8*7], m0
|
||||
movq m4, [thrnq+8*7+8*4]
|
||||
pmullw m3, m7
|
||||
movq [thrq+8*7+8], m1
|
||||
movq m5, [thrnq+8*7+8*5]
|
||||
pmullw m4, m7
|
||||
movq [thrq+8*7+8*2], m2
|
||||
movq m6, [thrnq+8*7+8*6]
|
||||
pmullw m5, m7
|
||||
movq [thrq+8*7+8*3], m3
|
||||
movq m0, [thrnq+14*8]
|
||||
pmullw m6, m7
|
||||
movq [thrq+8*7+8*4], m4
|
||||
movq m1, [thrnq+14*8+8]
|
||||
pmullw m0, m7
|
||||
movq [thrq+8*7+8*5], m5
|
||||
pmullw m1, m7
|
||||
movq [thrq+8*7+8*6], m6
|
||||
movq [thrq+14*8], m0
|
||||
movq [thrq+14*8+8], m1
|
||||
;void ff_mul_thrmat_sse2(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
|
||||
cglobal mul_thrmat, 3, 3, 5, thrn, thr, q
|
||||
movd m4, qd
|
||||
mova m0, [thrnq]
|
||||
punpcklwd m4, m4
|
||||
mova m1, [thrnq+16]
|
||||
pshufd m4, m4, 0
|
||||
pmullw m0, m4
|
||||
mova m2, [thrnq+16*2]
|
||||
pmullw m1, m4
|
||||
mova m3, [thrnq+16*3]
|
||||
pmullw m2, m4
|
||||
mova [thrq], m0
|
||||
mova m0, [thrnq+16*4]
|
||||
pmullw m3, m4
|
||||
mova [thrq+16], m1
|
||||
mova m1, [thrnq+16*5]
|
||||
pmullw m0, m4
|
||||
mova [thrq+16*2], m2
|
||||
mova m2, [thrnq+16*6]
|
||||
pmullw m1, m4
|
||||
mova [thrq+16*3], m3
|
||||
mova m3, [thrnq+16*7]
|
||||
pmullw m2, m4
|
||||
mova [thrq+16*4], m0
|
||||
pmullw m3, m4
|
||||
mova [thrq+16*5], m1
|
||||
mova [thrq+16*6], m2
|
||||
mova [thrq+16*7], m3
|
||||
RET
|
||||
|
||||
%macro COLUMN_FDCT 1-3 0, 0
|
||||
movq m1, [srcq+DCTSIZE*0*2]
|
||||
movq m7, [srcq+DCTSIZE*3*2]
|
||||
movq m0, m1
|
||||
%macro COLUMN_FDCT 1
|
||||
mova m1, [srcq+DCTSIZE*0*2]
|
||||
mova m7, [srcq+DCTSIZE*3*2]
|
||||
mova m0, m1
|
||||
paddw m1, [srcq+DCTSIZE*7*2]
|
||||
movq m3, m7
|
||||
mova m3, m7
|
||||
paddw m7, [srcq+DCTSIZE*4*2]
|
||||
movq m5, m1
|
||||
movq m6, [srcq+DCTSIZE*1*2]
|
||||
mova m5, m1
|
||||
mova m6, [srcq+DCTSIZE*1*2]
|
||||
psubw m1, m7
|
||||
movq m2, [srcq+DCTSIZE*2*2]
|
||||
movq m4, m6
|
||||
mova m2, [srcq+DCTSIZE*2*2]
|
||||
mova m4, m6
|
||||
paddw m6, [srcq+DCTSIZE*6*2]
|
||||
paddw m5, m7
|
||||
paddw m2, [srcq+DCTSIZE*5*2]
|
||||
movq m7, m6
|
||||
mova m7, m6
|
||||
paddw m6, m2
|
||||
psubw m7, m2
|
||||
movq m2, m5
|
||||
mova m2, m5
|
||||
%if ARCH_X86_64
|
||||
mova m8, [thrq]
|
||||
%define THRQ m8
|
||||
%else
|
||||
%define THRQ [thrq]
|
||||
%endif
|
||||
paddw m5, m6
|
||||
psubw m2, m6
|
||||
paddw m7, m1
|
||||
movq m6, [thrq+4*16+%2]
|
||||
psllw m7, 2
|
||||
psubw m5, [thrq+%2]
|
||||
mova m6, [thrq+4*16]
|
||||
psllw m7, 1
|
||||
psubw m5, THRQ
|
||||
psubw m2, m6
|
||||
paddusw m5, [thrq+%2]
|
||||
paddusw m5, THRQ
|
||||
paddusw m2, m6
|
||||
pmulhw m7, [pw_2D41]
|
||||
paddw m5, [thrq+%2]
|
||||
pmulhw m7, SQRT2
|
||||
paddw m5, THRQ
|
||||
paddw m2, m6
|
||||
psubusw m5, [thrq+%2]
|
||||
psubusw m5, THRQ
|
||||
psubusw m2, m6
|
||||
paddw m5, [pw_2]
|
||||
movq m6, m2
|
||||
mova m6, m2
|
||||
paddw m2, m5
|
||||
%if ARCH_X86_64
|
||||
mova m8, [thrq+2*16]
|
||||
%define THRQ m8
|
||||
%else
|
||||
%define THRQ [thrq+2*16]
|
||||
%endif
|
||||
psubw m5, m6
|
||||
movq m6, m1
|
||||
mova m6, m1
|
||||
paddw m1, m7
|
||||
psubw m1, [thrq+2*16+%2]
|
||||
psubw m1, THRQ
|
||||
psubw m6, m7
|
||||
movq m7, [thrq+6*16+%2]
|
||||
mova m7, [thrq+6*16]
|
||||
psraw m5, 2
|
||||
paddusw m1, [thrq+2*16+%2]
|
||||
paddusw m1, THRQ
|
||||
psubw m6, m7
|
||||
paddw m1, [thrq+2*16+%2]
|
||||
paddw m1, THRQ
|
||||
paddusw m6, m7
|
||||
psubusw m1, [thrq+2*16+%2]
|
||||
psubusw m1, THRQ
|
||||
paddw m6, m7
|
||||
psubw m3, [srcq+DCTSIZE*4*2]
|
||||
psubusw m6, m7
|
||||
movq m7, m1
|
||||
mova m7, m1
|
||||
psraw m2, 2
|
||||
psubw m4, [srcq+DCTSIZE*6*2]
|
||||
psubw m1, m6
|
||||
psubw m0, [srcq+DCTSIZE*7*2]
|
||||
paddw m6, m7
|
||||
psraw m6, 2
|
||||
movq m7, m2
|
||||
pmulhw m1, [pw_5A82]
|
||||
mova m7, m2
|
||||
pmulhw m1, SQRT2
|
||||
paddw m2, m6
|
||||
movq [rsp], m2
|
||||
mova tmp0, m2
|
||||
psubw m7, m6
|
||||
movq m2, [srcq+DCTSIZE*2*2]
|
||||
mova m2, [srcq+DCTSIZE*2*2]
|
||||
psubw m1, m6
|
||||
psubw m2, [srcq+DCTSIZE*5*2]
|
||||
movq m6, m5
|
||||
movq [rsp+8*3], m7
|
||||
mova m6, m5
|
||||
mova tmp3, m7
|
||||
paddw m3, m2
|
||||
paddw m2, m4
|
||||
paddw m4, m0
|
||||
movq m7, m3
|
||||
mova m7, m3
|
||||
psubw m3, m4
|
||||
psllw m3, 2
|
||||
psllw m7, 2
|
||||
pmulhw m3, [pw_187E]
|
||||
psllw m7, 1
|
||||
pmulhw m3, [pw_61F8]
|
||||
psllw m4, 2
|
||||
pmulhw m7, [pw_22A3]
|
||||
psllw m2, 2
|
||||
add srcq, 32
|
||||
pmulhw m7, [pw_4546]
|
||||
psllw m2, 1
|
||||
pmulhw m4, [pw_539F]
|
||||
paddw m5, m1
|
||||
pmulhw m2, [pw_2D41]
|
||||
pmulhw m2, SQRT2
|
||||
psubw m6, m1
|
||||
paddw m7, m3
|
||||
movq [rsp+8], m5
|
||||
mova tmp1, m5
|
||||
paddw m4, m3
|
||||
movq m3, [thrq+3*16+%2]
|
||||
movq m1, m0
|
||||
movq [rsp+8*2], m6
|
||||
mova m3, [thrq+3*16]
|
||||
mova m1, m0
|
||||
mova tmp2, m6
|
||||
psubw m1, m2
|
||||
paddw m0, m2
|
||||
movq m5, m1
|
||||
movq m2, [thrq+5*16+%2]
|
||||
mova m5, m1
|
||||
mova m2, [thrq+5*16]
|
||||
psubw m1, m7
|
||||
paddw m5, m7
|
||||
psubw m1, m3
|
||||
movq m7, [thrq+16+%2]
|
||||
mova m7, [thrq+16]
|
||||
psubw m5, m2
|
||||
movq m6, m0
|
||||
mova m6, m0
|
||||
paddw m0, m4
|
||||
paddusw m1, m3
|
||||
psubw m6, m4
|
||||
movq m4, [thrq+7*16+%2]
|
||||
mova m4, [thrq+7*16]
|
||||
psubw m0, m7
|
||||
psubw m6, m4
|
||||
paddusw m5, m2
|
||||
|
|
@ -348,139 +316,149 @@ cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
|
|||
psubusw m1, m3
|
||||
psubusw m5, m2
|
||||
psubusw m6, m4
|
||||
movq m4, m1
|
||||
mova m4, m1
|
||||
por m4, m5
|
||||
paddusw m0, m7
|
||||
por m4, m6
|
||||
paddw m0, m7
|
||||
packssdw m4, m4
|
||||
psubusw m0, m7
|
||||
movd tmpd, m4
|
||||
or tmpd, tmpd
|
||||
%if ARCH_X86_64
|
||||
movq tmpq, m4
|
||||
%else
|
||||
packssdw m4, m4
|
||||
movd tmpd, m4
|
||||
%endif
|
||||
or tmpq, tmpq
|
||||
jnz %1
|
||||
movq m4, [rsp]
|
||||
movq m1, m0
|
||||
pmulhw m0, [pw_3642]
|
||||
movq m2, m1
|
||||
movq m5, [outq+DCTSIZE*0*2]
|
||||
movq m3, m2
|
||||
pmulhw m1, [pw_2441]
|
||||
mova m4, tmp0
|
||||
psraw m3, m0, 2
|
||||
mova m5, [outq+DCTSIZE*0*2]
|
||||
pmulhw m1, m0, [pw_7642]
|
||||
pmulhw m2, m0, [pw_4546]
|
||||
pmulhw m0, SQRT2
|
||||
paddw m5, m4
|
||||
movq m6, [rsp+8]
|
||||
psraw m3, 2
|
||||
pmulhw m2, [pw_0CBB]
|
||||
mova m6, tmp1
|
||||
psubw m2, m1
|
||||
psubw m4, m3
|
||||
movq m7, [outq+DCTSIZE*1*2]
|
||||
mova m7, [outq+DCTSIZE*1*2]
|
||||
paddw m5, m3
|
||||
movq [outq+DCTSIZE*7*2], m4
|
||||
psubw m1, m3
|
||||
mova [outq+DCTSIZE*7*2], m4
|
||||
psubw m0, m1
|
||||
paddw m2, m0
|
||||
mova [outq+DCTSIZE*0*2], m5
|
||||
paddw m7, m6
|
||||
movq m3, [rsp+8*2]
|
||||
psubw m6, m0
|
||||
movq m4, [outq+DCTSIZE*2*2]
|
||||
paddw m7, m0
|
||||
movq [outq], m5
|
||||
mova m3, tmp2
|
||||
psubw m6, m1
|
||||
mova m4, [outq+DCTSIZE*2*2]
|
||||
paddw m7, m1
|
||||
mova [outq], m5
|
||||
paddw m4, m3
|
||||
movq [outq+DCTSIZE*6*2], m6
|
||||
psubw m3, m1
|
||||
movq m5, [outq+DCTSIZE*5*2]
|
||||
paddw m4, m1
|
||||
movq m6, [outq+DCTSIZE*3*2]
|
||||
mova [outq+DCTSIZE*6*2], m6
|
||||
psubw m3, m0
|
||||
mova m5, [outq+DCTSIZE*5*2]
|
||||
paddw m4, m0
|
||||
mova m6, [outq+DCTSIZE*3*2]
|
||||
paddw m5, m3
|
||||
movq m0, [rsp+8*3]
|
||||
add srcq, 8+%3
|
||||
movq [outq+DCTSIZE*1*2], m7
|
||||
mova m0, tmp3
|
||||
mova [outq+DCTSIZE*1*2], m7
|
||||
paddw m6, m0
|
||||
movq [outq+DCTSIZE*2*2], m4
|
||||
psubw m0, m2
|
||||
movq m7, [outq+DCTSIZE*4*2]
|
||||
paddw m6, m2
|
||||
movq [outq+DCTSIZE*5*2], m5
|
||||
mova [outq+DCTSIZE*2*2], m4
|
||||
paddw m0, m2
|
||||
mova m7, [outq+DCTSIZE*4*2]
|
||||
psubw m6, m2
|
||||
mova [outq+DCTSIZE*5*2], m5
|
||||
paddw m7, m0
|
||||
movq [outq+DCTSIZE*3*2], m6
|
||||
movq [outq+DCTSIZE*4*2], m7
|
||||
add outq, 8+%3
|
||||
mova [outq+DCTSIZE*3*2], m6
|
||||
mova [outq+DCTSIZE*4*2], m7
|
||||
add outq, 32
|
||||
%endmacro
|
||||
|
||||
%macro COLUMN_IDCT 0-1 0
|
||||
movq m3, m5
|
||||
%macro COLUMN_IDCT 0
|
||||
mova m3, m5
|
||||
psubw m5, m1
|
||||
psllw m5, 1
|
||||
paddw m3, m1
|
||||
movq m2, m0
|
||||
mova m2, m0
|
||||
psubw m0, m6
|
||||
movq m1, m5
|
||||
psllw m0, 1
|
||||
psllw m1, m5, 1
|
||||
pmulhw m1, [pw_AC62]
|
||||
paddw m5, m0
|
||||
pmulhw m5, [pw_3B21]
|
||||
pmulhw m5, [pw_7642]
|
||||
paddw m2, m6
|
||||
pmulhw m0, [pw_22A3]
|
||||
movq m7, m2
|
||||
movq m4, [rsp]
|
||||
pmulhw m0, [pw_4546]
|
||||
mova m7, m2
|
||||
mova m4, tmp0
|
||||
psubw m2, m3
|
||||
psllw m2, 1
|
||||
paddw m7, m3
|
||||
pmulhw m2, [pw_2D41]
|
||||
movq m6, m4
|
||||
pmulhw m2, SQRT2
|
||||
mova m6, m4
|
||||
psraw m7, 2
|
||||
paddw m4, [outq]
|
||||
psubw m6, m7
|
||||
movq m3, [rsp+8]
|
||||
mova m3, tmp1
|
||||
paddw m4, m7
|
||||
movq [outq+DCTSIZE*7*2], m6
|
||||
mova [outq+DCTSIZE*7*2], m6
|
||||
paddw m1, m5
|
||||
movq [outq], m4
|
||||
mova [outq], m4
|
||||
psubw m1, m7
|
||||
movq m7, [rsp+8*2]
|
||||
mova m7, tmp2
|
||||
psubw m0, m5
|
||||
movq m6, [rsp+8*3]
|
||||
movq m5, m3
|
||||
mova m6, tmp3
|
||||
mova m5, m3
|
||||
paddw m3, [outq+DCTSIZE*1*2]
|
||||
psubw m5, m1
|
||||
psubw m2, m1
|
||||
paddw m3, m1
|
||||
movq [outq+DCTSIZE*6*2], m5
|
||||
movq m4, m7
|
||||
mova [outq+DCTSIZE*6*2], m5
|
||||
mova m4, m7
|
||||
paddw m7, [outq+DCTSIZE*2*2]
|
||||
psubw m4, m2
|
||||
paddw m4, [outq+DCTSIZE*5*2]
|
||||
paddw m7, m2
|
||||
movq [outq+DCTSIZE*1*2], m3
|
||||
mova [outq+DCTSIZE*1*2], m3
|
||||
paddw m0, m2
|
||||
movq [outq+DCTSIZE*2*2], m7
|
||||
movq m1, m6
|
||||
mova [outq+DCTSIZE*2*2], m7
|
||||
mova m1, m6
|
||||
paddw m6, [outq+DCTSIZE*4*2]
|
||||
psubw m1, m0
|
||||
paddw m1, [outq+DCTSIZE*3*2]
|
||||
paddw m6, m0
|
||||
movq [outq+DCTSIZE*5*2], m4
|
||||
add srcq, 8+%1
|
||||
movq [outq+DCTSIZE*4*2], m6
|
||||
movq [outq+DCTSIZE*3*2], m1
|
||||
add outq, 8+%1
|
||||
mova [outq+DCTSIZE*5*2], m4
|
||||
mova [outq+DCTSIZE*4*2], m6
|
||||
mova [outq+DCTSIZE*3*2], m1
|
||||
add outq, 32
|
||||
%endmacro
|
||||
|
||||
;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
|
||||
cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
|
||||
.fdct1:
|
||||
COLUMN_FDCT .idct1
|
||||
jmp .fdct2
|
||||
;void ff_column_fidct_sse2(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
|
||||
cglobal column_fidct, 4, 5, 8+5*ARCH_X86_64, 64*!ARCH_X86_64, thr, src, out, cnt, tmp
|
||||
%if ARCH_X86_64
|
||||
%define tmp0 m8
|
||||
%define tmp1 m9
|
||||
%define tmp2 m10
|
||||
%define tmp3 m11
|
||||
%define SQRT2 m12
|
||||
mova m12, [pw_5A82]
|
||||
%else
|
||||
%define tmp0 [rsp]
|
||||
%define tmp1 [rsp+16]
|
||||
%define tmp2 [rsp+2*16]
|
||||
%define tmp3 [rsp+3*16]
|
||||
%define SQRT2 [pw_5A82]
|
||||
%endif
|
||||
.fdct:
|
||||
COLUMN_FDCT .idct
|
||||
sub cntd, 2
|
||||
jg .fdct
|
||||
RET
|
||||
|
||||
.idct1:
|
||||
.idct:
|
||||
COLUMN_IDCT
|
||||
|
||||
.fdct2:
|
||||
COLUMN_FDCT .idct2, 8, 16
|
||||
sub cntd, 2
|
||||
jg .fdct1
|
||||
RET
|
||||
|
||||
.idct2:
|
||||
COLUMN_IDCT 16
|
||||
sub cntd, 2
|
||||
jg .fdct1
|
||||
jg .fdct
|
||||
RET
|
||||
|
||||
INIT_MMX mmx
|
||||
;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
|
||||
cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
|
||||
add strideq, strideq
|
||||
|
|
|
|||
|
|
@ -21,29 +21,31 @@
|
|||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavfilter/vf_fspp.h"
|
||||
#include "libavfilter/vf_fsppdsp.h"
|
||||
|
||||
void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
|
||||
void ff_store_slice_sse2(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
|
||||
void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
|
||||
void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
|
||||
void ff_store_slice2_sse2(uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
void ff_mul_thrmat_sse2(const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
|
||||
void ff_column_fidct_sse2(const int16_t *thr_adr, const int16_t *data, int16_t *output, int cnt);
|
||||
void ff_row_idct_mmx(const int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
|
||||
void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
|
||||
|
||||
av_cold void ff_fspp_init_x86(FSPPContext *s)
|
||||
av_cold void ff_fsppdsp_init_x86(FSPPDSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
s->store_slice = ff_store_slice_mmx;
|
||||
s->store_slice2 = ff_store_slice2_mmx;
|
||||
s->mul_thrmat = ff_mul_thrmat_mmx;
|
||||
s->column_fidct = ff_column_fidct_mmx;
|
||||
s->row_idct = ff_row_idct_mmx;
|
||||
s->row_fdct = ff_row_fdct_mmx;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
s->store_slice = ff_store_slice_sse2;
|
||||
s->store_slice2 = ff_store_slice2_sse2;
|
||||
s->mul_thrmat = ff_mul_thrmat_sse2;
|
||||
s->column_fidct = ff_column_fidct_sse2;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ static void store_slice_sse2(uint8_t *dst, const int16_t *src,
|
|||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
|
||||
av_cold void ff_spp_init_x86(SPPContext *s)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -64,6 +64,7 @@ AVFILTEROBJS-$(CONFIG_BWDIF_FILTER) += vf_bwdif.o
|
|||
AVFILTEROBJS-$(CONFIG_COLORDETECT_FILTER)+= vf_colordetect.o
|
||||
AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
|
||||
AVFILTEROBJS-$(CONFIG_EQ_FILTER) += vf_eq.o
|
||||
AVFILTEROBJS-$(CONFIG_FSPP_FILTER) += vf_fspp.o
|
||||
AVFILTEROBJS-$(CONFIG_GBLUR_FILTER) += vf_gblur.o
|
||||
AVFILTEROBJS-$(CONFIG_HFLIP_FILTER) += vf_hflip.o
|
||||
AVFILTEROBJS-$(CONFIG_IDET_FILTER) += vf_idet.o
|
||||
|
|
|
|||
|
|
@ -297,6 +297,9 @@ static const struct {
|
|||
#if CONFIG_EQ_FILTER
|
||||
{ "vf_eq", checkasm_check_vf_eq },
|
||||
#endif
|
||||
#if CONFIG_FSPP_FILTER
|
||||
{ "vf_fspp", checkasm_check_vf_fspp },
|
||||
#endif
|
||||
#if CONFIG_GBLUR_FILTER
|
||||
{ "vf_gblur", checkasm_check_vf_gblur },
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -148,6 +148,7 @@ void checkasm_check_v210enc(void);
|
|||
void checkasm_check_vc1dsp(void);
|
||||
void checkasm_check_vf_bwdif(void);
|
||||
void checkasm_check_vf_eq(void);
|
||||
void checkasm_check_vf_fspp(void);
|
||||
void checkasm_check_vf_gblur(void);
|
||||
void checkasm_check_vf_hflip(void);
|
||||
void checkasm_check_vf_threshold(void);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,170 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "checkasm.h"
|
||||
#include "libavfilter/vf_fsppdsp.h"
|
||||
#include "libavcodec/mathops.h"
|
||||
#include "libavutil/mem_internal.h"
|
||||
|
||||
#define randomize_buffers(buf) \
|
||||
do { \
|
||||
for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
|
||||
buf[j] = rnd(); \
|
||||
} while (0)
|
||||
|
||||
#define randomize_mask_buffers(buf, buf2, nb_elems, nb_bits)\
|
||||
do { \
|
||||
for (size_t j = 0; j < nb_elems; ++j) \
|
||||
buf[j] = buf2[j] = sign_extend(rnd(), nb_bits); \
|
||||
} while (0)
|
||||
|
||||
#define randomize_buffer_range(buf, min, max) \
|
||||
do { \
|
||||
for (size_t j = 0; j < FF_ARRAY_ELEMS(buf); ++j) \
|
||||
buf[j] = min + rnd() % (max - min + 1); \
|
||||
} while (0)
|
||||
|
||||
static void check_store_slice(void)
|
||||
{
|
||||
enum {
|
||||
MAX_WIDTH = 256,
|
||||
/// in elements, not in bytes; 32 is arbitrary
|
||||
MAX_STRIDE = MAX_WIDTH + 32,
|
||||
MAX_HEIGHT = 8,
|
||||
};
|
||||
FSPPDSPContext fspp;
|
||||
ff_fsppdsp_init(&fspp);
|
||||
declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *src,
|
||||
ptrdiff_t dst_stride, ptrdiff_t src_stride,
|
||||
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
if (check_func(i ? fspp.store_slice2 : fspp.store_slice, "store_slice%s", i ? "2" : "")) {
|
||||
// store slice resets the row eight lines above the current one
|
||||
DECLARE_ALIGNED(16, int16_t, src_ref1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH];
|
||||
DECLARE_ALIGNED(16, int16_t, src_new1)[MAX_STRIDE * ( 8 + MAX_HEIGHT - 1) + MAX_WIDTH];
|
||||
// store_slice2 resets the row 16 lines below the current one
|
||||
DECLARE_ALIGNED(16, int16_t, src_ref2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH];
|
||||
DECLARE_ALIGNED(16, int16_t, src_new2)[MAX_STRIDE * (16 + MAX_HEIGHT - 1) + MAX_WIDTH];
|
||||
uint8_t dstbuf_new[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH], dstbuf_ref[MAX_STRIDE * (MAX_HEIGHT - 1) + MAX_WIDTH];
|
||||
uint8_t *dst_new = dstbuf_new, *dst_ref = dstbuf_ref;
|
||||
int16_t *src_ref, *src_new, *or_src_ref, *or_src_new;
|
||||
ptrdiff_t width = 1 + rnd() % MAX_WIDTH;
|
||||
ptrdiff_t src_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8);
|
||||
ptrdiff_t dst_stride = FFALIGN(width + 1 + rnd() % (MAX_STRIDE - MAX_WIDTH), 8);
|
||||
ptrdiff_t height = 1 + rnd() % 8;
|
||||
size_t nb_elems;
|
||||
|
||||
if (i) {
|
||||
src_ref = src_ref2;
|
||||
src_new = src_new2;
|
||||
or_src_ref = src_ref2;
|
||||
or_src_new = src_new2;
|
||||
nb_elems = FF_ARRAY_ELEMS(src_ref2);
|
||||
} else {
|
||||
src_ref = src_ref1 + 8 * src_stride;
|
||||
src_new = src_new1 + 8 * src_stride;
|
||||
or_src_ref = src_ref1;
|
||||
or_src_new = src_new1;
|
||||
nb_elems = FF_ARRAY_ELEMS(src_ref1);
|
||||
}
|
||||
if (rnd() & 1) {
|
||||
dst_ref += dst_stride * (height - 1);
|
||||
dst_new += dst_stride * (height - 1);
|
||||
dst_stride *= -1;
|
||||
}
|
||||
randomize_buffers(dstbuf_new);
|
||||
memcpy(dstbuf_ref, dstbuf_new, sizeof(dstbuf_ref));
|
||||
randomize_mask_buffers(or_src_ref, or_src_new, nb_elems, 14);
|
||||
|
||||
ptrdiff_t log2_scale = rnd() & 1;
|
||||
call_ref(dst_ref, src_ref, dst_stride, src_stride, width, height, log2_scale);
|
||||
call_new(dst_new, src_new, dst_stride, src_stride, width, height, log2_scale);
|
||||
if (memcmp(dstbuf_new, dstbuf_ref, sizeof(dstbuf_ref)) ||
|
||||
memcmp(or_src_ref, or_src_new, sizeof(*or_src_new) * nb_elems))
|
||||
fail();
|
||||
// don't use random parameters for benchmarks
|
||||
src_ref = or_src_ref + !i * 8 * MAX_STRIDE;
|
||||
bench_new(dstbuf_new, src_ref,
|
||||
MAX_STRIDE, MAX_STRIDE, MAX_WIDTH, 8, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void check_mul_thrmat(void)
|
||||
{
|
||||
FSPPDSPContext fspp;
|
||||
DECLARE_ALIGNED(16, int16_t, src)[64];
|
||||
DECLARE_ALIGNED(16, int16_t, dst_ref)[64];
|
||||
DECLARE_ALIGNED(16, int16_t, dst_new)[64];
|
||||
const int q = (uint8_t)rnd();
|
||||
declare_func(void, const int16_t *thr_adr_noq, int16_t *thr_adr, int q);
|
||||
|
||||
ff_fsppdsp_init(&fspp);
|
||||
|
||||
if (check_func(fspp.mul_thrmat, "mul_thrmat")) {
|
||||
randomize_buffers(src);
|
||||
call_ref(src, dst_ref, q);
|
||||
call_new(src, dst_new, q);
|
||||
if (memcmp(dst_ref, dst_new, sizeof(dst_ref)))
|
||||
fail();
|
||||
bench_new(src, dst_new, q);
|
||||
}
|
||||
}
|
||||
|
||||
static void check_column_fidct(void)
|
||||
{
|
||||
enum {
|
||||
NB_BLOCKS = 8, ///< arbitrary
|
||||
};
|
||||
FSPPDSPContext fspp;
|
||||
declare_func(void, const int16_t *thr_adr, const int16_t *data,
|
||||
int16_t *output, int cnt);
|
||||
|
||||
ff_fsppdsp_init(&fspp);
|
||||
|
||||
if (check_func(fspp.column_fidct, "column_fidct")) {
|
||||
DECLARE_ALIGNED(16, int16_t, threshold)[64];
|
||||
DECLARE_ALIGNED(16, int16_t, src)[8*(8*NB_BLOCKS + 6)];
|
||||
DECLARE_ALIGNED(16, int16_t, dst_new)[8*(8*NB_BLOCKS + 6)];
|
||||
DECLARE_ALIGNED(16, int16_t, dst_ref)[8*(8*NB_BLOCKS + 6)];
|
||||
|
||||
randomize_buffer_range(threshold, 0, INT16_MAX);
|
||||
randomize_buffer_range(src, -1284, 1284);
|
||||
randomize_buffers(dst_new);
|
||||
memcpy(dst_ref, dst_new, sizeof(dst_ref));
|
||||
|
||||
call_ref(threshold, src, dst_ref, NB_BLOCKS * 8);
|
||||
call_new(threshold, src, dst_new, NB_BLOCKS * 8);
|
||||
|
||||
if (memcmp(dst_new, dst_ref, sizeof(dst_new)))
|
||||
fail();
|
||||
|
||||
bench_new(threshold, src, dst_new, NB_BLOCKS * 8);
|
||||
}
|
||||
}
|
||||
|
||||
void checkasm_check_vf_fspp(void)
|
||||
{
|
||||
check_store_slice();
|
||||
check_mul_thrmat();
|
||||
check_column_fidct();
|
||||
}
|
||||
|
|
@ -55,13 +55,17 @@ configure()(
|
|||
${cross_prefix:+--cross-prefix="$cross_prefix"} \
|
||||
${as:+--as="$as"} \
|
||||
${cc:+--cc="$cc"} \
|
||||
${cxx:+--cxx="$cxx"} \
|
||||
${ld:+--ld="$ld"} \
|
||||
${nm:+--nm="$nm"} \
|
||||
${target_os:+--target-os="$target_os"} \
|
||||
${sysroot:+--sysroot="$sysroot"} \
|
||||
${target_exec:+--target-exec="$target_exec"} \
|
||||
${target_path:+--target-path="$target_path"} \
|
||||
${target_samples:+--target-samples="$target_samples"} \
|
||||
${extra_cflags:+--extra-cflags="$extra_cflags"} \
|
||||
${extra_cxxflags:+--extra-cxxflags="$extra_cxxflags"} \
|
||||
${extra_objcflags:+--extra-objcflags="$extra_objcflags"} \
|
||||
${extra_ldflags:+--extra-ldflags="$extra_ldflags"} \
|
||||
${extra_libs:+--extra-libs="$extra_libs"} \
|
||||
${extra_conf}
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \
|
|||
fate-checkasm-vf_colordetect \
|
||||
fate-checkasm-vf_colorspace \
|
||||
fate-checkasm-vf_eq \
|
||||
fate-checkasm-vf_fspp \
|
||||
fate-checkasm-vf_gblur \
|
||||
fate-checkasm-vf_hflip \
|
||||
fate-checkasm-vf_nlmeans \
|
||||
|
|
|
|||
Loading…
Reference in New Issue