45#include "../codestream/ojph_params_local.h"
57 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
59 __m128 a = _mm_load_ps(sp);
60 __m128 b = _mm_load_ps(sp + 4);
61 __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
62 __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
72 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
74 __m128 a = _mm_load_ps(spl);
75 __m128 b = _mm_load_ps(sph);
76 __m128 c = _mm_unpacklo_ps(a, b);
77 __m128 d = _mm_unpackhi_ps(a, b);
79 _mm_store_ps(dp + 4, d);
86 __m128 factor = _mm_set1_ps(f);
87 for (; width > 0; width -= 4, p += 4)
89 __m128 s = _mm_load_ps(p);
90 _mm_store_ps(p, _mm_mul_ps(factor, s));
97 ui32 repeat,
bool synthesis)
103 __m128 factor = _mm_set1_ps(a);
105 float* dst = aug->
f32;
106 const float* src1 = sig->
f32, * src2 = other->
f32;
108 for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
110 __m128 s1 = _mm_load_ps(src1);
111 __m128 s2 = _mm_load_ps(src2);
112 __m128 d = _mm_load_ps(dst);
113 d = _mm_add_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
114 _mm_store_ps(dst, d);
127 ui32 width,
bool even)
133 float* dpl = even ? ldst->
f32 : hdst->
f32;
134 float* dph = even ? hdst->
f32 : ldst->
f32;
135 float* sp = src->
f32;
141 float* hp = hdst->
f32, * lp = ldst->
f32;
142 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
143 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
145 for (
ui32 j = num_steps; j > 0; --j)
152 lp[l_width] = lp[l_width - 1];
154 const float* sp = lp;
156 int i = (int)h_width;
157 __m128 f = _mm_set1_ps(a);
160 for (; i > 0; i -= 4, sp += 4, dp += 4)
162 __m128 m = _mm_load_ps(sp);
163 __m128 n = _mm_loadu_ps(sp + 1);
164 __m128 p = _mm_load_ps(dp);
165 p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
171 for (; i > 0; i -= 4, sp += 4, dp += 4)
173 __m128 m = _mm_load_ps(sp);
174 __m128 n = _mm_loadu_ps(sp - 1);
175 __m128 p = _mm_load_ps(dp);
176 p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
182 float* t = lp; lp = hp; hp = t;
184 ui32 w = l_width; l_width = h_width; h_width = w;
188 float K = atk->
get_K();
189 float K_inv = 1.0f / K;
196 ldst->
f32[0] = src->
f32[0];
198 hdst->
f32[0] = src->
f32[0] * 2.0f;
205 ui32 width,
bool even)
210 float* oth = hsrc->
f32, * aug = lsrc->
f32;
211 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
212 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
215 float K = atk->
get_K();
216 float K_inv = 1.0f / K;
223 for (
ui32 j = 0; j < num_steps; ++j)
230 oth[oth_width] = oth[oth_width - 1];
232 const float* sp = oth;
234 int i = (int)aug_width;
235 __m128 f = _mm_set1_ps(a);
238 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
240 __m128 m = _mm_load_ps(sp);
241 __m128 n = _mm_loadu_ps(sp - 1);
242 __m128 p = _mm_load_ps(dp);
243 p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
249 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
251 __m128 m = _mm_load_ps(sp);
252 __m128 n = _mm_loadu_ps(sp + 1);
253 __m128 p = _mm_load_ps(dp);
254 p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
260 float* t = aug; aug = oth; oth = t;
262 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
267 float* dp = dst->
f32;
268 float* spl = even ? lsrc->
f32 : hsrc->
f32;
269 float* sph = even ? hsrc->
f32 : lsrc->
f32;
276 dst->
f32[0] = lsrc->
f32[0];
278 dst->
f32[0] = hsrc->
f32[0] * 0.5f;
void sse_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
static void sse_multiply_const(float *p, float f, int width)
void sse_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void sse_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void sse_deinterleave32(float *dpl, float *dph, float *sp, int width)
void sse_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void sse_interleave32(float *dp, float *spl, float *sph, int width)
ui32 get_num_steps() const
const lifting_step * get_step(ui32 s) const