45#include "../codestream/ojph_params_local.h"
61 __m128i x = _mm_srli_epi64(a, amt);
62 x = _mm_xor_si128(x, m);
63 __m128i result = _mm_sub_epi64(x, m);
71 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
73 __m128 a = _mm_load_ps(sp);
74 __m128 b = _mm_load_ps(sp + 4);
75 __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
76 __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
86 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
88 __m128 a = _mm_load_ps(spl);
89 __m128 b = _mm_load_ps(sph);
90 __m128 c = _mm_unpacklo_ps(a, b);
91 __m128 d = _mm_unpackhi_ps(a, b);
93 _mm_store_ps(dp + 4, d);
101 for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
103 __m128d a = _mm_load_pd(sp);
104 __m128d b = _mm_load_pd(sp + 2);
105 __m128d c = _mm_shuffle_pd(a, b, 0);
106 __m128d d = _mm_shuffle_pd(a, b, 3);
107 _mm_store_pd(dpl, c);
108 _mm_store_pd(dph, d);
116 for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
118 __m128d a = _mm_load_pd(spl);
119 __m128d b = _mm_load_pd(sph);
120 __m128d c = _mm_unpacklo_pd(a, b);
121 __m128d d = _mm_unpackhi_pd(a, b);
123 _mm_store_pd(dp + 2, d);
131 ui32 repeat,
bool synthesis)
136 __m128i vb = _mm_set1_epi32(b);
139 const si32* src1 = sig->
i32, * src2 = other->
i32;
147 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
149 __m128i s1 = _mm_load_si128((__m128i*)src1);
150 __m128i s2 = _mm_load_si128((__m128i*)src2);
151 __m128i d = _mm_load_si128((__m128i*)dst);
152 __m128i t = _mm_add_epi32(s1, s2);
153 __m128i v = _mm_add_epi32(vb, t);
154 __m128i w = _mm_srai_epi32(v, e);
155 d = _mm_sub_epi32(d, w);
156 _mm_store_si128((__m128i*)dst, d);
159 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
161 __m128i s1 = _mm_load_si128((__m128i*)src1);
162 __m128i s2 = _mm_load_si128((__m128i*)src2);
163 __m128i d = _mm_load_si128((__m128i*)dst);
164 __m128i t = _mm_add_epi32(s1, s2);
165 __m128i v = _mm_add_epi32(vb, t);
166 __m128i w = _mm_srai_epi32(v, e);
167 d = _mm_add_epi32(d, w);
168 _mm_store_si128((__m128i*)dst, d);
171 else if (a == -1 && b == 1 && e == 1)
175 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
177 __m128i s1 = _mm_load_si128((__m128i*)src1);
178 __m128i s2 = _mm_load_si128((__m128i*)src2);
179 __m128i d = _mm_load_si128((__m128i*)dst);
180 __m128i t = _mm_add_epi32(s1, s2);
181 __m128i w = _mm_srai_epi32(t, e);
182 d = _mm_add_epi32(d, w);
183 _mm_store_si128((__m128i*)dst, d);
186 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
188 __m128i s1 = _mm_load_si128((__m128i*)src1);
189 __m128i s2 = _mm_load_si128((__m128i*)src2);
190 __m128i d = _mm_load_si128((__m128i*)dst);
191 __m128i t = _mm_add_epi32(s1, s2);
192 __m128i w = _mm_srai_epi32(t, e);
193 d = _mm_sub_epi32(d, w);
194 _mm_store_si128((__m128i*)dst, d);
201 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
203 __m128i s1 = _mm_load_si128((__m128i*)src1);
204 __m128i s2 = _mm_load_si128((__m128i*)src2);
205 __m128i d = _mm_load_si128((__m128i*)dst);
206 __m128i t = _mm_add_epi32(s1, s2);
207 __m128i v = _mm_sub_epi32(vb, t);
208 __m128i w = _mm_srai_epi32(v, e);
209 d = _mm_sub_epi32(d, w);
210 _mm_store_si128((__m128i*)dst, d);
213 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
215 __m128i s1 = _mm_load_si128((__m128i*)src1);
216 __m128i s2 = _mm_load_si128((__m128i*)src2);
217 __m128i d = _mm_load_si128((__m128i*)dst);
218 __m128i t = _mm_add_epi32(s1, s2);
219 __m128i v = _mm_sub_epi32(vb, t);
220 __m128i w = _mm_srai_epi32(v, e);
221 d = _mm_add_epi32(d, w);
222 _mm_store_si128((__m128i*)dst, d);
230 for (
ui32 i = repeat; i > 0; --i)
231 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
233 for (
ui32 i = repeat; i > 0; --i)
234 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
242 ui32 repeat,
bool synthesis)
247 __m128i vb = _mm_set1_epi64x(b);
248 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
251 const si64* src1 = sig->
i64, * src2 = other->
i64;
259 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
261 __m128i s1 = _mm_load_si128((__m128i*)src1);
262 __m128i s2 = _mm_load_si128((__m128i*)src2);
263 __m128i d = _mm_load_si128((__m128i*)dst);
264 __m128i t = _mm_add_epi64(s1, s2);
265 __m128i v = _mm_add_epi64(vb, t);
267 d = _mm_sub_epi64(d, w);
268 _mm_store_si128((__m128i*)dst, d);
271 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
273 __m128i s1 = _mm_load_si128((__m128i*)src1);
274 __m128i s2 = _mm_load_si128((__m128i*)src2);
275 __m128i d = _mm_load_si128((__m128i*)dst);
276 __m128i t = _mm_add_epi64(s1, s2);
277 __m128i v = _mm_add_epi64(vb, t);
279 d = _mm_add_epi64(d, w);
280 _mm_store_si128((__m128i*)dst, d);
283 else if (a == -1 && b == 1 && e == 1)
287 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
289 __m128i s1 = _mm_load_si128((__m128i*)src1);
290 __m128i s2 = _mm_load_si128((__m128i*)src2);
291 __m128i d = _mm_load_si128((__m128i*)dst);
292 __m128i t = _mm_add_epi64(s1, s2);
294 d = _mm_add_epi64(d, w);
295 _mm_store_si128((__m128i*)dst, d);
298 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
300 __m128i s1 = _mm_load_si128((__m128i*)src1);
301 __m128i s2 = _mm_load_si128((__m128i*)src2);
302 __m128i d = _mm_load_si128((__m128i*)dst);
303 __m128i t = _mm_add_epi64(s1, s2);
305 d = _mm_sub_epi64(d, w);
306 _mm_store_si128((__m128i*)dst, d);
313 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
315 __m128i s1 = _mm_load_si128((__m128i*)src1);
316 __m128i s2 = _mm_load_si128((__m128i*)src2);
317 __m128i d = _mm_load_si128((__m128i*)dst);
318 __m128i t = _mm_add_epi64(s1, s2);
319 __m128i v = _mm_sub_epi64(vb, t);
321 d = _mm_sub_epi64(d, w);
322 _mm_store_si128((__m128i*)dst, d);
325 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
327 __m128i s1 = _mm_load_si128((__m128i*)src1);
328 __m128i s2 = _mm_load_si128((__m128i*)src2);
329 __m128i d = _mm_load_si128((__m128i*)dst);
330 __m128i t = _mm_add_epi64(s1, s2);
331 __m128i v = _mm_sub_epi64(vb, t);
333 d = _mm_add_epi64(d, w);
334 _mm_store_si128((__m128i*)dst, d);
340 for (
ui32 i = repeat; i > 0; --i)
341 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
343 for (
ui32 i = repeat; i > 0; --i)
344 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
351 ui32 repeat,
bool synthesis)
375 ui32 width,
bool even)
381 float* dpl = even ? ldst->
f32 : hdst->
f32;
382 float* dph = even ? hdst->
f32 : ldst->
f32;
383 float* sp = src->
f32;
389 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
390 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
392 for (
ui32 j = num_steps; j > 0; --j)
399 __m128i vb = _mm_set1_epi32(b);
403 lp[l_width] = lp[l_width - 1];
409 int i = (int)h_width;
412 for (; i > 0; i -= 4, sp += 4, dp += 4)
414 __m128i s1 = _mm_load_si128((__m128i*)sp);
415 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
416 __m128i d = _mm_load_si128((__m128i*)dp);
417 __m128i t = _mm_add_epi32(s1, s2);
418 __m128i v = _mm_add_epi32(vb, t);
419 __m128i w = _mm_srai_epi32(v, e);
420 d = _mm_add_epi32(d, w);
421 _mm_store_si128((__m128i*)dp, d);
426 for (; i > 0; i -= 4, sp += 4, dp += 4)
428 __m128i s1 = _mm_load_si128((__m128i*)sp);
429 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
430 __m128i d = _mm_load_si128((__m128i*)dp);
431 __m128i t = _mm_add_epi32(s1, s2);
432 __m128i v = _mm_add_epi32(vb, t);
433 __m128i w = _mm_srai_epi32(v, e);
434 d = _mm_add_epi32(d, w);
435 _mm_store_si128((__m128i*)dp, d);
439 else if (a == -1 && b == 1 && e == 1)
441 int i = (int)h_width;
443 for (; i > 0; i -= 4, sp += 4, dp += 4)
445 __m128i s1 = _mm_load_si128((__m128i*)sp);
446 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
447 __m128i d = _mm_load_si128((__m128i*)dp);
448 __m128i t = _mm_add_epi32(s1, s2);
449 __m128i w = _mm_srai_epi32(t, e);
450 d = _mm_sub_epi32(d, w);
451 _mm_store_si128((__m128i*)dp, d);
454 for (; i > 0; i -= 4, sp += 4, dp += 4)
456 __m128i s1 = _mm_load_si128((__m128i*)sp);
457 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
458 __m128i d = _mm_load_si128((__m128i*)dp);
459 __m128i t = _mm_add_epi32(s1, s2);
460 __m128i w = _mm_srai_epi32(t, e);
461 d = _mm_sub_epi32(d, w);
462 _mm_store_si128((__m128i*)dp, d);
467 int i = (int)h_width;
469 for (; i > 0; i -= 4, sp += 4, dp += 4)
471 __m128i s1 = _mm_load_si128((__m128i*)sp);
472 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
473 __m128i d = _mm_load_si128((__m128i*)dp);
474 __m128i t = _mm_add_epi32(s1, s2);
475 __m128i v = _mm_sub_epi32(vb, t);
476 __m128i w = _mm_srai_epi32(v, e);
477 d = _mm_add_epi32(d, w);
478 _mm_store_si128((__m128i*)dp, d);
481 for (; i > 0; i -= 4, sp += 4, dp += 4)
483 __m128i s1 = _mm_load_si128((__m128i*)sp);
484 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
485 __m128i d = _mm_load_si128((__m128i*)dp);
486 __m128i t = _mm_add_epi32(s1, s2);
487 __m128i v = _mm_sub_epi32(vb, t);
488 __m128i w = _mm_srai_epi32(v, e);
489 d = _mm_add_epi32(d, w);
490 _mm_store_si128((__m128i*)dp, d);
497 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
498 *dp += (b + a * (sp[0] + sp[1])) >> e;
500 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
501 *dp += (b + a * (sp[-1] + sp[0])) >> e;
505 si32* t = lp; lp = hp; hp = t;
507 ui32 w = l_width; l_width = h_width; h_width = w;
512 ldst->
i32[0] = src->
i32[0];
514 hdst->
i32[0] = src->
i32[0] << 1;
522 ui32 width,
bool even)
528 double* dpl = (
double*)(even ? ldst->
p : hdst->
p);
529 double* dph = (
double*)(even ? hdst->
p : ldst->
p);
530 double* sp = (
double*)src->
p;
536 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
537 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
539 for (
ui32 j = num_steps; j > 0; --j)
546 __m128i vb = _mm_set1_epi64x(b);
547 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
551 lp[l_width] = lp[l_width - 1];
557 int i = (int)h_width;
560 for (; i > 0; i -= 2, sp += 2, dp += 2)
562 __m128i s1 = _mm_load_si128((__m128i*)sp);
563 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
564 __m128i d = _mm_load_si128((__m128i*)dp);
565 __m128i t = _mm_add_epi64(s1, s2);
566 __m128i v = _mm_add_epi64(vb, t);
568 d = _mm_add_epi64(d, w);
569 _mm_store_si128((__m128i*)dp, d);
574 for (; i > 0; i -= 2, sp += 2, dp += 2)
576 __m128i s1 = _mm_load_si128((__m128i*)sp);
577 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
578 __m128i d = _mm_load_si128((__m128i*)dp);
579 __m128i t = _mm_add_epi64(s1, s2);
580 __m128i v = _mm_add_epi64(vb, t);
582 d = _mm_add_epi64(d, w);
583 _mm_store_si128((__m128i*)dp, d);
587 else if (a == -1 && b == 1 && e == 1)
589 int i = (int)h_width;
591 for (; i > 0; i -= 2, sp += 2, dp += 2)
593 __m128i s1 = _mm_load_si128((__m128i*)sp);
594 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
595 __m128i d = _mm_load_si128((__m128i*)dp);
596 __m128i t = _mm_add_epi64(s1, s2);
598 d = _mm_sub_epi64(d, w);
599 _mm_store_si128((__m128i*)dp, d);
602 for (; i > 0; i -= 2, sp += 2, dp += 2)
604 __m128i s1 = _mm_load_si128((__m128i*)sp);
605 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
606 __m128i d = _mm_load_si128((__m128i*)dp);
607 __m128i t = _mm_add_epi64(s1, s2);
609 d = _mm_sub_epi64(d, w);
610 _mm_store_si128((__m128i*)dp, d);
615 int i = (int)h_width;
617 for (; i > 0; i -= 2, sp += 2, dp += 2)
619 __m128i s1 = _mm_load_si128((__m128i*)sp);
620 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
621 __m128i d = _mm_load_si128((__m128i*)dp);
622 __m128i t = _mm_add_epi64(s1, s2);
623 __m128i v = _mm_sub_epi64(vb, t);
625 d = _mm_add_epi64(d, w);
626 _mm_store_si128((__m128i*)dp, d);
629 for (; i > 0; i -= 2, sp += 2, dp += 2)
631 __m128i s1 = _mm_load_si128((__m128i*)sp);
632 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
633 __m128i d = _mm_load_si128((__m128i*)dp);
634 __m128i t = _mm_add_epi64(s1, s2);
635 __m128i v = _mm_sub_epi64(vb, t);
637 d = _mm_add_epi64(d, w);
638 _mm_store_si128((__m128i*)dp, d);
645 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
646 *dp += (b + a * (sp[0] + sp[1])) >> e;
648 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
649 *dp += (b + a * (sp[-1] + sp[0])) >> e;
653 si64* t = lp; lp = hp; hp = t;
655 ui32 w = l_width; l_width = h_width; h_width = w;
660 ldst->
i64[0] = src->
i64[0];
662 hdst->
i64[0] = src->
i64[0] << 1;
669 ui32 width,
bool even)
689 ui32 width,
bool even)
695 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
696 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
698 for (
ui32 j = 0; j < num_steps; ++j)
704 __m128i vb = _mm_set1_epi32(b);
708 oth[oth_width] = oth[oth_width - 1];
710 const si32* sp = oth;
714 int i = (int)aug_width;
717 for (; i > 0; i -= 4, sp += 4, dp += 4)
719 __m128i s1 = _mm_load_si128((__m128i*)sp);
720 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
721 __m128i d = _mm_load_si128((__m128i*)dp);
722 __m128i t = _mm_add_epi32(s1, s2);
723 __m128i v = _mm_add_epi32(vb, t);
724 __m128i w = _mm_srai_epi32(v, e);
725 d = _mm_sub_epi32(d, w);
726 _mm_store_si128((__m128i*)dp, d);
731 for (; i > 0; i -= 4, sp += 4, dp += 4)
733 __m128i s1 = _mm_load_si128((__m128i*)sp);
734 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
735 __m128i d = _mm_load_si128((__m128i*)dp);
736 __m128i t = _mm_add_epi32(s1, s2);
737 __m128i v = _mm_add_epi32(vb, t);
738 __m128i w = _mm_srai_epi32(v, e);
739 d = _mm_sub_epi32(d, w);
740 _mm_store_si128((__m128i*)dp, d);
744 else if (a == -1 && b == 1 && e == 1)
746 int i = (int)aug_width;
748 for (; i > 0; i -= 4, sp += 4, dp += 4)
750 __m128i s1 = _mm_load_si128((__m128i*)sp);
751 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
752 __m128i d = _mm_load_si128((__m128i*)dp);
753 __m128i t = _mm_add_epi32(s1, s2);
754 __m128i w = _mm_srai_epi32(t, e);
755 d = _mm_add_epi32(d, w);
756 _mm_store_si128((__m128i*)dp, d);
759 for (; i > 0; i -= 4, sp += 4, dp += 4)
761 __m128i s1 = _mm_load_si128((__m128i*)sp);
762 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
763 __m128i d = _mm_load_si128((__m128i*)dp);
764 __m128i t = _mm_add_epi32(s1, s2);
765 __m128i w = _mm_srai_epi32(t, e);
766 d = _mm_add_epi32(d, w);
767 _mm_store_si128((__m128i*)dp, d);
772 int i = (int)aug_width;
774 for (; i > 0; i -= 4, sp += 4, dp += 4)
776 __m128i s1 = _mm_load_si128((__m128i*)sp);
777 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
778 __m128i d = _mm_load_si128((__m128i*)dp);
779 __m128i t = _mm_add_epi32(s1, s2);
780 __m128i v = _mm_sub_epi32(vb, t);
781 __m128i w = _mm_srai_epi32(v, e);
782 d = _mm_sub_epi32(d, w);
783 _mm_store_si128((__m128i*)dp, d);
786 for (; i > 0; i -= 4, sp += 4, dp += 4)
788 __m128i s1 = _mm_load_si128((__m128i*)sp);
789 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
790 __m128i d = _mm_load_si128((__m128i*)dp);
791 __m128i t = _mm_add_epi32(s1, s2);
792 __m128i v = _mm_sub_epi32(vb, t);
793 __m128i w = _mm_srai_epi32(v, e);
794 d = _mm_sub_epi32(d, w);
795 _mm_store_si128((__m128i*)dp, d);
804 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
805 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
807 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
808 *dp -= (b + a * (sp[0] + sp[1])) >> e;
812 si32* t = aug; aug = oth; oth = t;
814 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
819 float* dp = dst->
f32;
820 float* spl = even ? lsrc->
f32 : hsrc->
f32;
821 float* sph = even ? hsrc->
f32 : lsrc->
f32;
828 dst->
i32[0] = lsrc->
i32[0];
830 dst->
i32[0] = hsrc->
i32[0] >> 1;
837 ui32 width,
bool even)
843 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
844 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
846 for (
ui32 j = 0; j < num_steps; ++j)
852 __m128i vb = _mm_set1_epi64x(b);
853 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
857 oth[oth_width] = oth[oth_width - 1];
859 const si64* sp = oth;
863 int i = (int)aug_width;
866 for (; i > 0; i -= 2, sp += 2, dp += 2)
868 __m128i s1 = _mm_load_si128((__m128i*)sp);
869 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
870 __m128i d = _mm_load_si128((__m128i*)dp);
871 __m128i t = _mm_add_epi64(s1, s2);
872 __m128i v = _mm_add_epi64(vb, t);
874 d = _mm_sub_epi64(d, w);
875 _mm_store_si128((__m128i*)dp, d);
880 for (; i > 0; i -= 2, sp += 2, dp += 2)
882 __m128i s1 = _mm_load_si128((__m128i*)sp);
883 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
884 __m128i d = _mm_load_si128((__m128i*)dp);
885 __m128i t = _mm_add_epi64(s1, s2);
886 __m128i v = _mm_add_epi64(vb, t);
888 d = _mm_sub_epi64(d, w);
889 _mm_store_si128((__m128i*)dp, d);
893 else if (a == -1 && b == 1 && e == 1)
895 int i = (int)aug_width;
897 for (; i > 0; i -= 2, sp += 2, dp += 2)
899 __m128i s1 = _mm_load_si128((__m128i*)sp);
900 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
901 __m128i d = _mm_load_si128((__m128i*)dp);
902 __m128i t = _mm_add_epi64(s1, s2);
904 d = _mm_add_epi64(d, w);
905 _mm_store_si128((__m128i*)dp, d);
908 for (; i > 0; i -= 2, sp += 2, dp += 2)
910 __m128i s1 = _mm_load_si128((__m128i*)sp);
911 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
912 __m128i d = _mm_load_si128((__m128i*)dp);
913 __m128i t = _mm_add_epi64(s1, s2);
915 d = _mm_add_epi64(d, w);
916 _mm_store_si128((__m128i*)dp, d);
921 int i = (int)aug_width;
923 for (; i > 0; i -= 2, sp += 2, dp += 2)
925 __m128i s1 = _mm_load_si128((__m128i*)sp);
926 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
927 __m128i d = _mm_load_si128((__m128i*)dp);
928 __m128i t = _mm_add_epi64(s1, s2);
929 __m128i v = _mm_sub_epi64(vb, t);
931 d = _mm_sub_epi64(d, w);
932 _mm_store_si128((__m128i*)dp, d);
935 for (; i > 0; i -= 2, sp += 2, dp += 2)
937 __m128i s1 = _mm_load_si128((__m128i*)sp);
938 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
939 __m128i d = _mm_load_si128((__m128i*)dp);
940 __m128i t = _mm_add_epi64(s1, s2);
941 __m128i v = _mm_sub_epi64(vb, t);
943 d = _mm_sub_epi64(d, w);
944 _mm_store_si128((__m128i*)dp, d);
951 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
952 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
954 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
955 *dp -= (b + a * (sp[0] + sp[1])) >> e;
959 si64* t = aug; aug = oth; oth = t;
961 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
966 double* dp = (
double*)dst->
p;
967 double* spl = (
double*)(even ? lsrc->
p : hsrc->
p);
968 double* sph = (
double*)(even ? hsrc->
p : lsrc->
p);
975 dst->
i64[0] = lsrc->
i64[0];
977 dst->
i64[0] = hsrc->
i64[0] >> 1;
984 ui32 width,
bool even)
void sse2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void sse2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void sse2_rev_vert_step64(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void sse2_rev_horz_ana32(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void sse2_deinterleave64(double *dpl, double *dph, double *sp, int width)
static void sse2_interleave32(float *dp, float *spl, float *sph, int width)
static __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
void sse2_rev_horz_syn64(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void sse2_deinterleave32(float *dpl, float *dph, float *sp, int width)
void sse2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void sse2_rev_horz_syn32(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void sse2_rev_horz_ana64(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void sse2_rev_vert_step32(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void sse2_interleave64(double *dp, double *spl, double *sph, int width)
ui32 get_num_steps() const
const lifting_step * get_step(ui32 s) const