39#include <wasm_simd128.h>
45#include "../codestream/ojph_params_local.h"
57 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
59 v128_t a = wasm_v128_load(sp);
60 v128_t b = wasm_v128_load(sp + 4);
61 v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
62 v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
65 wasm_v128_store(dpl, c);
66 wasm_v128_store(dph, d);
74 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
76 v128_t a = wasm_v128_load(spl);
77 v128_t b = wasm_v128_load(sph);
78 v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
79 v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
82 wasm_v128_store(dp, c);
83 wasm_v128_store(dp + 4, d);
91 for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
93 v128_t a = wasm_v128_load(sp);
94 v128_t b = wasm_v128_load(sp + 2);
95 v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
96 v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
97 wasm_v128_store(dpl, c);
98 wasm_v128_store(dph, d);
106 for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
108 v128_t a = wasm_v128_load(spl);
109 v128_t b = wasm_v128_load(sph);
110 v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
111 v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
112 wasm_v128_store(dp, c);
113 wasm_v128_store(dp + 2, d);
120 v128_t factor = wasm_f32x4_splat(f);
121 for (; width > 0; width -= 4, p += 4)
123 v128_t s = wasm_v128_load(p);
124 wasm_v128_store(p, wasm_f32x4_mul(factor, s));
131 ui32 repeat,
bool synthesis)
137 v128_t factor = wasm_f32x4_splat(a);
139 float* dst = aug->
f32;
140 const float* src1 = sig->
f32, * src2 = other->
f32;
142 for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
144 v128_t s1 = wasm_v128_load(src1);
145 v128_t s2 = wasm_v128_load(src2);
146 v128_t d = wasm_v128_load(dst);
147 d = wasm_f32x4_add(d, wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2)));
148 wasm_v128_store(dst, d);
161 ui32 width,
bool even)
167 float* dpl = even ? ldst->
f32 : hdst->
f32;
168 float* dph = even ? hdst->
f32 : ldst->
f32;
169 float* sp = src->
f32;
175 float* hp = hdst->
f32, * lp = ldst->
f32;
176 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
177 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
179 for (
ui32 j = num_steps; j > 0; --j)
186 lp[l_width] = lp[l_width - 1];
188 const float* sp = lp;
190 int i = (int)h_width;
191 v128_t f = wasm_f32x4_splat(a);
194 for (; i > 0; i -= 4, sp += 4, dp += 4)
196 v128_t m = wasm_v128_load(sp);
197 v128_t n = wasm_v128_load(sp + 1);
198 v128_t p = wasm_v128_load(dp);
199 p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
200 wasm_v128_store(dp, p);
205 for (; i > 0; i -= 4, sp += 4, dp += 4)
207 v128_t m = wasm_v128_load(sp);
208 v128_t n = wasm_v128_load(sp - 1);
209 v128_t p = wasm_v128_load(dp);
210 p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
211 wasm_v128_store(dp, p);
216 float* t = lp; lp = hp; hp = t;
218 ui32 w = l_width; l_width = h_width; h_width = w;
222 float K = atk->
get_K();
223 float K_inv = 1.0f / K;
230 ldst->
f32[0] = src->
f32[0];
232 hdst->
f32[0] = src->
f32[0] * 2.0f;
239 ui32 width,
bool even)
244 float* oth = hsrc->
f32, * aug = lsrc->
f32;
245 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
246 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
249 float K = atk->
get_K();
250 float K_inv = 1.0f / K;
257 for (
ui32 j = 0; j < num_steps; ++j)
264 oth[oth_width] = oth[oth_width - 1];
266 const float* sp = oth;
268 int i = (int)aug_width;
269 v128_t f = wasm_f32x4_splat(a);
272 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
274 v128_t m = wasm_v128_load(sp);
275 v128_t n = wasm_v128_load(sp - 1);
276 v128_t p = wasm_v128_load(dp);
277 p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
278 wasm_v128_store(dp, p);
283 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
285 v128_t m = wasm_v128_load(sp);
286 v128_t n = wasm_v128_load(sp + 1);
287 v128_t p = wasm_v128_load(dp);
288 p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
289 wasm_v128_store(dp, p);
294 float* t = aug; aug = oth; oth = t;
296 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
301 float* dp = dst->
f32;
302 float* spl = even ? lsrc->
f32 : hsrc->
f32;
303 float* sph = even ? hsrc->
f32 : lsrc->
f32;
310 dst->
f32[0] = lsrc->
f32[0];
312 dst->
f32[0] = hsrc->
f32[0] * 0.5f;
319 ui32 repeat,
bool synthesis)
324 v128_t va = wasm_i32x4_splat(a);
325 v128_t vb = wasm_i32x4_splat(b);
328 const si32* src1 = sig->
i32, * src2 = other->
i32;
336 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
338 v128_t s1 = wasm_v128_load((v128_t*)src1);
339 v128_t s2 = wasm_v128_load((v128_t*)src2);
340 v128_t d = wasm_v128_load((v128_t*)dst);
341 v128_t t = wasm_i32x4_add(s1, s2);
342 v128_t v = wasm_i32x4_add(vb, t);
343 v128_t w = wasm_i32x4_shr(v, e);
344 d = wasm_i32x4_sub(d, w);
345 wasm_v128_store((v128_t*)dst, d);
348 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
350 v128_t s1 = wasm_v128_load((v128_t*)src1);
351 v128_t s2 = wasm_v128_load((v128_t*)src2);
352 v128_t d = wasm_v128_load((v128_t*)dst);
353 v128_t t = wasm_i32x4_add(s1, s2);
354 v128_t v = wasm_i32x4_add(vb, t);
355 v128_t w = wasm_i32x4_shr(v, e);
356 d = wasm_i32x4_add(d, w);
357 wasm_v128_store((v128_t*)dst, d);
360 else if (a == -1 && b == 1 && e == 1)
364 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
366 v128_t s1 = wasm_v128_load((v128_t*)src1);
367 v128_t s2 = wasm_v128_load((v128_t*)src2);
368 v128_t d = wasm_v128_load((v128_t*)dst);
369 v128_t t = wasm_i32x4_add(s1, s2);
370 v128_t w = wasm_i32x4_shr(t, e);
371 d = wasm_i32x4_add(d, w);
372 wasm_v128_store((v128_t*)dst, d);
375 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
377 v128_t s1 = wasm_v128_load((v128_t*)src1);
378 v128_t s2 = wasm_v128_load((v128_t*)src2);
379 v128_t d = wasm_v128_load((v128_t*)dst);
380 v128_t t = wasm_i32x4_add(s1, s2);
381 v128_t w = wasm_i32x4_shr(t, e);
382 d = wasm_i32x4_sub(d, w);
383 wasm_v128_store((v128_t*)dst, d);
390 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
392 v128_t s1 = wasm_v128_load((v128_t*)src1);
393 v128_t s2 = wasm_v128_load((v128_t*)src2);
394 v128_t d = wasm_v128_load((v128_t*)dst);
395 v128_t t = wasm_i32x4_add(s1, s2);
396 v128_t v = wasm_i32x4_sub(vb, t);
397 v128_t w = wasm_i32x4_shr(v, e);
398 d = wasm_i32x4_sub(d, w);
399 wasm_v128_store((v128_t*)dst, d);
402 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
404 v128_t s1 = wasm_v128_load((v128_t*)src1);
405 v128_t s2 = wasm_v128_load((v128_t*)src2);
406 v128_t d = wasm_v128_load((v128_t*)dst);
407 v128_t t = wasm_i32x4_add(s1, s2);
408 v128_t v = wasm_i32x4_sub(vb, t);
409 v128_t w = wasm_i32x4_shr(v, e);
410 d = wasm_i32x4_add(d, w);
411 wasm_v128_store((v128_t*)dst, d);
418 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
420 v128_t s1 = wasm_v128_load((v128_t*)src1);
421 v128_t s2 = wasm_v128_load((v128_t*)src2);
422 v128_t d = wasm_v128_load((v128_t*)dst);
423 v128_t t = wasm_i32x4_add(s1, s2);
424 v128_t u = wasm_i32x4_mul(va, t);
425 v128_t v = wasm_i32x4_add(vb, u);
426 v128_t w = wasm_i32x4_shr(v, e);
427 d = wasm_i32x4_sub(d, w);
428 wasm_v128_store((v128_t*)dst, d);
431 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
433 v128_t s1 = wasm_v128_load((v128_t*)src1);
434 v128_t s2 = wasm_v128_load((v128_t*)src2);
435 v128_t d = wasm_v128_load((v128_t*)dst);
436 v128_t t = wasm_i32x4_add(s1, s2);
437 v128_t u = wasm_i32x4_mul(va, t);
438 v128_t v = wasm_i32x4_add(vb, u);
439 v128_t w = wasm_i32x4_shr(v, e);
440 d = wasm_i32x4_add(d, w);
441 wasm_v128_store((v128_t*)dst, d);
449 ui32 repeat,
bool synthesis)
454 v128_t va = wasm_i64x2_splat(a);
455 v128_t vb = wasm_i64x2_splat(b);
458 const si64* src1 = sig->
i64, * src2 = other->
i64;
466 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
468 v128_t s1 = wasm_v128_load((v128_t*)src1);
469 v128_t s2 = wasm_v128_load((v128_t*)src2);
470 v128_t d = wasm_v128_load((v128_t*)dst);
471 v128_t t = wasm_i64x2_add(s1, s2);
472 v128_t v = wasm_i64x2_add(vb, t);
473 v128_t w = wasm_i64x2_shr(v, e);
474 d = wasm_i64x2_sub(d, w);
475 wasm_v128_store((v128_t*)dst, d);
478 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
480 v128_t s1 = wasm_v128_load((v128_t*)src1);
481 v128_t s2 = wasm_v128_load((v128_t*)src2);
482 v128_t d = wasm_v128_load((v128_t*)dst);
483 v128_t t = wasm_i64x2_add(s1, s2);
484 v128_t v = wasm_i64x2_add(vb, t);
485 v128_t w = wasm_i64x2_shr(v, e);
486 d = wasm_i64x2_add(d, w);
487 wasm_v128_store((v128_t*)dst, d);
490 else if (a == -1 && b == 1 && e == 1)
494 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
496 v128_t s1 = wasm_v128_load((v128_t*)src1);
497 v128_t s2 = wasm_v128_load((v128_t*)src2);
498 v128_t d = wasm_v128_load((v128_t*)dst);
499 v128_t t = wasm_i64x2_add(s1, s2);
500 v128_t w = wasm_i64x2_shr(t, e);
501 d = wasm_i64x2_add(d, w);
502 wasm_v128_store((v128_t*)dst, d);
505 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
507 v128_t s1 = wasm_v128_load((v128_t*)src1);
508 v128_t s2 = wasm_v128_load((v128_t*)src2);
509 v128_t d = wasm_v128_load((v128_t*)dst);
510 v128_t t = wasm_i64x2_add(s1, s2);
511 v128_t w = wasm_i64x2_shr(t, e);
512 d = wasm_i64x2_sub(d, w);
513 wasm_v128_store((v128_t*)dst, d);
520 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
522 v128_t s1 = wasm_v128_load((v128_t*)src1);
523 v128_t s2 = wasm_v128_load((v128_t*)src2);
524 v128_t d = wasm_v128_load((v128_t*)dst);
525 v128_t t = wasm_i64x2_add(s1, s2);
526 v128_t v = wasm_i64x2_sub(vb, t);
527 v128_t w = wasm_i64x2_shr(v, e);
528 d = wasm_i64x2_sub(d, w);
529 wasm_v128_store((v128_t*)dst, d);
532 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
534 v128_t s1 = wasm_v128_load((v128_t*)src1);
535 v128_t s2 = wasm_v128_load((v128_t*)src2);
536 v128_t d = wasm_v128_load((v128_t*)dst);
537 v128_t t = wasm_i64x2_add(s1, s2);
538 v128_t v = wasm_i64x2_sub(vb, t);
539 v128_t w = wasm_i64x2_shr(v, e);
540 d = wasm_i64x2_add(d, w);
541 wasm_v128_store((v128_t*)dst, d);
548 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
550 v128_t s1 = wasm_v128_load((v128_t*)src1);
551 v128_t s2 = wasm_v128_load((v128_t*)src2);
552 v128_t d = wasm_v128_load((v128_t*)dst);
553 v128_t t = wasm_i64x2_add(s1, s2);
554 v128_t u = wasm_i64x2_mul(va, t);
555 v128_t v = wasm_i64x2_add(vb, u);
556 v128_t w = wasm_i64x2_shr(v, e);
557 d = wasm_i64x2_sub(d, w);
558 wasm_v128_store((v128_t*)dst, d);
561 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
563 v128_t s1 = wasm_v128_load((v128_t*)src1);
564 v128_t s2 = wasm_v128_load((v128_t*)src2);
565 v128_t d = wasm_v128_load((v128_t*)dst);
566 v128_t t = wasm_i64x2_add(s1, s2);
567 v128_t u = wasm_i64x2_mul(va, t);
568 v128_t v = wasm_i64x2_add(vb, u);
569 v128_t w = wasm_i64x2_shr(v, e);
570 d = wasm_i64x2_add(d, w);
571 wasm_v128_store((v128_t*)dst, d);
579 ui32 repeat,
bool synthesis)
603 ui32 width,
bool even)
609 float* dpl = even ? ldst->
f32 : hdst->
f32;
610 float* dph = even ? hdst->
f32 : ldst->
f32;
611 float* sp = src->
f32;
617 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
618 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
620 for (
ui32 j = num_steps; j > 0; --j)
627 v128_t va = wasm_i32x4_splat(a);
628 v128_t vb = wasm_i32x4_splat(b);
632 lp[l_width] = lp[l_width - 1];
638 int i = (int)h_width;
641 for (; i > 0; i -= 4, sp += 4, dp += 4)
643 v128_t s1 = wasm_v128_load((v128_t*)sp);
644 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
645 v128_t d = wasm_v128_load((v128_t*)dp);
646 v128_t t = wasm_i32x4_add(s1, s2);
647 v128_t v = wasm_i32x4_add(vb, t);
648 v128_t w = wasm_i32x4_shr(v, e);
649 d = wasm_i32x4_add(d, w);
650 wasm_v128_store((v128_t*)dp, d);
655 for (; i > 0; i -= 4, sp += 4, dp += 4)
657 v128_t s1 = wasm_v128_load((v128_t*)sp);
658 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
659 v128_t d = wasm_v128_load((v128_t*)dp);
660 v128_t t = wasm_i32x4_add(s1, s2);
661 v128_t v = wasm_i32x4_add(vb, t);
662 v128_t w = wasm_i32x4_shr(v, e);
663 d = wasm_i32x4_add(d, w);
664 wasm_v128_store((v128_t*)dp, d);
668 else if (a == -1 && b == 1 && e == 1)
670 int i = (int)h_width;
672 for (; i > 0; i -= 4, sp += 4, dp += 4)
674 v128_t s1 = wasm_v128_load((v128_t*)sp);
675 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
676 v128_t d = wasm_v128_load((v128_t*)dp);
677 v128_t t = wasm_i32x4_add(s1, s2);
678 v128_t w = wasm_i32x4_shr(t, e);
679 d = wasm_i32x4_sub(d, w);
680 wasm_v128_store((v128_t*)dp, d);
683 for (; i > 0; i -= 4, sp += 4, dp += 4)
685 v128_t s1 = wasm_v128_load((v128_t*)sp);
686 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
687 v128_t d = wasm_v128_load((v128_t*)dp);
688 v128_t t = wasm_i32x4_add(s1, s2);
689 v128_t w = wasm_i32x4_shr(t, e);
690 d = wasm_i32x4_sub(d, w);
691 wasm_v128_store((v128_t*)dp, d);
696 int i = (int)h_width;
698 for (; i > 0; i -= 4, sp += 4, dp += 4)
700 v128_t s1 = wasm_v128_load((v128_t*)sp);
701 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
702 v128_t d = wasm_v128_load((v128_t*)dp);
703 v128_t t = wasm_i32x4_add(s1, s2);
704 v128_t v = wasm_i32x4_sub(vb, t);
705 v128_t w = wasm_i32x4_shr(v, e);
706 d = wasm_i32x4_add(d, w);
707 wasm_v128_store((v128_t*)dp, d);
710 for (; i > 0; i -= 4, sp += 4, dp += 4)
712 v128_t s1 = wasm_v128_load((v128_t*)sp);
713 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
714 v128_t d = wasm_v128_load((v128_t*)dp);
715 v128_t t = wasm_i32x4_add(s1, s2);
716 v128_t v = wasm_i32x4_sub(vb, t);
717 v128_t w = wasm_i32x4_shr(v, e);
718 d = wasm_i32x4_add(d, w);
719 wasm_v128_store((v128_t*)dp, d);
724 int i = (int)h_width;
726 for (; i > 0; i -= 4, sp += 4, dp += 4)
728 v128_t s1 = wasm_v128_load((v128_t*)sp);
729 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
730 v128_t d = wasm_v128_load((v128_t*)dp);
731 v128_t t = wasm_i32x4_add(s1, s2);
732 v128_t u = wasm_i32x4_mul(va, t);
733 v128_t v = wasm_i32x4_add(vb, u);
734 v128_t w = wasm_i32x4_shr(v, e);
735 d = wasm_i32x4_add(d, w);
736 wasm_v128_store((v128_t*)dp, d);
739 for (; i > 0; i -= 4, sp += 4, dp += 4)
741 v128_t s1 = wasm_v128_load((v128_t*)sp);
742 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
743 v128_t d = wasm_v128_load((v128_t*)dp);
744 v128_t t = wasm_i32x4_add(s1, s2);
745 v128_t u = wasm_i32x4_mul(va, t);
746 v128_t v = wasm_i32x4_add(vb, u);
747 v128_t w = wasm_i32x4_shr(v, e);
748 d = wasm_i32x4_add(d, w);
749 wasm_v128_store((v128_t*)dp, d);
754 si32* t = lp; lp = hp; hp = t;
756 ui32 w = l_width; l_width = h_width; h_width = w;
761 ldst->
i32[0] = src->
i32[0];
763 hdst->
i32[0] = src->
i32[0] << 1;
771 ui32 width,
bool even)
777 double* dpl = (
double*)(even ? ldst->
p : hdst->
p);
778 double* dph = (
double*)(even ? hdst->
p : ldst->
p);
779 double* sp = (
double*)src->
p;
785 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
786 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
788 for (
ui32 j = num_steps; j > 0; --j)
795 v128_t va = wasm_i64x2_splat(a);
796 v128_t vb = wasm_i64x2_splat(b);
800 lp[l_width] = lp[l_width - 1];
806 int i = (int)h_width;
809 for (; i > 0; i -= 2, sp += 2, dp += 2)
811 v128_t s1 = wasm_v128_load((v128_t*)sp);
812 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
813 v128_t d = wasm_v128_load((v128_t*)dp);
814 v128_t t = wasm_i64x2_add(s1, s2);
815 v128_t v = wasm_i64x2_add(vb, t);
816 v128_t w = wasm_i64x2_shr(v, e);
817 d = wasm_i64x2_add(d, w);
818 wasm_v128_store((v128_t*)dp, d);
823 for (; i > 0; i -= 2, sp += 2, dp += 2)
825 v128_t s1 = wasm_v128_load((v128_t*)sp);
826 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
827 v128_t d = wasm_v128_load((v128_t*)dp);
828 v128_t t = wasm_i64x2_add(s1, s2);
829 v128_t v = wasm_i64x2_add(vb, t);
830 v128_t w = wasm_i64x2_shr(v, e);
831 d = wasm_i64x2_add(d, w);
832 wasm_v128_store((v128_t*)dp, d);
836 else if (a == -1 && b == 1 && e == 1)
838 int i = (int)h_width;
840 for (; i > 0; i -= 2, sp += 2, dp += 2)
842 v128_t s1 = wasm_v128_load((v128_t*)sp);
843 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
844 v128_t d = wasm_v128_load((v128_t*)dp);
845 v128_t t = wasm_i64x2_add(s1, s2);
846 v128_t w = wasm_i64x2_shr(t, e);
847 d = wasm_i64x2_sub(d, w);
848 wasm_v128_store((v128_t*)dp, d);
851 for (; i > 0; i -= 2, sp += 2, dp += 2)
853 v128_t s1 = wasm_v128_load((v128_t*)sp);
854 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
855 v128_t d = wasm_v128_load((v128_t*)dp);
856 v128_t t = wasm_i64x2_add(s1, s2);
857 v128_t w = wasm_i64x2_shr(t, e);
858 d = wasm_i64x2_sub(d, w);
859 wasm_v128_store((v128_t*)dp, d);
864 int i = (int)h_width;
866 for (; i > 0; i -= 2, sp += 2, dp += 2)
868 v128_t s1 = wasm_v128_load((v128_t*)sp);
869 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
870 v128_t d = wasm_v128_load((v128_t*)dp);
871 v128_t t = wasm_i64x2_add(s1, s2);
872 v128_t v = wasm_i64x2_sub(vb, t);
873 v128_t w = wasm_i64x2_shr(v, e);
874 d = wasm_i64x2_add(d, w);
875 wasm_v128_store((v128_t*)dp, d);
878 for (; i > 0; i -= 2, sp += 2, dp += 2)
880 v128_t s1 = wasm_v128_load((v128_t*)sp);
881 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
882 v128_t d = wasm_v128_load((v128_t*)dp);
883 v128_t t = wasm_i64x2_add(s1, s2);
884 v128_t v = wasm_i64x2_sub(vb, t);
885 v128_t w = wasm_i64x2_shr(v, e);
886 d = wasm_i64x2_add(d, w);
887 wasm_v128_store((v128_t*)dp, d);
892 int i = (int)h_width;
894 for (; i > 0; i -= 2, sp += 2, dp += 2)
896 v128_t s1 = wasm_v128_load((v128_t*)sp);
897 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
898 v128_t d = wasm_v128_load((v128_t*)dp);
899 v128_t t = wasm_i64x2_add(s1, s2);
900 v128_t u = wasm_i64x2_mul(va, t);
901 v128_t v = wasm_i64x2_add(vb, u);
902 v128_t w = wasm_i64x2_shr(v, e);
903 d = wasm_i64x2_add(d, w);
904 wasm_v128_store((v128_t*)dp, d);
907 for (; i > 0; i -= 2, sp += 2, dp += 2)
909 v128_t s1 = wasm_v128_load((v128_t*)sp);
910 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
911 v128_t d = wasm_v128_load((v128_t*)dp);
912 v128_t t = wasm_i64x2_add(s1, s2);
913 v128_t u = wasm_i64x2_mul(va, t);
914 v128_t v = wasm_i64x2_add(vb, u);
915 v128_t w = wasm_i64x2_shr(v, e);
916 d = wasm_i64x2_add(d, w);
917 wasm_v128_store((v128_t*)dp, d);
922 si64* t = lp; lp = hp; hp = t;
924 ui32 w = l_width; l_width = h_width; h_width = w;
929 ldst->
i64[0] = src->
i64[0];
931 hdst->
i64[0] = src->
i64[0] << 1;
938 ui32 width,
bool even)
958 ui32 width,
bool even)
964 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
965 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
967 for (
ui32 j = 0; j < num_steps; ++j)
973 v128_t va = wasm_i32x4_splat(a);
974 v128_t vb = wasm_i32x4_splat(b);
978 oth[oth_width] = oth[oth_width - 1];
980 const si32* sp = oth;
984 int i = (int)aug_width;
987 for (; i > 0; i -= 4, sp += 4, dp += 4)
989 v128_t s1 = wasm_v128_load((v128_t*)sp);
990 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
991 v128_t d = wasm_v128_load((v128_t*)dp);
992 v128_t t = wasm_i32x4_add(s1, s2);
993 v128_t v = wasm_i32x4_add(vb, t);
994 v128_t w = wasm_i32x4_shr(v, e);
995 d = wasm_i32x4_sub(d, w);
996 wasm_v128_store((v128_t*)dp, d);
1001 for (; i > 0; i -= 4, sp += 4, dp += 4)
1003 v128_t s1 = wasm_v128_load((v128_t*)sp);
1004 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1005 v128_t d = wasm_v128_load((v128_t*)dp);
1006 v128_t t = wasm_i32x4_add(s1, s2);
1007 v128_t v = wasm_i32x4_add(vb, t);
1008 v128_t w = wasm_i32x4_shr(v, e);
1009 d = wasm_i32x4_sub(d, w);
1010 wasm_v128_store((v128_t*)dp, d);
1014 else if (a == -1 && b == 1 && e == 1)
1016 int i = (int)aug_width;
1018 for (; i > 0; i -= 4, sp += 4, dp += 4)
1020 v128_t s1 = wasm_v128_load((v128_t*)sp);
1021 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1022 v128_t d = wasm_v128_load((v128_t*)dp);
1023 v128_t t = wasm_i32x4_add(s1, s2);
1024 v128_t w = wasm_i32x4_shr(t, e);
1025 d = wasm_i32x4_add(d, w);
1026 wasm_v128_store((v128_t*)dp, d);
1029 for (; i > 0; i -= 4, sp += 4, dp += 4)
1031 v128_t s1 = wasm_v128_load((v128_t*)sp);
1032 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1033 v128_t d = wasm_v128_load((v128_t*)dp);
1034 v128_t t = wasm_i32x4_add(s1, s2);
1035 v128_t w = wasm_i32x4_shr(t, e);
1036 d = wasm_i32x4_add(d, w);
1037 wasm_v128_store((v128_t*)dp, d);
1042 int i = (int)aug_width;
1044 for (; i > 0; i -= 4, sp += 4, dp += 4)
1046 v128_t s1 = wasm_v128_load((v128_t*)sp);
1047 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1048 v128_t d = wasm_v128_load((v128_t*)dp);
1049 v128_t t = wasm_i32x4_add(s1, s2);
1050 v128_t v = wasm_i32x4_sub(vb, t);
1051 v128_t w = wasm_i32x4_shr(v, e);
1052 d = wasm_i32x4_sub(d, w);
1053 wasm_v128_store((v128_t*)dp, d);
1056 for (; i > 0; i -= 4, sp += 4, dp += 4)
1058 v128_t s1 = wasm_v128_load((v128_t*)sp);
1059 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1060 v128_t d = wasm_v128_load((v128_t*)dp);
1061 v128_t t = wasm_i32x4_add(s1, s2);
1062 v128_t v = wasm_i32x4_sub(vb, t);
1063 v128_t w = wasm_i32x4_shr(v, e);
1064 d = wasm_i32x4_sub(d, w);
1065 wasm_v128_store((v128_t*)dp, d);
1070 int i = (int)aug_width;
1072 for (; i > 0; i -= 4, sp += 4, dp += 4)
1074 v128_t s1 = wasm_v128_load((v128_t*)sp);
1075 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1076 v128_t d = wasm_v128_load((v128_t*)dp);
1077 v128_t t = wasm_i32x4_add(s1, s2);
1078 v128_t u = wasm_i32x4_mul(va, t);
1079 v128_t v = wasm_i32x4_add(vb, u);
1080 v128_t w = wasm_i32x4_shr(v, e);
1081 d = wasm_i32x4_sub(d, w);
1082 wasm_v128_store((v128_t*)dp, d);
1085 for (; i > 0; i -= 4, sp += 4, dp += 4)
1087 v128_t s1 = wasm_v128_load((v128_t*)sp);
1088 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1089 v128_t d = wasm_v128_load((v128_t*)dp);
1090 v128_t t = wasm_i32x4_add(s1, s2);
1091 v128_t u = wasm_i32x4_mul(va, t);
1092 v128_t v = wasm_i32x4_add(vb, u);
1093 v128_t w = wasm_i32x4_shr(v, e);
1094 d = wasm_i32x4_sub(d, w);
1095 wasm_v128_store((v128_t*)dp, d);
1100 si32* t = aug; aug = oth; oth = t;
1102 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1107 float* dp = dst->
f32;
1108 float* spl = even ? lsrc->
f32 : hsrc->
f32;
1109 float* sph = even ? hsrc->
f32 : lsrc->
f32;
1116 dst->
i32[0] = lsrc->
i32[0];
1118 dst->
i32[0] = hsrc->
i32[0] >> 1;
1125 ui32 width,
bool even)
1131 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1132 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1134 for (
ui32 j = 0; j < num_steps; ++j)
1140 v128_t va = wasm_i64x2_splat(a);
1141 v128_t vb = wasm_i64x2_splat(b);
1145 oth[oth_width] = oth[oth_width - 1];
1147 const si64* sp = oth;
1151 int i = (int)aug_width;
1154 for (; i > 0; i -= 2, sp += 2, dp += 2)
1156 v128_t s1 = wasm_v128_load((v128_t*)sp);
1157 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1158 v128_t d = wasm_v128_load((v128_t*)dp);
1159 v128_t t = wasm_i64x2_add(s1, s2);
1160 v128_t v = wasm_i64x2_add(vb, t);
1161 v128_t w = wasm_i64x2_shr(v, e);
1162 d = wasm_i64x2_sub(d, w);
1163 wasm_v128_store((v128_t*)dp, d);
1168 for (; i > 0; i -= 2, sp += 2, dp += 2)
1170 v128_t s1 = wasm_v128_load((v128_t*)sp);
1171 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1172 v128_t d = wasm_v128_load((v128_t*)dp);
1173 v128_t t = wasm_i64x2_add(s1, s2);
1174 v128_t v = wasm_i64x2_add(vb, t);
1175 v128_t w = wasm_i64x2_shr(v, e);
1176 d = wasm_i64x2_sub(d, w);
1177 wasm_v128_store((v128_t*)dp, d);
1181 else if (a == -1 && b == 1 && e == 1)
1183 int i = (int)aug_width;
1185 for (; i > 0; i -= 2, sp += 2, dp += 2)
1187 v128_t s1 = wasm_v128_load((v128_t*)sp);
1188 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1189 v128_t d = wasm_v128_load((v128_t*)dp);
1190 v128_t t = wasm_i64x2_add(s1, s2);
1191 v128_t w = wasm_i64x2_shr(t, e);
1192 d = wasm_i64x2_add(d, w);
1193 wasm_v128_store((v128_t*)dp, d);
1196 for (; i > 0; i -= 2, sp += 2, dp += 2)
1198 v128_t s1 = wasm_v128_load((v128_t*)sp);
1199 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1200 v128_t d = wasm_v128_load((v128_t*)dp);
1201 v128_t t = wasm_i64x2_add(s1, s2);
1202 v128_t w = wasm_i64x2_shr(t, e);
1203 d = wasm_i64x2_add(d, w);
1204 wasm_v128_store((v128_t*)dp, d);
1209 int i = (int)aug_width;
1211 for (; i > 0; i -= 2, sp += 2, dp += 2)
1213 v128_t s1 = wasm_v128_load((v128_t*)sp);
1214 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1215 v128_t d = wasm_v128_load((v128_t*)dp);
1216 v128_t t = wasm_i64x2_add(s1, s2);
1217 v128_t v = wasm_i64x2_sub(vb, t);
1218 v128_t w = wasm_i64x2_shr(v, e);
1219 d = wasm_i64x2_sub(d, w);
1220 wasm_v128_store((v128_t*)dp, d);
1223 for (; i > 0; i -= 2, sp += 2, dp += 2)
1225 v128_t s1 = wasm_v128_load((v128_t*)sp);
1226 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1227 v128_t d = wasm_v128_load((v128_t*)dp);
1228 v128_t t = wasm_i64x2_add(s1, s2);
1229 v128_t v = wasm_i64x2_sub(vb, t);
1230 v128_t w = wasm_i64x2_shr(v, e);
1231 d = wasm_i64x2_sub(d, w);
1232 wasm_v128_store((v128_t*)dp, d);
1237 int i = (int)aug_width;
1239 for (; i > 0; i -= 2, sp += 2, dp += 2)
1241 v128_t s1 = wasm_v128_load((v128_t*)sp);
1242 v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
1243 v128_t d = wasm_v128_load((v128_t*)dp);
1244 v128_t t = wasm_i64x2_add(s1, s2);
1245 v128_t u = wasm_i64x2_mul(va, t);
1246 v128_t v = wasm_i64x2_add(vb, u);
1247 v128_t w = wasm_i64x2_shr(v, e);
1248 d = wasm_i64x2_sub(d, w);
1249 wasm_v128_store((v128_t*)dp, d);
1252 for (; i > 0; i -= 2, sp += 2, dp += 2)
1254 v128_t s1 = wasm_v128_load((v128_t*)sp);
1255 v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
1256 v128_t d = wasm_v128_load((v128_t*)dp);
1257 v128_t t = wasm_i64x2_add(s1, s2);
1258 v128_t u = wasm_i64x2_mul(va, t);
1259 v128_t v = wasm_i64x2_add(vb, u);
1260 v128_t w = wasm_i64x2_shr(v, e);
1261 d = wasm_i64x2_sub(d, w);
1262 wasm_v128_store((v128_t*)dp, d);
1267 si64* t = aug; aug = oth; oth = t;
1269 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1274 double* dp = (
double*)dst->
p;
1275 double* spl = (
double*)(even ? lsrc->
p : hsrc->
p);
1276 double* sph = (
double*)(even ? hsrc->
p : lsrc->
p);
1283 dst->
i64[0] = lsrc->
i64[0];
1285 dst->
i64[0] = hsrc->
i64[0] >> 1;
1292 ui32 width,
bool even)
static void wasm_multiply_const(float *p, float f, int width)
static void wasm_interleave64(double *dp, double *spl, double *sph, int width)
void wasm_rev_horz_syn64(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void wasm_rev_horz_ana32(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void wasm_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void wasm_rev_horz_syn32(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void wasm_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void wasm_rev_vert_step64(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void wasm_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void wasm_interleave32(float *dp, float *spl, float *sph, int width)
void wasm_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void wasm_rev_vert_step32(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void wasm_deinterleave32(float *dpl, float *dph, float *sp, int width)
static void wasm_deinterleave64(double *dpl, double *dph, double *sp, int width)
static void wasm_rev_horz_ana64(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
ui32 get_num_steps() const
const lifting_step * get_step(ui32 s) const