45#include "../codestream/ojph_params_local.h"
61 const line_buf* aug,
ui32 repeat,
bool synthesis) = NULL;
80 const line_buf* aug,
ui32 repeat,
bool synthesis) = NULL;
105#if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
116 #ifndef OJPH_DISABLE_SIMD
118 #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
120 #ifndef OJPH_DISABLE_SSE
130 #ifndef OJPH_DISABLE_SSE2
139 #ifndef OJPH_DISABLE_AVX
149 #ifndef OJPH_DISABLE_AVX2
158 #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512))
172 #elif defined(OJPH_ARCH_ARM)
194#if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
200 ui32 repeat,
bool synthesis)
207 const si32* src1 = sig->
i32, * src2 = other->
i32;
214 for (
ui32 i = repeat; i > 0; --i)
215 *dst++ -= (b + *src1++ + *src2++) >> e;
217 for (
ui32 i = repeat; i > 0; --i)
218 *dst++ += (b + *src1++ + *src2++) >> e;
220 else if (a == -1 && b == 1 && e == 1)
223 for (
ui32 i = repeat; i > 0; --i)
224 *dst++ += (*src1++ + *src2++) >> e;
226 for (
ui32 i = repeat; i > 0; --i)
227 *dst++ -= (*src1++ + *src2++) >> e;
232 for (
ui32 i = repeat; i > 0; --i)
233 *dst++ -= (b - (*src1++ + *src2++)) >> e;
235 for (
ui32 i = repeat; i > 0; --i)
236 *dst++ += (b - (*src1++ + *src2++)) >> e;
240 for (
ui32 i = repeat; i > 0; --i)
241 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
243 for (
ui32 i = repeat; i > 0; --i)
244 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
252 ui32 repeat,
bool synthesis)
259 const si64* src1 = sig->
i64, * src2 = other->
i64;
266 for (
ui32 i = repeat; i > 0; --i)
267 *dst++ -= (b + *src1++ + *src2++) >> e;
269 for (
ui32 i = repeat; i > 0; --i)
270 *dst++ += (b + *src1++ + *src2++) >> e;
272 else if (a == -1 && b == 1 && e == 1)
275 for (
ui32 i = repeat; i > 0; --i)
276 *dst++ += (*src1++ + *src2++) >> e;
278 for (
ui32 i = repeat; i > 0; --i)
279 *dst++ -= (*src1++ + *src2++) >> e;
284 for (
ui32 i = repeat; i > 0; --i)
285 *dst++ -= (b - (*src1++ + *src2++)) >> e;
287 for (
ui32 i = repeat; i > 0; --i)
288 *dst++ += (b - (*src1++ + *src2++)) >> e;
292 for (
ui32 i = repeat; i > 0; --i)
293 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
295 for (
ui32 i = repeat; i > 0; --i)
296 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
303 ui32 repeat,
bool synthesis)
327 ui32 width,
bool even)
340 for (; w > 1; w -= 2)
342 *dpl++ = *sp++; *dph++ = *sp++;
350 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
351 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
353 for (
ui32 j = num_steps; j > 0; --j)
363 lp[l_width] = lp[l_width - 1];
365 const si32* sp = lp + (even ? 1 : 0);
369 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
370 *dp += (b + (sp[-1] + sp[0])) >> e;
372 else if (a == -1 && b == 1 && e == 1)
374 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
375 *dp -= (sp[-1] + sp[0]) >> e;
379 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
380 *dp += (b - (sp[-1] + sp[0])) >> e;
384 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
385 *dp += (b + a * (sp[-1] + sp[0])) >> e;
389 si32* t = lp; lp = hp; hp = t;
391 ui32 w = l_width; l_width = h_width; h_width = w;
396 ldst->
i32[0] = src->
i32[0];
398 hdst->
i32[0] = src->
i32[0] << 1;
406 ui32 width,
bool even)
419 for (; w > 1; w -= 2)
421 *dpl++ = *sp++; *dph++ = *sp++;
429 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
430 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
432 for (
ui32 j = num_steps; j > 0; --j)
442 lp[l_width] = lp[l_width - 1];
444 const si64* sp = lp + (even ? 1 : 0);
448 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
449 *dp += (b + (sp[-1] + sp[0])) >> e;
451 else if (a == -1 && b == 1 && e == 1)
453 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
454 *dp -= (sp[-1] + sp[0]) >> e;
458 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
459 *dp += (b - (sp[-1] + sp[0])) >> e;
463 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
464 *dp += (b + a * (sp[-1] + sp[0])) >> e;
468 si64* t = lp; lp = hp; hp = t;
470 ui32 w = l_width; l_width = h_width; h_width = w;
475 ldst->
i64[0] = src->
i64[0];
477 hdst->
i64[0] = src->
i64[0] << 1;
484 ui32 width,
bool even)
505 ui32 width,
bool even)
511 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
512 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
514 for (
ui32 j = 0; j < num_steps; ++j)
523 oth[oth_width] = oth[oth_width - 1];
525 const si32* sp = oth + (ev ? 0 : 1);
529 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
530 *dp -= (b + (sp[-1] + sp[0])) >> e;
532 else if (a == -1 && b == 1 && e == 1)
534 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
535 *dp += (sp[-1] + sp[0]) >> e;
539 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
540 *dp -= (b - (sp[-1] + sp[0])) >> e;
544 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
545 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
549 si32* t = aug; aug = oth; oth = t;
551 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
563 for (; w > 1; w -= 2)
565 *dp++ = *spl++; *dp++ = *sph++;
574 dst->
i32[0] = lsrc->
i32[0];
576 dst->
i32[0] = hsrc->
i32[0] >> 1;
584 ui32 width,
bool even)
590 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
591 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
593 for (
ui32 j = 0; j < num_steps; ++j)
602 oth[oth_width] = oth[oth_width - 1];
604 const si64* sp = oth + (ev ? 0 : 1);
608 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
609 *dp -= (b + (sp[-1] + sp[0])) >> e;
611 else if (a == -1 && b == 1 && e == 1)
613 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
614 *dp += (sp[-1] + sp[0]) >> e;
618 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
619 *dp -= (b - (sp[-1] + sp[0])) >> e;
623 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
624 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
628 si64* t = aug; aug = oth; oth = t;
630 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
642 for (; w > 1; w -= 2)
644 *dp++ = *spl++; *dp++ = *sph++;
653 dst->
i64[0] = lsrc->
i64[0];
655 dst->
i64[0] = hsrc->
i64[0] >> 1;
662 ui32 width,
bool even)
682 ui32 repeat,
bool synthesis)
689 float* dst = aug->
f32;
690 const float* src1 = sig->
f32, * src2 = other->
f32;
691 for (
ui32 i = repeat; i > 0; --i)
692 *dst++ += a * (*src1++ + *src2++);
698 float* dst = aug->
f32;
699 for (
ui32 i = repeat; i > 0; --i)
706 ui32 width,
bool even)
711 float* dph = hdst->
f32;
712 float* dpl = ldst->
f32;
713 float* sp = src->
f32;
719 for (; w > 1; w -= 2)
721 *dpl++ = *sp++; *dph++ = *sp++;
728 float* hp = hdst->
f32, * lp = ldst->
f32;
729 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
730 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
732 for (
ui32 j = num_steps; j > 0; --j)
739 lp[l_width] = lp[l_width - 1];
741 const float* sp = lp + (even ? 1 : 0);
743 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
744 *dp += a * (sp[-1] + sp[0]);
747 float* t = lp; lp = hp; hp = t;
749 ui32 w = l_width; l_width = h_width; h_width = w;
753 float K = atk->
get_K();
754 float K_inv = 1.0f / K;
758 for (
ui32 i = l_width; i > 0; --i)
762 for (
ui32 i = h_width; i > 0; --i)
768 ldst->
f32[0] = src->
f32[0];
770 hdst->
f32[0] = src->
f32[0] * 2.0f;
777 ui32 width,
bool even)
782 float* oth = hsrc->
f32, * aug = lsrc->
f32;
783 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
784 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
787 float K = atk->
get_K();
788 float K_inv = 1.0f / K;
792 for (
ui32 i = aug_width; i > 0; --i)
796 for (
ui32 i = oth_width; i > 0; --i)
801 for (
ui32 j = 0; j < num_steps; ++j)
808 oth[oth_width] = oth[oth_width - 1];
810 const float* sp = oth + (ev ? 0 : 1);
812 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
813 *dp -= a * (sp[-1] + sp[0]);
816 float* t = aug; aug = oth; oth = t;
818 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
822 float* sph = hsrc->
f32;
823 float* spl = lsrc->
f32;
824 float* dp = dst->
f32;
827 { *dp++ = *sph++; --w; }
828 for (; w > 1; w -= 2)
829 { *dp++ = *spl++; *dp++ = *sph++; }
831 { *dp++ = *spl++; --w; }
835 dst->
f32[0] = lsrc->
f32[0];
837 dst->
f32[0] = hsrc->
f32[0] * 0.5f;
void(* rev_horz_ana)(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void gen_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void gen_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void gen_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void gen_rev_horz_syn32(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void gen_rev_vert_step64(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx512_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void gen_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void gen_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void gen_rev_vert_step32(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void gen_rev_horz_ana64(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void(* irv_vert_times_K)(float K, const line_buf *aug, ui32 repeat)
static bool wavelet_transform_functions_initialized
void gen_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void(* irv_vert_step)(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void avx2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void init_wavelet_transform_functions()
void wasm_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
static void gen_rev_horz_syn64(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx512_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void sse_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void sse_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void wasm_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void sse_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx512_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void avx512_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void(* rev_horz_syn)(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void wasm_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void(* irv_horz_ana)(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void(* rev_vert_step)(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void gen_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void(* irv_horz_syn)(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void wasm_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void gen_rev_horz_ana32(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
@ X86_CPU_EXT_LEVEL_AVX512
OJPH_EXPORT int get_cpu_ext_level()
ui32 get_num_steps() const
const lifting_step * get_step(ui32 s) const