55 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
56 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
57 __m128 shift = _mm_set1_ps(0.5f);
58 __m128 m = _mm_set1_ps(mul);
59 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
61 __m128 t = _mm_loadu_ps(sp);
62 __m128 s = _mm_add_ps(t, shift);
64 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
66 _MM_SET_ROUNDING_MODE(rounding_mode);
73 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
74 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
75 __m128 m = _mm_set1_ps(mul);
76 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
78 __m128 t = _mm_loadu_ps(sp);
79 __m128 s = _mm_mul_ps(t, m);
80 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
82 _MM_SET_ROUNDING_MODE(rounding_mode);
89 __m128 ct = _mm_cmpge_ps(x, y);
90 __m128i c = _mm_castps_si128(ct);
91 __m128i d = _mm_and_si128(c, a);
92 __m128i e = _mm_andnot_si128(c, b);
93 return _mm_or_si128(d, e);
100 __m128 ct = _mm_cmplt_ps(x, y);
101 __m128i c = _mm_castps_si128(ct);
102 __m128i d = _mm_and_si128(c, a);
103 __m128i e = _mm_andnot_si128(c, b);
104 return _mm_or_si128(d, e);
108 template <
bool NLT_TYPE3>
112 ui32 bit_depth,
bool is_signed,
ui32 width)
119 assert(bit_depth <= 32);
120 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
121 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
123 const float* sp = src_line->
f32;
124 si32* dp = dst_line->
i32 + dst_line_offset;
131 si32 neg_limit = (
si32)INT_MIN >> (32 - bit_depth);
132 __m128 mul = _mm_set1_ps((
float)(1ull << bit_depth));
133 __m128 fl_up_lim = _mm_set1_ps(-(
float)neg_limit);
134 __m128 fl_low_lim = _mm_set1_ps((
float)neg_limit);
135 __m128i s32_up_lim = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
136 __m128i s32_low_lim = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
140 __m128i zero = _mm_setzero_si128();
141 __m128i bias = _mm_set1_epi32(-(
si32)((1ULL << (bit_depth - 1)) + 1));
142 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
143 __m128 t = _mm_loadu_ps(sp);
144 t = _mm_mul_ps(t, mul);
145 __m128i u = _mm_cvtps_epi32(t);
150 __m128i c = _mm_cmpgt_epi32(zero, u);
151 __m128i neg = _mm_sub_epi32(bias, u);
152 neg = _mm_and_si128(c, neg);
153 u = _mm_andnot_si128(c, u);
154 u = _mm_or_si128(neg, u);
156 _mm_storeu_si128((__m128i*)dp, u);
161 __m128i half = _mm_set1_epi32((
si32)(1ULL << (bit_depth - 1)));
162 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
163 __m128 t = _mm_loadu_ps(sp);
164 t = _mm_mul_ps(t, mul);
165 __m128i u = _mm_cvtps_epi32(t);
168 u = _mm_add_epi32(u, half);
169 _mm_storeu_si128((__m128i*)dp, u);
173 _MM_SET_ROUNDING_MODE(rounding_mode);
179 ui32 bit_depth,
bool is_signed,
ui32 width)
181 local_sse2_irv_convert_to_integer<false>(src_line, dst_line,
182 dst_line_offset, bit_depth, is_signed, width);
188 ui32 bit_depth,
bool is_signed,
ui32 width)
190 local_sse2_irv_convert_to_integer<true>(src_line, dst_line,
191 dst_line_offset, bit_depth, is_signed, width);
200 __m128i x = _mm_srli_epi64(a, amt);
201 x = _mm_xor_si128(x, m);
202 __m128i result = _mm_sub_epi64(x, m);
210 t = _mm_cmplt_epi32(a, zero);
211 t = _mm_unpacklo_epi32(a, t);
219 t = _mm_cmplt_epi32(a, zero);
220 t = _mm_unpackhi_epi32(a, t);
226 const ui32 src_line_offset,
228 const ui32 dst_line_offset,
235 const si32 *sp = src_line->
i32 + src_line_offset;
236 si32 *dp = dst_line->
i32 + dst_line_offset;
237 __m128i sh = _mm_set1_epi32((
si32)shift);
238 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
240 __m128i s = _mm_loadu_si128((__m128i*)sp);
241 s = _mm_add_epi32(s, sh);
242 _mm_storeu_si128((__m128i*)dp, s);
247 const si32 *sp = src_line->
i32 + src_line_offset;
248 si64 *dp = dst_line->
i64 + dst_line_offset;
249 __m128i zero = _mm_setzero_si128();
250 __m128i sh = _mm_set1_epi64x(shift);
251 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
254 s = _mm_loadu_si128((__m128i*)sp);
257 t = _mm_add_epi64(t, sh);
258 _mm_storeu_si128((__m128i*)dp, t);
261 t = _mm_add_epi64(t, sh);
262 _mm_storeu_si128((__m128i*)dp + 1, t);
270 const si64 *sp = src_line->
i64 + src_line_offset;
271 si32 *dp = dst_line->
i32 + dst_line_offset;
272 __m128i low_bits = _mm_set_epi64x(0, (
si64)ULLONG_MAX);
273 __m128i sh = _mm_set1_epi64x(shift);
274 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
277 s = _mm_loadu_si128((__m128i*)sp);
278 s = _mm_add_epi64(s, sh);
280 t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
281 t = _mm_and_si128(low_bits, t);
283 s = _mm_loadu_si128((__m128i*)sp + 1);
284 s = _mm_add_epi64(s, sh);
286 s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
287 s = _mm_andnot_si128(low_bits, s);
289 t = _mm_or_si128(s, t);
290 _mm_storeu_si128((__m128i*)dp, t);
297 const ui32 src_line_offset,
299 const ui32 dst_line_offset,
306 const si32 *sp = src_line->
i32 + src_line_offset;
307 si32 *dp = dst_line->
i32 + dst_line_offset;
308 __m128i sh = _mm_set1_epi32((
si32)(-shift));
309 __m128i zero = _mm_setzero_si128();
310 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
312 __m128i s = _mm_loadu_si128((__m128i*)sp);
313 __m128i c = _mm_cmplt_epi32(s, zero);
314 __m128i v_m_sh = _mm_sub_epi32(sh, s);
315 v_m_sh = _mm_and_si128(c, v_m_sh);
316 s = _mm_andnot_si128(c, s);
317 s = _mm_or_si128(s, v_m_sh);
318 _mm_storeu_si128((__m128i*)dp, s);
323 const si32 *sp = src_line->
i32 + src_line_offset;
324 si64 *dp = dst_line->
i64 + dst_line_offset;
325 __m128i sh = _mm_set1_epi64x(-shift);
326 __m128i zero = _mm_setzero_si128();
327 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
329 __m128i s, t, u, c, v_m_sh;
330 s = _mm_loadu_si128((__m128i*)sp);
332 t = _mm_cmplt_epi32(s, zero);
333 u = _mm_unpacklo_epi32(s, t);
334 c = _mm_unpacklo_epi32(t, t);
336 v_m_sh = _mm_sub_epi64(sh, u);
337 v_m_sh = _mm_and_si128(c, v_m_sh);
338 u = _mm_andnot_si128(c, u);
339 u = _mm_or_si128(u, v_m_sh);
341 _mm_storeu_si128((__m128i*)dp, u);
342 u = _mm_unpackhi_epi32(s, t);
343 c = _mm_unpackhi_epi32(t, t);
345 v_m_sh = _mm_sub_epi64(sh, u);
346 v_m_sh = _mm_and_si128(c, v_m_sh);
347 u = _mm_andnot_si128(c, u);
348 u = _mm_or_si128(u, v_m_sh);
350 _mm_storeu_si128((__m128i*)dp + 1, u);
358 const si64 *sp = src_line->
i64 + src_line_offset;
359 si32 *dp = dst_line->
i32 + dst_line_offset;
360 __m128i sh = _mm_set1_epi64x(-shift);
361 __m128i zero = _mm_setzero_si128();
362 __m128i half_mask = _mm_set_epi64x(0, (
si64)ULLONG_MAX);
363 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
367 __m128i s, t, p, n, m, tm;
368 s = _mm_loadu_si128((__m128i*)sp);
370 tm = _mm_cmplt_epi32(s, zero);
371 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1));
372 tm = _mm_sub_epi64(sh, s);
373 n = _mm_and_si128(m, tm);
374 p = _mm_andnot_si128(m, s);
375 tm = _mm_or_si128(n, p);
376 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
377 t = _mm_and_si128(half_mask, tm);
379 s = _mm_loadu_si128((__m128i*)sp + 1);
380 tm = _mm_cmplt_epi32(s, zero);
381 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1));
382 tm = _mm_sub_epi64(sh, s);
383 n = _mm_and_si128(m, tm);
384 p = _mm_andnot_si128(m, s);
385 tm = _mm_or_si128(n, p);
386 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
387 tm = _mm_andnot_si128(half_mask, tm);
389 t = _mm_or_si128(t, tm);
390 _mm_storeu_si128((__m128i*)dp, t);
396 template<
bool NLT_TYPE3>
400 ui32 bit_depth,
bool is_signed,
ui32 width)
407 assert(bit_depth <= 32);
408 __m128 mul = _mm_set1_ps((
float)(1.0 / (
double)(1ULL << bit_depth)));
410 const si32* sp = src_line->
i32 + src_line_offset;
411 float* dp = dst_line->
f32;
414 __m128i zero = _mm_setzero_si128();
415 __m128i bias = _mm_set1_epi32(-(
si32)((1ULL << (bit_depth - 1)) + 1));
416 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
417 __m128i t = _mm_loadu_si128((__m128i*)sp);
420 __m128i c = _mm_cmplt_epi32(t, zero);
421 __m128i neg = _mm_sub_epi32(bias, t);
422 neg = _mm_and_si128(c, neg);
423 c = _mm_andnot_si128(c, t);
424 t = _mm_or_si128(neg, c);
426 __m128 v = _mm_cvtepi32_ps(t);
427 v = _mm_mul_ps(v, mul);
428 _mm_storeu_ps(dp, v);
433 __m128i half = _mm_set1_epi32((
si32)(1ULL << (bit_depth - 1)));
434 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
435 __m128i t = _mm_loadu_si128((__m128i*)sp);
436 t = _mm_sub_epi32(t, half);
437 __m128 v = _mm_cvtepi32_ps(t);
438 v = _mm_mul_ps(v, mul);
439 _mm_storeu_ps(dp, v);
447 ui32 bit_depth,
bool is_signed,
ui32 width)
449 local_sse2_irv_convert_to_float<false>(src_line, src_line_offset,
450 dst_line, bit_depth, is_signed, width);
456 ui32 bit_depth,
bool is_signed,
ui32 width)
458 local_sse2_irv_convert_to_float<true>(src_line, src_line_offset,
459 dst_line, bit_depth, is_signed, width);
486 for (
int i = (repeat + 3) >> 2; i > 0; --i)
488 __m128i mr = _mm_load_si128((__m128i*)rp);
489 __m128i mg = _mm_load_si128((__m128i*)gp);
490 __m128i mb = _mm_load_si128((__m128i*)bp);
491 __m128i t = _mm_add_epi32(mr, mb);
492 t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
493 _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2));
494 t = _mm_sub_epi32(mb, mg);
495 _mm_store_si128((__m128i*)cbp, t);
496 t = _mm_sub_epi32(mr, mg);
497 _mm_store_si128((__m128i*)crp, t);
499 rp += 4; gp += 4; bp += 4;
500 yp += 4; cbp += 4; crp += 4;
511 __m128i zero = _mm_setzero_si128();
512 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
515 for (
int i = (repeat + 3) >> 2; i > 0; --i)
517 __m128i mr32 = _mm_load_si128((__m128i*)rp);
518 __m128i mg32 = _mm_load_si128((__m128i*)gp);
519 __m128i mb32 = _mm_load_si128((__m128i*)bp);
520 __m128i mr, mg, mb, t;
525 t = _mm_add_epi64(mr, mb);
526 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
528 t = _mm_sub_epi64(mb, mg);
529 _mm_store_si128((__m128i*)cbp, t);
530 t = _mm_sub_epi64(mr, mg);
531 _mm_store_si128((__m128i*)crp, t);
533 yp += 2; cbp += 2; crp += 2;
539 t = _mm_add_epi64(mr, mb);
540 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
542 t = _mm_sub_epi64(mb, mg);
543 _mm_store_si128((__m128i*)cbp, t);
544 t = _mm_sub_epi64(mr, mg);
545 _mm_store_si128((__m128i*)crp, t);
547 rp += 4; gp += 4; bp += 4;
548 yp += 2; cbp += 2; crp += 2;
577 for (
int i = (repeat + 3) >> 2; i > 0; --i)
579 __m128i my = _mm_load_si128((__m128i*)yp);
580 __m128i mcb = _mm_load_si128((__m128i*)cbp);
581 __m128i mcr = _mm_load_si128((__m128i*)crp);
583 __m128i t = _mm_add_epi32(mcb, mcr);
584 t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
585 _mm_store_si128((__m128i*)gp, t);
586 __m128i u = _mm_add_epi32(mcb, t);
587 _mm_store_si128((__m128i*)bp, u);
588 u = _mm_add_epi32(mcr, t);
589 _mm_store_si128((__m128i*)rp, u);
591 yp += 4; cbp += 4; crp += 4;
592 rp += 4; gp += 4; bp += 4;
603 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
604 __m128i low_bits = _mm_set_epi64x(0, (
si64)ULLONG_MAX);
607 for (
int i = (repeat + 3) >> 2; i > 0; --i)
609 __m128i my, mcb, mcr, tr, tg, tb;
610 my = _mm_load_si128((__m128i*)yp);
611 mcb = _mm_load_si128((__m128i*)cbp);
612 mcr = _mm_load_si128((__m128i*)crp);
614 tg = _mm_add_epi64(mcb, mcr);
616 tb = _mm_add_epi64(mcb, tg);
617 tr = _mm_add_epi64(mcr, tg);
620 mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
621 mr = _mm_and_si128(low_bits, mr);
622 mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
623 mg = _mm_and_si128(low_bits, mg);
624 mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
625 mb = _mm_and_si128(low_bits, mb);
627 yp += 2; cbp += 2; crp += 2;
629 my = _mm_load_si128((__m128i*)yp);
630 mcb = _mm_load_si128((__m128i*)cbp);
631 mcr = _mm_load_si128((__m128i*)crp);
633 tg = _mm_add_epi64(mcb, mcr);
635 tb = _mm_add_epi64(mcb, tg);
636 tr = _mm_add_epi64(mcr, tg);
638 tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
639 tr = _mm_andnot_si128(low_bits, tr);
640 mr = _mm_or_si128(mr, tr);
641 tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
642 tg = _mm_andnot_si128(low_bits, tg);
643 mg = _mm_or_si128(mg, tg);
644 tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
645 tb = _mm_andnot_si128(low_bits, tb);
646 mb = _mm_or_si128(mb, tb);
648 _mm_store_si128((__m128i*)rp, mr);
649 _mm_store_si128((__m128i*)gp, mg);
650 _mm_store_si128((__m128i*)bp, mb);
652 yp += 2; cbp += 2; crp += 2;
653 rp += 4; gp += 4; bp += 4;
void sse2_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void sse2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
static void local_sse2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
static __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void sse2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
static __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
static void local_sse2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
static __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
static __m128i ojph_mm_min_lt_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
void sse2_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
static __m128i ojph_mm_max_ge_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
void sse2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)