OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_colour_sse2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_colour_sse2.cpp
34// Author: Aous Naman
35// Date: 11 October 2019
36//***************************************************************************/
37
38#include <climits>
39#include <cmath>
40
41#include "ojph_defs.h"
42#include "ojph_arch.h"
43#include "ojph_mem.h"
44#include "ojph_colour.h"
45
46#include <emmintrin.h>
47
48namespace ojph {
49 namespace local {
50
52 void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
53 ui32 width)
54 {
55 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
56 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
57 __m128 shift = _mm_set1_ps(0.5f);
58 __m128 m = _mm_set1_ps(mul);
59 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
60 {
61 __m128 t = _mm_loadu_ps(sp);
62 __m128 s = _mm_add_ps(t, shift);
63 s = _mm_mul_ps(s, m);
64 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
65 }
66 _MM_SET_ROUNDING_MODE(rounding_mode);
67 }
68
70 void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
71 ui32 width)
72 {
73 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
74 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
75 __m128 m = _mm_set1_ps(mul);
76 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
77 {
78 __m128 t = _mm_loadu_ps(sp);
79 __m128 s = _mm_mul_ps(t, m);
80 _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
81 }
82 _MM_SET_ROUNDING_MODE(rounding_mode);
83 }
84
86 static inline
87 __m128i ojph_mm_max_ge_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
88 {
89 __m128 ct = _mm_cmpge_ps(x, y); // 0xFFFFFFFF for x >= y
90 __m128i c = _mm_castps_si128(ct); // does not generate any code
91 __m128i d = _mm_and_si128(c, a); // keep only a, where x >= y
92 __m128i e = _mm_andnot_si128(c, b); // keep only b, where x < y
93 return _mm_or_si128(d, e); // combine
94 }
95
97 static inline
98 __m128i ojph_mm_min_lt_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
99 {
100 __m128 ct = _mm_cmplt_ps(x, y); // 0xFFFFFFFF for x < y
101 __m128i c = _mm_castps_si128(ct); // does not generate any code
102 __m128i d = _mm_and_si128(c, a); // keep only a, where x < y
103 __m128i e = _mm_andnot_si128(c, b); // keep only b, where x >= y
104 return _mm_or_si128(d, e); // combine
105 }
106
108 template <bool NLT_TYPE3>
109 static inline
111 line_buf *dst_line, ui32 dst_line_offset,
112 ui32 bit_depth, bool is_signed, ui32 width)
113 {
114 assert((src_line->flags & line_buf::LFT_32BIT) &&
115 (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
116 (dst_line->flags & line_buf::LFT_32BIT) &&
117 (dst_line->flags & line_buf::LFT_INTEGER));
118
119 assert(bit_depth <= 32);
120 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
121 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
122
123 const float* sp = src_line->f32;
124 si32* dp = dst_line->i32 + dst_line_offset;
125 // There is the possibility that converting to integer will
126 // exceed the dynamic range of 32bit integer; therefore, care must be
127 // exercised.
128 // We look if the floating point number is outside the half-closed
129 // interval [-0.5f, 0.5f). If so, we limit the resulting integer
130 // to the maximum/minimum that number supports.
131 si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
132 __m128 mul = _mm_set1_ps((float)(1ull << bit_depth));
133 __m128 fl_up_lim = _mm_set1_ps(-(float)neg_limit); // val < upper
134 __m128 fl_low_lim = _mm_set1_ps((float)neg_limit); // val >= lower
135 __m128i s32_up_lim = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
136 __m128i s32_low_lim = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
137
138 if (is_signed)
139 {
140 __m128i zero = _mm_setzero_si128();
141 __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
142 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
143 __m128 t = _mm_loadu_ps(sp);
144 t = _mm_mul_ps(t, mul);
145 __m128i u = _mm_cvtps_epi32(t);
146 u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
147 u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
148 if (NLT_TYPE3)
149 {
150 __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
151 __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
152 neg = _mm_and_si128(c, neg); //keep only - bias - value
153 u = _mm_andnot_si128(c, u); //keep only +ve or 0
154 u = _mm_or_si128(neg, u); //combine
155 }
156 _mm_storeu_si128((__m128i*)dp, u);
157 }
158 }
159 else
160 {
161 __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
162 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
163 __m128 t = _mm_loadu_ps(sp);
164 t = _mm_mul_ps(t, mul);
165 __m128i u = _mm_cvtps_epi32(t);
166 u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
167 u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
168 u = _mm_add_epi32(u, half);
169 _mm_storeu_si128((__m128i*)dp, u);
170 }
171 }
172
173 _MM_SET_ROUNDING_MODE(rounding_mode);
174 }
175
178 line_buf *dst_line, ui32 dst_line_offset,
179 ui32 bit_depth, bool is_signed, ui32 width)
180 {
181 local_sse2_irv_convert_to_integer<false>(src_line, dst_line,
182 dst_line_offset, bit_depth, is_signed, width);
183 }
184
187 line_buf *dst_line, ui32 dst_line_offset,
188 ui32 bit_depth, bool is_signed, ui32 width)
189 {
190 local_sse2_irv_convert_to_integer<true>(src_line, dst_line,
191 dst_line_offset, bit_depth, is_signed, width);
192 }
193
195 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
196 static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
197 {
198 // note than m must be obtained using
199 // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
200 __m128i x = _mm_srli_epi64(a, amt);
201 x = _mm_xor_si128(x, m);
202 __m128i result = _mm_sub_epi64(x, m);
203 return result;
204 }
205
207 static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
208 {
209 __m128i t;
210 t = _mm_cmplt_epi32(a, zero); // get -ve
211 t = _mm_unpacklo_epi32(a, t);
212 return t;
213 }
214
216 static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
217 {
218 __m128i t;
219 t = _mm_cmplt_epi32(a, zero); // get -ve
220 t = _mm_unpackhi_epi32(a, t);
221 return t;
222 }
223
225 void sse2_rev_convert(const line_buf *src_line,
226 const ui32 src_line_offset,
227 line_buf *dst_line,
228 const ui32 dst_line_offset,
229 si64 shift, ui32 width)
230 {
231 if (src_line->flags & line_buf::LFT_32BIT)
232 {
233 if (dst_line->flags & line_buf::LFT_32BIT)
234 {
235 const si32 *sp = src_line->i32 + src_line_offset;
236 si32 *dp = dst_line->i32 + dst_line_offset;
237 __m128i sh = _mm_set1_epi32((si32)shift);
238 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
239 {
240 __m128i s = _mm_loadu_si128((__m128i*)sp);
241 s = _mm_add_epi32(s, sh);
242 _mm_storeu_si128((__m128i*)dp, s);
243 }
244 }
245 else
246 {
247 const si32 *sp = src_line->i32 + src_line_offset;
248 si64 *dp = dst_line->i64 + dst_line_offset;
249 __m128i zero = _mm_setzero_si128();
250 __m128i sh = _mm_set1_epi64x(shift);
251 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
252 {
253 __m128i s, t;
254 s = _mm_loadu_si128((__m128i*)sp);
255
256 t = sse2_cvtlo_epi32_epi64(s, zero);
257 t = _mm_add_epi64(t, sh);
258 _mm_storeu_si128((__m128i*)dp, t);
259
260 t = sse2_cvthi_epi32_epi64(s, zero);
261 t = _mm_add_epi64(t, sh);
262 _mm_storeu_si128((__m128i*)dp + 1, t);
263 }
264 }
265 }
266 else
267 {
268 assert(src_line->flags | line_buf::LFT_64BIT);
269 assert(dst_line->flags | line_buf::LFT_32BIT);
270 const si64 *sp = src_line->i64 + src_line_offset;
271 si32 *dp = dst_line->i32 + dst_line_offset;
272 __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
273 __m128i sh = _mm_set1_epi64x(shift);
274 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
275 {
276 __m128i s, t;
277 s = _mm_loadu_si128((__m128i*)sp);
278 s = _mm_add_epi64(s, sh);
279
280 t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
281 t = _mm_and_si128(low_bits, t);
282
283 s = _mm_loadu_si128((__m128i*)sp + 1);
284 s = _mm_add_epi64(s, sh);
285
286 s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
287 s = _mm_andnot_si128(low_bits, s);
288
289 t = _mm_or_si128(s, t);
290 _mm_storeu_si128((__m128i*)dp, t);
291 }
292 }
293 }
294
297 const ui32 src_line_offset,
298 line_buf *dst_line,
299 const ui32 dst_line_offset,
300 si64 shift, ui32 width)
301 {
302 if (src_line->flags & line_buf::LFT_32BIT)
303 {
304 if (dst_line->flags & line_buf::LFT_32BIT)
305 {
306 const si32 *sp = src_line->i32 + src_line_offset;
307 si32 *dp = dst_line->i32 + dst_line_offset;
308 __m128i sh = _mm_set1_epi32((si32)(-shift));
309 __m128i zero = _mm_setzero_si128();
310 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
311 {
312 __m128i s = _mm_loadu_si128((__m128i*)sp);
313 __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value
314 __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value
315 v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
316 s = _mm_andnot_si128(c, s); // keep only +ve or 0
317 s = _mm_or_si128(s, v_m_sh); // combine
318 _mm_storeu_si128((__m128i*)dp, s);
319 }
320 }
321 else
322 {
323 const si32 *sp = src_line->i32 + src_line_offset;
324 si64 *dp = dst_line->i64 + dst_line_offset;
325 __m128i sh = _mm_set1_epi64x(-shift);
326 __m128i zero = _mm_setzero_si128();
327 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
328 {
329 __m128i s, t, u, c, v_m_sh;
330 s = _mm_loadu_si128((__m128i*)sp);
331
332 t = _mm_cmplt_epi32(s, zero); // find -ve 32bit -1
333 u = _mm_unpacklo_epi32(s, t); // correct 64bit data
334 c = _mm_unpacklo_epi32(t, t); // 64bit -1 for -ve value
335
336 v_m_sh = _mm_sub_epi64(sh, u); // - shift - value
337 v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
338 u = _mm_andnot_si128(c, u); // keep only +ve or 0
339 u = _mm_or_si128(u, v_m_sh); // combine
340
341 _mm_storeu_si128((__m128i*)dp, u);
342 u = _mm_unpackhi_epi32(s, t); // correct 64bit data
343 c = _mm_unpackhi_epi32(t, t); // 64bit -1 for -ve value
344
345 v_m_sh = _mm_sub_epi64(sh, u); // - shift - value
346 v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
347 u = _mm_andnot_si128(c, u); // keep only +ve or 0
348 u = _mm_or_si128(u, v_m_sh); // combine
349
350 _mm_storeu_si128((__m128i*)dp + 1, u);
351 }
352 }
353 }
354 else
355 {
356 assert(src_line->flags | line_buf::LFT_64BIT);
357 assert(dst_line->flags | line_buf::LFT_32BIT);
358 const si64 *sp = src_line->i64 + src_line_offset;
359 si32 *dp = dst_line->i32 + dst_line_offset;
360 __m128i sh = _mm_set1_epi64x(-shift);
361 __m128i zero = _mm_setzero_si128();
362 __m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX);
363 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
364 {
365 // s for source, t for target, p for positive, n for negative,
366 // m for mask, and tm for temp
367 __m128i s, t, p, n, m, tm;
368 s = _mm_loadu_si128((__m128i*)sp);
369
370 tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value
371 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
372 tm = _mm_sub_epi64(sh, s); // - shift - value
373 n = _mm_and_si128(m, tm); // -ve
374 p = _mm_andnot_si128(m, s); // +ve
375 tm = _mm_or_si128(n, p);
376 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
377 t = _mm_and_si128(half_mask, tm);
378
379 s = _mm_loadu_si128((__m128i*)sp + 1);
380 tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value
381 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
382 tm = _mm_sub_epi64(sh, s); // - shift - value
383 n = _mm_and_si128(m, tm); // -ve
384 p = _mm_andnot_si128(m, s); // +ve
385 tm = _mm_or_si128(n, p);
386 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
387 tm = _mm_andnot_si128(half_mask, tm);
388
389 t = _mm_or_si128(t, tm);
390 _mm_storeu_si128((__m128i*)dp, t);
391 }
392 }
393 }
394
396 template<bool NLT_TYPE3>
397 static inline
399 ui32 src_line_offset, line_buf *dst_line,
400 ui32 bit_depth, bool is_signed, ui32 width)
401 {
402 assert((src_line->flags & line_buf::LFT_32BIT) &&
403 (src_line->flags & line_buf::LFT_INTEGER) &&
404 (dst_line->flags & line_buf::LFT_32BIT) &&
405 (dst_line->flags & line_buf::LFT_INTEGER) == 0);
406
407 assert(bit_depth <= 32);
408 __m128 mul = _mm_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
409
410 const si32* sp = src_line->i32 + src_line_offset;
411 float* dp = dst_line->f32;
412 if (is_signed)
413 {
414 __m128i zero = _mm_setzero_si128();
415 __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
416 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
417 __m128i t = _mm_loadu_si128((__m128i*)sp);
418 if (NLT_TYPE3)
419 {
420 __m128i c = _mm_cmplt_epi32(t, zero); // 0xFFFFFFFF for -ve value
421 __m128i neg = _mm_sub_epi32(bias, t); // - bias - value
422 neg = _mm_and_si128(c, neg); // keep only - bias - value
423 c = _mm_andnot_si128(c, t); // keep only +ve or 0
424 t = _mm_or_si128(neg, c); // combine
425 }
426 __m128 v = _mm_cvtepi32_ps(t);
427 v = _mm_mul_ps(v, mul);
428 _mm_storeu_ps(dp, v);
429 }
430 }
431 else
432 {
433 __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
434 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
435 __m128i t = _mm_loadu_si128((__m128i*)sp);
436 t = _mm_sub_epi32(t, half);
437 __m128 v = _mm_cvtepi32_ps(t);
438 v = _mm_mul_ps(v, mul);
439 _mm_storeu_ps(dp, v);
440 }
441 }
442 }
443
446 ui32 src_line_offset, line_buf *dst_line,
447 ui32 bit_depth, bool is_signed, ui32 width)
448 {
449 local_sse2_irv_convert_to_float<false>(src_line, src_line_offset,
450 dst_line, bit_depth, is_signed, width);
451 }
452
455 ui32 src_line_offset, line_buf *dst_line,
456 ui32 bit_depth, bool is_signed, ui32 width)
457 {
458 local_sse2_irv_convert_to_float<true>(src_line, src_line_offset,
459 dst_line, bit_depth, is_signed, width);
460 }
461
464 const line_buf *g,
465 const line_buf *b,
466 line_buf *y, line_buf *cb, line_buf *cr,
467 ui32 repeat)
468 {
469 assert((y->flags & line_buf::LFT_INTEGER) &&
475
476 if (y->flags & line_buf::LFT_32BIT)
477 {
478 assert((y->flags & line_buf::LFT_32BIT) &&
479 (cb->flags & line_buf::LFT_32BIT) &&
480 (cr->flags & line_buf::LFT_32BIT) &&
481 (r->flags & line_buf::LFT_32BIT) &&
482 (g->flags & line_buf::LFT_32BIT) &&
484 const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
485 si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
486 for (int i = (repeat + 3) >> 2; i > 0; --i)
487 {
488 __m128i mr = _mm_load_si128((__m128i*)rp);
489 __m128i mg = _mm_load_si128((__m128i*)gp);
490 __m128i mb = _mm_load_si128((__m128i*)bp);
491 __m128i t = _mm_add_epi32(mr, mb);
492 t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
493 _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2));
494 t = _mm_sub_epi32(mb, mg);
495 _mm_store_si128((__m128i*)cbp, t);
496 t = _mm_sub_epi32(mr, mg);
497 _mm_store_si128((__m128i*)crp, t);
498
499 rp += 4; gp += 4; bp += 4;
500 yp += 4; cbp += 4; crp += 4;
501 }
502 }
503 else
504 {
505 assert((y->flags & line_buf::LFT_64BIT) &&
506 (cb->flags & line_buf::LFT_64BIT) &&
507 (cr->flags & line_buf::LFT_64BIT) &&
508 (r->flags & line_buf::LFT_32BIT) &&
509 (g->flags & line_buf::LFT_32BIT) &&
511 __m128i zero = _mm_setzero_si128();
512 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
513 const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
514 si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
515 for (int i = (repeat + 3) >> 2; i > 0; --i)
516 {
517 __m128i mr32 = _mm_load_si128((__m128i*)rp);
518 __m128i mg32 = _mm_load_si128((__m128i*)gp);
519 __m128i mb32 = _mm_load_si128((__m128i*)bp);
520 __m128i mr, mg, mb, t;
521 mr = sse2_cvtlo_epi32_epi64(mr32, zero);
522 mg = sse2_cvtlo_epi32_epi64(mg32, zero);
523 mb = sse2_cvtlo_epi32_epi64(mb32, zero);
524
525 t = _mm_add_epi64(mr, mb);
526 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
527 _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
528 t = _mm_sub_epi64(mb, mg);
529 _mm_store_si128((__m128i*)cbp, t);
530 t = _mm_sub_epi64(mr, mg);
531 _mm_store_si128((__m128i*)crp, t);
532
533 yp += 2; cbp += 2; crp += 2;
534
535 mr = sse2_cvthi_epi32_epi64(mr32, zero);
536 mg = sse2_cvthi_epi32_epi64(mg32, zero);
537 mb = sse2_cvthi_epi32_epi64(mb32, zero);
538
539 t = _mm_add_epi64(mr, mb);
540 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
541 _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
542 t = _mm_sub_epi64(mb, mg);
543 _mm_store_si128((__m128i*)cbp, t);
544 t = _mm_sub_epi64(mr, mg);
545 _mm_store_si128((__m128i*)crp, t);
546
547 rp += 4; gp += 4; bp += 4;
548 yp += 2; cbp += 2; crp += 2;
549 }
550 }
551 }
552
555 const line_buf *cb,
556 const line_buf *cr,
557 line_buf *r, line_buf *g, line_buf *b,
558 ui32 repeat)
559 {
560 assert((y->flags & line_buf::LFT_INTEGER) &&
566
567 if (y->flags & line_buf::LFT_32BIT)
568 {
569 assert((y->flags & line_buf::LFT_32BIT) &&
570 (cb->flags & line_buf::LFT_32BIT) &&
571 (cr->flags & line_buf::LFT_32BIT) &&
572 (r->flags & line_buf::LFT_32BIT) &&
573 (g->flags & line_buf::LFT_32BIT) &&
575 const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
576 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
577 for (int i = (repeat + 3) >> 2; i > 0; --i)
578 {
579 __m128i my = _mm_load_si128((__m128i*)yp);
580 __m128i mcb = _mm_load_si128((__m128i*)cbp);
581 __m128i mcr = _mm_load_si128((__m128i*)crp);
582
583 __m128i t = _mm_add_epi32(mcb, mcr);
584 t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
585 _mm_store_si128((__m128i*)gp, t);
586 __m128i u = _mm_add_epi32(mcb, t);
587 _mm_store_si128((__m128i*)bp, u);
588 u = _mm_add_epi32(mcr, t);
589 _mm_store_si128((__m128i*)rp, u);
590
591 yp += 4; cbp += 4; crp += 4;
592 rp += 4; gp += 4; bp += 4;
593 }
594 }
595 else
596 {
597 assert((y->flags & line_buf::LFT_64BIT) &&
598 (cb->flags & line_buf::LFT_64BIT) &&
599 (cr->flags & line_buf::LFT_64BIT) &&
600 (r->flags & line_buf::LFT_32BIT) &&
601 (g->flags & line_buf::LFT_32BIT) &&
603 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
604 __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
605 const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
606 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
607 for (int i = (repeat + 3) >> 2; i > 0; --i)
608 {
609 __m128i my, mcb, mcr, tr, tg, tb;
610 my = _mm_load_si128((__m128i*)yp);
611 mcb = _mm_load_si128((__m128i*)cbp);
612 mcr = _mm_load_si128((__m128i*)crp);
613
614 tg = _mm_add_epi64(mcb, mcr);
615 tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
616 tb = _mm_add_epi64(mcb, tg);
617 tr = _mm_add_epi64(mcr, tg);
618
619 __m128i mr, mg, mb;
620 mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
621 mr = _mm_and_si128(low_bits, mr);
622 mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
623 mg = _mm_and_si128(low_bits, mg);
624 mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
625 mb = _mm_and_si128(low_bits, mb);
626
627 yp += 2; cbp += 2; crp += 2;
628
629 my = _mm_load_si128((__m128i*)yp);
630 mcb = _mm_load_si128((__m128i*)cbp);
631 mcr = _mm_load_si128((__m128i*)crp);
632
633 tg = _mm_add_epi64(mcb, mcr);
634 tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
635 tb = _mm_add_epi64(mcb, tg);
636 tr = _mm_add_epi64(mcr, tg);
637
638 tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
639 tr = _mm_andnot_si128(low_bits, tr);
640 mr = _mm_or_si128(mr, tr);
641 tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
642 tg = _mm_andnot_si128(low_bits, tg);
643 mg = _mm_or_si128(mg, tg);
644 tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
645 tb = _mm_andnot_si128(low_bits, tb);
646 mb = _mm_or_si128(mb, tb);
647
648 _mm_store_si128((__m128i*)rp, mr);
649 _mm_store_si128((__m128i*)gp, mg);
650 _mm_store_si128((__m128i*)bp, mb);
651
652 yp += 2; cbp += 2; crp += 2;
653 rp += 4; gp += 4; bp += 4;
654 }
655 }
656 }
657 }
658}
si64 * i64
Definition: ojph_mem.h:161
float * f32
Definition: ojph_mem.h:162
si32 * i32
Definition: ojph_mem.h:160
void sse2_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
void sse2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
static void local_sse2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
static __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void sse2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
static __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
static void local_sse2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
static __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
static __m128i ojph_mm_min_lt_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
void sse2_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
static __m128i ojph_mm_max_ge_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
void sse2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
int64_t si64
Definition: ojph_defs.h:57
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54