OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_transform_sse.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_sse.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include <cstdio>
39#include <xmmintrin.h>
40
41#include "ojph_defs.h"
42#include "ojph_arch.h"
43#include "ojph_mem.h"
44#include "ojph_params.h"
45#include "../codestream/ojph_params_local.h"
46
47#include "ojph_transform.h"
49
50namespace ojph {
51 namespace local {
52
54 static inline
55 void sse_deinterleave32(float* dpl, float* dph, float* sp, int width)
56 {
57 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
58 {
59 __m128 a = _mm_load_ps(sp);
60 __m128 b = _mm_load_ps(sp + 4);
61 __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
62 __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
63 _mm_store_ps(dpl, c);
64 _mm_store_ps(dph, d);
65 }
66 }
67
69 static inline
70 void sse_interleave32(float* dp, float* spl, float* sph, int width) \
71 {
72 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
73 {
74 __m128 a = _mm_load_ps(spl);
75 __m128 b = _mm_load_ps(sph);
76 __m128 c = _mm_unpacklo_ps(a, b);
77 __m128 d = _mm_unpackhi_ps(a, b);
78 _mm_store_ps(dp, c);
79 _mm_store_ps(dp + 4, d);
80 }
81 }
82
84 static inline void sse_multiply_const(float* p, float f, int width)
85 {
86 __m128 factor = _mm_set1_ps(f);
87 for (; width > 0; width -= 4, p += 4)
88 {
89 __m128 s = _mm_load_ps(p);
90 _mm_store_ps(p, _mm_mul_ps(factor, s));
91 }
92 }
93
95 void sse_irv_vert_step(const lifting_step* s, const line_buf* sig,
96 const line_buf* other, const line_buf* aug,
97 ui32 repeat, bool synthesis)
98 {
99 float a = s->irv.Aatk;
100 if (synthesis)
101 a = -a;
102
103 __m128 factor = _mm_set1_ps(a);
104
105 float* dst = aug->f32;
106 const float* src1 = sig->f32, * src2 = other->f32;
107 int i = (int)repeat;
108 for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
109 {
110 __m128 s1 = _mm_load_ps(src1);
111 __m128 s2 = _mm_load_ps(src2);
112 __m128 d = _mm_load_ps(dst);
113 d = _mm_add_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
114 _mm_store_ps(dst, d);
115 }
116 }
117
119 void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
120 {
121 sse_multiply_const(aug->f32, K, (int)repeat);
122 }
123
125 void sse_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
126 const line_buf* hdst, const line_buf* src,
127 ui32 width, bool even)
128 {
129 if (width > 1)
130 {
131 // split src into ldst and hdst
132 {
133 float* dpl = even ? ldst->f32 : hdst->f32;
134 float* dph = even ? hdst->f32 : ldst->f32;
135 float* sp = src->f32;
136 int w = (int)width;
137 sse_deinterleave32(dpl, dph, sp, w);
138 }
139
140 // the actual horizontal transform
141 float* hp = hdst->f32, * lp = ldst->f32;
142 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
143 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
144 ui32 num_steps = atk->get_num_steps();
145 for (ui32 j = num_steps; j > 0; --j)
146 {
147 const lifting_step* s = atk->get_step(j - 1);
148 const float a = s->irv.Aatk;
149
150 // extension
151 lp[-1] = lp[0];
152 lp[l_width] = lp[l_width - 1];
153 // lifting step
154 const float* sp = lp;
155 float* dp = hp;
156 int i = (int)h_width;
157 __m128 f = _mm_set1_ps(a);
158 if (even)
159 {
160 for (; i > 0; i -= 4, sp += 4, dp += 4)
161 {
162 __m128 m = _mm_load_ps(sp);
163 __m128 n = _mm_loadu_ps(sp + 1);
164 __m128 p = _mm_load_ps(dp);
165 p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
166 _mm_store_ps(dp, p);
167 }
168 }
169 else
170 {
171 for (; i > 0; i -= 4, sp += 4, dp += 4)
172 {
173 __m128 m = _mm_load_ps(sp);
174 __m128 n = _mm_loadu_ps(sp - 1);
175 __m128 p = _mm_load_ps(dp);
176 p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
177 _mm_store_ps(dp, p);
178 }
179 }
180
181 // swap buffers
182 float* t = lp; lp = hp; hp = t;
183 even = !even;
184 ui32 w = l_width; l_width = h_width; h_width = w;
185 }
186
187 { // multiply by K or 1/K
188 float K = atk->get_K();
189 float K_inv = 1.0f / K;
190 sse_multiply_const(lp, K_inv, (int)l_width);
191 sse_multiply_const(hp, K, (int)h_width);
192 }
193 }
194 else {
195 if (even)
196 ldst->f32[0] = src->f32[0];
197 else
198 hdst->f32[0] = src->f32[0] * 2.0f;
199 }
200 }
201
203 void sse_irv_horz_syn(const param_atk* atk, const line_buf* dst,
204 const line_buf* lsrc, const line_buf* hsrc,
205 ui32 width, bool even)
206 {
207 if (width > 1)
208 {
209 bool ev = even;
210 float* oth = hsrc->f32, * aug = lsrc->f32;
211 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
212 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
213
214 { // multiply by K or 1/K
215 float K = atk->get_K();
216 float K_inv = 1.0f / K;
217 sse_multiply_const(aug, K, (int)aug_width);
218 sse_multiply_const(oth, K_inv, (int)oth_width);
219 }
220
221 // the actual horizontal transform
222 ui32 num_steps = atk->get_num_steps();
223 for (ui32 j = 0; j < num_steps; ++j)
224 {
225 const lifting_step* s = atk->get_step(j);
226 const float a = s->irv.Aatk;
227
228 // extension
229 oth[-1] = oth[0];
230 oth[oth_width] = oth[oth_width - 1];
231 // lifting step
232 const float* sp = oth;
233 float* dp = aug;
234 int i = (int)aug_width;
235 __m128 f = _mm_set1_ps(a);
236 if (ev)
237 {
238 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
239 {
240 __m128 m = _mm_load_ps(sp);
241 __m128 n = _mm_loadu_ps(sp - 1);
242 __m128 p = _mm_load_ps(dp);
243 p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
244 _mm_store_ps(dp, p);
245 }
246 }
247 else
248 {
249 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
250 {
251 __m128 m = _mm_load_ps(sp);
252 __m128 n = _mm_loadu_ps(sp + 1);
253 __m128 p = _mm_load_ps(dp);
254 p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
255 _mm_store_ps(dp, p);
256 }
257 }
258
259 // swap buffers
260 float* t = aug; aug = oth; oth = t;
261 ev = !ev;
262 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
263 }
264
265 // combine both lsrc and hsrc into dst
266 {
267 float* dp = dst->f32;
268 float* spl = even ? lsrc->f32 : hsrc->f32;
269 float* sph = even ? hsrc->f32 : lsrc->f32;
270 int w = (int)width;
271 sse_interleave32(dp, spl, sph, w);
272 }
273 }
274 else {
275 if (even)
276 dst->f32[0] = lsrc->f32[0];
277 else
278 dst->f32[0] = hsrc->f32[0] * 0.5f;
279 }
280 }
281
282 } // !local
283} // !ojph
float * f32
Definition: ojph_mem.h:162
void sse_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
static void sse_multiply_const(float *p, float f, int width)
void sse_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void sse_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void sse_deinterleave32(float *dpl, float *dph, float *sp, int width)
void sse_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void sse_interleave32(float *dp, float *spl, float *sph, int width)
uint32_t ui32
Definition: ojph_defs.h:54
const lifting_step * get_step(ui32 s) const