112 ui32 bit_depth,
bool is_signed,
ui32 width)
119 assert(bit_depth <= 32);
120 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
121 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
123 const float* sp = src_line->
f32;
124 si32* dp = dst_line->
i32 + dst_line_offset;
131 si32 neg_limit = (
si32)INT_MIN >> (32 - bit_depth);
132 __m128 mul = _mm_set1_ps((
float)(1ull << bit_depth));
133 __m128 fl_up_lim = _mm_set1_ps(-(
float)neg_limit);
134 __m128 fl_low_lim = _mm_set1_ps((
float)neg_limit);
135 __m128i s32_up_lim = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
136 __m128i s32_low_lim = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
140 __m128i zero = _mm_setzero_si128();
141 __m128i bias = _mm_set1_epi32(-(
si32)((1ULL << (bit_depth - 1)) + 1));
142 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
143 __m128 t = _mm_loadu_ps(sp);
144 t = _mm_mul_ps(t, mul);
145 __m128i u = _mm_cvtps_epi32(t);
150 __m128i c = _mm_cmpgt_epi32(zero, u);
151 __m128i neg = _mm_sub_epi32(bias, u);
152 neg = _mm_and_si128(c, neg);
153 u = _mm_andnot_si128(c, u);
154 u = _mm_or_si128(neg, u);
156 _mm_storeu_si128((__m128i*)dp, u);
161 __m128i half = _mm_set1_epi32((
si32)(1ULL << (bit_depth - 1)));
162 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
163 __m128 t = _mm_loadu_ps(sp);
164 t = _mm_mul_ps(t, mul);
165 __m128i u = _mm_cvtps_epi32(t);
168 u = _mm_add_epi32(u, half);
169 _mm_storeu_si128((__m128i*)dp, u);
173 _MM_SET_ROUNDING_MODE(rounding_mode);
226 const ui32 src_line_offset,
228 const ui32 dst_line_offset,
235 const si32 *sp = src_line->
i32 + src_line_offset;
236 si32 *dp = dst_line->
i32 + dst_line_offset;
237 __m128i sh = _mm_set1_epi32((
si32)shift);
238 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
240 __m128i s = _mm_loadu_si128((__m128i*)sp);
241 s = _mm_add_epi32(s, sh);
242 _mm_storeu_si128((__m128i*)dp, s);
247 const si32 *sp = src_line->
i32 + src_line_offset;
248 si64 *dp = dst_line->
i64 + dst_line_offset;
249 __m128i zero = _mm_setzero_si128();
250 __m128i sh = _mm_set1_epi64x(shift);
251 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
254 s = _mm_loadu_si128((__m128i*)sp);
257 t = _mm_add_epi64(t, sh);
258 _mm_storeu_si128((__m128i*)dp, t);
261 t = _mm_add_epi64(t, sh);
262 _mm_storeu_si128((__m128i*)dp + 1, t);
270 const si64 *sp = src_line->
i64 + src_line_offset;
271 si32 *dp = dst_line->
i32 + dst_line_offset;
272 __m128i low_bits = _mm_set_epi64x(0, (
si64)ULLONG_MAX);
273 __m128i sh = _mm_set1_epi64x(shift);
274 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
277 s = _mm_loadu_si128((__m128i*)sp);
278 s = _mm_add_epi64(s, sh);
280 t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
281 t = _mm_and_si128(low_bits, t);
283 s = _mm_loadu_si128((__m128i*)sp + 1);
284 s = _mm_add_epi64(s, sh);
286 s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
287 s = _mm_andnot_si128(low_bits, s);
289 t = _mm_or_si128(s, t);
290 _mm_storeu_si128((__m128i*)dp, t);
297 const ui32 src_line_offset,
299 const ui32 dst_line_offset,
306 const si32 *sp = src_line->
i32 + src_line_offset;
307 si32 *dp = dst_line->
i32 + dst_line_offset;
308 __m128i sh = _mm_set1_epi32((
si32)(-shift));
309 __m128i zero = _mm_setzero_si128();
310 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
312 __m128i s = _mm_loadu_si128((__m128i*)sp);
313 __m128i c = _mm_cmplt_epi32(s, zero);
314 __m128i v_m_sh = _mm_sub_epi32(sh, s);
315 v_m_sh = _mm_and_si128(c, v_m_sh);
316 s = _mm_andnot_si128(c, s);
317 s = _mm_or_si128(s, v_m_sh);
318 _mm_storeu_si128((__m128i*)dp, s);
323 const si32 *sp = src_line->
i32 + src_line_offset;
324 si64 *dp = dst_line->
i64 + dst_line_offset;
325 __m128i sh = _mm_set1_epi64x(-shift);
326 __m128i zero = _mm_setzero_si128();
327 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
329 __m128i s, t, u, c, v_m_sh;
330 s = _mm_loadu_si128((__m128i*)sp);
332 t = _mm_cmplt_epi32(s, zero);
333 u = _mm_unpacklo_epi32(s, t);
334 c = _mm_unpacklo_epi32(t, t);
336 v_m_sh = _mm_sub_epi64(sh, u);
337 v_m_sh = _mm_and_si128(c, v_m_sh);
338 u = _mm_andnot_si128(c, u);
339 u = _mm_or_si128(u, v_m_sh);
341 _mm_storeu_si128((__m128i*)dp, u);
342 u = _mm_unpackhi_epi32(s, t);
343 c = _mm_unpackhi_epi32(t, t);
345 v_m_sh = _mm_sub_epi64(sh, u);
346 v_m_sh = _mm_and_si128(c, v_m_sh);
347 u = _mm_andnot_si128(c, u);
348 u = _mm_or_si128(u, v_m_sh);
350 _mm_storeu_si128((__m128i*)dp + 1, u);
358 const si64 *sp = src_line->
i64 + src_line_offset;
359 si32 *dp = dst_line->
i32 + dst_line_offset;
360 __m128i sh = _mm_set1_epi64x(-shift);
361 __m128i zero = _mm_setzero_si128();
362 __m128i half_mask = _mm_set_epi64x(0, (
si64)ULLONG_MAX);
363 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
367 __m128i s, t, p, n, m, tm;
368 s = _mm_loadu_si128((__m128i*)sp);
370 tm = _mm_cmplt_epi32(s, zero);
371 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1));
372 tm = _mm_sub_epi64(sh, s);
373 n = _mm_and_si128(m, tm);
374 p = _mm_andnot_si128(m, s);
375 tm = _mm_or_si128(n, p);
376 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
377 t = _mm_and_si128(half_mask, tm);
379 s = _mm_loadu_si128((__m128i*)sp + 1);
380 tm = _mm_cmplt_epi32(s, zero);
381 m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1));
382 tm = _mm_sub_epi64(sh, s);
383 n = _mm_and_si128(m, tm);
384 p = _mm_andnot_si128(m, s);
385 tm = _mm_or_si128(n, p);
386 tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
387 tm = _mm_andnot_si128(half_mask, tm);
389 t = _mm_or_si128(t, tm);
390 _mm_storeu_si128((__m128i*)dp, t);
400 ui32 bit_depth,
bool is_signed,
ui32 width)
407 assert(bit_depth <= 32);
408 __m128 mul = _mm_set1_ps((
float)(1.0 / (
double)(1ULL << bit_depth)));
410 const si32* sp = src_line->
i32 + src_line_offset;
411 float* dp = dst_line->
f32;
414 __m128i zero = _mm_setzero_si128();
415 __m128i bias = _mm_set1_epi32(-(
si32)((1ULL << (bit_depth - 1)) + 1));
416 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
417 __m128i t = _mm_loadu_si128((__m128i*)sp);
420 __m128i c = _mm_cmplt_epi32(t, zero);
421 __m128i neg = _mm_sub_epi32(bias, t);
422 neg = _mm_and_si128(c, neg);
423 c = _mm_andnot_si128(c, t);
424 t = _mm_or_si128(neg, c);
426 __m128 v = _mm_cvtepi32_ps(t);
427 v = _mm_mul_ps(v, mul);
428 _mm_storeu_ps(dp, v);
433 __m128i half = _mm_set1_epi32((
si32)(1ULL << (bit_depth - 1)));
434 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
435 __m128i t = _mm_loadu_si128((__m128i*)sp);
436 t = _mm_sub_epi32(t, half);
437 __m128 v = _mm_cvtepi32_ps(t);
438 v = _mm_mul_ps(v, mul);
439 _mm_storeu_ps(dp, v);
486 for (
int i = (repeat + 3) >> 2; i > 0; --i)
488 __m128i mr = _mm_load_si128((__m128i*)rp);
489 __m128i mg = _mm_load_si128((__m128i*)gp);
490 __m128i mb = _mm_load_si128((__m128i*)bp);
491 __m128i t = _mm_add_epi32(mr, mb);
492 t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
493 _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2));
494 t = _mm_sub_epi32(mb, mg);
495 _mm_store_si128((__m128i*)cbp, t);
496 t = _mm_sub_epi32(mr, mg);
497 _mm_store_si128((__m128i*)crp, t);
499 rp += 4; gp += 4; bp += 4;
500 yp += 4; cbp += 4; crp += 4;
511 __m128i zero = _mm_setzero_si128();
512 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
515 for (
int i = (repeat + 3) >> 2; i > 0; --i)
517 __m128i mr32 = _mm_load_si128((__m128i*)rp);
518 __m128i mg32 = _mm_load_si128((__m128i*)gp);
519 __m128i mb32 = _mm_load_si128((__m128i*)bp);
520 __m128i mr, mg, mb, t;
525 t = _mm_add_epi64(mr, mb);
526 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
528 t = _mm_sub_epi64(mb, mg);
529 _mm_store_si128((__m128i*)cbp, t);
530 t = _mm_sub_epi64(mr, mg);
531 _mm_store_si128((__m128i*)crp, t);
533 yp += 2; cbp += 2; crp += 2;
539 t = _mm_add_epi64(mr, mb);
540 t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
542 t = _mm_sub_epi64(mb, mg);
543 _mm_store_si128((__m128i*)cbp, t);
544 t = _mm_sub_epi64(mr, mg);
545 _mm_store_si128((__m128i*)crp, t);
547 rp += 4; gp += 4; bp += 4;
548 yp += 2; cbp += 2; crp += 2;
577 for (
int i = (repeat + 3) >> 2; i > 0; --i)
579 __m128i my = _mm_load_si128((__m128i*)yp);
580 __m128i mcb = _mm_load_si128((__m128i*)cbp);
581 __m128i mcr = _mm_load_si128((__m128i*)crp);
583 __m128i t = _mm_add_epi32(mcb, mcr);
584 t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
585 _mm_store_si128((__m128i*)gp, t);
586 __m128i u = _mm_add_epi32(mcb, t);
587 _mm_store_si128((__m128i*)bp, u);
588 u = _mm_add_epi32(mcr, t);
589 _mm_store_si128((__m128i*)rp, u);
591 yp += 4; cbp += 4; crp += 4;
592 rp += 4; gp += 4; bp += 4;
603 __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
604 __m128i low_bits = _mm_set_epi64x(0, (
si64)ULLONG_MAX);
607 for (
int i = (repeat + 3) >> 2; i > 0; --i)
609 __m128i my, mcb, mcr, tr, tg, tb;
610 my = _mm_load_si128((__m128i*)yp);
611 mcb = _mm_load_si128((__m128i*)cbp);
612 mcr = _mm_load_si128((__m128i*)crp);
614 tg = _mm_add_epi64(mcb, mcr);
616 tb = _mm_add_epi64(mcb, tg);
617 tr = _mm_add_epi64(mcr, tg);
620 mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
621 mr = _mm_and_si128(low_bits, mr);
622 mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
623 mg = _mm_and_si128(low_bits, mg);
624 mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
625 mb = _mm_and_si128(low_bits, mb);
627 yp += 2; cbp += 2; crp += 2;
629 my = _mm_load_si128((__m128i*)yp);
630 mcb = _mm_load_si128((__m128i*)cbp);
631 mcr = _mm_load_si128((__m128i*)crp);
633 tg = _mm_add_epi64(mcb, mcr);
635 tb = _mm_add_epi64(mcb, tg);
636 tr = _mm_add_epi64(mcr, tg);
638 tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
639 tr = _mm_andnot_si128(low_bits, tr);
640 mr = _mm_or_si128(mr, tr);
641 tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
642 tg = _mm_andnot_si128(low_bits, tg);
643 mg = _mm_or_si128(mg, tg);
644 tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
645 tb = _mm_andnot_si128(low_bits, tb);
646 mb = _mm_or_si128(mb, tb);
648 _mm_store_si128((__m128i*)rp, mr);
649 _mm_store_si128((__m128i*)gp, mg);
650 _mm_store_si128((__m128i*)bp, mb);
652 yp += 2; cbp += 2; crp += 2;
653 rp += 4; gp += 4; bp += 4;