1b8e80941Smrg/*
2b8e80941Smrg * Copyright © 2014 Timothy Arceri
3b8e80941Smrg *
4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
6b8e80941Smrg * to deal in the Software without restriction, including without limitation
7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
9b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
10b8e80941Smrg *
11b8e80941Smrg * The above copyright notice and this permission notice (including the next
12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
13b8e80941Smrg * Software.
14b8e80941Smrg *
15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21b8e80941Smrg * IN THE SOFTWARE.
22b8e80941Smrg *
23b8e80941Smrg * Author:
24b8e80941Smrg *    Timothy Arceri <t_arceri@yahoo.com.au>
25b8e80941Smrg *
26b8e80941Smrg */
27b8e80941Smrg
28b8e80941Smrg#include "main/sse_minmax.h"
29b8e80941Smrg#include <smmintrin.h>
30b8e80941Smrg#include <stdint.h>
31b8e80941Smrg
32b8e80941Smrgvoid
33b8e80941Smrg_mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index,
34b8e80941Smrg                         unsigned *max_index, const unsigned count)
35b8e80941Smrg{
36b8e80941Smrg   unsigned max_ui = 0;
37b8e80941Smrg   unsigned min_ui = ~0U;
38b8e80941Smrg   unsigned i = 0;
39b8e80941Smrg   unsigned aligned_count = count;
40b8e80941Smrg
41b8e80941Smrg   /* handle the first few values without SSE until the pointer is aligned */
42b8e80941Smrg   while (((uintptr_t)ui_indices & 15) && aligned_count) {
43b8e80941Smrg      if (*ui_indices > max_ui)
44b8e80941Smrg         max_ui = *ui_indices;
45b8e80941Smrg      if (*ui_indices < min_ui)
46b8e80941Smrg         min_ui = *ui_indices;
47b8e80941Smrg
48b8e80941Smrg      aligned_count--;
49b8e80941Smrg      ui_indices++;
50b8e80941Smrg   }
51b8e80941Smrg
52b8e80941Smrg   /* TODO: The actual threshold for SSE begin useful may be higher than 8.
53b8e80941Smrg    * Some careful microbenchmarks and measurement are required to
54b8e80941Smrg    * find the actual tipping point.
55b8e80941Smrg    */
56b8e80941Smrg   if (aligned_count >= 8) {
57b8e80941Smrg      unsigned max_arr[4] __attribute__ ((aligned (16)));
58b8e80941Smrg      unsigned min_arr[4] __attribute__ ((aligned (16)));
59b8e80941Smrg      unsigned vec_count;
60b8e80941Smrg      __m128i max_ui4 = _mm_setzero_si128();
61b8e80941Smrg      __m128i min_ui4 = _mm_set1_epi32(~0U);
62b8e80941Smrg      __m128i ui_indices4;
63b8e80941Smrg      __m128i *ui_indices_ptr;
64b8e80941Smrg
65b8e80941Smrg      vec_count = aligned_count & ~0x3;
66b8e80941Smrg      ui_indices_ptr = (__m128i *)ui_indices;
67b8e80941Smrg      for (i = 0; i < vec_count / 4; i++) {
68b8e80941Smrg         ui_indices4 = _mm_load_si128(&ui_indices_ptr[i]);
69b8e80941Smrg         max_ui4 = _mm_max_epu32(ui_indices4, max_ui4);
70b8e80941Smrg         min_ui4 = _mm_min_epu32(ui_indices4, min_ui4);
71b8e80941Smrg      }
72b8e80941Smrg
73b8e80941Smrg      _mm_store_si128((__m128i *)max_arr, max_ui4);
74b8e80941Smrg      _mm_store_si128((__m128i *)min_arr, min_ui4);
75b8e80941Smrg
76b8e80941Smrg      for (i = 0; i < 4; i++) {
77b8e80941Smrg         if (max_arr[i] > max_ui)
78b8e80941Smrg            max_ui = max_arr[i];
79b8e80941Smrg         if (min_arr[i] < min_ui)
80b8e80941Smrg            min_ui = min_arr[i];
81b8e80941Smrg      }
82b8e80941Smrg      i = vec_count;
83b8e80941Smrg   }
84b8e80941Smrg
85b8e80941Smrg   for (; i < aligned_count; i++) {
86b8e80941Smrg      if (ui_indices[i] > max_ui)
87b8e80941Smrg         max_ui = ui_indices[i];
88b8e80941Smrg      if (ui_indices[i] < min_ui)
89b8e80941Smrg         min_ui = ui_indices[i];
90b8e80941Smrg   }
91b8e80941Smrg
92b8e80941Smrg   *min_index = min_ui;
93b8e80941Smrg   *max_index = max_ui;
94b8e80941Smrg}
95