1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2014 Timothy Arceri 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg * 23b8e80941Smrg * Author: 24b8e80941Smrg * Timothy Arceri <t_arceri@yahoo.com.au> 25b8e80941Smrg * 26b8e80941Smrg */ 27b8e80941Smrg 28b8e80941Smrg#include "main/sse_minmax.h" 29b8e80941Smrg#include <smmintrin.h> 30b8e80941Smrg#include <stdint.h> 31b8e80941Smrg 32b8e80941Smrgvoid 33b8e80941Smrg_mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index, 34b8e80941Smrg unsigned *max_index, const unsigned count) 35b8e80941Smrg{ 36b8e80941Smrg unsigned max_ui = 0; 37b8e80941Smrg unsigned min_ui = ~0U; 38b8e80941Smrg unsigned i = 0; 39b8e80941Smrg unsigned aligned_count = count; 40b8e80941Smrg 41b8e80941Smrg /* handle the first few values without SSE until the pointer is aligned */ 42b8e80941Smrg while (((uintptr_t)ui_indices & 15) && aligned_count) { 43b8e80941Smrg if (*ui_indices > max_ui) 44b8e80941Smrg max_ui = *ui_indices; 45b8e80941Smrg if (*ui_indices < min_ui) 46b8e80941Smrg min_ui = *ui_indices; 47b8e80941Smrg 48b8e80941Smrg aligned_count--; 49b8e80941Smrg ui_indices++; 50b8e80941Smrg } 51b8e80941Smrg 52b8e80941Smrg /* TODO: The actual threshold for SSE begin useful may be higher than 8. 53b8e80941Smrg * Some careful microbenchmarks and measurement are required to 54b8e80941Smrg * find the actual tipping point. 55b8e80941Smrg */ 56b8e80941Smrg if (aligned_count >= 8) { 57b8e80941Smrg unsigned max_arr[4] __attribute__ ((aligned (16))); 58b8e80941Smrg unsigned min_arr[4] __attribute__ ((aligned (16))); 59b8e80941Smrg unsigned vec_count; 60b8e80941Smrg __m128i max_ui4 = _mm_setzero_si128(); 61b8e80941Smrg __m128i min_ui4 = _mm_set1_epi32(~0U); 62b8e80941Smrg __m128i ui_indices4; 63b8e80941Smrg __m128i *ui_indices_ptr; 64b8e80941Smrg 65b8e80941Smrg vec_count = aligned_count & ~0x3; 66b8e80941Smrg ui_indices_ptr = (__m128i *)ui_indices; 67b8e80941Smrg for (i = 0; i < vec_count / 4; i++) { 68b8e80941Smrg ui_indices4 = _mm_load_si128(&ui_indices_ptr[i]); 69b8e80941Smrg max_ui4 = _mm_max_epu32(ui_indices4, max_ui4); 70b8e80941Smrg min_ui4 = _mm_min_epu32(ui_indices4, min_ui4); 71b8e80941Smrg } 72b8e80941Smrg 73b8e80941Smrg _mm_store_si128((__m128i *)max_arr, max_ui4); 74b8e80941Smrg _mm_store_si128((__m128i *)min_arr, min_ui4); 75b8e80941Smrg 76b8e80941Smrg for (i = 0; i < 4; i++) { 77b8e80941Smrg if (max_arr[i] > max_ui) 78b8e80941Smrg max_ui = max_arr[i]; 79b8e80941Smrg if (min_arr[i] < min_ui) 80b8e80941Smrg min_ui = min_arr[i]; 81b8e80941Smrg } 82b8e80941Smrg i = vec_count; 83b8e80941Smrg } 84b8e80941Smrg 85b8e80941Smrg for (; i < aligned_count; i++) { 86b8e80941Smrg if (ui_indices[i] > max_ui) 87b8e80941Smrg max_ui = ui_indices[i]; 88b8e80941Smrg if (ui_indices[i] < min_ui) 89b8e80941Smrg min_ui = ui_indices[i]; 90b8e80941Smrg } 91b8e80941Smrg 92b8e80941Smrg *min_index = min_ui; 93b8e80941Smrg *max_index = max_ui; 94b8e80941Smrg} 95