streaming-load-memcpy.c revision af69d88d
1af69d88dSmrg/* 2af69d88dSmrg * Copyright © 2013 Intel Corporation 3af69d88dSmrg * 4af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a 5af69d88dSmrg * copy of this software and associated documentation files (the "Software"), 6af69d88dSmrg * to deal in the Software without restriction, including without limitation 7af69d88dSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8af69d88dSmrg * and/or sell copies of the Software, and to permit persons to whom the 9af69d88dSmrg * Software is furnished to do so, subject to the following conditions: 10af69d88dSmrg * 11af69d88dSmrg * The above copyright notice and this permission notice (including the next 12af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the 13af69d88dSmrg * Software. 14af69d88dSmrg * 15af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18af69d88dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19af69d88dSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20af69d88dSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21af69d88dSmrg * IN THE SOFTWARE. 22af69d88dSmrg * 23af69d88dSmrg * Authors: 24af69d88dSmrg * Eric Anholt <eric@anholt.net> 25af69d88dSmrg * Matt Turner <mattst88@gmail.com> 26af69d88dSmrg * 27af69d88dSmrg */ 28af69d88dSmrg 29af69d88dSmrg#ifdef __SSE4_1__ 30af69d88dSmrg#include "main/macros.h" 31af69d88dSmrg#include "main/streaming-load-memcpy.h" 32af69d88dSmrg#include <smmintrin.h> 33af69d88dSmrg 34af69d88dSmrg/* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming 35af69d88dSmrg * read performance from uncached memory. 36af69d88dSmrg */ 37af69d88dSmrgvoid 38af69d88dSmrg_mesa_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len) 39af69d88dSmrg{ 40af69d88dSmrg char *restrict d = dst; 41af69d88dSmrg char *restrict s = src; 42af69d88dSmrg 43af69d88dSmrg /* If dst and src are not co-aligned, fallback to memcpy(). */ 44af69d88dSmrg if (((uintptr_t)d & 15) != ((uintptr_t)s & 15)) { 45af69d88dSmrg memcpy(d, s, len); 46af69d88dSmrg return; 47af69d88dSmrg } 48af69d88dSmrg 49af69d88dSmrg /* memcpy() the misaligned header. At the end of this if block, <d> and <s> 50af69d88dSmrg * are aligned to a 16-byte boundary or <len> == 0. 51af69d88dSmrg */ 52af69d88dSmrg if ((uintptr_t)d & 15) { 53af69d88dSmrg uintptr_t bytes_before_alignment_boundary = 16 - ((uintptr_t)d & 15); 54af69d88dSmrg assert(bytes_before_alignment_boundary < 16); 55af69d88dSmrg 56af69d88dSmrg memcpy(d, s, MIN2(bytes_before_alignment_boundary, len)); 57af69d88dSmrg 58af69d88dSmrg d = (char *)ALIGN((uintptr_t)d, 16); 59af69d88dSmrg s = (char *)ALIGN((uintptr_t)s, 16); 60af69d88dSmrg len -= MIN2(bytes_before_alignment_boundary, len); 61af69d88dSmrg } 62af69d88dSmrg 63af69d88dSmrg while (len >= 64) { 64af69d88dSmrg __m128i *dst_cacheline = (__m128i *)d; 65af69d88dSmrg __m128i *src_cacheline = (__m128i *)s; 66af69d88dSmrg 67af69d88dSmrg __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); 68af69d88dSmrg __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); 69af69d88dSmrg __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); 70af69d88dSmrg __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3); 71af69d88dSmrg 72af69d88dSmrg _mm_store_si128(dst_cacheline + 0, temp1); 73af69d88dSmrg _mm_store_si128(dst_cacheline + 1, temp2); 74af69d88dSmrg _mm_store_si128(dst_cacheline + 2, temp3); 75af69d88dSmrg _mm_store_si128(dst_cacheline + 3, temp4); 76af69d88dSmrg 77af69d88dSmrg d += 64; 78af69d88dSmrg s += 64; 79af69d88dSmrg len -= 64; 80af69d88dSmrg } 81af69d88dSmrg 82af69d88dSmrg /* memcpy() the tail. */ 83af69d88dSmrg if (len) { 84af69d88dSmrg memcpy(d, s, len); 85af69d88dSmrg } 86af69d88dSmrg} 87af69d88dSmrg 88af69d88dSmrg#endif 89