streaming-load-memcpy.c revision af69d88d
1af69d88dSmrg/*
2af69d88dSmrg * Copyright © 2013 Intel Corporation
3af69d88dSmrg *
4af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a
5af69d88dSmrg * copy of this software and associated documentation files (the "Software"),
6af69d88dSmrg * to deal in the Software without restriction, including without limitation
7af69d88dSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8af69d88dSmrg * and/or sell copies of the Software, and to permit persons to whom the
9af69d88dSmrg * Software is furnished to do so, subject to the following conditions:
10af69d88dSmrg *
11af69d88dSmrg * The above copyright notice and this permission notice (including the next
12af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the
13af69d88dSmrg * Software.
14af69d88dSmrg *
15af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18af69d88dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19af69d88dSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20af69d88dSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21af69d88dSmrg * IN THE SOFTWARE.
22af69d88dSmrg *
23af69d88dSmrg * Authors:
24af69d88dSmrg *    Eric Anholt <eric@anholt.net>
25af69d88dSmrg *    Matt Turner <mattst88@gmail.com>
26af69d88dSmrg *
27af69d88dSmrg */
28af69d88dSmrg
29af69d88dSmrg#ifdef __SSE4_1__
30af69d88dSmrg#include "main/macros.h"
31af69d88dSmrg#include "main/streaming-load-memcpy.h"
32af69d88dSmrg#include <smmintrin.h>
33af69d88dSmrg
34af69d88dSmrg/* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming
35af69d88dSmrg * read performance from uncached memory.
36af69d88dSmrg */
37af69d88dSmrgvoid
38af69d88dSmrg_mesa_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
39af69d88dSmrg{
40af69d88dSmrg   char *restrict d = dst;
41af69d88dSmrg   char *restrict s = src;
42af69d88dSmrg
43af69d88dSmrg   /* If dst and src are not co-aligned, fallback to memcpy(). */
44af69d88dSmrg   if (((uintptr_t)d & 15) != ((uintptr_t)s & 15)) {
45af69d88dSmrg      memcpy(d, s, len);
46af69d88dSmrg      return;
47af69d88dSmrg   }
48af69d88dSmrg
49af69d88dSmrg   /* memcpy() the misaligned header. At the end of this if block, <d> and <s>
50af69d88dSmrg    * are aligned to a 16-byte boundary or <len> == 0.
51af69d88dSmrg    */
52af69d88dSmrg   if ((uintptr_t)d & 15) {
53af69d88dSmrg      uintptr_t bytes_before_alignment_boundary = 16 - ((uintptr_t)d & 15);
54af69d88dSmrg      assert(bytes_before_alignment_boundary < 16);
55af69d88dSmrg
56af69d88dSmrg      memcpy(d, s, MIN2(bytes_before_alignment_boundary, len));
57af69d88dSmrg
58af69d88dSmrg      d = (char *)ALIGN((uintptr_t)d, 16);
59af69d88dSmrg      s = (char *)ALIGN((uintptr_t)s, 16);
60af69d88dSmrg      len -= MIN2(bytes_before_alignment_boundary, len);
61af69d88dSmrg   }
62af69d88dSmrg
63af69d88dSmrg   while (len >= 64) {
64af69d88dSmrg      __m128i *dst_cacheline = (__m128i *)d;
65af69d88dSmrg      __m128i *src_cacheline = (__m128i *)s;
66af69d88dSmrg
67af69d88dSmrg      __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0);
68af69d88dSmrg      __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1);
69af69d88dSmrg      __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2);
70af69d88dSmrg      __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);
71af69d88dSmrg
72af69d88dSmrg      _mm_store_si128(dst_cacheline + 0, temp1);
73af69d88dSmrg      _mm_store_si128(dst_cacheline + 1, temp2);
74af69d88dSmrg      _mm_store_si128(dst_cacheline + 2, temp3);
75af69d88dSmrg      _mm_store_si128(dst_cacheline + 3, temp4);
76af69d88dSmrg
77af69d88dSmrg      d += 64;
78af69d88dSmrg      s += 64;
79af69d88dSmrg      len -= 64;
80af69d88dSmrg   }
81af69d88dSmrg
82af69d88dSmrg   /* memcpy() the tail. */
83af69d88dSmrg   if (len) {
84af69d88dSmrg      memcpy(d, s, len);
85af69d88dSmrg   }
86af69d88dSmrg}
87af69d88dSmrg
88af69d88dSmrg#endif
89