config/nvptx/doacross.h

1.1.1.8  mrg /* Copyright (C) 2015-2024 Free Software Foundation, Inc.
    1.1  mrg    Contributed by Alexander Monakov <amonakov (at) ispras.ru>
    1.1  mrg
    1.1  mrg    This file is part of the GNU Offloading and Multi Processing Library
    1.1  mrg    (libgomp).
    1.1  mrg
    1.1  mrg    Libgomp is free software; you can redistribute it and/or modify it
    1.1  mrg    under the terms of the GNU General Public License as published by
    1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
    1.1  mrg    any later version.
    1.1  mrg
    1.1  mrg    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
    1.1  mrg    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    1.1  mrg    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
    1.1  mrg    more details.
    1.1  mrg
    1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
    1.1  mrg    permissions described in the GCC Runtime Library Exception, version
    1.1  mrg    3.1, as published by the Free Software Foundation.
    1.1  mrg
    1.1  mrg    You should have received a copy of the GNU General Public License and
    1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
    1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    1.1  mrg    <http://www.gnu.org/licenses/>.  */
    1.1  mrg
    1.1  mrg /* This is the NVPTX implementation of doacross spinning.  */
    1.1  mrg
    1.1  mrg #ifndef GOMP_DOACROSS_H
    1.1  mrg #define GOMP_DOACROSS_H 1
    1.1  mrg
    1.1  mrg #include "libgomp.h"
    1.1  mrg
    1.1  mrg static int zero;
    1.1  mrg
    1.1  mrg static inline int
    1.1  mrg cpu_relax (void)
    1.1  mrg {
    1.1  mrg   int r;
    1.1  mrg   /* Here we need a long-latency operation to make the current warp yield.
    1.1  mrg      We could use ld.cv, uncached load from system (host) memory, but that
    1.1  mrg      would require allocating locked memory in the plugin.  Alternatively,
    1.1  mrg      we can use ld.cg, which evicts from L1 and caches in L2.  */
    1.1  mrg   asm volatile ("ld.cg.s32 %0, [%1];" : "=r" (r) : "i" (&zero) : "memory");
    1.1  mrg   return r;
    1.1  mrg }
    1.1  mrg
    1.1  mrg static inline void doacross_spin (unsigned long *addr, unsigned long expected,
    1.1  mrg 				  unsigned long cur)
    1.1  mrg {
    1.1  mrg   /* Prevent compiler from optimizing based on bounds of containing object.  */
    1.1  mrg   asm ("" : "+r" (addr));
    1.1  mrg   do
    1.1  mrg     {
    1.1  mrg       int i = cpu_relax ();
    1.1  mrg       cur = addr[i];
    1.1  mrg     }
    1.1  mrg   while (cur <= expected);
    1.1  mrg }
    1.1  mrg
    1.1  mrg #endif /* GOMP_DOACROSS_H */