Home | History | Annotate | Line # | Download | only in unix
      1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
      2  * Permission is hereby granted, free of charge, to any person obtaining a copy
      3  * of this software and associated documentation files (the "Software"), to
      4  * deal in the Software without restriction, including without limitation the
      5  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
      6  * sell copies of the Software, and to permit persons to whom the Software is
      7  * furnished to do so, subject to the following conditions:
      8  *
      9  * The above copyright notice and this permission notice shall be included in
     10  * all copies or substantial portions of the Software.
     11  *
     12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     17  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     18  * IN THE SOFTWARE.
     19  */
     20 
     21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
     22  * EPOLL* counterparts.  We use the POLL* variants in this file because that
     23  * is what libuv uses elsewhere.
     24  */
     25 
     26 #include "uv.h"
     27 #include "internal.h"
     28 
     29 #include <inttypes.h>
     30 #include <stdatomic.h>
     31 #include <stddef.h>  /* offsetof */
     32 #include <stdint.h>
     33 #include <stdio.h>
     34 #include <stdlib.h>
     35 #include <string.h>
     36 #include <assert.h>
     37 #include <errno.h>
     38 
     39 #include <fcntl.h>
     40 #include <ifaddrs.h>
     41 #include <net/ethernet.h>
     42 #include <net/if.h>
     43 #include <netpacket/packet.h>
     44 #include <sys/epoll.h>
     45 #include <sys/inotify.h>
     46 #include <sys/mman.h>
     47 #include <sys/param.h>
     48 #include <sys/prctl.h>
     49 #include <sys/socket.h>
     50 #include <sys/stat.h>
     51 #include <sys/syscall.h>
     52 #include <sys/sysinfo.h>
     53 #include <sys/sysmacros.h>
     54 #include <sys/types.h>
     55 #include <sys/utsname.h>
     56 #include <time.h>
     57 #include <unistd.h>
     58 
     59 #ifndef __NR_io_uring_setup
     60 # define __NR_io_uring_setup 425
     61 #endif
     62 
     63 #ifndef __NR_io_uring_enter
     64 # define __NR_io_uring_enter 426
     65 #endif
     66 
     67 #ifndef __NR_io_uring_register
     68 # define __NR_io_uring_register 427
     69 #endif
     70 
     71 #ifndef __NR_copy_file_range
     72 # if defined(__x86_64__)
     73 #  define __NR_copy_file_range 326
     74 # elif defined(__i386__)
     75 #  define __NR_copy_file_range 377
     76 # elif defined(__s390__)
     77 #  define __NR_copy_file_range 375
     78 # elif defined(__arm__)
     79 #  define __NR_copy_file_range 391
     80 # elif defined(__aarch64__)
     81 #  define __NR_copy_file_range 285
     82 # elif defined(__powerpc__)
     83 #  define __NR_copy_file_range 379
     84 # elif defined(__arc__)
     85 #  define __NR_copy_file_range 285
     86 # elif defined(__riscv)
     87 #  define __NR_copy_file_range 285
     88 # endif
     89 #endif /* __NR_copy_file_range */
     90 
     91 #ifndef __NR_statx
     92 # if defined(__x86_64__)
     93 #  define __NR_statx 332
     94 # elif defined(__i386__)
     95 #  define __NR_statx 383
     96 # elif defined(__aarch64__)
     97 #  define __NR_statx 397
     98 # elif defined(__arm__)
     99 #  define __NR_statx 397
    100 # elif defined(__ppc__)
    101 #  define __NR_statx 383
    102 # elif defined(__s390__)
    103 #  define __NR_statx 379
    104 # elif defined(__riscv)
    105 #  define __NR_statx 291
    106 # endif
    107 #endif /* __NR_statx */
    108 
    109 #ifndef __NR_getrandom
    110 # if defined(__x86_64__)
    111 #  define __NR_getrandom 318
    112 # elif defined(__i386__)
    113 #  define __NR_getrandom 355
    114 # elif defined(__aarch64__)
    115 #  define __NR_getrandom 384
    116 # elif defined(__arm__)
    117 #  define __NR_getrandom 384
    118 # elif defined(__ppc__)
    119 #  define __NR_getrandom 359
    120 # elif defined(__s390__)
    121 #  define __NR_getrandom 349
    122 # elif defined(__riscv)
    123 #  define __NR_getrandom 278
    124 # endif
    125 #endif /* __NR_getrandom */
    126 
    127 enum {
    128   UV__IORING_SETUP_SQPOLL = 2u,
    129   UV__IORING_SETUP_NO_SQARRAY = 0x10000u,
    130 };
    131 
    132 enum {
    133   UV__IORING_FEAT_SINGLE_MMAP = 1u,
    134   UV__IORING_FEAT_NODROP = 2u,
    135   UV__IORING_FEAT_RSRC_TAGS = 1024u,  /* linux v5.13 */
    136 };
    137 
    138 enum {
    139   UV__IORING_OP_READV = 1,
    140   UV__IORING_OP_WRITEV = 2,
    141   UV__IORING_OP_FSYNC = 3,
    142   UV__IORING_OP_OPENAT = 18,
    143   UV__IORING_OP_CLOSE = 19,
    144   UV__IORING_OP_STATX = 21,
    145   UV__IORING_OP_EPOLL_CTL = 29,
    146   UV__IORING_OP_RENAMEAT = 35,
    147   UV__IORING_OP_UNLINKAT = 36,
    148   UV__IORING_OP_MKDIRAT = 37,
    149   UV__IORING_OP_SYMLINKAT = 38,
    150   UV__IORING_OP_LINKAT = 39,
    151   UV__IORING_OP_FTRUNCATE = 55,
    152 };
    153 
    154 enum {
    155   UV__IORING_ENTER_GETEVENTS = 1u,
    156   UV__IORING_ENTER_SQ_WAKEUP = 2u,
    157 };
    158 
    159 enum {
    160   UV__IORING_SQ_NEED_WAKEUP = 1u,
    161   UV__IORING_SQ_CQ_OVERFLOW = 2u,
    162 };
    163 
    164 struct uv__io_cqring_offsets {
    165   uint32_t head;
    166   uint32_t tail;
    167   uint32_t ring_mask;
    168   uint32_t ring_entries;
    169   uint32_t overflow;
    170   uint32_t cqes;
    171   uint64_t reserved0;
    172   uint64_t reserved1;
    173 };
    174 
    175 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
    176 
    177 struct uv__io_sqring_offsets {
    178   uint32_t head;
    179   uint32_t tail;
    180   uint32_t ring_mask;
    181   uint32_t ring_entries;
    182   uint32_t flags;
    183   uint32_t dropped;
    184   uint32_t array;
    185   uint32_t reserved0;
    186   uint64_t reserved1;
    187 };
    188 
    189 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
    190 
    191 struct uv__io_uring_cqe {
    192   uint64_t user_data;
    193   int32_t res;
    194   uint32_t flags;
    195 };
    196 
    197 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
    198 
    199 struct uv__io_uring_sqe {
    200   uint8_t opcode;
    201   uint8_t flags;
    202   uint16_t ioprio;
    203   int32_t fd;
    204   union {
    205     uint64_t off;
    206     uint64_t addr2;
    207   };
    208   union {
    209     uint64_t addr;
    210   };
    211   uint32_t len;
    212   union {
    213     uint32_t rw_flags;
    214     uint32_t fsync_flags;
    215     uint32_t open_flags;
    216     uint32_t statx_flags;
    217   };
    218   uint64_t user_data;
    219   union {
    220     uint16_t buf_index;
    221     uint64_t pad[3];
    222   };
    223 };
    224 
    225 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
    226 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
    227 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
    228 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
    229 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
    230 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
    231 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
    232 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
    233 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
    234 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
    235 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
    236 
    237 struct uv__io_uring_params {
    238   uint32_t sq_entries;
    239   uint32_t cq_entries;
    240   uint32_t flags;
    241   uint32_t sq_thread_cpu;
    242   uint32_t sq_thread_idle;
    243   uint32_t features;
    244   uint32_t reserved[4];
    245   struct uv__io_sqring_offsets sq_off;  /* 40 bytes */
    246   struct uv__io_cqring_offsets cq_off;  /* 40 bytes */
    247 };
    248 
    249 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
    250 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
    251 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
    252 
    253 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
    254 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
    255 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
    256 
    257 struct watcher_list {
    258   RB_ENTRY(watcher_list) entry;
    259   struct uv__queue watchers;
    260   int iterating;
    261   char* path;
    262   int wd;
    263 };
    264 
    265 struct watcher_root {
    266   struct watcher_list* rbh_root;
    267 };
    268 
    269 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
    270 static void uv__inotify_read(uv_loop_t* loop,
    271                              uv__io_t* w,
    272                              unsigned int revents);
    273 static int compare_watchers(const struct watcher_list* a,
    274                             const struct watcher_list* b);
    275 static void maybe_free_watcher_list(struct watcher_list* w,
    276                                     uv_loop_t* loop);
    277 
    278 static void uv__epoll_ctl_flush(int epollfd,
    279                                 struct uv__iou* ctl,
    280                                 struct epoll_event (*events)[256]);
    281 
    282 static void uv__epoll_ctl_prep(int epollfd,
    283                                struct uv__iou* ctl,
    284                                struct epoll_event (*events)[256],
    285                                int op,
    286                                int fd,
    287                                struct epoll_event* e);
    288 
    289 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
    290 
    291 
    292 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
    293   /* This cast works because watcher_root is a struct with a pointer as its
    294    * sole member. Such type punning is unsafe in the presence of strict
    295    * pointer aliasing (and is just plain nasty) but that is why libuv
    296    * is compiled with -fno-strict-aliasing.
    297    */
    298   return (struct watcher_root*) &loop->inotify_watchers;
    299 }
    300 
    301 
    302 unsigned uv__kernel_version(void) {
    303   static _Atomic unsigned cached_version;
    304   struct utsname u;
    305   unsigned version;
    306   unsigned major;
    307   unsigned minor;
    308   unsigned patch;
    309   char v_sig[256];
    310   char* needle;
    311 
    312   version = atomic_load_explicit(&cached_version, memory_order_relaxed);
    313   if (version != 0)
    314     return version;
    315 
    316   /* Check /proc/version_signature first as it's the way to get the mainline
    317    * kernel version in Ubuntu. The format is:
    318    *   Ubuntu ubuntu_kernel_version mainline_kernel_version
    319    * For example:
    320    *   Ubuntu 5.15.0-79.86-generic 5.15.111
    321    */
    322   if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
    323     if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
    324       goto calculate_version;
    325 
    326   if (-1 == uname(&u))
    327     return 0;
    328 
    329   /* In Debian we need to check `version` instead of `release` to extract the
    330    * mainline kernel version. This is an example of how it looks like:
    331    *  #1 SMP Debian 5.10.46-4 (2021-08-03)
    332    */
    333   needle = strstr(u.version, "Debian ");
    334   if (needle != NULL)
    335     if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
    336       goto calculate_version;
    337 
    338   if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
    339     return 0;
    340 
    341   /* Handle it when the process runs under the UNAME26 personality:
    342    *
    343    * - kernels >= 3.x identify as 2.6.40+x
    344    * - kernels >= 4.x identify as 2.6.60+x
    345    *
    346    * UNAME26 is a poorly conceived hack that doesn't let us distinguish
    347    * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
    348    * that 2.6.60+x means 4.x.
    349    *
    350    * Fun fact of the day: it's technically possible to observe the actual
    351    * kernel version for a brief moment because uname() first copies out the
    352    * real release string before overwriting it with the backcompat string.
    353    */
    354   if (major == 2 && minor == 6) {
    355     if (patch >= 60) {
    356       major = 4;
    357       minor = patch - 60;
    358       patch = 0;
    359     } else if (patch >= 40) {
    360       major = 3;
    361       minor = patch - 40;
    362       patch = 0;
    363     }
    364   }
    365 
    366 calculate_version:
    367   version = major * 65536 + minor * 256 + patch;
    368   atomic_store_explicit(&cached_version, version, memory_order_relaxed);
    369 
    370   return version;
    371 }
    372 
    373 
    374 ssize_t
    375 uv__fs_copy_file_range(int fd_in,
    376                        off_t* off_in,
    377                        int fd_out,
    378                        off_t* off_out,
    379                        size_t len,
    380                        unsigned int flags)
    381 {
    382 #ifdef __NR_copy_file_range
    383   return syscall(__NR_copy_file_range,
    384                  fd_in,
    385                  off_in,
    386                  fd_out,
    387                  off_out,
    388                  len,
    389                  flags);
    390 #else
    391   return errno = ENOSYS, -1;
    392 #endif
    393 }
    394 
    395 
    396 int uv__statx(int dirfd,
    397               const char* path,
    398               int flags,
    399               unsigned int mask,
    400               struct uv__statx* statxbuf) {
    401 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
    402   return errno = ENOSYS, -1;
    403 #else
    404   int rc;
    405 
    406   rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
    407   if (rc >= 0)
    408     uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
    409 
    410   return rc;
    411 #endif
    412 }
    413 
    414 
    415 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
    416 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
    417   return errno = ENOSYS, -1;
    418 #else
    419   ssize_t rc;
    420 
    421   rc = syscall(__NR_getrandom, buf, buflen, flags);
    422   if (rc >= 0)
    423     uv__msan_unpoison(buf, buflen);
    424 
    425   return rc;
    426 #endif
    427 }
    428 
    429 
    430 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
    431   return syscall(__NR_io_uring_setup, entries, params);
    432 }
    433 
    434 
    435 int uv__io_uring_enter(int fd,
    436                        unsigned to_submit,
    437                        unsigned min_complete,
    438                        unsigned flags) {
    439   /* io_uring_enter used to take a sigset_t but it's unused
    440    * in newer kernels unless IORING_ENTER_EXT_ARG is set,
    441    * in which case it takes a struct io_uring_getevents_arg.
    442    */
    443   return syscall(__NR_io_uring_enter,
    444                  fd,
    445                  to_submit,
    446                  min_complete,
    447                  flags,
    448                  NULL,
    449                  0L);
    450 }
    451 
    452 
    453 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
    454   return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
    455 }
    456 
    457 
    458 static int uv__use_io_uring(uint32_t flags) {
    459 #if defined(__ANDROID_API__)
    460   return 0;  /* Possibly available but blocked by seccomp. */
    461 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
    462   /* See https://github.com/libuv/libuv/issues/4158. */
    463   return 0;  /* All 32 bits kernels appear buggy. */
    464 #elif defined(__powerpc64__) || defined(__ppc64__)
    465   /* See https://github.com/libuv/libuv/issues/4283. */
    466   return 0; /* Random SIGSEGV in signal handler. */
    467 #else
    468   /* Ternary: unknown=0, yes=1, no=-1 */
    469   static _Atomic int use_io_uring;
    470   char* val;
    471   int use;
    472 
    473 #if defined(__hppa__)
    474   /* io_uring first supported on parisc in 6.1, functional in .51
    475    * https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/
    476    */
    477   if (uv__kernel_version() < /*6.1.51*/0x060133)
    478     return 0;
    479 #endif
    480 
    481   /* SQPOLL is all kinds of buggy but epoll batching should work fine. */
    482   if (0 == (flags & UV__IORING_SETUP_SQPOLL))
    483     return 1;
    484 
    485   /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
    486   if (uv__kernel_version() < /*5.10.186*/0x050ABA)
    487     return 0;
    488 
    489   use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
    490 
    491   if (use == 0) {
    492     val = getenv("UV_USE_IO_URING");
    493     use = val != NULL && atoi(val) > 0 ? 1 : -1;
    494     atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
    495   }
    496 
    497   return use > 0;
    498 #endif
    499 }
    500 
    501 
    502 static void uv__iou_init(int epollfd,
    503                          struct uv__iou* iou,
    504                          uint32_t entries,
    505                          uint32_t flags) {
    506   struct uv__io_uring_params params;
    507   struct epoll_event e;
    508   size_t cqlen;
    509   size_t sqlen;
    510   size_t maxlen;
    511   size_t sqelen;
    512   unsigned kernel_version;
    513   uint32_t* sqarray;
    514   uint32_t i;
    515   char* sq;
    516   char* sqe;
    517   int ringfd;
    518   int no_sqarray;
    519 
    520   sq = MAP_FAILED;
    521   sqe = MAP_FAILED;
    522 
    523   if (!uv__use_io_uring(flags))
    524     return;
    525 
    526   kernel_version = uv__kernel_version();
    527   no_sqarray =
    528       UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600);
    529 
    530   /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
    531    * Mostly academic because we check for a v5.13 kernel afterwards anyway.
    532    */
    533   memset(&params, 0, sizeof(params));
    534   params.flags = flags | no_sqarray;
    535 
    536   if (flags & UV__IORING_SETUP_SQPOLL)
    537     params.sq_thread_idle = 10;  /* milliseconds */
    538 
    539   /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
    540   ringfd = uv__io_uring_setup(entries, &params);
    541   if (ringfd == -1)
    542     return;
    543 
    544   /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
    545    * actually detecting is whether IORING_OP_STATX works with SQPOLL.
    546    */
    547   if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
    548     goto fail;
    549 
    550   /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
    551   if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
    552     goto fail;
    553 
    554   /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
    555   if (!(params.features & UV__IORING_FEAT_NODROP))
    556     goto fail;
    557 
    558   sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
    559   cqlen =
    560       params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
    561   maxlen = sqlen < cqlen ? cqlen : sqlen;
    562   sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
    563 
    564   sq = mmap(0,
    565             maxlen,
    566             PROT_READ | PROT_WRITE,
    567             MAP_SHARED | MAP_POPULATE,
    568             ringfd,
    569             0);  /* IORING_OFF_SQ_RING */
    570 
    571   sqe = mmap(0,
    572              sqelen,
    573              PROT_READ | PROT_WRITE,
    574              MAP_SHARED | MAP_POPULATE,
    575              ringfd,
    576              0x10000000ull);  /* IORING_OFF_SQES */
    577 
    578   if (sq == MAP_FAILED || sqe == MAP_FAILED)
    579     goto fail;
    580 
    581   if (flags & UV__IORING_SETUP_SQPOLL) {
    582     /* Only interested in completion events. To get notified when
    583      * the kernel pulls items from the submission ring, add POLLOUT.
    584      */
    585     memset(&e, 0, sizeof(e));
    586     e.events = POLLIN;
    587     e.data.fd = ringfd;
    588 
    589     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
    590       goto fail;
    591   }
    592 
    593   iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
    594   iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
    595   iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
    596   iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
    597   iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
    598   iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
    599   iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
    600   iou->sq = sq;
    601   iou->cqe = sq + params.cq_off.cqes;
    602   iou->sqe = sqe;
    603   iou->sqlen = sqlen;
    604   iou->cqlen = cqlen;
    605   iou->maxlen = maxlen;
    606   iou->sqelen = sqelen;
    607   iou->ringfd = ringfd;
    608   iou->in_flight = 0;
    609 
    610   if (no_sqarray)
    611     return;
    612 
    613   sqarray = (uint32_t*) (sq + params.sq_off.array);
    614   for (i = 0; i <= iou->sqmask; i++)
    615     sqarray[i] = i;  /* Slot -> sqe identity mapping. */
    616 
    617   return;
    618 
    619 fail:
    620   if (sq != MAP_FAILED)
    621     munmap(sq, maxlen);
    622 
    623   if (sqe != MAP_FAILED)
    624     munmap(sqe, sqelen);
    625 
    626   uv__close(ringfd);
    627 }
    628 
    629 
    630 static void uv__iou_delete(struct uv__iou* iou) {
    631   if (iou->ringfd > -1) {
    632     munmap(iou->sq, iou->maxlen);
    633     munmap(iou->sqe, iou->sqelen);
    634     uv__close(iou->ringfd);
    635     iou->ringfd = -1;
    636   }
    637 }
    638 
    639 
    640 int uv__platform_loop_init(uv_loop_t* loop) {
    641   uv__loop_internal_fields_t* lfields;
    642 
    643   lfields = uv__get_internal_fields(loop);
    644   lfields->ctl.ringfd = -1;
    645   lfields->iou.ringfd = -2;  /* "uninitialized" */
    646 
    647   loop->inotify_watchers = NULL;
    648   loop->inotify_fd = -1;
    649   loop->backend_fd = epoll_create1(O_CLOEXEC);
    650 
    651   if (loop->backend_fd == -1)
    652     return UV__ERR(errno);
    653 
    654   uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
    655 
    656   return 0;
    657 }
    658 
    659 
    660 int uv__io_fork(uv_loop_t* loop) {
    661   int err;
    662   struct watcher_list* root;
    663 
    664   root = uv__inotify_watchers(loop)->rbh_root;
    665 
    666   uv__close(loop->backend_fd);
    667   loop->backend_fd = -1;
    668 
    669   /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
    670   uv__platform_loop_delete(loop);
    671 
    672   err = uv__platform_loop_init(loop);
    673   if (err)
    674     return err;
    675 
    676   return uv__inotify_fork(loop, root);
    677 }
    678 
    679 
    680 void uv__platform_loop_delete(uv_loop_t* loop) {
    681   uv__loop_internal_fields_t* lfields;
    682 
    683   lfields = uv__get_internal_fields(loop);
    684   uv__iou_delete(&lfields->ctl);
    685   uv__iou_delete(&lfields->iou);
    686 
    687   if (loop->inotify_fd != -1) {
    688     uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
    689     uv__close(loop->inotify_fd);
    690     loop->inotify_fd = -1;
    691   }
    692 }
    693 
    694 
    695 struct uv__invalidate {
    696   struct epoll_event (*prep)[256];
    697   struct epoll_event* events;
    698   int nfds;
    699 };
    700 
    701 
    702 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
    703   uv__loop_internal_fields_t* lfields;
    704   struct uv__invalidate* inv;
    705   struct epoll_event dummy;
    706   int i;
    707 
    708   lfields = uv__get_internal_fields(loop);
    709   inv = lfields->inv;
    710 
    711   /* Invalidate events with same file descriptor */
    712   if (inv != NULL)
    713     for (i = 0; i < inv->nfds; i++)
    714       if (inv->events[i].data.fd == fd)
    715         inv->events[i].data.fd = -1;
    716 
    717   /* Remove the file descriptor from the epoll.
    718    * This avoids a problem where the same file description remains open
    719    * in another process, causing repeated junk epoll events.
    720    *
    721    * Perform EPOLL_CTL_DEL immediately instead of going through
    722    * io_uring's submit queue, otherwise the file descriptor may
    723    * be closed by the time the kernel starts the operation.
    724    *
    725    * We pass in a dummy epoll_event, to work around a bug in old kernels.
    726    *
    727    * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
    728    * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
    729    */
    730   memset(&dummy, 0, sizeof(dummy));
    731   epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
    732 }
    733 
    734 
    735 int uv__io_check_fd(uv_loop_t* loop, int fd) {
    736   struct epoll_event e;
    737   int rc;
    738 
    739   memset(&e, 0, sizeof(e));
    740   e.events = POLLIN;
    741   e.data.fd = -1;
    742 
    743   rc = 0;
    744   if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
    745     if (errno != EEXIST)
    746       rc = UV__ERR(errno);
    747 
    748   if (rc == 0)
    749     if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
    750       abort();
    751 
    752   return rc;
    753 }
    754 
    755 
    756 /* Caller must initialize SQE and call uv__iou_submit(). */
    757 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
    758                                                 uv_loop_t* loop,
    759                                                 uv_fs_t* req) {
    760   struct uv__io_uring_sqe* sqe;
    761   uint32_t head;
    762   uint32_t tail;
    763   uint32_t mask;
    764   uint32_t slot;
    765 
    766   /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
    767    * initialization failed. Anything else is a valid ring file descriptor.
    768    */
    769   if (iou->ringfd == -2) {
    770     /* By default, the SQPOLL is not created. Enable only if the loop is
    771      * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING
    772      * environment variable is unset or a positive number.
    773      */
    774     if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL)
    775       if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL))
    776         uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
    777 
    778     if (iou->ringfd == -2)
    779       iou->ringfd = -1;  /* "failed" */
    780   }
    781 
    782   if (iou->ringfd == -1)
    783     return NULL;
    784 
    785   head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
    786                               memory_order_acquire);
    787   tail = *iou->sqtail;
    788   mask = iou->sqmask;
    789 
    790   if ((head & mask) == ((tail + 1) & mask))
    791     return NULL;  /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
    792 
    793   slot = tail & mask;
    794   sqe = iou->sqe;
    795   sqe = &sqe[slot];
    796   memset(sqe, 0, sizeof(*sqe));
    797   sqe->user_data = (uintptr_t) req;
    798 
    799   /* Pacify uv_cancel(). */
    800   req->work_req.loop = loop;
    801   req->work_req.work = NULL;
    802   req->work_req.done = NULL;
    803   uv__queue_init(&req->work_req.wq);
    804 
    805   uv__req_register(loop);
    806   iou->in_flight++;
    807 
    808   return sqe;
    809 }
    810 
    811 
    812 static void uv__iou_submit(struct uv__iou* iou) {
    813   uint32_t flags;
    814 
    815   atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
    816                         *iou->sqtail + 1,
    817                         memory_order_release);
    818 
    819   flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
    820                                memory_order_acquire);
    821 
    822   if (flags & UV__IORING_SQ_NEED_WAKEUP)
    823     if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
    824       if (errno != EOWNERDEAD)  /* Kernel bug. Harmless, ignore. */
    825         perror("libuv: io_uring_enter(wakeup)");  /* Can't happen. */
    826 }
    827 
    828 
    829 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
    830   struct uv__io_uring_sqe* sqe;
    831   struct uv__iou* iou;
    832   int kv;
    833 
    834   kv = uv__kernel_version();
    835   /* Work around a poorly understood bug in older kernels where closing a file
    836    * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
    837    * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
    838    * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
    839    * but good candidates are the several data race fixes. Interestingly, it
    840    * seems to manifest only when running under Docker so the possibility of
    841    * a Docker bug can't be completely ruled out either. Yay, computers.
    842    * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
    843    * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
    844    * solved.
    845    */
    846   if (kv < /* 5.15.90 */ 0x050F5A)
    847     return 0;
    848 
    849   if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
    850     return 0;
    851 
    852 
    853   iou = &uv__get_internal_fields(loop)->iou;
    854 
    855   sqe = uv__iou_get_sqe(iou, loop, req);
    856   if (sqe == NULL)
    857     return 0;
    858 
    859   sqe->fd = req->file;
    860   sqe->opcode = UV__IORING_OP_CLOSE;
    861 
    862   uv__iou_submit(iou);
    863 
    864   return 1;
    865 }
    866 
    867 
    868 int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) {
    869   struct uv__io_uring_sqe* sqe;
    870   struct uv__iou* iou;
    871 
    872   if (uv__kernel_version() < /* 6.9 */0x060900)
    873     return 0;
    874 
    875   iou = &uv__get_internal_fields(loop)->iou;
    876   sqe = uv__iou_get_sqe(iou, loop, req);
    877   if (sqe == NULL)
    878     return 0;
    879 
    880   sqe->fd = req->file;
    881   sqe->len = req->off;
    882   sqe->opcode = UV__IORING_OP_FTRUNCATE;
    883   uv__iou_submit(iou);
    884 
    885   return 1;
    886 }
    887 
    888 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
    889                                   uv_fs_t* req,
    890                                   uint32_t fsync_flags) {
    891   struct uv__io_uring_sqe* sqe;
    892   struct uv__iou* iou;
    893 
    894   iou = &uv__get_internal_fields(loop)->iou;
    895 
    896   sqe = uv__iou_get_sqe(iou, loop, req);
    897   if (sqe == NULL)
    898     return 0;
    899 
    900   /* Little known fact: setting seq->off and seq->len turns
    901    * it into an asynchronous sync_file_range() operation.
    902    */
    903   sqe->fd = req->file;
    904   sqe->fsync_flags = fsync_flags;
    905   sqe->opcode = UV__IORING_OP_FSYNC;
    906 
    907   uv__iou_submit(iou);
    908 
    909   return 1;
    910 }
    911 
    912 
    913 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
    914   struct uv__io_uring_sqe* sqe;
    915   struct uv__iou* iou;
    916 
    917   if (uv__kernel_version() < /* 5.15.0 */0x050F00)
    918     return 0;
    919 
    920   iou = &uv__get_internal_fields(loop)->iou;
    921   sqe = uv__iou_get_sqe(iou, loop, req);
    922   if (sqe == NULL)
    923     return 0;
    924 
    925   sqe->addr = (uintptr_t) req->path;
    926   sqe->fd = AT_FDCWD;
    927   sqe->addr2 = (uintptr_t) req->new_path;
    928   sqe->len = AT_FDCWD;
    929   sqe->opcode = UV__IORING_OP_LINKAT;
    930 
    931   uv__iou_submit(iou);
    932 
    933   return 1;
    934 }
    935 
    936 
    937 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
    938   struct uv__io_uring_sqe* sqe;
    939   struct uv__iou* iou;
    940 
    941   if (uv__kernel_version() < /* 5.15.0 */0x050F00)
    942     return 0;
    943 
    944   iou = &uv__get_internal_fields(loop)->iou;
    945   sqe = uv__iou_get_sqe(iou, loop, req);
    946   if (sqe == NULL)
    947     return 0;
    948 
    949   sqe->addr = (uintptr_t) req->path;
    950   sqe->fd = AT_FDCWD;
    951   sqe->len = req->mode;
    952   sqe->opcode = UV__IORING_OP_MKDIRAT;
    953 
    954   uv__iou_submit(iou);
    955 
    956   return 1;
    957 }
    958 
    959 
    960 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
    961   struct uv__io_uring_sqe* sqe;
    962   struct uv__iou* iou;
    963 
    964   iou = &uv__get_internal_fields(loop)->iou;
    965 
    966   sqe = uv__iou_get_sqe(iou, loop, req);
    967   if (sqe == NULL)
    968     return 0;
    969 
    970   sqe->addr = (uintptr_t) req->path;
    971   sqe->fd = AT_FDCWD;
    972   sqe->len = req->mode;
    973   sqe->opcode = UV__IORING_OP_OPENAT;
    974   sqe->open_flags = req->flags | O_CLOEXEC;
    975 
    976   uv__iou_submit(iou);
    977 
    978   return 1;
    979 }
    980 
    981 
    982 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
    983   struct uv__io_uring_sqe* sqe;
    984   struct uv__iou* iou;
    985 
    986   iou = &uv__get_internal_fields(loop)->iou;
    987 
    988   sqe = uv__iou_get_sqe(iou, loop, req);
    989   if (sqe == NULL)
    990     return 0;
    991 
    992   sqe->addr = (uintptr_t) req->path;
    993   sqe->fd = AT_FDCWD;
    994   sqe->addr2 = (uintptr_t) req->new_path;
    995   sqe->len = AT_FDCWD;
    996   sqe->opcode = UV__IORING_OP_RENAMEAT;
    997 
    998   uv__iou_submit(iou);
    999 
   1000   return 1;
   1001 }
   1002 
   1003 
   1004 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
   1005   struct uv__io_uring_sqe* sqe;
   1006   struct uv__iou* iou;
   1007 
   1008   if (uv__kernel_version() < /* 5.15.0 */0x050F00)
   1009     return 0;
   1010 
   1011   iou = &uv__get_internal_fields(loop)->iou;
   1012   sqe = uv__iou_get_sqe(iou, loop, req);
   1013   if (sqe == NULL)
   1014     return 0;
   1015 
   1016   sqe->addr = (uintptr_t) req->path;
   1017   sqe->fd = AT_FDCWD;
   1018   sqe->addr2 = (uintptr_t) req->new_path;
   1019   sqe->opcode = UV__IORING_OP_SYMLINKAT;
   1020 
   1021   uv__iou_submit(iou);
   1022 
   1023   return 1;
   1024 }
   1025 
   1026 
   1027 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
   1028   struct uv__io_uring_sqe* sqe;
   1029   struct uv__iou* iou;
   1030 
   1031   iou = &uv__get_internal_fields(loop)->iou;
   1032 
   1033   sqe = uv__iou_get_sqe(iou, loop, req);
   1034   if (sqe == NULL)
   1035     return 0;
   1036 
   1037   sqe->addr = (uintptr_t) req->path;
   1038   sqe->fd = AT_FDCWD;
   1039   sqe->opcode = UV__IORING_OP_UNLINKAT;
   1040 
   1041   uv__iou_submit(iou);
   1042 
   1043   return 1;
   1044 }
   1045 
   1046 
   1047 int uv__iou_fs_read_or_write(uv_loop_t* loop,
   1048                              uv_fs_t* req,
   1049                              int is_read) {
   1050   struct uv__io_uring_sqe* sqe;
   1051   struct uv__iou* iou;
   1052 
   1053   /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
   1054    * to the threadpool on writes */
   1055   if (req->nbufs > IOV_MAX) {
   1056     if (is_read)
   1057       req->nbufs = IOV_MAX;
   1058     else
   1059       return 0;
   1060   }
   1061 
   1062   iou = &uv__get_internal_fields(loop)->iou;
   1063 
   1064   sqe = uv__iou_get_sqe(iou, loop, req);
   1065   if (sqe == NULL)
   1066     return 0;
   1067 
   1068   sqe->addr = (uintptr_t) req->bufs;
   1069   sqe->fd = req->file;
   1070   sqe->len = req->nbufs;
   1071   sqe->off = req->off < 0 ? -1 : req->off;
   1072   sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
   1073 
   1074   uv__iou_submit(iou);
   1075 
   1076   return 1;
   1077 }
   1078 
   1079 
   1080 int uv__iou_fs_statx(uv_loop_t* loop,
   1081                      uv_fs_t* req,
   1082                      int is_fstat,
   1083                      int is_lstat) {
   1084   struct uv__io_uring_sqe* sqe;
   1085   struct uv__statx* statxbuf;
   1086   struct uv__iou* iou;
   1087 
   1088   statxbuf = uv__malloc(sizeof(*statxbuf));
   1089   if (statxbuf == NULL)
   1090     return 0;
   1091 
   1092   iou = &uv__get_internal_fields(loop)->iou;
   1093 
   1094   sqe = uv__iou_get_sqe(iou, loop, req);
   1095   if (sqe == NULL) {
   1096     uv__free(statxbuf);
   1097     return 0;
   1098   }
   1099 
   1100   req->ptr = statxbuf;
   1101 
   1102   sqe->addr = (uintptr_t) req->path;
   1103   sqe->addr2 = (uintptr_t) statxbuf;
   1104   sqe->fd = AT_FDCWD;
   1105   sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
   1106   sqe->opcode = UV__IORING_OP_STATX;
   1107 
   1108   if (is_fstat) {
   1109     sqe->addr = (uintptr_t) "";
   1110     sqe->fd = req->file;
   1111     sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
   1112   }
   1113 
   1114   if (is_lstat)
   1115     sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
   1116 
   1117   uv__iou_submit(iou);
   1118 
   1119   return 1;
   1120 }
   1121 
   1122 
   1123 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
   1124   buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
   1125   buf->st_mode = statxbuf->stx_mode;
   1126   buf->st_nlink = statxbuf->stx_nlink;
   1127   buf->st_uid = statxbuf->stx_uid;
   1128   buf->st_gid = statxbuf->stx_gid;
   1129   buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
   1130   buf->st_ino = statxbuf->stx_ino;
   1131   buf->st_size = statxbuf->stx_size;
   1132   buf->st_blksize = statxbuf->stx_blksize;
   1133   buf->st_blocks = statxbuf->stx_blocks;
   1134   buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
   1135   buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
   1136   buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
   1137   buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
   1138   buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
   1139   buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
   1140   buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
   1141   buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
   1142   buf->st_flags = 0;
   1143   buf->st_gen = 0;
   1144 }
   1145 
   1146 
   1147 static void uv__iou_fs_statx_post(uv_fs_t* req) {
   1148   struct uv__statx* statxbuf;
   1149   uv_stat_t* buf;
   1150 
   1151   buf = &req->statbuf;
   1152   statxbuf = req->ptr;
   1153   req->ptr = NULL;
   1154 
   1155   if (req->result == 0) {
   1156     uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
   1157     uv__statx_to_stat(statxbuf, buf);
   1158     req->ptr = buf;
   1159   }
   1160 
   1161   uv__free(statxbuf);
   1162 }
   1163 
   1164 
   1165 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
   1166   struct uv__io_uring_cqe* cqe;
   1167   struct uv__io_uring_cqe* e;
   1168   uv_fs_t* req;
   1169   uint32_t head;
   1170   uint32_t tail;
   1171   uint32_t mask;
   1172   uint32_t i;
   1173   uint32_t flags;
   1174   int nevents;
   1175   int rc;
   1176 
   1177   head = *iou->cqhead;
   1178   tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
   1179                               memory_order_acquire);
   1180   mask = iou->cqmask;
   1181   cqe = iou->cqe;
   1182   nevents = 0;
   1183 
   1184   for (i = head; i != tail; i++) {
   1185     e = &cqe[i & mask];
   1186 
   1187     req = (uv_fs_t*) (uintptr_t) e->user_data;
   1188     assert(req->type == UV_FS);
   1189 
   1190     uv__req_unregister(loop);
   1191     iou->in_flight--;
   1192 
   1193     /* If the op is not supported by the kernel retry using the thread pool */
   1194     if (e->res == -EOPNOTSUPP) {
   1195       uv__fs_post(loop, req);
   1196       continue;
   1197     }
   1198 
   1199     /* io_uring stores error codes as negative numbers, same as libuv. */
   1200     req->result = e->res;
   1201 
   1202     switch (req->fs_type) {
   1203       case UV_FS_FSTAT:
   1204       case UV_FS_LSTAT:
   1205       case UV_FS_STAT:
   1206         uv__iou_fs_statx_post(req);
   1207         break;
   1208       default:  /* Squelch -Wswitch warnings. */
   1209         break;
   1210     }
   1211 
   1212     uv__metrics_update_idle_time(loop);
   1213     req->cb(req);
   1214     nevents++;
   1215   }
   1216 
   1217   atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
   1218                         tail,
   1219                         memory_order_release);
   1220 
   1221   /* Check whether CQE's overflowed, if so enter the kernel to make them
   1222    * available. Don't grab them immediately but in the next loop iteration to
   1223    * avoid loop starvation. */
   1224   flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
   1225                                memory_order_acquire);
   1226 
   1227   if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
   1228     do
   1229       rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
   1230     while (rc == -1 && errno == EINTR);
   1231 
   1232     if (rc < 0)
   1233       perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
   1234   }
   1235 
   1236   uv__metrics_inc_events(loop, nevents);
   1237   if (uv__get_internal_fields(loop)->current_timeout == 0)
   1238     uv__metrics_inc_events_waiting(loop, nevents);
   1239 }
   1240 
   1241 
   1242 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
   1243  * executed immediately, otherwise the file descriptor may have been closed
   1244  * by the time the kernel starts the operation.
   1245  */
   1246 static void uv__epoll_ctl_prep(int epollfd,
   1247                                struct uv__iou* ctl,
   1248                                struct epoll_event (*events)[256],
   1249                                int op,
   1250                                int fd,
   1251                                struct epoll_event* e) {
   1252   struct uv__io_uring_sqe* sqe;
   1253   struct epoll_event* pe;
   1254   uint32_t mask;
   1255   uint32_t slot;
   1256 
   1257   assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
   1258   assert(ctl->ringfd != -1);
   1259 
   1260   mask = ctl->sqmask;
   1261   slot = (*ctl->sqtail)++ & mask;
   1262 
   1263   pe = &(*events)[slot];
   1264   *pe = *e;
   1265 
   1266   sqe = ctl->sqe;
   1267   sqe = &sqe[slot];
   1268 
   1269   memset(sqe, 0, sizeof(*sqe));
   1270   sqe->addr = (uintptr_t) pe;
   1271   sqe->fd = epollfd;
   1272   sqe->len = op;
   1273   sqe->off = fd;
   1274   sqe->opcode = UV__IORING_OP_EPOLL_CTL;
   1275   sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
   1276 
   1277   if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
   1278     uv__epoll_ctl_flush(epollfd, ctl, events);
   1279 }
   1280 
   1281 
   1282 static void uv__epoll_ctl_flush(int epollfd,
   1283                                 struct uv__iou* ctl,
   1284                                 struct epoll_event (*events)[256]) {
   1285   struct epoll_event oldevents[256];
   1286   struct uv__io_uring_cqe* cqe;
   1287   uint32_t oldslot;
   1288   uint32_t slot;
   1289   uint32_t n;
   1290   int fd;
   1291   int op;
   1292   int rc;
   1293 
   1294   STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
   1295   assert(ctl->ringfd != -1);
   1296   assert(*ctl->sqhead != *ctl->sqtail);
   1297 
   1298   n = *ctl->sqtail - *ctl->sqhead;
   1299   do
   1300     rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
   1301   while (rc == -1 && errno == EINTR);
   1302 
   1303   if (rc < 0)
   1304     perror("libuv: io_uring_enter(getevents)");  /* Can't happen. */
   1305 
   1306   if (rc != (int) n)
   1307     abort();
   1308 
   1309   assert(*ctl->sqhead == *ctl->sqtail);
   1310 
   1311   memcpy(oldevents, *events, sizeof(*events));
   1312 
   1313   /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
   1314    * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
   1315    * that we are already watching. Ignore the former and retry the latter
   1316    * with EPOLL_CTL_MOD.
   1317    */
   1318   while (*ctl->cqhead != *ctl->cqtail) {
   1319     slot = (*ctl->cqhead)++ & ctl->cqmask;
   1320 
   1321     cqe = ctl->cqe;
   1322     cqe = &cqe[slot];
   1323 
   1324     if (cqe->res == 0)
   1325       continue;
   1326 
   1327     fd = cqe->user_data >> 32;
   1328     op = 3 & cqe->user_data;
   1329     oldslot = 255 & (cqe->user_data >> 2);
   1330 
   1331     if (op == EPOLL_CTL_DEL)
   1332       continue;
   1333 
   1334     if (op != EPOLL_CTL_ADD)
   1335       abort();
   1336 
   1337     if (cqe->res != -EEXIST)
   1338       abort();
   1339 
   1340     uv__epoll_ctl_prep(epollfd,
   1341                        ctl,
   1342                        events,
   1343                        EPOLL_CTL_MOD,
   1344                        fd,
   1345                        &oldevents[oldslot]);
   1346   }
   1347 }
   1348 
   1349 
   1350 void uv__io_poll(uv_loop_t* loop, int timeout) {
   1351   uv__loop_internal_fields_t* lfields;
   1352   struct epoll_event events[1024];
   1353   struct epoll_event prep[256];
   1354   struct uv__invalidate inv;
   1355   struct epoll_event* pe;
   1356   struct epoll_event e;
   1357   struct uv__iou* ctl;
   1358   struct uv__iou* iou;
   1359   int real_timeout;
   1360   struct uv__queue* q;
   1361   uv__io_t* w;
   1362   sigset_t* sigmask;
   1363   sigset_t sigset;
   1364   uint64_t base;
   1365   int have_iou_events;
   1366   int have_signals;
   1367   int nevents;
   1368   int epollfd;
   1369   int count;
   1370   int nfds;
   1371   int fd;
   1372   int op;
   1373   int i;
   1374   int user_timeout;
   1375   int reset_timeout;
   1376 
   1377   lfields = uv__get_internal_fields(loop);
   1378   ctl = &lfields->ctl;
   1379   iou = &lfields->iou;
   1380 
   1381   sigmask = NULL;
   1382   if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
   1383     sigemptyset(&sigset);
   1384     sigaddset(&sigset, SIGPROF);
   1385     sigmask = &sigset;
   1386   }
   1387 
   1388   assert(timeout >= -1);
   1389   base = loop->time;
   1390   count = 48; /* Benchmarks suggest this gives the best throughput. */
   1391   real_timeout = timeout;
   1392 
   1393   if (lfields->flags & UV_METRICS_IDLE_TIME) {
   1394     reset_timeout = 1;
   1395     user_timeout = timeout;
   1396     timeout = 0;
   1397   } else {
   1398     reset_timeout = 0;
   1399     user_timeout = 0;
   1400   }
   1401 
   1402   epollfd = loop->backend_fd;
   1403 
   1404   memset(&e, 0, sizeof(e));
   1405 
   1406   while (!uv__queue_empty(&loop->watcher_queue)) {
   1407     q = uv__queue_head(&loop->watcher_queue);
   1408     w = uv__queue_data(q, uv__io_t, watcher_queue);
   1409     uv__queue_remove(q);
   1410     uv__queue_init(q);
   1411 
   1412     op = EPOLL_CTL_MOD;
   1413     if (w->events == 0)
   1414       op = EPOLL_CTL_ADD;
   1415 
   1416     w->events = w->pevents;
   1417     e.events = w->pevents;
   1418     e.data.fd = w->fd;
   1419     fd = w->fd;
   1420 
   1421     if (ctl->ringfd != -1) {
   1422       uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
   1423       continue;
   1424     }
   1425 
   1426     if (!epoll_ctl(epollfd, op, fd, &e))
   1427       continue;
   1428 
   1429     assert(op == EPOLL_CTL_ADD);
   1430     assert(errno == EEXIST);
   1431 
   1432     /* File descriptor that's been watched before, update event mask. */
   1433     if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
   1434       abort();
   1435   }
   1436 
   1437   inv.events = events;
   1438   inv.prep = &prep;
   1439   inv.nfds = -1;
   1440 
   1441   for (;;) {
   1442     if (loop->nfds == 0)
   1443       if (iou->in_flight == 0)
   1444         break;
   1445 
   1446     /* All event mask mutations should be visible to the kernel before
   1447      * we enter epoll_pwait().
   1448      */
   1449     if (ctl->ringfd != -1)
   1450       while (*ctl->sqhead != *ctl->sqtail)
   1451         uv__epoll_ctl_flush(epollfd, ctl, &prep);
   1452 
   1453     /* Only need to set the provider_entry_time if timeout != 0. The function
   1454      * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
   1455      */
   1456     if (timeout != 0)
   1457       uv__metrics_set_provider_entry_time(loop);
   1458 
   1459     /* Store the current timeout in a location that's globally accessible so
   1460      * other locations like uv__work_done() can determine whether the queue
   1461      * of events in the callback were waiting when poll was called.
   1462      */
   1463     lfields->current_timeout = timeout;
   1464 
   1465     nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
   1466 
   1467     /* Update loop->time unconditionally. It's tempting to skip the update when
   1468      * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
   1469      * operating system didn't reschedule our process while in the syscall.
   1470      */
   1471     SAVE_ERRNO(uv__update_time(loop));
   1472 
   1473     if (nfds == -1)
   1474       assert(errno == EINTR);
   1475     else if (nfds == 0)
   1476       /* Unlimited timeout should only return with events or signal. */
   1477       assert(timeout != -1);
   1478 
   1479     if (nfds == 0 || nfds == -1) {
   1480       if (reset_timeout != 0) {
   1481         timeout = user_timeout;
   1482         reset_timeout = 0;
   1483       } else if (nfds == 0) {
   1484         return;
   1485       }
   1486 
   1487       /* Interrupted by a signal. Update timeout and poll again. */
   1488       goto update_timeout;
   1489     }
   1490 
   1491     have_iou_events = 0;
   1492     have_signals = 0;
   1493     nevents = 0;
   1494 
   1495     inv.nfds = nfds;
   1496     lfields->inv = &inv;
   1497 
   1498     for (i = 0; i < nfds; i++) {
   1499       pe = events + i;
   1500       fd = pe->data.fd;
   1501 
   1502       /* Skip invalidated events, see uv__platform_invalidate_fd */
   1503       if (fd == -1)
   1504         continue;
   1505 
   1506       if (fd == iou->ringfd) {
   1507         uv__poll_io_uring(loop, iou);
   1508         have_iou_events = 1;
   1509         continue;
   1510       }
   1511 
   1512       assert(fd >= 0);
   1513       assert((unsigned) fd < loop->nwatchers);
   1514 
   1515       w = loop->watchers[fd];
   1516 
   1517       if (w == NULL) {
   1518         /* File descriptor that we've stopped watching, disarm it.
   1519          *
   1520          * Ignore all errors because we may be racing with another thread
   1521          * when the file descriptor is closed.
   1522          *
   1523          * Perform EPOLL_CTL_DEL immediately instead of going through
   1524          * io_uring's submit queue, otherwise the file descriptor may
   1525          * be closed by the time the kernel starts the operation.
   1526          */
   1527         epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
   1528         continue;
   1529       }
   1530 
   1531       /* Give users only events they're interested in. Prevents spurious
   1532        * callbacks when previous callback invocation in this loop has stopped
   1533        * the current watcher. Also, filters out events that users has not
   1534        * requested us to watch.
   1535        */
   1536       pe->events &= w->pevents | POLLERR | POLLHUP;
   1537 
   1538       /* Work around an epoll quirk where it sometimes reports just the
   1539        * EPOLLERR or EPOLLHUP event.  In order to force the event loop to
   1540        * move forward, we merge in the read/write events that the watcher
   1541        * is interested in; uv__read() and uv__write() will then deal with
   1542        * the error or hangup in the usual fashion.
   1543        *
   1544        * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
   1545        * reads the available data, calls uv_read_stop(), then sometime later
   1546        * calls uv_read_start() again.  By then, libuv has forgotten about the
   1547        * hangup and the kernel won't report EPOLLIN again because there's
   1548        * nothing left to read.  If anything, libuv is to blame here.  The
   1549        * current hack is just a quick bandaid; to properly fix it, libuv
   1550        * needs to remember the error/hangup event.  We should get that for
   1551        * free when we switch over to edge-triggered I/O.
   1552        */
   1553       if (pe->events == POLLERR || pe->events == POLLHUP)
   1554         pe->events |=
   1555           w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
   1556 
   1557       if (pe->events != 0) {
   1558         /* Run signal watchers last.  This also affects child process watchers
   1559          * because those are implemented in terms of signal watchers.
   1560          */
   1561         if (w == &loop->signal_io_watcher) {
   1562           have_signals = 1;
   1563         } else {
   1564           uv__metrics_update_idle_time(loop);
   1565           w->cb(loop, w, pe->events);
   1566         }
   1567 
   1568         nevents++;
   1569       }
   1570     }
   1571 
   1572     uv__metrics_inc_events(loop, nevents);
   1573     if (reset_timeout != 0) {
   1574       timeout = user_timeout;
   1575       reset_timeout = 0;
   1576       uv__metrics_inc_events_waiting(loop, nevents);
   1577     }
   1578 
   1579     if (have_signals != 0) {
   1580       uv__metrics_update_idle_time(loop);
   1581       loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
   1582     }
   1583 
   1584     lfields->inv = NULL;
   1585 
   1586     if (have_iou_events != 0)
   1587       break;  /* Event loop should cycle now so don't poll again. */
   1588 
   1589     if (have_signals != 0)
   1590       break;  /* Event loop should cycle now so don't poll again. */
   1591 
   1592     if (nevents != 0) {
   1593       if (nfds == ARRAY_SIZE(events) && --count != 0) {
   1594         /* Poll for more events but don't block this time. */
   1595         timeout = 0;
   1596         continue;
   1597       }
   1598       break;
   1599     }
   1600 
   1601 update_timeout:
   1602     if (timeout == 0)
   1603       break;
   1604 
   1605     if (timeout == -1)
   1606       continue;
   1607 
   1608     assert(timeout > 0);
   1609 
   1610     real_timeout -= (loop->time - base);
   1611     if (real_timeout <= 0)
   1612       break;
   1613 
   1614     timeout = real_timeout;
   1615   }
   1616 
   1617   if (ctl->ringfd != -1)
   1618     while (*ctl->sqhead != *ctl->sqtail)
   1619       uv__epoll_ctl_flush(epollfd, ctl, &prep);
   1620 }
   1621 
   1622 uint64_t uv__hrtime(uv_clocktype_t type) {
   1623   static _Atomic clock_t fast_clock_id = -1;
   1624   struct timespec t;
   1625   clock_t clock_id;
   1626 
   1627   /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
   1628    * millisecond granularity or better.  CLOCK_MONOTONIC_COARSE is
   1629    * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
   1630    * decide to make a costly system call.
   1631    */
   1632   /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
   1633    * when it has microsecond granularity or better (unlikely).
   1634    */
   1635   clock_id = CLOCK_MONOTONIC;
   1636   if (type != UV_CLOCK_FAST)
   1637     goto done;
   1638 
   1639   clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
   1640   if (clock_id != -1)
   1641     goto done;
   1642 
   1643   clock_id = CLOCK_MONOTONIC;
   1644   if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
   1645     if (t.tv_nsec <= 1 * 1000 * 1000)
   1646       clock_id = CLOCK_MONOTONIC_COARSE;
   1647 
   1648   atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
   1649 
   1650 done:
   1651 
   1652   if (clock_gettime(clock_id, &t))
   1653     return 0;  /* Not really possible. */
   1654 
   1655   return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
   1656 }
   1657 
   1658 
   1659 int uv_resident_set_memory(size_t* rss) {
   1660   char buf[1024];
   1661   const char* s;
   1662   long val;
   1663   int rc;
   1664   int i;
   1665 
   1666   /* rss: 24th element */
   1667   rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
   1668   if (rc < 0)
   1669     return rc;
   1670 
   1671   /* find the last ')' */
   1672   s = strrchr(buf, ')');
   1673   if (s == NULL)
   1674     goto err;
   1675 
   1676   for (i = 1; i <= 22; i++) {
   1677     s = strchr(s + 1, ' ');
   1678     if (s == NULL)
   1679       goto err;
   1680   }
   1681 
   1682   errno = 0;
   1683   val = strtol(s, NULL, 10);
   1684   if (val < 0 || errno != 0)
   1685     goto err;
   1686 
   1687   *rss = val * getpagesize();
   1688   return 0;
   1689 
   1690 err:
   1691   return UV_EINVAL;
   1692 }
   1693 
   1694 int uv_uptime(double* uptime) {
   1695   struct timespec now;
   1696   char buf[128];
   1697 
   1698   /* Consult /proc/uptime when present (common case), or fall back to
   1699    * clock_gettime. Why not always clock_gettime? It doesn't always return the
   1700    * right result under OpenVZ and possibly other containerized environments.
   1701    */
   1702   if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
   1703     if (1 == sscanf(buf, "%lf", uptime))
   1704       return 0;
   1705 
   1706   if (clock_gettime(CLOCK_BOOTTIME, &now))
   1707     return UV__ERR(errno);
   1708 
   1709   *uptime = now.tv_sec;
   1710   return 0;
   1711 }
   1712 
   1713 
   1714 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
   1715 #if defined(__PPC__)
   1716   static const char model_marker[] = "cpu\t\t: ";
   1717   static const char model_marker2[] = "";
   1718 #elif defined(__arm__)
   1719   static const char model_marker[] = "model name\t: ";
   1720   static const char model_marker2[] = "Processor\t: ";
   1721 #elif defined(__aarch64__)
   1722   static const char model_marker[] = "CPU part\t: ";
   1723   static const char model_marker2[] = "";
   1724 #elif defined(__mips__)
   1725   static const char model_marker[] = "cpu model\t\t: ";
   1726   static const char model_marker2[] = "";
   1727 #elif defined(__loongarch__)
   1728   static const char model_marker[] = "cpu family\t\t: ";
   1729   static const char model_marker2[] = "";
   1730 #else
   1731   static const char model_marker[] = "model name\t: ";
   1732   static const char model_marker2[] = "";
   1733 #endif
   1734   static const char parts[] =
   1735 #ifdef __aarch64__
   1736     "0x811\nARM810\n"       "0x920\nARM920\n"      "0x922\nARM922\n"
   1737     "0x926\nARM926\n"       "0x940\nARM940\n"      "0x946\nARM946\n"
   1738     "0x966\nARM966\n"       "0xa20\nARM1020\n"      "0xa22\nARM1022\n"
   1739     "0xa26\nARM1026\n"      "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
   1740     "0xb56\nARM1156\n"      "0xb76\nARM1176\n"      "0xc05\nCortex-A5\n"
   1741     "0xc07\nCortex-A7\n"    "0xc08\nCortex-A8\n"    "0xc09\nCortex-A9\n"
   1742     "0xc0d\nCortex-A17\n"   /* Originally A12 */
   1743     "0xc0f\nCortex-A15\n"   "0xc0e\nCortex-A17\n"   "0xc14\nCortex-R4\n"
   1744     "0xc15\nCortex-R5\n"    "0xc17\nCortex-R7\n"    "0xc18\nCortex-R8\n"
   1745     "0xc20\nCortex-M0\n"    "0xc21\nCortex-M1\n"    "0xc23\nCortex-M3\n"
   1746     "0xc24\nCortex-M4\n"    "0xc27\nCortex-M7\n"    "0xc60\nCortex-M0+\n"
   1747     "0xd01\nCortex-A32\n"   "0xd03\nCortex-A53\n"   "0xd04\nCortex-A35\n"
   1748     "0xd05\nCortex-A55\n"   "0xd06\nCortex-A65\n"   "0xd07\nCortex-A57\n"
   1749     "0xd08\nCortex-A72\n"   "0xd09\nCortex-A73\n"   "0xd0a\nCortex-A75\n"
   1750     "0xd0b\nCortex-A76\n"   "0xd0c\nNeoverse-N1\n"  "0xd0d\nCortex-A77\n"
   1751     "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n"   "0xd20\nCortex-M23\n"
   1752     "0xd21\nCortex-M33\n"   "0xd41\nCortex-A78\n"   "0xd42\nCortex-A78AE\n"
   1753     "0xd4a\nNeoverse-E1\n"  "0xd4b\nCortex-A78C\n"
   1754 #endif
   1755     "";
   1756   struct cpu {
   1757     unsigned long long freq, user, nice, sys, idle, irq;
   1758     unsigned model;
   1759   };
   1760   FILE* fp;
   1761   char* p;
   1762   int found;
   1763   int n;
   1764   unsigned i;
   1765   unsigned cpu;
   1766   unsigned maxcpu;
   1767   unsigned size;
   1768   unsigned long long skip;
   1769   struct cpu (*cpus)[8192];  /* Kernel maximum. */
   1770   struct cpu* c;
   1771   struct cpu t;
   1772   char (*model)[64];
   1773   unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
   1774   /* Assumption: even big.LITTLE systems will have only a handful
   1775    * of different CPU models. Most systems will just have one.
   1776    */
   1777   char models[8][64];
   1778   char buf[1024];
   1779 
   1780   memset(bitmap, 0, sizeof(bitmap));
   1781   memset(models, 0, sizeof(models));
   1782   snprintf(*models, sizeof(*models), "unknown");
   1783   maxcpu = 0;
   1784 
   1785   cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
   1786   if (cpus == NULL)
   1787     return UV_ENOMEM;
   1788 
   1789   fp = uv__open_file("/proc/stat");
   1790   if (fp == NULL) {
   1791     uv__free(cpus);
   1792     return UV__ERR(errno);
   1793   }
   1794 
   1795   if (NULL == fgets(buf, sizeof(buf), fp))
   1796     abort();
   1797 
   1798   for (;;) {
   1799     memset(&t, 0, sizeof(t));
   1800 
   1801     n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
   1802                &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
   1803 
   1804     if (n != 7)
   1805       break;
   1806 
   1807     if (NULL == fgets(buf, sizeof(buf), fp))
   1808       abort();
   1809 
   1810     if (cpu >= ARRAY_SIZE(*cpus))
   1811       continue;
   1812 
   1813     (*cpus)[cpu] = t;
   1814 
   1815     bitmap[cpu >> 3] |= 1 << (cpu & 7);
   1816 
   1817     if (cpu >= maxcpu)
   1818       maxcpu = cpu + 1;
   1819   }
   1820 
   1821   fclose(fp);
   1822 
   1823   fp = uv__open_file("/proc/cpuinfo");
   1824   if (fp == NULL)
   1825     goto nocpuinfo;
   1826 
   1827   for (;;) {
   1828     if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
   1829       break;  /* Parse error. */
   1830 
   1831     while (fgets(buf, sizeof(buf), fp)) {
   1832       if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) {
   1833         p = buf + sizeof(model_marker) - 1;
   1834         goto parts;
   1835       }
   1836       if (!*model_marker2)
   1837         continue;
   1838       if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) {
   1839         p = buf + sizeof(model_marker2) - 1;
   1840         goto parts;
   1841       }
   1842     }
   1843 
   1844     goto next;  /* Not found. */
   1845 
   1846 parts:
   1847     n = (int) strcspn(p, "\n");
   1848 
   1849     /* arm64: translate CPU part code to model name. */
   1850     if (*parts) {
   1851       p = memmem(parts, sizeof(parts) - 1, p, n + 1);
   1852       if (p == NULL)
   1853         p = "unknown";
   1854       else
   1855         p += n + 1;
   1856       n = (int) strcspn(p, "\n");
   1857     }
   1858 
   1859     found = 0;
   1860     for (model = models; !found && model < ARRAY_END(models); model++)
   1861       found = !strncmp(p, *model, strlen(*model));
   1862 
   1863     if (!found)
   1864       goto next;
   1865 
   1866     if (**model == '\0')
   1867       snprintf(*model, sizeof(*model), "%.*s", n, p);
   1868 
   1869     if (cpu < maxcpu)
   1870       (*cpus)[cpu].model = model - models;
   1871 
   1872 next:
   1873     while (fgets(buf, sizeof(buf), fp))
   1874       if (*buf == '\n')
   1875         break;
   1876   }
   1877 
   1878   fclose(fp);
   1879   fp = NULL;
   1880 
   1881 nocpuinfo:
   1882 
   1883   n = 0;
   1884   for (cpu = 0; cpu < maxcpu; cpu++) {
   1885     if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
   1886       continue;
   1887 
   1888     n++;
   1889     snprintf(buf, sizeof(buf),
   1890              "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
   1891 
   1892     fp = uv__open_file(buf);
   1893     if (fp == NULL)
   1894       continue;
   1895 
   1896     if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
   1897       abort();
   1898     fclose(fp);
   1899     fp = NULL;
   1900   }
   1901 
   1902   size = n * sizeof(**ci) + sizeof(models);
   1903   *ci = uv__malloc(size);
   1904   *count = 0;
   1905 
   1906   if (*ci == NULL) {
   1907     uv__free(cpus);
   1908     return UV_ENOMEM;
   1909   }
   1910 
   1911   *count = n;
   1912   p = memcpy(*ci + n, models, sizeof(models));
   1913 
   1914   i = 0;
   1915   for (cpu = 0; cpu < maxcpu; cpu++) {
   1916     if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
   1917       continue;
   1918 
   1919     c = *cpus + cpu;
   1920 
   1921     (*ci)[i++] = (uv_cpu_info_t) {
   1922       .model     = p + c->model * sizeof(*model),
   1923       .speed     = c->freq / 1000,
   1924       /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
   1925        * therefore the multiplier is always 1000/100 = 10.
   1926        */
   1927       .cpu_times = (struct uv_cpu_times_s) {
   1928         .user = 10 * c->user,
   1929         .nice = 10 * c->nice,
   1930         .sys  = 10 * c->sys,
   1931         .idle = 10 * c->idle,
   1932         .irq  = 10 * c->irq,
   1933       },
   1934     };
   1935   }
   1936 
   1937   uv__free(cpus);
   1938 
   1939   return 0;
   1940 }
   1941 
   1942 
   1943 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
   1944   if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
   1945     return 1;
   1946   if (ent->ifa_addr == NULL)
   1947     return 1;
   1948   /*
   1949    * On Linux getifaddrs returns information related to the raw underlying
   1950    * devices. We're not interested in this information yet.
   1951    */
   1952   if (ent->ifa_addr->sa_family == PF_PACKET)
   1953     return exclude_type;
   1954   return !exclude_type;
   1955 }
   1956 
   1957 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */
   1958 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
   1959   uv_interface_address_t* address;
   1960   struct sockaddr_ll* sll;
   1961   struct ifaddrs* addrs;
   1962   struct ifaddrs* ent;
   1963   size_t namelen;
   1964   char* name;
   1965   int i;
   1966 
   1967   *count = 0;
   1968   *addresses = NULL;
   1969 
   1970   if (getifaddrs(&addrs))
   1971     return UV__ERR(errno);
   1972 
   1973   /* Count the number of interfaces */
   1974   namelen = 0;
   1975   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
   1976     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
   1977       continue;
   1978 
   1979     namelen += strlen(ent->ifa_name) + 1;
   1980     (*count)++;
   1981   }
   1982 
   1983   if (*count == 0) {
   1984     freeifaddrs(addrs);
   1985     return 0;
   1986   }
   1987 
   1988   /* Make sure the memory is initiallized to zero using calloc() */
   1989   *addresses = uv__calloc(1, *count * sizeof(**addresses) + namelen);
   1990   if (*addresses == NULL) {
   1991     freeifaddrs(addrs);
   1992     return UV_ENOMEM;
   1993   }
   1994 
   1995   name = (char*) &(*addresses)[*count];
   1996   address = *addresses;
   1997 
   1998   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
   1999     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
   2000       continue;
   2001 
   2002     namelen = strlen(ent->ifa_name) + 1;
   2003     address->name = memcpy(name, ent->ifa_name, namelen);
   2004     name += namelen;
   2005 
   2006     if (ent->ifa_addr->sa_family == AF_INET6) {
   2007       address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
   2008     } else {
   2009       address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
   2010     }
   2011 
   2012     if (ent->ifa_netmask->sa_family == AF_INET6) {
   2013       address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
   2014     } else {
   2015       address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
   2016     }
   2017 
   2018     address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
   2019 
   2020     address++;
   2021   }
   2022 
   2023   /* Fill in physical addresses for each interface */
   2024   for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
   2025     if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
   2026       continue;
   2027 
   2028     address = *addresses;
   2029 
   2030     for (i = 0; i < (*count); i++) {
   2031       size_t namelen = strlen(ent->ifa_name);
   2032       /* Alias interface share the same physical address */
   2033       if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
   2034           (address->name[namelen] == 0 || address->name[namelen] == ':')) {
   2035         sll = (struct sockaddr_ll*)ent->ifa_addr;
   2036         memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
   2037       }
   2038       address++;
   2039     }
   2040   }
   2041 
   2042   freeifaddrs(addrs);
   2043 
   2044   return 0;
   2045 }
   2046 
   2047 
   2048 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */
   2049 void uv_free_interface_addresses(uv_interface_address_t* addresses,
   2050                                  int count) {
   2051   uv__free(addresses);
   2052 }
   2053 
   2054 
   2055 void uv__set_process_title(const char* title) {
   2056 #if defined(PR_SET_NAME)
   2057   prctl(PR_SET_NAME, title);  /* Only copies first 16 characters. */
   2058 #endif
   2059 }
   2060 
   2061 
   2062 static uint64_t uv__read_proc_meminfo(const char* what) {
   2063   uint64_t rc;
   2064   char* p;
   2065   char buf[4096];  /* Large enough to hold all of /proc/meminfo. */
   2066 
   2067   if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
   2068     return 0;
   2069 
   2070   p = strstr(buf, what);
   2071 
   2072   if (p == NULL)
   2073     return 0;
   2074 
   2075   p += strlen(what);
   2076 
   2077   rc = 0;
   2078   sscanf(p, "%" PRIu64 " kB", &rc);
   2079 
   2080   return rc * 1024;
   2081 }
   2082 
   2083 
   2084 uint64_t uv_get_free_memory(void) {
   2085   struct sysinfo info;
   2086   uint64_t rc;
   2087 
   2088   rc = uv__read_proc_meminfo("MemAvailable:");
   2089 
   2090   if (rc != 0)
   2091     return rc;
   2092 
   2093   if (0 == sysinfo(&info))
   2094     return (uint64_t) info.freeram * info.mem_unit;
   2095 
   2096   return 0;
   2097 }
   2098 
   2099 
   2100 uint64_t uv_get_total_memory(void) {
   2101   struct sysinfo info;
   2102   uint64_t rc;
   2103 
   2104   rc = uv__read_proc_meminfo("MemTotal:");
   2105 
   2106   if (rc != 0)
   2107     return rc;
   2108 
   2109   if (0 == sysinfo(&info))
   2110     return (uint64_t) info.totalram * info.mem_unit;
   2111 
   2112   return 0;
   2113 }
   2114 
   2115 
   2116 static uint64_t uv__read_uint64(const char* filename) {
   2117   char buf[32];  /* Large enough to hold an encoded uint64_t. */
   2118   uint64_t rc;
   2119 
   2120   rc = 0;
   2121   if (0 == uv__slurp(filename, buf, sizeof(buf)))
   2122     if (1 != sscanf(buf, "%" PRIu64, &rc))
   2123       if (0 == strcmp(buf, "max\n"))
   2124         rc = UINT64_MAX;
   2125 
   2126   return rc;
   2127 }
   2128 
   2129 
   2130 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
   2131  * finds the location and length of the memory controller mount path.
   2132  * This disregards the leading / for easy concatenation of paths.
   2133  * Returns NULL if the memory controller wasn't found. */
   2134 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
   2135                                                 int* n) {
   2136   char* p;
   2137 
   2138   /* Seek to the memory controller line. */
   2139   p = strchr(buf, ':');
   2140   while (p != NULL && strncmp(p, ":memory:", 8)) {
   2141     p = strchr(p, '\n');
   2142     if (p != NULL)
   2143       p = strchr(p, ':');
   2144   }
   2145 
   2146   if (p != NULL) {
   2147     /* Determine the length of the mount path. */
   2148     p = p + strlen(":memory:/");
   2149     *n = (int) strcspn(p, "\n");
   2150   }
   2151 
   2152   return p;
   2153 }
   2154 
   2155 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
   2156                                           uint64_t* max) {
   2157   char filename[4097];
   2158   char* p;
   2159   int n;
   2160   uint64_t cgroup1_max;
   2161 
   2162   /* Find out where the controller is mounted. */
   2163   p = uv__cgroup1_find_memory_controller(buf, &n);
   2164   if (p != NULL) {
   2165     snprintf(filename, sizeof(filename),
   2166              "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
   2167     *high = uv__read_uint64(filename);
   2168 
   2169     snprintf(filename, sizeof(filename),
   2170              "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
   2171     *max = uv__read_uint64(filename);
   2172 
   2173     /* If the controller wasn't mounted, the reads above will have failed,
   2174      * as indicated by uv__read_uint64 returning 0.
   2175      */
   2176      if (*high != 0 && *max != 0)
   2177        goto update_limits;
   2178   }
   2179 
   2180   /* Fall back to the limits of the global memory controller. */
   2181   *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
   2182   *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
   2183 
   2184   /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
   2185    * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
   2186    */
   2187 update_limits:
   2188   cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
   2189   if (*high == cgroup1_max)
   2190     *high = UINT64_MAX;
   2191   if (*max == cgroup1_max)
   2192     *max = UINT64_MAX;
   2193 }
   2194 
   2195 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
   2196                                           uint64_t* max) {
   2197   char filename[4097];
   2198   char* p;
   2199   int n;
   2200 
   2201   /* Find out where the controller is mounted. */
   2202   p = buf + strlen("0::/");
   2203   n = (int) strcspn(p, "\n");
   2204 
   2205   /* Read the memory limits of the controller. */
   2206   snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
   2207   *max = uv__read_uint64(filename);
   2208   snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
   2209   *high = uv__read_uint64(filename);
   2210 }
   2211 
   2212 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
   2213   uint64_t high;
   2214   uint64_t max;
   2215 
   2216   /* In the case of cgroupv2, we'll only have a single entry. */
   2217   if (strncmp(buf, "0::/", 4))
   2218     uv__get_cgroup1_memory_limits(buf, &high, &max);
   2219   else
   2220     uv__get_cgroup2_memory_limits(buf, &high, &max);
   2221 
   2222   if (high == 0 || max == 0)
   2223     return 0;
   2224 
   2225   return high < max ? high : max;
   2226 }
   2227 
   2228 uint64_t uv_get_constrained_memory(void) {
   2229   char buf[1024];
   2230 
   2231   if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
   2232     return 0;
   2233 
   2234   return uv__get_cgroup_constrained_memory(buf);
   2235 }
   2236 
   2237 
   2238 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
   2239   char filename[4097];
   2240   uint64_t current;
   2241   char* p;
   2242   int n;
   2243 
   2244   /* Find out where the controller is mounted. */
   2245   p = uv__cgroup1_find_memory_controller(buf, &n);
   2246   if (p != NULL) {
   2247     snprintf(filename, sizeof(filename),
   2248             "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
   2249     current = uv__read_uint64(filename);
   2250 
   2251     /* If the controller wasn't mounted, the reads above will have failed,
   2252      * as indicated by uv__read_uint64 returning 0.
   2253      */
   2254     if (current != 0)
   2255       return current;
   2256   }
   2257 
   2258   /* Fall back to the usage of the global memory controller. */
   2259   return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
   2260 }
   2261 
   2262 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
   2263   char filename[4097];
   2264   char* p;
   2265   int n;
   2266 
   2267   /* Find out where the controller is mounted. */
   2268   p = buf + strlen("0::/");
   2269   n = (int) strcspn(p, "\n");
   2270 
   2271   snprintf(filename, sizeof(filename),
   2272            "/sys/fs/cgroup/%.*s/memory.current", n, p);
   2273   return uv__read_uint64(filename);
   2274 }
   2275 
   2276 uint64_t uv_get_available_memory(void) {
   2277   char buf[1024];
   2278   uint64_t constrained;
   2279   uint64_t current;
   2280   uint64_t total;
   2281 
   2282   if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
   2283     return 0;
   2284 
   2285   constrained = uv__get_cgroup_constrained_memory(buf);
   2286   if (constrained == 0)
   2287     return uv_get_free_memory();
   2288 
   2289   total = uv_get_total_memory();
   2290   if (constrained > total)
   2291     return uv_get_free_memory();
   2292 
   2293   /* In the case of cgroupv2, we'll only have a single entry. */
   2294   if (strncmp(buf, "0::/", 4))
   2295     current = uv__get_cgroup1_current_memory(buf);
   2296   else
   2297     current = uv__get_cgroup2_current_memory(buf);
   2298 
   2299   /* memory usage can be higher than the limit (for short bursts of time) */
   2300   if (constrained < current)
   2301     return 0;
   2302 
   2303   return constrained - current;
   2304 }
   2305 
   2306 
   2307 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
   2308                                             long long* quota) {
   2309   static const char cgroup_mount[] = "/sys/fs/cgroup";
   2310   const char* cgroup_trimmed;
   2311   char buf[1024];
   2312   char full_path[256];
   2313   char path[256];
   2314   char quota_buf[16];
   2315   char* last_slash;
   2316   int cgroup_size;
   2317   long long limit;
   2318   long long min_quota;
   2319   long long period;
   2320 
   2321   if (strncmp(cgroup, "0::/", 4) != 0)
   2322     return UV_EINVAL;
   2323 
   2324   /* Trim ending \n by replacing it with a 0 */
   2325   cgroup_trimmed = cgroup + sizeof("0::/") - 1;      /* Skip the prefix "0::/" */
   2326   cgroup_size = (int)strcspn(cgroup_trimmed, "\n");  /* Find the first \n */
   2327   min_quota = LLONG_MAX;
   2328 
   2329   /* Construct the path to the cpu.max files */
   2330   snprintf(path, sizeof(path), "%s/%.*s/cgroup.controllers", cgroup_mount,
   2331            cgroup_size, cgroup_trimmed);
   2332 
   2333   /* Read controllers, if not exists, not really a cgroup */
   2334   if (uv__slurp(path, buf, sizeof(buf)) < 0)
   2335     return UV_EIO;
   2336 
   2337   snprintf(path, sizeof(path), "%s/%.*s", cgroup_mount, cgroup_size,
   2338            cgroup_trimmed);
   2339 
   2340   /*
   2341    * Traverse up the cgroup v2 hierarchy, starting from the current cgroup path.
   2342    * At each level, attempt to read the "cpu.max" file, which defines the CPU
   2343    * quota and period.
   2344    *
   2345    * This reflects how Linux applies cgroup limits hierarchically.
   2346    *
   2347    * e.g: given a path like /sys/fs/cgroup/foo/bar/baz, we check:
   2348    *   - /sys/fs/cgroup/foo/bar/baz/cpu.max
   2349    *   - /sys/fs/cgroup/foo/bar/cpu.max
   2350    *   - /sys/fs/cgroup/foo/cpu.max
   2351    *   - /sys/fs/cgroup/cpu.max
   2352    */
   2353   while (strncmp(path, cgroup_mount, strlen(cgroup_mount)) == 0) {
   2354     snprintf(full_path, sizeof(full_path), "%s/cpu.max", path);
   2355 
   2356     /* Silently ignore and continue if the file does not exist */
   2357     if (uv__slurp(full_path, quota_buf, sizeof(quota_buf)) < 0)
   2358       goto next;
   2359 
   2360     /* No limit, move on */
   2361     if (strncmp(quota_buf, "max", 3) == 0)
   2362       goto next;
   2363 
   2364     /* Read cpu.max */
   2365     if (sscanf(quota_buf, "%lld %lld", &limit, &period) != 2)
   2366       goto next;
   2367 
   2368     /* Can't divide by 0 */
   2369     if (period == 0)
   2370       goto next;
   2371 
   2372     *quota = limit / period;
   2373     if (*quota < min_quota)
   2374       min_quota = *quota;
   2375 
   2376 next:
   2377     /* Move up one level in the cgroup hierarchy by trimming the last path.
   2378      * The loop ends once we reach the cgroup root mount point.
   2379      */
   2380     last_slash = strrchr(path, '/');
   2381     if (last_slash == NULL || strcmp(path, cgroup_mount) == 0)
   2382       break;
   2383     *last_slash = '\0';
   2384   }
   2385 
   2386   return 0;
   2387 }
   2388 
   2389 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
   2390                                              int* cgroup_size) {
   2391   /* Seek to the cpu controller line. */
   2392   char* cgroup_cpu = strstr(cgroup, ":cpu,");
   2393 
   2394   if (cgroup_cpu != NULL) {
   2395     /* Skip the controller prefix to the start of the cgroup path. */
   2396     cgroup_cpu += sizeof(":cpu,") - 1;
   2397     /* Determine the length of the cgroup path, excluding the newline. */
   2398     *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
   2399   }
   2400 
   2401   return cgroup_cpu;
   2402 }
   2403 
   2404 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
   2405                                             long long* quota) {
   2406   char path[256];
   2407   char buf[1024];
   2408   int cgroup_size;
   2409   char* cgroup_cpu;
   2410   long long period_length;
   2411   long long quota_per_period;
   2412 
   2413   cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
   2414 
   2415   if (cgroup_cpu == NULL)
   2416     return UV_EIO;
   2417 
   2418   /* Construct the path to the cpu.cfs_quota_us file */
   2419   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
   2420            cgroup_size, cgroup_cpu);
   2421 
   2422   /* Read cpu.cfs_quota_us */
   2423   if (uv__slurp(path, buf, sizeof(buf)) < 0)
   2424     return UV_EIO;
   2425 
   2426   if (sscanf(buf, "%lld", &quota_per_period) != 1)
   2427     return UV_EINVAL;
   2428 
   2429   /* Construct the path to the cpu.cfs_period_us file */
   2430   snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
   2431            cgroup_size, cgroup_cpu);
   2432 
   2433   /* Read cpu.cfs_period_us */
   2434   if (uv__slurp(path, buf, sizeof(buf)) < 0)
   2435     return UV_EIO;
   2436 
   2437   if (sscanf(buf, "%lld", &period_length) != 1)
   2438     return UV_EINVAL;
   2439 
   2440   /* Can't divide by 0 */
   2441   if (period_length == 0)
   2442     return UV_EINVAL;
   2443 
   2444   *quota = quota_per_period / period_length;
   2445 
   2446   return 0;
   2447 }
   2448 
   2449 int uv__get_constrained_cpu(long long* quota) {
   2450   char cgroup[1024];
   2451 
   2452   /* Read the cgroup from /proc/self/cgroup */
   2453   if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
   2454     return UV_EIO;
   2455 
   2456   /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
   2457    * The entry for cgroup v2 is always in the format "0::$PATH"
   2458    * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
   2459   if (strncmp(cgroup, "0::/", 4) == 0)
   2460     return uv__get_cgroupv2_constrained_cpu(cgroup, quota);
   2461   else
   2462     return uv__get_cgroupv1_constrained_cpu(cgroup, quota);
   2463 }
   2464 
   2465 
   2466 void uv_loadavg(double avg[3]) {
   2467   struct sysinfo info;
   2468   char buf[128];  /* Large enough to hold all of /proc/loadavg. */
   2469 
   2470   if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
   2471     if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
   2472       return;
   2473 
   2474   if (sysinfo(&info) < 0)
   2475     return;
   2476 
   2477   avg[0] = (double) info.loads[0] / 65536.0;
   2478   avg[1] = (double) info.loads[1] / 65536.0;
   2479   avg[2] = (double) info.loads[2] / 65536.0;
   2480 }
   2481 
   2482 
   2483 static int compare_watchers(const struct watcher_list* a,
   2484                             const struct watcher_list* b) {
   2485   if (a->wd < b->wd) return -1;
   2486   if (a->wd > b->wd) return 1;
   2487   return 0;
   2488 }
   2489 
   2490 
   2491 static int init_inotify(uv_loop_t* loop) {
   2492   int err;
   2493   int fd;
   2494 
   2495   if (loop->inotify_fd != -1)
   2496     return 0;
   2497 
   2498   fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
   2499   if (fd < 0)
   2500     return UV__ERR(errno);
   2501 
   2502   err = uv__io_init_start(loop, &loop->inotify_read_watcher, uv__inotify_read,
   2503                           fd, POLLIN);
   2504   if (err) {
   2505     uv__close(fd);
   2506     return err;
   2507   }
   2508 
   2509   loop->inotify_fd = fd;
   2510   return 0;
   2511 }
   2512 
   2513 
   2514 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
   2515   /* Open the inotify_fd, and re-arm all the inotify watchers. */
   2516   int err;
   2517   struct watcher_list* tmp_watcher_list_iter;
   2518   struct watcher_list* watcher_list;
   2519   struct watcher_list tmp_watcher_list;
   2520   struct uv__queue queue;
   2521   struct uv__queue* q;
   2522   uv_fs_event_t* handle;
   2523   char* tmp_path;
   2524 
   2525   if (root == NULL)
   2526     return 0;
   2527 
   2528   /* We must restore the old watcher list to be able to close items
   2529    * out of it.
   2530    */
   2531   loop->inotify_watchers = root;
   2532 
   2533   uv__queue_init(&tmp_watcher_list.watchers);
   2534   /* Note that the queue we use is shared with the start and stop()
   2535    * functions, making uv__queue_foreach unsafe to use. So we use the
   2536    * uv__queue_move trick to safely iterate. Also don't free the watcher
   2537    * list until we're done iterating. c.f. uv__inotify_read.
   2538    */
   2539   RB_FOREACH_SAFE(watcher_list, watcher_root,
   2540                   uv__inotify_watchers(loop), tmp_watcher_list_iter) {
   2541     watcher_list->iterating = 1;
   2542     uv__queue_move(&watcher_list->watchers, &queue);
   2543     while (!uv__queue_empty(&queue)) {
   2544       q = uv__queue_head(&queue);
   2545       handle = uv__queue_data(q, uv_fs_event_t, watchers);
   2546       /* It's critical to keep a copy of path here, because it
   2547        * will be set to NULL by stop() and then deallocated by
   2548        * maybe_free_watcher_list
   2549        */
   2550       tmp_path = uv__strdup(handle->path);
   2551       assert(tmp_path != NULL);
   2552       uv__queue_remove(q);
   2553       uv__queue_insert_tail(&watcher_list->watchers, q);
   2554       uv_fs_event_stop(handle);
   2555 
   2556       uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
   2557       handle->path = tmp_path;
   2558     }
   2559     watcher_list->iterating = 0;
   2560     maybe_free_watcher_list(watcher_list, loop);
   2561   }
   2562 
   2563   uv__queue_move(&tmp_watcher_list.watchers, &queue);
   2564   while (!uv__queue_empty(&queue)) {
   2565       q = uv__queue_head(&queue);
   2566       uv__queue_remove(q);
   2567       handle = uv__queue_data(q, uv_fs_event_t, watchers);
   2568       tmp_path = handle->path;
   2569       handle->path = NULL;
   2570       err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
   2571       uv__free(tmp_path);
   2572       if (err)
   2573         return err;
   2574   }
   2575 
   2576   return 0;
   2577 }
   2578 
   2579 
   2580 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
   2581   struct watcher_list w;
   2582   w.wd = wd;
   2583   return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
   2584 }
   2585 
   2586 
   2587 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
   2588   /* if the watcher_list->watchers is being iterated over, we can't free it. */
   2589   if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
   2590     /* No watchers left for this path. Clean up. */
   2591     RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
   2592     inotify_rm_watch(loop->inotify_fd, w->wd);
   2593     uv__free(w);
   2594   }
   2595 }
   2596 
   2597 
   2598 static void uv__inotify_read(uv_loop_t* loop,
   2599                              uv__io_t* dummy,
   2600                              unsigned int events) {
   2601   const struct inotify_event* e;
   2602   struct watcher_list* w;
   2603   uv_fs_event_t* h;
   2604   struct uv__queue queue;
   2605   struct uv__queue* q;
   2606   const char* path;
   2607   ssize_t size;
   2608   const char *p;
   2609   /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
   2610   char buf[4096];
   2611 
   2612   for (;;) {
   2613     do
   2614       size = read(loop->inotify_fd, buf, sizeof(buf));
   2615     while (size == -1 && errno == EINTR);
   2616 
   2617     if (size == -1) {
   2618       assert(errno == EAGAIN || errno == EWOULDBLOCK);
   2619       break;
   2620     }
   2621 
   2622     assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
   2623 
   2624     /* Now we have one or more inotify_event structs. */
   2625     for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
   2626       e = (const struct inotify_event*) p;
   2627 
   2628       events = 0;
   2629       if (e->mask & (IN_ATTRIB|IN_MODIFY))
   2630         events |= UV_CHANGE;
   2631       if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
   2632         events |= UV_RENAME;
   2633 
   2634       w = find_watcher(loop, e->wd);
   2635       if (w == NULL)
   2636         continue; /* Stale event, no watchers left. */
   2637 
   2638       /* inotify does not return the filename when monitoring a single file
   2639        * for modifications. Repurpose the filename for API compatibility.
   2640        * I'm not convinced this is a good thing, maybe it should go.
   2641        */
   2642       path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
   2643 
   2644       /* We're about to iterate over the queue and call user's callbacks.
   2645        * What can go wrong?
   2646        * A callback could call uv_fs_event_stop()
   2647        * and the queue can change under our feet.
   2648        * So, we use uv__queue_move() trick to safely iterate over the queue.
   2649        * And we don't free the watcher_list until we're done iterating.
   2650        *
   2651        * First,
   2652        * tell uv_fs_event_stop() (that could be called from a user's callback)
   2653        * not to free watcher_list.
   2654        */
   2655       w->iterating = 1;
   2656       uv__queue_move(&w->watchers, &queue);
   2657       while (!uv__queue_empty(&queue)) {
   2658         q = uv__queue_head(&queue);
   2659         h = uv__queue_data(q, uv_fs_event_t, watchers);
   2660 
   2661         uv__queue_remove(q);
   2662         uv__queue_insert_tail(&w->watchers, q);
   2663 
   2664         h->cb(h, path, events, 0);
   2665       }
   2666       /* done iterating, time to (maybe) free empty watcher_list */
   2667       w->iterating = 0;
   2668       maybe_free_watcher_list(w, loop);
   2669     }
   2670   }
   2671 }
   2672 
   2673 
   2674 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
   2675   uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
   2676   return 0;
   2677 }
   2678 
   2679 
   2680 int uv_fs_event_start(uv_fs_event_t* handle,
   2681                       uv_fs_event_cb cb,
   2682                       const char* path,
   2683                       unsigned int flags) {
   2684   struct watcher_list* w;
   2685   uv_loop_t* loop;
   2686   size_t len;
   2687   int events;
   2688   int err;
   2689   int wd;
   2690 
   2691   if (uv__is_active(handle))
   2692     return UV_EINVAL;
   2693 
   2694   loop = handle->loop;
   2695 
   2696   err = init_inotify(loop);
   2697   if (err)
   2698     return err;
   2699 
   2700   events = IN_ATTRIB
   2701          | IN_CREATE
   2702          | IN_MODIFY
   2703          | IN_DELETE
   2704          | IN_DELETE_SELF
   2705          | IN_MOVE_SELF
   2706          | IN_MOVED_FROM
   2707          | IN_MOVED_TO;
   2708 
   2709   wd = inotify_add_watch(loop->inotify_fd, path, events);
   2710   if (wd == -1)
   2711     return UV__ERR(errno);
   2712 
   2713   w = find_watcher(loop, wd);
   2714   if (w)
   2715     goto no_insert;
   2716 
   2717   len = strlen(path) + 1;
   2718   w = uv__malloc(sizeof(*w) + len);
   2719   if (w == NULL)
   2720     return UV_ENOMEM;
   2721 
   2722   w->wd = wd;
   2723   w->path = memcpy(w + 1, path, len);
   2724   uv__queue_init(&w->watchers);
   2725   w->iterating = 0;
   2726   RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
   2727 
   2728 no_insert:
   2729   uv__handle_start(handle);
   2730   uv__queue_insert_tail(&w->watchers, &handle->watchers);
   2731   handle->path = w->path;
   2732   handle->cb = cb;
   2733   handle->wd = wd;
   2734 
   2735   return 0;
   2736 }
   2737 
   2738 
   2739 int uv_fs_event_stop(uv_fs_event_t* handle) {
   2740   struct watcher_list* w;
   2741 
   2742   if (!uv__is_active(handle))
   2743     return 0;
   2744 
   2745   w = find_watcher(handle->loop, handle->wd);
   2746   assert(w != NULL);
   2747 
   2748   handle->wd = -1;
   2749   handle->path = NULL;
   2750   uv__handle_stop(handle);
   2751   uv__queue_remove(&handle->watchers);
   2752 
   2753   maybe_free_watcher_list(w, handle->loop);
   2754 
   2755   return 0;
   2756 }
   2757 
   2758 
   2759 void uv__fs_event_close(uv_fs_event_t* handle) {
   2760   uv_fs_event_stop(handle);
   2761 }
   2762