1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved. 2 * Permission is hereby granted, free of charge, to any person obtaining a copy 3 * of this software and associated documentation files (the "Software"), to 4 * deal in the Software without restriction, including without limitation the 5 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 6 * sell copies of the Software, and to permit persons to whom the Software is 7 * furnished to do so, subject to the following conditions: 8 * 9 * The above copyright notice and this permission notice shall be included in 10 * all copies or substantial portions of the Software. 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 17 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 18 * IN THE SOFTWARE. 19 */ 20 21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their 22 * EPOLL* counterparts. We use the POLL* variants in this file because that 23 * is what libuv uses elsewhere. 24 */ 25 26 #include "uv.h" 27 #include "internal.h" 28 29 #include <inttypes.h> 30 #include <stdatomic.h> 31 #include <stddef.h> /* offsetof */ 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <assert.h> 37 #include <errno.h> 38 39 #include <fcntl.h> 40 #include <ifaddrs.h> 41 #include <net/ethernet.h> 42 #include <net/if.h> 43 #include <netpacket/packet.h> 44 #include <sys/epoll.h> 45 #include <sys/inotify.h> 46 #include <sys/mman.h> 47 #include <sys/param.h> 48 #include <sys/prctl.h> 49 #include <sys/socket.h> 50 #include <sys/stat.h> 51 #include <sys/syscall.h> 52 #include <sys/sysinfo.h> 53 #include <sys/sysmacros.h> 54 #include <sys/types.h> 55 #include <sys/utsname.h> 56 #include <time.h> 57 #include <unistd.h> 58 59 #ifndef __NR_io_uring_setup 60 # define __NR_io_uring_setup 425 61 #endif 62 63 #ifndef __NR_io_uring_enter 64 # define __NR_io_uring_enter 426 65 #endif 66 67 #ifndef __NR_io_uring_register 68 # define __NR_io_uring_register 427 69 #endif 70 71 #ifndef __NR_copy_file_range 72 # if defined(__x86_64__) 73 # define __NR_copy_file_range 326 74 # elif defined(__i386__) 75 # define __NR_copy_file_range 377 76 # elif defined(__s390__) 77 # define __NR_copy_file_range 375 78 # elif defined(__arm__) 79 # define __NR_copy_file_range 391 80 # elif defined(__aarch64__) 81 # define __NR_copy_file_range 285 82 # elif defined(__powerpc__) 83 # define __NR_copy_file_range 379 84 # elif defined(__arc__) 85 # define __NR_copy_file_range 285 86 # elif defined(__riscv) 87 # define __NR_copy_file_range 285 88 # endif 89 #endif /* __NR_copy_file_range */ 90 91 #ifndef __NR_statx 92 # if defined(__x86_64__) 93 # define __NR_statx 332 94 # elif defined(__i386__) 95 # define __NR_statx 383 96 # elif defined(__aarch64__) 97 # define __NR_statx 397 98 # elif defined(__arm__) 99 # define __NR_statx 397 100 # elif defined(__ppc__) 101 # define __NR_statx 383 102 # elif defined(__s390__) 103 # define __NR_statx 379 104 # elif defined(__riscv) 105 # define __NR_statx 291 106 # endif 107 #endif /* __NR_statx */ 108 109 #ifndef __NR_getrandom 110 # if defined(__x86_64__) 111 # define __NR_getrandom 318 112 # elif defined(__i386__) 113 # define __NR_getrandom 355 114 # elif defined(__aarch64__) 115 # define __NR_getrandom 384 116 # elif defined(__arm__) 117 # define __NR_getrandom 384 118 # elif defined(__ppc__) 119 # define __NR_getrandom 359 120 # elif defined(__s390__) 121 # define __NR_getrandom 349 122 # elif defined(__riscv) 123 # define __NR_getrandom 278 124 # endif 125 #endif /* __NR_getrandom */ 126 127 enum { 128 UV__IORING_SETUP_SQPOLL = 2u, 129 UV__IORING_SETUP_NO_SQARRAY = 0x10000u, 130 }; 131 132 enum { 133 UV__IORING_FEAT_SINGLE_MMAP = 1u, 134 UV__IORING_FEAT_NODROP = 2u, 135 UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */ 136 }; 137 138 enum { 139 UV__IORING_OP_READV = 1, 140 UV__IORING_OP_WRITEV = 2, 141 UV__IORING_OP_FSYNC = 3, 142 UV__IORING_OP_OPENAT = 18, 143 UV__IORING_OP_CLOSE = 19, 144 UV__IORING_OP_STATX = 21, 145 UV__IORING_OP_EPOLL_CTL = 29, 146 UV__IORING_OP_RENAMEAT = 35, 147 UV__IORING_OP_UNLINKAT = 36, 148 UV__IORING_OP_MKDIRAT = 37, 149 UV__IORING_OP_SYMLINKAT = 38, 150 UV__IORING_OP_LINKAT = 39, 151 UV__IORING_OP_FTRUNCATE = 55, 152 }; 153 154 enum { 155 UV__IORING_ENTER_GETEVENTS = 1u, 156 UV__IORING_ENTER_SQ_WAKEUP = 2u, 157 }; 158 159 enum { 160 UV__IORING_SQ_NEED_WAKEUP = 1u, 161 UV__IORING_SQ_CQ_OVERFLOW = 2u, 162 }; 163 164 struct uv__io_cqring_offsets { 165 uint32_t head; 166 uint32_t tail; 167 uint32_t ring_mask; 168 uint32_t ring_entries; 169 uint32_t overflow; 170 uint32_t cqes; 171 uint64_t reserved0; 172 uint64_t reserved1; 173 }; 174 175 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets)); 176 177 struct uv__io_sqring_offsets { 178 uint32_t head; 179 uint32_t tail; 180 uint32_t ring_mask; 181 uint32_t ring_entries; 182 uint32_t flags; 183 uint32_t dropped; 184 uint32_t array; 185 uint32_t reserved0; 186 uint64_t reserved1; 187 }; 188 189 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets)); 190 191 struct uv__io_uring_cqe { 192 uint64_t user_data; 193 int32_t res; 194 uint32_t flags; 195 }; 196 197 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe)); 198 199 struct uv__io_uring_sqe { 200 uint8_t opcode; 201 uint8_t flags; 202 uint16_t ioprio; 203 int32_t fd; 204 union { 205 uint64_t off; 206 uint64_t addr2; 207 }; 208 union { 209 uint64_t addr; 210 }; 211 uint32_t len; 212 union { 213 uint32_t rw_flags; 214 uint32_t fsync_flags; 215 uint32_t open_flags; 216 uint32_t statx_flags; 217 }; 218 uint64_t user_data; 219 union { 220 uint16_t buf_index; 221 uint64_t pad[3]; 222 }; 223 }; 224 225 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe)); 226 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode)); 227 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags)); 228 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio)); 229 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd)); 230 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off)); 231 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr)); 232 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len)); 233 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags)); 234 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data)); 235 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index)); 236 237 struct uv__io_uring_params { 238 uint32_t sq_entries; 239 uint32_t cq_entries; 240 uint32_t flags; 241 uint32_t sq_thread_cpu; 242 uint32_t sq_thread_idle; 243 uint32_t features; 244 uint32_t reserved[4]; 245 struct uv__io_sqring_offsets sq_off; /* 40 bytes */ 246 struct uv__io_cqring_offsets cq_off; /* 40 bytes */ 247 }; 248 249 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params)); 250 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off)); 251 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off)); 252 253 STATIC_ASSERT(EPOLL_CTL_ADD < 4); 254 STATIC_ASSERT(EPOLL_CTL_DEL < 4); 255 STATIC_ASSERT(EPOLL_CTL_MOD < 4); 256 257 struct watcher_list { 258 RB_ENTRY(watcher_list) entry; 259 struct uv__queue watchers; 260 int iterating; 261 char* path; 262 int wd; 263 }; 264 265 struct watcher_root { 266 struct watcher_list* rbh_root; 267 }; 268 269 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root); 270 static void uv__inotify_read(uv_loop_t* loop, 271 uv__io_t* w, 272 unsigned int revents); 273 static int compare_watchers(const struct watcher_list* a, 274 const struct watcher_list* b); 275 static void maybe_free_watcher_list(struct watcher_list* w, 276 uv_loop_t* loop); 277 278 static void uv__epoll_ctl_flush(int epollfd, 279 struct uv__iou* ctl, 280 struct epoll_event (*events)[256]); 281 282 static void uv__epoll_ctl_prep(int epollfd, 283 struct uv__iou* ctl, 284 struct epoll_event (*events)[256], 285 int op, 286 int fd, 287 struct epoll_event* e); 288 289 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers) 290 291 292 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) { 293 /* This cast works because watcher_root is a struct with a pointer as its 294 * sole member. Such type punning is unsafe in the presence of strict 295 * pointer aliasing (and is just plain nasty) but that is why libuv 296 * is compiled with -fno-strict-aliasing. 297 */ 298 return (struct watcher_root*) &loop->inotify_watchers; 299 } 300 301 302 unsigned uv__kernel_version(void) { 303 static _Atomic unsigned cached_version; 304 struct utsname u; 305 unsigned version; 306 unsigned major; 307 unsigned minor; 308 unsigned patch; 309 char v_sig[256]; 310 char* needle; 311 312 version = atomic_load_explicit(&cached_version, memory_order_relaxed); 313 if (version != 0) 314 return version; 315 316 /* Check /proc/version_signature first as it's the way to get the mainline 317 * kernel version in Ubuntu. The format is: 318 * Ubuntu ubuntu_kernel_version mainline_kernel_version 319 * For example: 320 * Ubuntu 5.15.0-79.86-generic 5.15.111 321 */ 322 if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig))) 323 if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch)) 324 goto calculate_version; 325 326 if (-1 == uname(&u)) 327 return 0; 328 329 /* In Debian we need to check `version` instead of `release` to extract the 330 * mainline kernel version. This is an example of how it looks like: 331 * #1 SMP Debian 5.10.46-4 (2021-08-03) 332 */ 333 needle = strstr(u.version, "Debian "); 334 if (needle != NULL) 335 if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch)) 336 goto calculate_version; 337 338 if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch)) 339 return 0; 340 341 /* Handle it when the process runs under the UNAME26 personality: 342 * 343 * - kernels >= 3.x identify as 2.6.40+x 344 * - kernels >= 4.x identify as 2.6.60+x 345 * 346 * UNAME26 is a poorly conceived hack that doesn't let us distinguish 347 * between 4.x kernels and 5.x/6.x kernels so we conservatively assume 348 * that 2.6.60+x means 4.x. 349 * 350 * Fun fact of the day: it's technically possible to observe the actual 351 * kernel version for a brief moment because uname() first copies out the 352 * real release string before overwriting it with the backcompat string. 353 */ 354 if (major == 2 && minor == 6) { 355 if (patch >= 60) { 356 major = 4; 357 minor = patch - 60; 358 patch = 0; 359 } else if (patch >= 40) { 360 major = 3; 361 minor = patch - 40; 362 patch = 0; 363 } 364 } 365 366 calculate_version: 367 version = major * 65536 + minor * 256 + patch; 368 atomic_store_explicit(&cached_version, version, memory_order_relaxed); 369 370 return version; 371 } 372 373 374 ssize_t 375 uv__fs_copy_file_range(int fd_in, 376 off_t* off_in, 377 int fd_out, 378 off_t* off_out, 379 size_t len, 380 unsigned int flags) 381 { 382 #ifdef __NR_copy_file_range 383 return syscall(__NR_copy_file_range, 384 fd_in, 385 off_in, 386 fd_out, 387 off_out, 388 len, 389 flags); 390 #else 391 return errno = ENOSYS, -1; 392 #endif 393 } 394 395 396 int uv__statx(int dirfd, 397 const char* path, 398 int flags, 399 unsigned int mask, 400 struct uv__statx* statxbuf) { 401 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30 402 return errno = ENOSYS, -1; 403 #else 404 int rc; 405 406 rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf); 407 if (rc >= 0) 408 uv__msan_unpoison(statxbuf, sizeof(*statxbuf)); 409 410 return rc; 411 #endif 412 } 413 414 415 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) { 416 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28 417 return errno = ENOSYS, -1; 418 #else 419 ssize_t rc; 420 421 rc = syscall(__NR_getrandom, buf, buflen, flags); 422 if (rc >= 0) 423 uv__msan_unpoison(buf, buflen); 424 425 return rc; 426 #endif 427 } 428 429 430 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) { 431 return syscall(__NR_io_uring_setup, entries, params); 432 } 433 434 435 int uv__io_uring_enter(int fd, 436 unsigned to_submit, 437 unsigned min_complete, 438 unsigned flags) { 439 /* io_uring_enter used to take a sigset_t but it's unused 440 * in newer kernels unless IORING_ENTER_EXT_ARG is set, 441 * in which case it takes a struct io_uring_getevents_arg. 442 */ 443 return syscall(__NR_io_uring_enter, 444 fd, 445 to_submit, 446 min_complete, 447 flags, 448 NULL, 449 0L); 450 } 451 452 453 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) { 454 return syscall(__NR_io_uring_register, fd, opcode, arg, nargs); 455 } 456 457 458 static int uv__use_io_uring(uint32_t flags) { 459 #if defined(__ANDROID_API__) 460 return 0; /* Possibly available but blocked by seccomp. */ 461 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4 462 /* See https://github.com/libuv/libuv/issues/4158. */ 463 return 0; /* All 32 bits kernels appear buggy. */ 464 #elif defined(__powerpc64__) || defined(__ppc64__) 465 /* See https://github.com/libuv/libuv/issues/4283. */ 466 return 0; /* Random SIGSEGV in signal handler. */ 467 #else 468 /* Ternary: unknown=0, yes=1, no=-1 */ 469 static _Atomic int use_io_uring; 470 char* val; 471 int use; 472 473 #if defined(__hppa__) 474 /* io_uring first supported on parisc in 6.1, functional in .51 475 * https://lore.kernel.org/all/cb912694-b1fe-dbb0-4d8c-d608f3526905@gmx.de/ 476 */ 477 if (uv__kernel_version() < /*6.1.51*/0x060133) 478 return 0; 479 #endif 480 481 /* SQPOLL is all kinds of buggy but epoll batching should work fine. */ 482 if (0 == (flags & UV__IORING_SETUP_SQPOLL)) 483 return 1; 484 485 /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */ 486 if (uv__kernel_version() < /*5.10.186*/0x050ABA) 487 return 0; 488 489 use = atomic_load_explicit(&use_io_uring, memory_order_relaxed); 490 491 if (use == 0) { 492 val = getenv("UV_USE_IO_URING"); 493 use = val != NULL && atoi(val) > 0 ? 1 : -1; 494 atomic_store_explicit(&use_io_uring, use, memory_order_relaxed); 495 } 496 497 return use > 0; 498 #endif 499 } 500 501 502 static void uv__iou_init(int epollfd, 503 struct uv__iou* iou, 504 uint32_t entries, 505 uint32_t flags) { 506 struct uv__io_uring_params params; 507 struct epoll_event e; 508 size_t cqlen; 509 size_t sqlen; 510 size_t maxlen; 511 size_t sqelen; 512 unsigned kernel_version; 513 uint32_t* sqarray; 514 uint32_t i; 515 char* sq; 516 char* sqe; 517 int ringfd; 518 int no_sqarray; 519 520 sq = MAP_FAILED; 521 sqe = MAP_FAILED; 522 523 if (!uv__use_io_uring(flags)) 524 return; 525 526 kernel_version = uv__kernel_version(); 527 no_sqarray = 528 UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600); 529 530 /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement. 531 * Mostly academic because we check for a v5.13 kernel afterwards anyway. 532 */ 533 memset(¶ms, 0, sizeof(params)); 534 params.flags = flags | no_sqarray; 535 536 if (flags & UV__IORING_SETUP_SQPOLL) 537 params.sq_thread_idle = 10; /* milliseconds */ 538 539 /* Kernel returns a file descriptor with O_CLOEXEC flag set. */ 540 ringfd = uv__io_uring_setup(entries, ¶ms); 541 if (ringfd == -1) 542 return; 543 544 /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're 545 * actually detecting is whether IORING_OP_STATX works with SQPOLL. 546 */ 547 if (!(params.features & UV__IORING_FEAT_RSRC_TAGS)) 548 goto fail; 549 550 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */ 551 if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP)) 552 goto fail; 553 554 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */ 555 if (!(params.features & UV__IORING_FEAT_NODROP)) 556 goto fail; 557 558 sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t); 559 cqlen = 560 params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe); 561 maxlen = sqlen < cqlen ? cqlen : sqlen; 562 sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe); 563 564 sq = mmap(0, 565 maxlen, 566 PROT_READ | PROT_WRITE, 567 MAP_SHARED | MAP_POPULATE, 568 ringfd, 569 0); /* IORING_OFF_SQ_RING */ 570 571 sqe = mmap(0, 572 sqelen, 573 PROT_READ | PROT_WRITE, 574 MAP_SHARED | MAP_POPULATE, 575 ringfd, 576 0x10000000ull); /* IORING_OFF_SQES */ 577 578 if (sq == MAP_FAILED || sqe == MAP_FAILED) 579 goto fail; 580 581 if (flags & UV__IORING_SETUP_SQPOLL) { 582 /* Only interested in completion events. To get notified when 583 * the kernel pulls items from the submission ring, add POLLOUT. 584 */ 585 memset(&e, 0, sizeof(e)); 586 e.events = POLLIN; 587 e.data.fd = ringfd; 588 589 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e)) 590 goto fail; 591 } 592 593 iou->sqhead = (uint32_t*) (sq + params.sq_off.head); 594 iou->sqtail = (uint32_t*) (sq + params.sq_off.tail); 595 iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask); 596 iou->sqflags = (uint32_t*) (sq + params.sq_off.flags); 597 iou->cqhead = (uint32_t*) (sq + params.cq_off.head); 598 iou->cqtail = (uint32_t*) (sq + params.cq_off.tail); 599 iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask); 600 iou->sq = sq; 601 iou->cqe = sq + params.cq_off.cqes; 602 iou->sqe = sqe; 603 iou->sqlen = sqlen; 604 iou->cqlen = cqlen; 605 iou->maxlen = maxlen; 606 iou->sqelen = sqelen; 607 iou->ringfd = ringfd; 608 iou->in_flight = 0; 609 610 if (no_sqarray) 611 return; 612 613 sqarray = (uint32_t*) (sq + params.sq_off.array); 614 for (i = 0; i <= iou->sqmask; i++) 615 sqarray[i] = i; /* Slot -> sqe identity mapping. */ 616 617 return; 618 619 fail: 620 if (sq != MAP_FAILED) 621 munmap(sq, maxlen); 622 623 if (sqe != MAP_FAILED) 624 munmap(sqe, sqelen); 625 626 uv__close(ringfd); 627 } 628 629 630 static void uv__iou_delete(struct uv__iou* iou) { 631 if (iou->ringfd > -1) { 632 munmap(iou->sq, iou->maxlen); 633 munmap(iou->sqe, iou->sqelen); 634 uv__close(iou->ringfd); 635 iou->ringfd = -1; 636 } 637 } 638 639 640 int uv__platform_loop_init(uv_loop_t* loop) { 641 uv__loop_internal_fields_t* lfields; 642 643 lfields = uv__get_internal_fields(loop); 644 lfields->ctl.ringfd = -1; 645 lfields->iou.ringfd = -2; /* "uninitialized" */ 646 647 loop->inotify_watchers = NULL; 648 loop->inotify_fd = -1; 649 loop->backend_fd = epoll_create1(O_CLOEXEC); 650 651 if (loop->backend_fd == -1) 652 return UV__ERR(errno); 653 654 uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0); 655 656 return 0; 657 } 658 659 660 int uv__io_fork(uv_loop_t* loop) { 661 int err; 662 struct watcher_list* root; 663 664 root = uv__inotify_watchers(loop)->rbh_root; 665 666 uv__close(loop->backend_fd); 667 loop->backend_fd = -1; 668 669 /* TODO(bnoordhuis) Loses items from the submission and completion rings. */ 670 uv__platform_loop_delete(loop); 671 672 err = uv__platform_loop_init(loop); 673 if (err) 674 return err; 675 676 return uv__inotify_fork(loop, root); 677 } 678 679 680 void uv__platform_loop_delete(uv_loop_t* loop) { 681 uv__loop_internal_fields_t* lfields; 682 683 lfields = uv__get_internal_fields(loop); 684 uv__iou_delete(&lfields->ctl); 685 uv__iou_delete(&lfields->iou); 686 687 if (loop->inotify_fd != -1) { 688 uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN); 689 uv__close(loop->inotify_fd); 690 loop->inotify_fd = -1; 691 } 692 } 693 694 695 struct uv__invalidate { 696 struct epoll_event (*prep)[256]; 697 struct epoll_event* events; 698 int nfds; 699 }; 700 701 702 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) { 703 uv__loop_internal_fields_t* lfields; 704 struct uv__invalidate* inv; 705 struct epoll_event dummy; 706 int i; 707 708 lfields = uv__get_internal_fields(loop); 709 inv = lfields->inv; 710 711 /* Invalidate events with same file descriptor */ 712 if (inv != NULL) 713 for (i = 0; i < inv->nfds; i++) 714 if (inv->events[i].data.fd == fd) 715 inv->events[i].data.fd = -1; 716 717 /* Remove the file descriptor from the epoll. 718 * This avoids a problem where the same file description remains open 719 * in another process, causing repeated junk epoll events. 720 * 721 * Perform EPOLL_CTL_DEL immediately instead of going through 722 * io_uring's submit queue, otherwise the file descriptor may 723 * be closed by the time the kernel starts the operation. 724 * 725 * We pass in a dummy epoll_event, to work around a bug in old kernels. 726 * 727 * Work around a bug in kernels 3.10 to 3.19 where passing a struct that 728 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings. 729 */ 730 memset(&dummy, 0, sizeof(dummy)); 731 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy); 732 } 733 734 735 int uv__io_check_fd(uv_loop_t* loop, int fd) { 736 struct epoll_event e; 737 int rc; 738 739 memset(&e, 0, sizeof(e)); 740 e.events = POLLIN; 741 e.data.fd = -1; 742 743 rc = 0; 744 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e)) 745 if (errno != EEXIST) 746 rc = UV__ERR(errno); 747 748 if (rc == 0) 749 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e)) 750 abort(); 751 752 return rc; 753 } 754 755 756 /* Caller must initialize SQE and call uv__iou_submit(). */ 757 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou, 758 uv_loop_t* loop, 759 uv_fs_t* req) { 760 struct uv__io_uring_sqe* sqe; 761 uint32_t head; 762 uint32_t tail; 763 uint32_t mask; 764 uint32_t slot; 765 766 /* Lazily create the ring. State machine: -2 means uninitialized, -1 means 767 * initialization failed. Anything else is a valid ring file descriptor. 768 */ 769 if (iou->ringfd == -2) { 770 /* By default, the SQPOLL is not created. Enable only if the loop is 771 * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING 772 * environment variable is unset or a positive number. 773 */ 774 if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL) 775 if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL)) 776 uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL); 777 778 if (iou->ringfd == -2) 779 iou->ringfd = -1; /* "failed" */ 780 } 781 782 if (iou->ringfd == -1) 783 return NULL; 784 785 head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead, 786 memory_order_acquire); 787 tail = *iou->sqtail; 788 mask = iou->sqmask; 789 790 if ((head & mask) == ((tail + 1) & mask)) 791 return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */ 792 793 slot = tail & mask; 794 sqe = iou->sqe; 795 sqe = &sqe[slot]; 796 memset(sqe, 0, sizeof(*sqe)); 797 sqe->user_data = (uintptr_t) req; 798 799 /* Pacify uv_cancel(). */ 800 req->work_req.loop = loop; 801 req->work_req.work = NULL; 802 req->work_req.done = NULL; 803 uv__queue_init(&req->work_req.wq); 804 805 uv__req_register(loop); 806 iou->in_flight++; 807 808 return sqe; 809 } 810 811 812 static void uv__iou_submit(struct uv__iou* iou) { 813 uint32_t flags; 814 815 atomic_store_explicit((_Atomic uint32_t*) iou->sqtail, 816 *iou->sqtail + 1, 817 memory_order_release); 818 819 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags, 820 memory_order_acquire); 821 822 if (flags & UV__IORING_SQ_NEED_WAKEUP) 823 if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP)) 824 if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */ 825 perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */ 826 } 827 828 829 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) { 830 struct uv__io_uring_sqe* sqe; 831 struct uv__iou* iou; 832 int kv; 833 834 kv = uv__kernel_version(); 835 /* Work around a poorly understood bug in older kernels where closing a file 836 * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to 837 * execve("/foo/bar") later on. The bug seems to have been fixed somewhere 838 * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit 839 * but good candidates are the several data race fixes. Interestingly, it 840 * seems to manifest only when running under Docker so the possibility of 841 * a Docker bug can't be completely ruled out either. Yay, computers. 842 * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and 843 * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be 844 * solved. 845 */ 846 if (kv < /* 5.15.90 */ 0x050F5A) 847 return 0; 848 849 if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100) 850 return 0; 851 852 853 iou = &uv__get_internal_fields(loop)->iou; 854 855 sqe = uv__iou_get_sqe(iou, loop, req); 856 if (sqe == NULL) 857 return 0; 858 859 sqe->fd = req->file; 860 sqe->opcode = UV__IORING_OP_CLOSE; 861 862 uv__iou_submit(iou); 863 864 return 1; 865 } 866 867 868 int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) { 869 struct uv__io_uring_sqe* sqe; 870 struct uv__iou* iou; 871 872 if (uv__kernel_version() < /* 6.9 */0x060900) 873 return 0; 874 875 iou = &uv__get_internal_fields(loop)->iou; 876 sqe = uv__iou_get_sqe(iou, loop, req); 877 if (sqe == NULL) 878 return 0; 879 880 sqe->fd = req->file; 881 sqe->len = req->off; 882 sqe->opcode = UV__IORING_OP_FTRUNCATE; 883 uv__iou_submit(iou); 884 885 return 1; 886 } 887 888 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop, 889 uv_fs_t* req, 890 uint32_t fsync_flags) { 891 struct uv__io_uring_sqe* sqe; 892 struct uv__iou* iou; 893 894 iou = &uv__get_internal_fields(loop)->iou; 895 896 sqe = uv__iou_get_sqe(iou, loop, req); 897 if (sqe == NULL) 898 return 0; 899 900 /* Little known fact: setting seq->off and seq->len turns 901 * it into an asynchronous sync_file_range() operation. 902 */ 903 sqe->fd = req->file; 904 sqe->fsync_flags = fsync_flags; 905 sqe->opcode = UV__IORING_OP_FSYNC; 906 907 uv__iou_submit(iou); 908 909 return 1; 910 } 911 912 913 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) { 914 struct uv__io_uring_sqe* sqe; 915 struct uv__iou* iou; 916 917 if (uv__kernel_version() < /* 5.15.0 */0x050F00) 918 return 0; 919 920 iou = &uv__get_internal_fields(loop)->iou; 921 sqe = uv__iou_get_sqe(iou, loop, req); 922 if (sqe == NULL) 923 return 0; 924 925 sqe->addr = (uintptr_t) req->path; 926 sqe->fd = AT_FDCWD; 927 sqe->addr2 = (uintptr_t) req->new_path; 928 sqe->len = AT_FDCWD; 929 sqe->opcode = UV__IORING_OP_LINKAT; 930 931 uv__iou_submit(iou); 932 933 return 1; 934 } 935 936 937 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) { 938 struct uv__io_uring_sqe* sqe; 939 struct uv__iou* iou; 940 941 if (uv__kernel_version() < /* 5.15.0 */0x050F00) 942 return 0; 943 944 iou = &uv__get_internal_fields(loop)->iou; 945 sqe = uv__iou_get_sqe(iou, loop, req); 946 if (sqe == NULL) 947 return 0; 948 949 sqe->addr = (uintptr_t) req->path; 950 sqe->fd = AT_FDCWD; 951 sqe->len = req->mode; 952 sqe->opcode = UV__IORING_OP_MKDIRAT; 953 954 uv__iou_submit(iou); 955 956 return 1; 957 } 958 959 960 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) { 961 struct uv__io_uring_sqe* sqe; 962 struct uv__iou* iou; 963 964 iou = &uv__get_internal_fields(loop)->iou; 965 966 sqe = uv__iou_get_sqe(iou, loop, req); 967 if (sqe == NULL) 968 return 0; 969 970 sqe->addr = (uintptr_t) req->path; 971 sqe->fd = AT_FDCWD; 972 sqe->len = req->mode; 973 sqe->opcode = UV__IORING_OP_OPENAT; 974 sqe->open_flags = req->flags | O_CLOEXEC; 975 976 uv__iou_submit(iou); 977 978 return 1; 979 } 980 981 982 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) { 983 struct uv__io_uring_sqe* sqe; 984 struct uv__iou* iou; 985 986 iou = &uv__get_internal_fields(loop)->iou; 987 988 sqe = uv__iou_get_sqe(iou, loop, req); 989 if (sqe == NULL) 990 return 0; 991 992 sqe->addr = (uintptr_t) req->path; 993 sqe->fd = AT_FDCWD; 994 sqe->addr2 = (uintptr_t) req->new_path; 995 sqe->len = AT_FDCWD; 996 sqe->opcode = UV__IORING_OP_RENAMEAT; 997 998 uv__iou_submit(iou); 999 1000 return 1; 1001 } 1002 1003 1004 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) { 1005 struct uv__io_uring_sqe* sqe; 1006 struct uv__iou* iou; 1007 1008 if (uv__kernel_version() < /* 5.15.0 */0x050F00) 1009 return 0; 1010 1011 iou = &uv__get_internal_fields(loop)->iou; 1012 sqe = uv__iou_get_sqe(iou, loop, req); 1013 if (sqe == NULL) 1014 return 0; 1015 1016 sqe->addr = (uintptr_t) req->path; 1017 sqe->fd = AT_FDCWD; 1018 sqe->addr2 = (uintptr_t) req->new_path; 1019 sqe->opcode = UV__IORING_OP_SYMLINKAT; 1020 1021 uv__iou_submit(iou); 1022 1023 return 1; 1024 } 1025 1026 1027 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) { 1028 struct uv__io_uring_sqe* sqe; 1029 struct uv__iou* iou; 1030 1031 iou = &uv__get_internal_fields(loop)->iou; 1032 1033 sqe = uv__iou_get_sqe(iou, loop, req); 1034 if (sqe == NULL) 1035 return 0; 1036 1037 sqe->addr = (uintptr_t) req->path; 1038 sqe->fd = AT_FDCWD; 1039 sqe->opcode = UV__IORING_OP_UNLINKAT; 1040 1041 uv__iou_submit(iou); 1042 1043 return 1; 1044 } 1045 1046 1047 int uv__iou_fs_read_or_write(uv_loop_t* loop, 1048 uv_fs_t* req, 1049 int is_read) { 1050 struct uv__io_uring_sqe* sqe; 1051 struct uv__iou* iou; 1052 1053 /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback 1054 * to the threadpool on writes */ 1055 if (req->nbufs > IOV_MAX) { 1056 if (is_read) 1057 req->nbufs = IOV_MAX; 1058 else 1059 return 0; 1060 } 1061 1062 iou = &uv__get_internal_fields(loop)->iou; 1063 1064 sqe = uv__iou_get_sqe(iou, loop, req); 1065 if (sqe == NULL) 1066 return 0; 1067 1068 sqe->addr = (uintptr_t) req->bufs; 1069 sqe->fd = req->file; 1070 sqe->len = req->nbufs; 1071 sqe->off = req->off < 0 ? -1 : req->off; 1072 sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV; 1073 1074 uv__iou_submit(iou); 1075 1076 return 1; 1077 } 1078 1079 1080 int uv__iou_fs_statx(uv_loop_t* loop, 1081 uv_fs_t* req, 1082 int is_fstat, 1083 int is_lstat) { 1084 struct uv__io_uring_sqe* sqe; 1085 struct uv__statx* statxbuf; 1086 struct uv__iou* iou; 1087 1088 statxbuf = uv__malloc(sizeof(*statxbuf)); 1089 if (statxbuf == NULL) 1090 return 0; 1091 1092 iou = &uv__get_internal_fields(loop)->iou; 1093 1094 sqe = uv__iou_get_sqe(iou, loop, req); 1095 if (sqe == NULL) { 1096 uv__free(statxbuf); 1097 return 0; 1098 } 1099 1100 req->ptr = statxbuf; 1101 1102 sqe->addr = (uintptr_t) req->path; 1103 sqe->addr2 = (uintptr_t) statxbuf; 1104 sqe->fd = AT_FDCWD; 1105 sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */ 1106 sqe->opcode = UV__IORING_OP_STATX; 1107 1108 if (is_fstat) { 1109 sqe->addr = (uintptr_t) ""; 1110 sqe->fd = req->file; 1111 sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */ 1112 } 1113 1114 if (is_lstat) 1115 sqe->statx_flags |= AT_SYMLINK_NOFOLLOW; 1116 1117 uv__iou_submit(iou); 1118 1119 return 1; 1120 } 1121 1122 1123 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) { 1124 buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor); 1125 buf->st_mode = statxbuf->stx_mode; 1126 buf->st_nlink = statxbuf->stx_nlink; 1127 buf->st_uid = statxbuf->stx_uid; 1128 buf->st_gid = statxbuf->stx_gid; 1129 buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor); 1130 buf->st_ino = statxbuf->stx_ino; 1131 buf->st_size = statxbuf->stx_size; 1132 buf->st_blksize = statxbuf->stx_blksize; 1133 buf->st_blocks = statxbuf->stx_blocks; 1134 buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec; 1135 buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec; 1136 buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec; 1137 buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec; 1138 buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec; 1139 buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec; 1140 buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec; 1141 buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec; 1142 buf->st_flags = 0; 1143 buf->st_gen = 0; 1144 } 1145 1146 1147 static void uv__iou_fs_statx_post(uv_fs_t* req) { 1148 struct uv__statx* statxbuf; 1149 uv_stat_t* buf; 1150 1151 buf = &req->statbuf; 1152 statxbuf = req->ptr; 1153 req->ptr = NULL; 1154 1155 if (req->result == 0) { 1156 uv__msan_unpoison(statxbuf, sizeof(*statxbuf)); 1157 uv__statx_to_stat(statxbuf, buf); 1158 req->ptr = buf; 1159 } 1160 1161 uv__free(statxbuf); 1162 } 1163 1164 1165 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) { 1166 struct uv__io_uring_cqe* cqe; 1167 struct uv__io_uring_cqe* e; 1168 uv_fs_t* req; 1169 uint32_t head; 1170 uint32_t tail; 1171 uint32_t mask; 1172 uint32_t i; 1173 uint32_t flags; 1174 int nevents; 1175 int rc; 1176 1177 head = *iou->cqhead; 1178 tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail, 1179 memory_order_acquire); 1180 mask = iou->cqmask; 1181 cqe = iou->cqe; 1182 nevents = 0; 1183 1184 for (i = head; i != tail; i++) { 1185 e = &cqe[i & mask]; 1186 1187 req = (uv_fs_t*) (uintptr_t) e->user_data; 1188 assert(req->type == UV_FS); 1189 1190 uv__req_unregister(loop); 1191 iou->in_flight--; 1192 1193 /* If the op is not supported by the kernel retry using the thread pool */ 1194 if (e->res == -EOPNOTSUPP) { 1195 uv__fs_post(loop, req); 1196 continue; 1197 } 1198 1199 /* io_uring stores error codes as negative numbers, same as libuv. */ 1200 req->result = e->res; 1201 1202 switch (req->fs_type) { 1203 case UV_FS_FSTAT: 1204 case UV_FS_LSTAT: 1205 case UV_FS_STAT: 1206 uv__iou_fs_statx_post(req); 1207 break; 1208 default: /* Squelch -Wswitch warnings. */ 1209 break; 1210 } 1211 1212 uv__metrics_update_idle_time(loop); 1213 req->cb(req); 1214 nevents++; 1215 } 1216 1217 atomic_store_explicit((_Atomic uint32_t*) iou->cqhead, 1218 tail, 1219 memory_order_release); 1220 1221 /* Check whether CQE's overflowed, if so enter the kernel to make them 1222 * available. Don't grab them immediately but in the next loop iteration to 1223 * avoid loop starvation. */ 1224 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags, 1225 memory_order_acquire); 1226 1227 if (flags & UV__IORING_SQ_CQ_OVERFLOW) { 1228 do 1229 rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS); 1230 while (rc == -1 && errno == EINTR); 1231 1232 if (rc < 0) 1233 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */ 1234 } 1235 1236 uv__metrics_inc_events(loop, nevents); 1237 if (uv__get_internal_fields(loop)->current_timeout == 0) 1238 uv__metrics_inc_events_waiting(loop, nevents); 1239 } 1240 1241 1242 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be 1243 * executed immediately, otherwise the file descriptor may have been closed 1244 * by the time the kernel starts the operation. 1245 */ 1246 static void uv__epoll_ctl_prep(int epollfd, 1247 struct uv__iou* ctl, 1248 struct epoll_event (*events)[256], 1249 int op, 1250 int fd, 1251 struct epoll_event* e) { 1252 struct uv__io_uring_sqe* sqe; 1253 struct epoll_event* pe; 1254 uint32_t mask; 1255 uint32_t slot; 1256 1257 assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD); 1258 assert(ctl->ringfd != -1); 1259 1260 mask = ctl->sqmask; 1261 slot = (*ctl->sqtail)++ & mask; 1262 1263 pe = &(*events)[slot]; 1264 *pe = *e; 1265 1266 sqe = ctl->sqe; 1267 sqe = &sqe[slot]; 1268 1269 memset(sqe, 0, sizeof(*sqe)); 1270 sqe->addr = (uintptr_t) pe; 1271 sqe->fd = epollfd; 1272 sqe->len = op; 1273 sqe->off = fd; 1274 sqe->opcode = UV__IORING_OP_EPOLL_CTL; 1275 sqe->user_data = op | slot << 2 | (int64_t) fd << 32; 1276 1277 if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask)) 1278 uv__epoll_ctl_flush(epollfd, ctl, events); 1279 } 1280 1281 1282 static void uv__epoll_ctl_flush(int epollfd, 1283 struct uv__iou* ctl, 1284 struct epoll_event (*events)[256]) { 1285 struct epoll_event oldevents[256]; 1286 struct uv__io_uring_cqe* cqe; 1287 uint32_t oldslot; 1288 uint32_t slot; 1289 uint32_t n; 1290 int fd; 1291 int op; 1292 int rc; 1293 1294 STATIC_ASSERT(sizeof(oldevents) == sizeof(*events)); 1295 assert(ctl->ringfd != -1); 1296 assert(*ctl->sqhead != *ctl->sqtail); 1297 1298 n = *ctl->sqtail - *ctl->sqhead; 1299 do 1300 rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS); 1301 while (rc == -1 && errno == EINTR); 1302 1303 if (rc < 0) 1304 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */ 1305 1306 if (rc != (int) n) 1307 abort(); 1308 1309 assert(*ctl->sqhead == *ctl->sqtail); 1310 1311 memcpy(oldevents, *events, sizeof(*events)); 1312 1313 /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors 1314 * that have been closed, or EPOLL_CTL_ADD commands for file descriptors 1315 * that we are already watching. Ignore the former and retry the latter 1316 * with EPOLL_CTL_MOD. 1317 */ 1318 while (*ctl->cqhead != *ctl->cqtail) { 1319 slot = (*ctl->cqhead)++ & ctl->cqmask; 1320 1321 cqe = ctl->cqe; 1322 cqe = &cqe[slot]; 1323 1324 if (cqe->res == 0) 1325 continue; 1326 1327 fd = cqe->user_data >> 32; 1328 op = 3 & cqe->user_data; 1329 oldslot = 255 & (cqe->user_data >> 2); 1330 1331 if (op == EPOLL_CTL_DEL) 1332 continue; 1333 1334 if (op != EPOLL_CTL_ADD) 1335 abort(); 1336 1337 if (cqe->res != -EEXIST) 1338 abort(); 1339 1340 uv__epoll_ctl_prep(epollfd, 1341 ctl, 1342 events, 1343 EPOLL_CTL_MOD, 1344 fd, 1345 &oldevents[oldslot]); 1346 } 1347 } 1348 1349 1350 void uv__io_poll(uv_loop_t* loop, int timeout) { 1351 uv__loop_internal_fields_t* lfields; 1352 struct epoll_event events[1024]; 1353 struct epoll_event prep[256]; 1354 struct uv__invalidate inv; 1355 struct epoll_event* pe; 1356 struct epoll_event e; 1357 struct uv__iou* ctl; 1358 struct uv__iou* iou; 1359 int real_timeout; 1360 struct uv__queue* q; 1361 uv__io_t* w; 1362 sigset_t* sigmask; 1363 sigset_t sigset; 1364 uint64_t base; 1365 int have_iou_events; 1366 int have_signals; 1367 int nevents; 1368 int epollfd; 1369 int count; 1370 int nfds; 1371 int fd; 1372 int op; 1373 int i; 1374 int user_timeout; 1375 int reset_timeout; 1376 1377 lfields = uv__get_internal_fields(loop); 1378 ctl = &lfields->ctl; 1379 iou = &lfields->iou; 1380 1381 sigmask = NULL; 1382 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) { 1383 sigemptyset(&sigset); 1384 sigaddset(&sigset, SIGPROF); 1385 sigmask = &sigset; 1386 } 1387 1388 assert(timeout >= -1); 1389 base = loop->time; 1390 count = 48; /* Benchmarks suggest this gives the best throughput. */ 1391 real_timeout = timeout; 1392 1393 if (lfields->flags & UV_METRICS_IDLE_TIME) { 1394 reset_timeout = 1; 1395 user_timeout = timeout; 1396 timeout = 0; 1397 } else { 1398 reset_timeout = 0; 1399 user_timeout = 0; 1400 } 1401 1402 epollfd = loop->backend_fd; 1403 1404 memset(&e, 0, sizeof(e)); 1405 1406 while (!uv__queue_empty(&loop->watcher_queue)) { 1407 q = uv__queue_head(&loop->watcher_queue); 1408 w = uv__queue_data(q, uv__io_t, watcher_queue); 1409 uv__queue_remove(q); 1410 uv__queue_init(q); 1411 1412 op = EPOLL_CTL_MOD; 1413 if (w->events == 0) 1414 op = EPOLL_CTL_ADD; 1415 1416 w->events = w->pevents; 1417 e.events = w->pevents; 1418 e.data.fd = w->fd; 1419 fd = w->fd; 1420 1421 if (ctl->ringfd != -1) { 1422 uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e); 1423 continue; 1424 } 1425 1426 if (!epoll_ctl(epollfd, op, fd, &e)) 1427 continue; 1428 1429 assert(op == EPOLL_CTL_ADD); 1430 assert(errno == EEXIST); 1431 1432 /* File descriptor that's been watched before, update event mask. */ 1433 if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e)) 1434 abort(); 1435 } 1436 1437 inv.events = events; 1438 inv.prep = &prep; 1439 inv.nfds = -1; 1440 1441 for (;;) { 1442 if (loop->nfds == 0) 1443 if (iou->in_flight == 0) 1444 break; 1445 1446 /* All event mask mutations should be visible to the kernel before 1447 * we enter epoll_pwait(). 1448 */ 1449 if (ctl->ringfd != -1) 1450 while (*ctl->sqhead != *ctl->sqtail) 1451 uv__epoll_ctl_flush(epollfd, ctl, &prep); 1452 1453 /* Only need to set the provider_entry_time if timeout != 0. The function 1454 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME. 1455 */ 1456 if (timeout != 0) 1457 uv__metrics_set_provider_entry_time(loop); 1458 1459 /* Store the current timeout in a location that's globally accessible so 1460 * other locations like uv__work_done() can determine whether the queue 1461 * of events in the callback were waiting when poll was called. 1462 */ 1463 lfields->current_timeout = timeout; 1464 1465 nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask); 1466 1467 /* Update loop->time unconditionally. It's tempting to skip the update when 1468 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the 1469 * operating system didn't reschedule our process while in the syscall. 1470 */ 1471 SAVE_ERRNO(uv__update_time(loop)); 1472 1473 if (nfds == -1) 1474 assert(errno == EINTR); 1475 else if (nfds == 0) 1476 /* Unlimited timeout should only return with events or signal. */ 1477 assert(timeout != -1); 1478 1479 if (nfds == 0 || nfds == -1) { 1480 if (reset_timeout != 0) { 1481 timeout = user_timeout; 1482 reset_timeout = 0; 1483 } else if (nfds == 0) { 1484 return; 1485 } 1486 1487 /* Interrupted by a signal. Update timeout and poll again. */ 1488 goto update_timeout; 1489 } 1490 1491 have_iou_events = 0; 1492 have_signals = 0; 1493 nevents = 0; 1494 1495 inv.nfds = nfds; 1496 lfields->inv = &inv; 1497 1498 for (i = 0; i < nfds; i++) { 1499 pe = events + i; 1500 fd = pe->data.fd; 1501 1502 /* Skip invalidated events, see uv__platform_invalidate_fd */ 1503 if (fd == -1) 1504 continue; 1505 1506 if (fd == iou->ringfd) { 1507 uv__poll_io_uring(loop, iou); 1508 have_iou_events = 1; 1509 continue; 1510 } 1511 1512 assert(fd >= 0); 1513 assert((unsigned) fd < loop->nwatchers); 1514 1515 w = loop->watchers[fd]; 1516 1517 if (w == NULL) { 1518 /* File descriptor that we've stopped watching, disarm it. 1519 * 1520 * Ignore all errors because we may be racing with another thread 1521 * when the file descriptor is closed. 1522 * 1523 * Perform EPOLL_CTL_DEL immediately instead of going through 1524 * io_uring's submit queue, otherwise the file descriptor may 1525 * be closed by the time the kernel starts the operation. 1526 */ 1527 epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe); 1528 continue; 1529 } 1530 1531 /* Give users only events they're interested in. Prevents spurious 1532 * callbacks when previous callback invocation in this loop has stopped 1533 * the current watcher. Also, filters out events that users has not 1534 * requested us to watch. 1535 */ 1536 pe->events &= w->pevents | POLLERR | POLLHUP; 1537 1538 /* Work around an epoll quirk where it sometimes reports just the 1539 * EPOLLERR or EPOLLHUP event. In order to force the event loop to 1540 * move forward, we merge in the read/write events that the watcher 1541 * is interested in; uv__read() and uv__write() will then deal with 1542 * the error or hangup in the usual fashion. 1543 * 1544 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user 1545 * reads the available data, calls uv_read_stop(), then sometime later 1546 * calls uv_read_start() again. By then, libuv has forgotten about the 1547 * hangup and the kernel won't report EPOLLIN again because there's 1548 * nothing left to read. If anything, libuv is to blame here. The 1549 * current hack is just a quick bandaid; to properly fix it, libuv 1550 * needs to remember the error/hangup event. We should get that for 1551 * free when we switch over to edge-triggered I/O. 1552 */ 1553 if (pe->events == POLLERR || pe->events == POLLHUP) 1554 pe->events |= 1555 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI); 1556 1557 if (pe->events != 0) { 1558 /* Run signal watchers last. This also affects child process watchers 1559 * because those are implemented in terms of signal watchers. 1560 */ 1561 if (w == &loop->signal_io_watcher) { 1562 have_signals = 1; 1563 } else { 1564 uv__metrics_update_idle_time(loop); 1565 w->cb(loop, w, pe->events); 1566 } 1567 1568 nevents++; 1569 } 1570 } 1571 1572 uv__metrics_inc_events(loop, nevents); 1573 if (reset_timeout != 0) { 1574 timeout = user_timeout; 1575 reset_timeout = 0; 1576 uv__metrics_inc_events_waiting(loop, nevents); 1577 } 1578 1579 if (have_signals != 0) { 1580 uv__metrics_update_idle_time(loop); 1581 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN); 1582 } 1583 1584 lfields->inv = NULL; 1585 1586 if (have_iou_events != 0) 1587 break; /* Event loop should cycle now so don't poll again. */ 1588 1589 if (have_signals != 0) 1590 break; /* Event loop should cycle now so don't poll again. */ 1591 1592 if (nevents != 0) { 1593 if (nfds == ARRAY_SIZE(events) && --count != 0) { 1594 /* Poll for more events but don't block this time. */ 1595 timeout = 0; 1596 continue; 1597 } 1598 break; 1599 } 1600 1601 update_timeout: 1602 if (timeout == 0) 1603 break; 1604 1605 if (timeout == -1) 1606 continue; 1607 1608 assert(timeout > 0); 1609 1610 real_timeout -= (loop->time - base); 1611 if (real_timeout <= 0) 1612 break; 1613 1614 timeout = real_timeout; 1615 } 1616 1617 if (ctl->ringfd != -1) 1618 while (*ctl->sqhead != *ctl->sqtail) 1619 uv__epoll_ctl_flush(epollfd, ctl, &prep); 1620 } 1621 1622 uint64_t uv__hrtime(uv_clocktype_t type) { 1623 static _Atomic clock_t fast_clock_id = -1; 1624 struct timespec t; 1625 clock_t clock_id; 1626 1627 /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has 1628 * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is 1629 * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may 1630 * decide to make a costly system call. 1631 */ 1632 /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE 1633 * when it has microsecond granularity or better (unlikely). 1634 */ 1635 clock_id = CLOCK_MONOTONIC; 1636 if (type != UV_CLOCK_FAST) 1637 goto done; 1638 1639 clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed); 1640 if (clock_id != -1) 1641 goto done; 1642 1643 clock_id = CLOCK_MONOTONIC; 1644 if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t)) 1645 if (t.tv_nsec <= 1 * 1000 * 1000) 1646 clock_id = CLOCK_MONOTONIC_COARSE; 1647 1648 atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed); 1649 1650 done: 1651 1652 if (clock_gettime(clock_id, &t)) 1653 return 0; /* Not really possible. */ 1654 1655 return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec; 1656 } 1657 1658 1659 int uv_resident_set_memory(size_t* rss) { 1660 char buf[1024]; 1661 const char* s; 1662 long val; 1663 int rc; 1664 int i; 1665 1666 /* rss: 24th element */ 1667 rc = uv__slurp("/proc/self/stat", buf, sizeof(buf)); 1668 if (rc < 0) 1669 return rc; 1670 1671 /* find the last ')' */ 1672 s = strrchr(buf, ')'); 1673 if (s == NULL) 1674 goto err; 1675 1676 for (i = 1; i <= 22; i++) { 1677 s = strchr(s + 1, ' '); 1678 if (s == NULL) 1679 goto err; 1680 } 1681 1682 errno = 0; 1683 val = strtol(s, NULL, 10); 1684 if (val < 0 || errno != 0) 1685 goto err; 1686 1687 *rss = val * getpagesize(); 1688 return 0; 1689 1690 err: 1691 return UV_EINVAL; 1692 } 1693 1694 int uv_uptime(double* uptime) { 1695 struct timespec now; 1696 char buf[128]; 1697 1698 /* Consult /proc/uptime when present (common case), or fall back to 1699 * clock_gettime. Why not always clock_gettime? It doesn't always return the 1700 * right result under OpenVZ and possibly other containerized environments. 1701 */ 1702 if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf))) 1703 if (1 == sscanf(buf, "%lf", uptime)) 1704 return 0; 1705 1706 if (clock_gettime(CLOCK_BOOTTIME, &now)) 1707 return UV__ERR(errno); 1708 1709 *uptime = now.tv_sec; 1710 return 0; 1711 } 1712 1713 1714 int uv_cpu_info(uv_cpu_info_t** ci, int* count) { 1715 #if defined(__PPC__) 1716 static const char model_marker[] = "cpu\t\t: "; 1717 static const char model_marker2[] = ""; 1718 #elif defined(__arm__) 1719 static const char model_marker[] = "model name\t: "; 1720 static const char model_marker2[] = "Processor\t: "; 1721 #elif defined(__aarch64__) 1722 static const char model_marker[] = "CPU part\t: "; 1723 static const char model_marker2[] = ""; 1724 #elif defined(__mips__) 1725 static const char model_marker[] = "cpu model\t\t: "; 1726 static const char model_marker2[] = ""; 1727 #elif defined(__loongarch__) 1728 static const char model_marker[] = "cpu family\t\t: "; 1729 static const char model_marker2[] = ""; 1730 #else 1731 static const char model_marker[] = "model name\t: "; 1732 static const char model_marker2[] = ""; 1733 #endif 1734 static const char parts[] = 1735 #ifdef __aarch64__ 1736 "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n" 1737 "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n" 1738 "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n" 1739 "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n" 1740 "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n" 1741 "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n" 1742 "0xc0d\nCortex-A17\n" /* Originally A12 */ 1743 "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n" 1744 "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n" 1745 "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n" 1746 "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n" 1747 "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n" 1748 "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n" 1749 "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n" 1750 "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n" 1751 "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n" 1752 "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n" 1753 "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n" 1754 #endif 1755 ""; 1756 struct cpu { 1757 unsigned long long freq, user, nice, sys, idle, irq; 1758 unsigned model; 1759 }; 1760 FILE* fp; 1761 char* p; 1762 int found; 1763 int n; 1764 unsigned i; 1765 unsigned cpu; 1766 unsigned maxcpu; 1767 unsigned size; 1768 unsigned long long skip; 1769 struct cpu (*cpus)[8192]; /* Kernel maximum. */ 1770 struct cpu* c; 1771 struct cpu t; 1772 char (*model)[64]; 1773 unsigned char bitmap[ARRAY_SIZE(*cpus) / 8]; 1774 /* Assumption: even big.LITTLE systems will have only a handful 1775 * of different CPU models. Most systems will just have one. 1776 */ 1777 char models[8][64]; 1778 char buf[1024]; 1779 1780 memset(bitmap, 0, sizeof(bitmap)); 1781 memset(models, 0, sizeof(models)); 1782 snprintf(*models, sizeof(*models), "unknown"); 1783 maxcpu = 0; 1784 1785 cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus)); 1786 if (cpus == NULL) 1787 return UV_ENOMEM; 1788 1789 fp = uv__open_file("/proc/stat"); 1790 if (fp == NULL) { 1791 uv__free(cpus); 1792 return UV__ERR(errno); 1793 } 1794 1795 if (NULL == fgets(buf, sizeof(buf), fp)) 1796 abort(); 1797 1798 for (;;) { 1799 memset(&t, 0, sizeof(t)); 1800 1801 n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu", 1802 &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq); 1803 1804 if (n != 7) 1805 break; 1806 1807 if (NULL == fgets(buf, sizeof(buf), fp)) 1808 abort(); 1809 1810 if (cpu >= ARRAY_SIZE(*cpus)) 1811 continue; 1812 1813 (*cpus)[cpu] = t; 1814 1815 bitmap[cpu >> 3] |= 1 << (cpu & 7); 1816 1817 if (cpu >= maxcpu) 1818 maxcpu = cpu + 1; 1819 } 1820 1821 fclose(fp); 1822 1823 fp = uv__open_file("/proc/cpuinfo"); 1824 if (fp == NULL) 1825 goto nocpuinfo; 1826 1827 for (;;) { 1828 if (1 != fscanf(fp, "processor\t: %u\n", &cpu)) 1829 break; /* Parse error. */ 1830 1831 while (fgets(buf, sizeof(buf), fp)) { 1832 if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) { 1833 p = buf + sizeof(model_marker) - 1; 1834 goto parts; 1835 } 1836 if (!*model_marker2) 1837 continue; 1838 if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) { 1839 p = buf + sizeof(model_marker2) - 1; 1840 goto parts; 1841 } 1842 } 1843 1844 goto next; /* Not found. */ 1845 1846 parts: 1847 n = (int) strcspn(p, "\n"); 1848 1849 /* arm64: translate CPU part code to model name. */ 1850 if (*parts) { 1851 p = memmem(parts, sizeof(parts) - 1, p, n + 1); 1852 if (p == NULL) 1853 p = "unknown"; 1854 else 1855 p += n + 1; 1856 n = (int) strcspn(p, "\n"); 1857 } 1858 1859 found = 0; 1860 for (model = models; !found && model < ARRAY_END(models); model++) 1861 found = !strncmp(p, *model, strlen(*model)); 1862 1863 if (!found) 1864 goto next; 1865 1866 if (**model == '\0') 1867 snprintf(*model, sizeof(*model), "%.*s", n, p); 1868 1869 if (cpu < maxcpu) 1870 (*cpus)[cpu].model = model - models; 1871 1872 next: 1873 while (fgets(buf, sizeof(buf), fp)) 1874 if (*buf == '\n') 1875 break; 1876 } 1877 1878 fclose(fp); 1879 fp = NULL; 1880 1881 nocpuinfo: 1882 1883 n = 0; 1884 for (cpu = 0; cpu < maxcpu; cpu++) { 1885 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7)))) 1886 continue; 1887 1888 n++; 1889 snprintf(buf, sizeof(buf), 1890 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu); 1891 1892 fp = uv__open_file(buf); 1893 if (fp == NULL) 1894 continue; 1895 1896 if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq)) 1897 abort(); 1898 fclose(fp); 1899 fp = NULL; 1900 } 1901 1902 size = n * sizeof(**ci) + sizeof(models); 1903 *ci = uv__malloc(size); 1904 *count = 0; 1905 1906 if (*ci == NULL) { 1907 uv__free(cpus); 1908 return UV_ENOMEM; 1909 } 1910 1911 *count = n; 1912 p = memcpy(*ci + n, models, sizeof(models)); 1913 1914 i = 0; 1915 for (cpu = 0; cpu < maxcpu; cpu++) { 1916 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7)))) 1917 continue; 1918 1919 c = *cpus + cpu; 1920 1921 (*ci)[i++] = (uv_cpu_info_t) { 1922 .model = p + c->model * sizeof(*model), 1923 .speed = c->freq / 1000, 1924 /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz, 1925 * therefore the multiplier is always 1000/100 = 10. 1926 */ 1927 .cpu_times = (struct uv_cpu_times_s) { 1928 .user = 10 * c->user, 1929 .nice = 10 * c->nice, 1930 .sys = 10 * c->sys, 1931 .idle = 10 * c->idle, 1932 .irq = 10 * c->irq, 1933 }, 1934 }; 1935 } 1936 1937 uv__free(cpus); 1938 1939 return 0; 1940 } 1941 1942 1943 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) { 1944 if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING))) 1945 return 1; 1946 if (ent->ifa_addr == NULL) 1947 return 1; 1948 /* 1949 * On Linux getifaddrs returns information related to the raw underlying 1950 * devices. We're not interested in this information yet. 1951 */ 1952 if (ent->ifa_addr->sa_family == PF_PACKET) 1953 return exclude_type; 1954 return !exclude_type; 1955 } 1956 1957 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */ 1958 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) { 1959 uv_interface_address_t* address; 1960 struct sockaddr_ll* sll; 1961 struct ifaddrs* addrs; 1962 struct ifaddrs* ent; 1963 size_t namelen; 1964 char* name; 1965 int i; 1966 1967 *count = 0; 1968 *addresses = NULL; 1969 1970 if (getifaddrs(&addrs)) 1971 return UV__ERR(errno); 1972 1973 /* Count the number of interfaces */ 1974 namelen = 0; 1975 for (ent = addrs; ent != NULL; ent = ent->ifa_next) { 1976 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR)) 1977 continue; 1978 1979 namelen += strlen(ent->ifa_name) + 1; 1980 (*count)++; 1981 } 1982 1983 if (*count == 0) { 1984 freeifaddrs(addrs); 1985 return 0; 1986 } 1987 1988 /* Make sure the memory is initiallized to zero using calloc() */ 1989 *addresses = uv__calloc(1, *count * sizeof(**addresses) + namelen); 1990 if (*addresses == NULL) { 1991 freeifaddrs(addrs); 1992 return UV_ENOMEM; 1993 } 1994 1995 name = (char*) &(*addresses)[*count]; 1996 address = *addresses; 1997 1998 for (ent = addrs; ent != NULL; ent = ent->ifa_next) { 1999 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR)) 2000 continue; 2001 2002 namelen = strlen(ent->ifa_name) + 1; 2003 address->name = memcpy(name, ent->ifa_name, namelen); 2004 name += namelen; 2005 2006 if (ent->ifa_addr->sa_family == AF_INET6) { 2007 address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr); 2008 } else { 2009 address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr); 2010 } 2011 2012 if (ent->ifa_netmask->sa_family == AF_INET6) { 2013 address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask); 2014 } else { 2015 address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask); 2016 } 2017 2018 address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK); 2019 2020 address++; 2021 } 2022 2023 /* Fill in physical addresses for each interface */ 2024 for (ent = addrs; ent != NULL; ent = ent->ifa_next) { 2025 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS)) 2026 continue; 2027 2028 address = *addresses; 2029 2030 for (i = 0; i < (*count); i++) { 2031 size_t namelen = strlen(ent->ifa_name); 2032 /* Alias interface share the same physical address */ 2033 if (strncmp(address->name, ent->ifa_name, namelen) == 0 && 2034 (address->name[namelen] == 0 || address->name[namelen] == ':')) { 2035 sll = (struct sockaddr_ll*)ent->ifa_addr; 2036 memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr)); 2037 } 2038 address++; 2039 } 2040 } 2041 2042 freeifaddrs(addrs); 2043 2044 return 0; 2045 } 2046 2047 2048 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */ 2049 void uv_free_interface_addresses(uv_interface_address_t* addresses, 2050 int count) { 2051 uv__free(addresses); 2052 } 2053 2054 2055 void uv__set_process_title(const char* title) { 2056 #if defined(PR_SET_NAME) 2057 prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */ 2058 #endif 2059 } 2060 2061 2062 static uint64_t uv__read_proc_meminfo(const char* what) { 2063 uint64_t rc; 2064 char* p; 2065 char buf[4096]; /* Large enough to hold all of /proc/meminfo. */ 2066 2067 if (uv__slurp("/proc/meminfo", buf, sizeof(buf))) 2068 return 0; 2069 2070 p = strstr(buf, what); 2071 2072 if (p == NULL) 2073 return 0; 2074 2075 p += strlen(what); 2076 2077 rc = 0; 2078 sscanf(p, "%" PRIu64 " kB", &rc); 2079 2080 return rc * 1024; 2081 } 2082 2083 2084 uint64_t uv_get_free_memory(void) { 2085 struct sysinfo info; 2086 uint64_t rc; 2087 2088 rc = uv__read_proc_meminfo("MemAvailable:"); 2089 2090 if (rc != 0) 2091 return rc; 2092 2093 if (0 == sysinfo(&info)) 2094 return (uint64_t) info.freeram * info.mem_unit; 2095 2096 return 0; 2097 } 2098 2099 2100 uint64_t uv_get_total_memory(void) { 2101 struct sysinfo info; 2102 uint64_t rc; 2103 2104 rc = uv__read_proc_meminfo("MemTotal:"); 2105 2106 if (rc != 0) 2107 return rc; 2108 2109 if (0 == sysinfo(&info)) 2110 return (uint64_t) info.totalram * info.mem_unit; 2111 2112 return 0; 2113 } 2114 2115 2116 static uint64_t uv__read_uint64(const char* filename) { 2117 char buf[32]; /* Large enough to hold an encoded uint64_t. */ 2118 uint64_t rc; 2119 2120 rc = 0; 2121 if (0 == uv__slurp(filename, buf, sizeof(buf))) 2122 if (1 != sscanf(buf, "%" PRIu64, &rc)) 2123 if (0 == strcmp(buf, "max\n")) 2124 rc = UINT64_MAX; 2125 2126 return rc; 2127 } 2128 2129 2130 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups, 2131 * finds the location and length of the memory controller mount path. 2132 * This disregards the leading / for easy concatenation of paths. 2133 * Returns NULL if the memory controller wasn't found. */ 2134 static char* uv__cgroup1_find_memory_controller(char buf[static 1024], 2135 int* n) { 2136 char* p; 2137 2138 /* Seek to the memory controller line. */ 2139 p = strchr(buf, ':'); 2140 while (p != NULL && strncmp(p, ":memory:", 8)) { 2141 p = strchr(p, '\n'); 2142 if (p != NULL) 2143 p = strchr(p, ':'); 2144 } 2145 2146 if (p != NULL) { 2147 /* Determine the length of the mount path. */ 2148 p = p + strlen(":memory:/"); 2149 *n = (int) strcspn(p, "\n"); 2150 } 2151 2152 return p; 2153 } 2154 2155 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high, 2156 uint64_t* max) { 2157 char filename[4097]; 2158 char* p; 2159 int n; 2160 uint64_t cgroup1_max; 2161 2162 /* Find out where the controller is mounted. */ 2163 p = uv__cgroup1_find_memory_controller(buf, &n); 2164 if (p != NULL) { 2165 snprintf(filename, sizeof(filename), 2166 "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p); 2167 *high = uv__read_uint64(filename); 2168 2169 snprintf(filename, sizeof(filename), 2170 "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p); 2171 *max = uv__read_uint64(filename); 2172 2173 /* If the controller wasn't mounted, the reads above will have failed, 2174 * as indicated by uv__read_uint64 returning 0. 2175 */ 2176 if (*high != 0 && *max != 0) 2177 goto update_limits; 2178 } 2179 2180 /* Fall back to the limits of the global memory controller. */ 2181 *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes"); 2182 *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes"); 2183 2184 /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect 2185 * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE). 2186 */ 2187 update_limits: 2188 cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1); 2189 if (*high == cgroup1_max) 2190 *high = UINT64_MAX; 2191 if (*max == cgroup1_max) 2192 *max = UINT64_MAX; 2193 } 2194 2195 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high, 2196 uint64_t* max) { 2197 char filename[4097]; 2198 char* p; 2199 int n; 2200 2201 /* Find out where the controller is mounted. */ 2202 p = buf + strlen("0::/"); 2203 n = (int) strcspn(p, "\n"); 2204 2205 /* Read the memory limits of the controller. */ 2206 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p); 2207 *max = uv__read_uint64(filename); 2208 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p); 2209 *high = uv__read_uint64(filename); 2210 } 2211 2212 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) { 2213 uint64_t high; 2214 uint64_t max; 2215 2216 /* In the case of cgroupv2, we'll only have a single entry. */ 2217 if (strncmp(buf, "0::/", 4)) 2218 uv__get_cgroup1_memory_limits(buf, &high, &max); 2219 else 2220 uv__get_cgroup2_memory_limits(buf, &high, &max); 2221 2222 if (high == 0 || max == 0) 2223 return 0; 2224 2225 return high < max ? high : max; 2226 } 2227 2228 uint64_t uv_get_constrained_memory(void) { 2229 char buf[1024]; 2230 2231 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf))) 2232 return 0; 2233 2234 return uv__get_cgroup_constrained_memory(buf); 2235 } 2236 2237 2238 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) { 2239 char filename[4097]; 2240 uint64_t current; 2241 char* p; 2242 int n; 2243 2244 /* Find out where the controller is mounted. */ 2245 p = uv__cgroup1_find_memory_controller(buf, &n); 2246 if (p != NULL) { 2247 snprintf(filename, sizeof(filename), 2248 "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p); 2249 current = uv__read_uint64(filename); 2250 2251 /* If the controller wasn't mounted, the reads above will have failed, 2252 * as indicated by uv__read_uint64 returning 0. 2253 */ 2254 if (current != 0) 2255 return current; 2256 } 2257 2258 /* Fall back to the usage of the global memory controller. */ 2259 return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes"); 2260 } 2261 2262 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) { 2263 char filename[4097]; 2264 char* p; 2265 int n; 2266 2267 /* Find out where the controller is mounted. */ 2268 p = buf + strlen("0::/"); 2269 n = (int) strcspn(p, "\n"); 2270 2271 snprintf(filename, sizeof(filename), 2272 "/sys/fs/cgroup/%.*s/memory.current", n, p); 2273 return uv__read_uint64(filename); 2274 } 2275 2276 uint64_t uv_get_available_memory(void) { 2277 char buf[1024]; 2278 uint64_t constrained; 2279 uint64_t current; 2280 uint64_t total; 2281 2282 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf))) 2283 return 0; 2284 2285 constrained = uv__get_cgroup_constrained_memory(buf); 2286 if (constrained == 0) 2287 return uv_get_free_memory(); 2288 2289 total = uv_get_total_memory(); 2290 if (constrained > total) 2291 return uv_get_free_memory(); 2292 2293 /* In the case of cgroupv2, we'll only have a single entry. */ 2294 if (strncmp(buf, "0::/", 4)) 2295 current = uv__get_cgroup1_current_memory(buf); 2296 else 2297 current = uv__get_cgroup2_current_memory(buf); 2298 2299 /* memory usage can be higher than the limit (for short bursts of time) */ 2300 if (constrained < current) 2301 return 0; 2302 2303 return constrained - current; 2304 } 2305 2306 2307 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup, 2308 long long* quota) { 2309 static const char cgroup_mount[] = "/sys/fs/cgroup"; 2310 const char* cgroup_trimmed; 2311 char buf[1024]; 2312 char full_path[256]; 2313 char path[256]; 2314 char quota_buf[16]; 2315 char* last_slash; 2316 int cgroup_size; 2317 long long limit; 2318 long long min_quota; 2319 long long period; 2320 2321 if (strncmp(cgroup, "0::/", 4) != 0) 2322 return UV_EINVAL; 2323 2324 /* Trim ending \n by replacing it with a 0 */ 2325 cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */ 2326 cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first \n */ 2327 min_quota = LLONG_MAX; 2328 2329 /* Construct the path to the cpu.max files */ 2330 snprintf(path, sizeof(path), "%s/%.*s/cgroup.controllers", cgroup_mount, 2331 cgroup_size, cgroup_trimmed); 2332 2333 /* Read controllers, if not exists, not really a cgroup */ 2334 if (uv__slurp(path, buf, sizeof(buf)) < 0) 2335 return UV_EIO; 2336 2337 snprintf(path, sizeof(path), "%s/%.*s", cgroup_mount, cgroup_size, 2338 cgroup_trimmed); 2339 2340 /* 2341 * Traverse up the cgroup v2 hierarchy, starting from the current cgroup path. 2342 * At each level, attempt to read the "cpu.max" file, which defines the CPU 2343 * quota and period. 2344 * 2345 * This reflects how Linux applies cgroup limits hierarchically. 2346 * 2347 * e.g: given a path like /sys/fs/cgroup/foo/bar/baz, we check: 2348 * - /sys/fs/cgroup/foo/bar/baz/cpu.max 2349 * - /sys/fs/cgroup/foo/bar/cpu.max 2350 * - /sys/fs/cgroup/foo/cpu.max 2351 * - /sys/fs/cgroup/cpu.max 2352 */ 2353 while (strncmp(path, cgroup_mount, strlen(cgroup_mount)) == 0) { 2354 snprintf(full_path, sizeof(full_path), "%s/cpu.max", path); 2355 2356 /* Silently ignore and continue if the file does not exist */ 2357 if (uv__slurp(full_path, quota_buf, sizeof(quota_buf)) < 0) 2358 goto next; 2359 2360 /* No limit, move on */ 2361 if (strncmp(quota_buf, "max", 3) == 0) 2362 goto next; 2363 2364 /* Read cpu.max */ 2365 if (sscanf(quota_buf, "%lld %lld", &limit, &period) != 2) 2366 goto next; 2367 2368 /* Can't divide by 0 */ 2369 if (period == 0) 2370 goto next; 2371 2372 *quota = limit / period; 2373 if (*quota < min_quota) 2374 min_quota = *quota; 2375 2376 next: 2377 /* Move up one level in the cgroup hierarchy by trimming the last path. 2378 * The loop ends once we reach the cgroup root mount point. 2379 */ 2380 last_slash = strrchr(path, '/'); 2381 if (last_slash == NULL || strcmp(path, cgroup_mount) == 0) 2382 break; 2383 *last_slash = '\0'; 2384 } 2385 2386 return 0; 2387 } 2388 2389 static char* uv__cgroup1_find_cpu_controller(const char* cgroup, 2390 int* cgroup_size) { 2391 /* Seek to the cpu controller line. */ 2392 char* cgroup_cpu = strstr(cgroup, ":cpu,"); 2393 2394 if (cgroup_cpu != NULL) { 2395 /* Skip the controller prefix to the start of the cgroup path. */ 2396 cgroup_cpu += sizeof(":cpu,") - 1; 2397 /* Determine the length of the cgroup path, excluding the newline. */ 2398 *cgroup_size = (int)strcspn(cgroup_cpu, "\n"); 2399 } 2400 2401 return cgroup_cpu; 2402 } 2403 2404 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup, 2405 long long* quota) { 2406 char path[256]; 2407 char buf[1024]; 2408 int cgroup_size; 2409 char* cgroup_cpu; 2410 long long period_length; 2411 long long quota_per_period; 2412 2413 cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size); 2414 2415 if (cgroup_cpu == NULL) 2416 return UV_EIO; 2417 2418 /* Construct the path to the cpu.cfs_quota_us file */ 2419 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us", 2420 cgroup_size, cgroup_cpu); 2421 2422 /* Read cpu.cfs_quota_us */ 2423 if (uv__slurp(path, buf, sizeof(buf)) < 0) 2424 return UV_EIO; 2425 2426 if (sscanf(buf, "%lld", "a_per_period) != 1) 2427 return UV_EINVAL; 2428 2429 /* Construct the path to the cpu.cfs_period_us file */ 2430 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us", 2431 cgroup_size, cgroup_cpu); 2432 2433 /* Read cpu.cfs_period_us */ 2434 if (uv__slurp(path, buf, sizeof(buf)) < 0) 2435 return UV_EIO; 2436 2437 if (sscanf(buf, "%lld", &period_length) != 1) 2438 return UV_EINVAL; 2439 2440 /* Can't divide by 0 */ 2441 if (period_length == 0) 2442 return UV_EINVAL; 2443 2444 *quota = quota_per_period / period_length; 2445 2446 return 0; 2447 } 2448 2449 int uv__get_constrained_cpu(long long* quota) { 2450 char cgroup[1024]; 2451 2452 /* Read the cgroup from /proc/self/cgroup */ 2453 if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0) 2454 return UV_EIO; 2455 2456 /* Check if the system is using cgroup v2 by examining /proc/self/cgroup 2457 * The entry for cgroup v2 is always in the format "0::$PATH" 2458 * see https://docs.kernel.org/admin-guide/cgroup-v2.html */ 2459 if (strncmp(cgroup, "0::/", 4) == 0) 2460 return uv__get_cgroupv2_constrained_cpu(cgroup, quota); 2461 else 2462 return uv__get_cgroupv1_constrained_cpu(cgroup, quota); 2463 } 2464 2465 2466 void uv_loadavg(double avg[3]) { 2467 struct sysinfo info; 2468 char buf[128]; /* Large enough to hold all of /proc/loadavg. */ 2469 2470 if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf))) 2471 if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2])) 2472 return; 2473 2474 if (sysinfo(&info) < 0) 2475 return; 2476 2477 avg[0] = (double) info.loads[0] / 65536.0; 2478 avg[1] = (double) info.loads[1] / 65536.0; 2479 avg[2] = (double) info.loads[2] / 65536.0; 2480 } 2481 2482 2483 static int compare_watchers(const struct watcher_list* a, 2484 const struct watcher_list* b) { 2485 if (a->wd < b->wd) return -1; 2486 if (a->wd > b->wd) return 1; 2487 return 0; 2488 } 2489 2490 2491 static int init_inotify(uv_loop_t* loop) { 2492 int err; 2493 int fd; 2494 2495 if (loop->inotify_fd != -1) 2496 return 0; 2497 2498 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); 2499 if (fd < 0) 2500 return UV__ERR(errno); 2501 2502 err = uv__io_init_start(loop, &loop->inotify_read_watcher, uv__inotify_read, 2503 fd, POLLIN); 2504 if (err) { 2505 uv__close(fd); 2506 return err; 2507 } 2508 2509 loop->inotify_fd = fd; 2510 return 0; 2511 } 2512 2513 2514 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) { 2515 /* Open the inotify_fd, and re-arm all the inotify watchers. */ 2516 int err; 2517 struct watcher_list* tmp_watcher_list_iter; 2518 struct watcher_list* watcher_list; 2519 struct watcher_list tmp_watcher_list; 2520 struct uv__queue queue; 2521 struct uv__queue* q; 2522 uv_fs_event_t* handle; 2523 char* tmp_path; 2524 2525 if (root == NULL) 2526 return 0; 2527 2528 /* We must restore the old watcher list to be able to close items 2529 * out of it. 2530 */ 2531 loop->inotify_watchers = root; 2532 2533 uv__queue_init(&tmp_watcher_list.watchers); 2534 /* Note that the queue we use is shared with the start and stop() 2535 * functions, making uv__queue_foreach unsafe to use. So we use the 2536 * uv__queue_move trick to safely iterate. Also don't free the watcher 2537 * list until we're done iterating. c.f. uv__inotify_read. 2538 */ 2539 RB_FOREACH_SAFE(watcher_list, watcher_root, 2540 uv__inotify_watchers(loop), tmp_watcher_list_iter) { 2541 watcher_list->iterating = 1; 2542 uv__queue_move(&watcher_list->watchers, &queue); 2543 while (!uv__queue_empty(&queue)) { 2544 q = uv__queue_head(&queue); 2545 handle = uv__queue_data(q, uv_fs_event_t, watchers); 2546 /* It's critical to keep a copy of path here, because it 2547 * will be set to NULL by stop() and then deallocated by 2548 * maybe_free_watcher_list 2549 */ 2550 tmp_path = uv__strdup(handle->path); 2551 assert(tmp_path != NULL); 2552 uv__queue_remove(q); 2553 uv__queue_insert_tail(&watcher_list->watchers, q); 2554 uv_fs_event_stop(handle); 2555 2556 uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers); 2557 handle->path = tmp_path; 2558 } 2559 watcher_list->iterating = 0; 2560 maybe_free_watcher_list(watcher_list, loop); 2561 } 2562 2563 uv__queue_move(&tmp_watcher_list.watchers, &queue); 2564 while (!uv__queue_empty(&queue)) { 2565 q = uv__queue_head(&queue); 2566 uv__queue_remove(q); 2567 handle = uv__queue_data(q, uv_fs_event_t, watchers); 2568 tmp_path = handle->path; 2569 handle->path = NULL; 2570 err = uv_fs_event_start(handle, handle->cb, tmp_path, 0); 2571 uv__free(tmp_path); 2572 if (err) 2573 return err; 2574 } 2575 2576 return 0; 2577 } 2578 2579 2580 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) { 2581 struct watcher_list w; 2582 w.wd = wd; 2583 return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w); 2584 } 2585 2586 2587 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) { 2588 /* if the watcher_list->watchers is being iterated over, we can't free it. */ 2589 if ((!w->iterating) && uv__queue_empty(&w->watchers)) { 2590 /* No watchers left for this path. Clean up. */ 2591 RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w); 2592 inotify_rm_watch(loop->inotify_fd, w->wd); 2593 uv__free(w); 2594 } 2595 } 2596 2597 2598 static void uv__inotify_read(uv_loop_t* loop, 2599 uv__io_t* dummy, 2600 unsigned int events) { 2601 const struct inotify_event* e; 2602 struct watcher_list* w; 2603 uv_fs_event_t* h; 2604 struct uv__queue queue; 2605 struct uv__queue* q; 2606 const char* path; 2607 ssize_t size; 2608 const char *p; 2609 /* needs to be large enough for sizeof(inotify_event) + strlen(path) */ 2610 char buf[4096]; 2611 2612 for (;;) { 2613 do 2614 size = read(loop->inotify_fd, buf, sizeof(buf)); 2615 while (size == -1 && errno == EINTR); 2616 2617 if (size == -1) { 2618 assert(errno == EAGAIN || errno == EWOULDBLOCK); 2619 break; 2620 } 2621 2622 assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */ 2623 2624 /* Now we have one or more inotify_event structs. */ 2625 for (p = buf; p < buf + size; p += sizeof(*e) + e->len) { 2626 e = (const struct inotify_event*) p; 2627 2628 events = 0; 2629 if (e->mask & (IN_ATTRIB|IN_MODIFY)) 2630 events |= UV_CHANGE; 2631 if (e->mask & ~(IN_ATTRIB|IN_MODIFY)) 2632 events |= UV_RENAME; 2633 2634 w = find_watcher(loop, e->wd); 2635 if (w == NULL) 2636 continue; /* Stale event, no watchers left. */ 2637 2638 /* inotify does not return the filename when monitoring a single file 2639 * for modifications. Repurpose the filename for API compatibility. 2640 * I'm not convinced this is a good thing, maybe it should go. 2641 */ 2642 path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path); 2643 2644 /* We're about to iterate over the queue and call user's callbacks. 2645 * What can go wrong? 2646 * A callback could call uv_fs_event_stop() 2647 * and the queue can change under our feet. 2648 * So, we use uv__queue_move() trick to safely iterate over the queue. 2649 * And we don't free the watcher_list until we're done iterating. 2650 * 2651 * First, 2652 * tell uv_fs_event_stop() (that could be called from a user's callback) 2653 * not to free watcher_list. 2654 */ 2655 w->iterating = 1; 2656 uv__queue_move(&w->watchers, &queue); 2657 while (!uv__queue_empty(&queue)) { 2658 q = uv__queue_head(&queue); 2659 h = uv__queue_data(q, uv_fs_event_t, watchers); 2660 2661 uv__queue_remove(q); 2662 uv__queue_insert_tail(&w->watchers, q); 2663 2664 h->cb(h, path, events, 0); 2665 } 2666 /* done iterating, time to (maybe) free empty watcher_list */ 2667 w->iterating = 0; 2668 maybe_free_watcher_list(w, loop); 2669 } 2670 } 2671 } 2672 2673 2674 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) { 2675 uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT); 2676 return 0; 2677 } 2678 2679 2680 int uv_fs_event_start(uv_fs_event_t* handle, 2681 uv_fs_event_cb cb, 2682 const char* path, 2683 unsigned int flags) { 2684 struct watcher_list* w; 2685 uv_loop_t* loop; 2686 size_t len; 2687 int events; 2688 int err; 2689 int wd; 2690 2691 if (uv__is_active(handle)) 2692 return UV_EINVAL; 2693 2694 loop = handle->loop; 2695 2696 err = init_inotify(loop); 2697 if (err) 2698 return err; 2699 2700 events = IN_ATTRIB 2701 | IN_CREATE 2702 | IN_MODIFY 2703 | IN_DELETE 2704 | IN_DELETE_SELF 2705 | IN_MOVE_SELF 2706 | IN_MOVED_FROM 2707 | IN_MOVED_TO; 2708 2709 wd = inotify_add_watch(loop->inotify_fd, path, events); 2710 if (wd == -1) 2711 return UV__ERR(errno); 2712 2713 w = find_watcher(loop, wd); 2714 if (w) 2715 goto no_insert; 2716 2717 len = strlen(path) + 1; 2718 w = uv__malloc(sizeof(*w) + len); 2719 if (w == NULL) 2720 return UV_ENOMEM; 2721 2722 w->wd = wd; 2723 w->path = memcpy(w + 1, path, len); 2724 uv__queue_init(&w->watchers); 2725 w->iterating = 0; 2726 RB_INSERT(watcher_root, uv__inotify_watchers(loop), w); 2727 2728 no_insert: 2729 uv__handle_start(handle); 2730 uv__queue_insert_tail(&w->watchers, &handle->watchers); 2731 handle->path = w->path; 2732 handle->cb = cb; 2733 handle->wd = wd; 2734 2735 return 0; 2736 } 2737 2738 2739 int uv_fs_event_stop(uv_fs_event_t* handle) { 2740 struct watcher_list* w; 2741 2742 if (!uv__is_active(handle)) 2743 return 0; 2744 2745 w = find_watcher(handle->loop, handle->wd); 2746 assert(w != NULL); 2747 2748 handle->wd = -1; 2749 handle->path = NULL; 2750 uv__handle_stop(handle); 2751 uv__queue_remove(&handle->watchers); 2752 2753 maybe_free_watcher_list(w, handle->loop); 2754 2755 return 0; 2756 } 2757 2758 2759 void uv__fs_event_close(uv_fs_event_t* handle) { 2760 uv_fs_event_stop(handle); 2761 } 2762