summaryrefslogtreecommitdiff
path: root/src/libnm-systemd-shared/src/basic/fd-util.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libnm-systemd-shared/src/basic/fd-util.c')
-rw-r--r--src/libnm-systemd-shared/src/basic/fd-util.c501
1 files changed, 78 insertions, 423 deletions
diff --git a/src/libnm-systemd-shared/src/basic/fd-util.c b/src/libnm-systemd-shared/src/basic/fd-util.c
index 1a873601b2..6b6457dbc2 100644
--- a/src/libnm-systemd-shared/src/basic/fd-util.c
+++ b/src/libnm-systemd-shared/src/basic/fd-util.c
@@ -2,19 +2,20 @@
#include <errno.h>
#include <fcntl.h>
+#include <linux/btrfs.h>
+#include <linux/magic.h>
+#include <sys/ioctl.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <unistd.h>
#include "alloc-util.h"
-#include "copy.h"
#include "dirent-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "fs-util.h"
#include "io-util.h"
#include "macro.h"
-#include "memfd-util.h"
#include "missing_fcntl.h"
#include "missing_syscall.h"
#include "parse-util.h"
@@ -207,10 +208,9 @@ static int get_max_fd(void) {
return (int) (m - 1);
}
-int close_all_fds(const int except[], size_t n_except) {
+int close_all_fds_full(int except[], size_t n_except, bool allow_alloc) {
static bool have_close_range = true; /* Assume we live in the future */
_cleanup_closedir_ DIR *d = NULL;
- struct dirent *de;
int r = 0;
assert(n_except == 0 || except);
@@ -226,129 +226,104 @@ int close_all_fds(const int except[], size_t n_except) {
/* Close everything. Yay! */
if (close_range(3, -1, 0) >= 0)
- return 1;
+ return 0;
- if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+ if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))
+ have_close_range = false;
+ else
return -errno;
- have_close_range = false;
} else {
- _cleanup_free_ int *sorted_malloc = NULL;
- size_t n_sorted;
- int *sorted;
-
- assert(n_except < SIZE_MAX);
- n_sorted = n_except + 1;
-
- if (n_sorted > 64) /* Use heap for large numbers of fds, stack otherwise */
- sorted = sorted_malloc = new(int, n_sorted);
- else
- sorted = newa(int, n_sorted);
-
- if (sorted) {
- int c = 0;
-
- memcpy(sorted, except, n_except * sizeof(int));
+ typesafe_qsort(except, n_except, cmp_int);
- /* Let's add fd 2 to the list of fds, to simplify the loop below, as this
- * allows us to cover the head of the array the same way as the body */
- sorted[n_sorted-1] = 2;
+ for (size_t i = 0; i < n_except; i++) {
+ int start = i == 0 ? 2 : MAX(except[i-1], 2); /* The first three fds shall always remain open */
+ int end = MAX(except[i], 2);
- typesafe_qsort(sorted, n_sorted, cmp_int);
+ assert(end >= start);
- for (size_t i = 0; i < n_sorted-1; i++) {
- int start, end;
-
- start = MAX(sorted[i], 2); /* The first three fds shall always remain open */
- end = MAX(sorted[i+1], 2);
-
- assert(end >= start);
-
- if (end - start <= 1)
- continue;
-
- /* Close everything between the start and end fds (both of which shall stay open) */
- if (close_range(start + 1, end - 1, 0) < 0) {
- if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
- return -errno;
+ if (end - start <= 1)
+ continue;
+ /* Close everything between the start and end fds (both of which shall stay open) */
+ if (close_range(start + 1, end - 1, 0) < 0) {
+ if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))
have_close_range = false;
- break;
- }
-
- c += end - start - 1;
+ else
+ return -errno;
+ goto opendir_fallback;
}
+ }
- if (have_close_range) {
- /* The loop succeeded. Let's now close everything beyond the end */
+ /* The loop succeeded. Let's now close everything beyond the end */
- if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
- return c;
+ if (except[n_except-1] >= INT_MAX) /* Don't let the addition below overflow */
+ return 0;
- if (close_range(sorted[n_sorted-1] + 1, -1, 0) >= 0)
- return c + 1;
+ int start = MAX(except[n_except-1], 2);
- if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
- return -errno;
+ if (close_range(start + 1, -1, 0) >= 0)
+ return 0;
- have_close_range = false;
- }
- }
+ if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))
+ have_close_range = false;
+ else
+ return -errno;
}
-
- /* Fallback on OOM or if close_range() is not supported */
}
- d = opendir("/proc/self/fd");
- if (!d) {
- int fd, max_fd;
+ /* Fallback for when close_range() is not supported */
+ opendir_fallback:
+ d = allow_alloc ? opendir("/proc/self/fd") : NULL;
+ if (d) {
+ struct dirent *de;
- /* When /proc isn't available (for example in chroots) the fallback is brute forcing through
- * the fd table */
+ FOREACH_DIRENT(de, d, return -errno) {
+ int fd = -1, q;
- max_fd = get_max_fd();
- if (max_fd < 0)
- return max_fd;
+ if (safe_atoi(de->d_name, &fd) < 0)
+ /* Let's better ignore this, just in case */
+ continue;
- /* Refuse to do the loop over more too many elements. It's better to fail immediately than to
- * spin the CPU for a long time. */
- if (max_fd > MAX_FD_LOOP_LIMIT)
- return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
- "/proc/self/fd is inaccessible. Refusing to loop over %d potential fds.",
- max_fd);
+ if (fd < 3)
+ continue;
- for (fd = 3; fd >= 0; fd = fd < max_fd ? fd + 1 : -1) {
- int q;
+ if (fd == dirfd(d))
+ continue;
if (fd_in_set(fd, except, n_except))
continue;
q = close_nointr(fd);
- if (q < 0 && q != -EBADF && r >= 0)
+ if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */
r = q;
}
return r;
}
- FOREACH_DIRENT(de, d, return -errno) {
- int fd = -1, q;
+ /* Fallback for when /proc isn't available (for example in chroots) or when we cannot allocate by
+ * brute-forcing through the file descriptor table. */
- if (safe_atoi(de->d_name, &fd) < 0)
- /* Let's better ignore this, just in case */
- continue;
+ int max_fd = get_max_fd();
+ if (max_fd < 0)
+ return max_fd;
- if (fd < 3)
- continue;
+ /* Refuse to do the loop over more too many elements. It's better to fail immediately than to
+ * spin the CPU for a long time. */
+ if (max_fd > MAX_FD_LOOP_LIMIT)
+ return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
+ "/proc/self/fd is inaccessible. Refusing to loop over %d potential fds.",
+ max_fd);
- if (fd == dirfd(d))
- continue;
+ for (int fd = 3; fd >= 0; fd = fd < max_fd ? fd + 1 : -1) {
+ int q;
if (fd_in_set(fd, except, n_except))
continue;
q = close_nointr(fd);
- if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */
+ if (q < 0 && q != -EBADF && r >= 0)
r = q;
}
@@ -520,343 +495,6 @@ int move_fd(int from, int to, int cloexec) {
return to;
}
-int acquire_data_fd(const void *data, size_t size, unsigned flags) {
-
- _cleanup_close_pair_ int pipefds[2] = { -1, -1 };
- char pattern[] = "/dev/shm/data-fd-XXXXXX";
- _cleanup_close_ int fd = -1;
- int isz = 0, r;
- ssize_t n;
- off_t f;
-
- assert(data || size == 0);
-
- /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
- * complex than I wish it was. But here's why:
- *
- * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
- * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
- *
- * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
- * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
- * clients can only bump their size to a system-wide limit, which might be quite low.
- *
- * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
- * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
- * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
- *
- * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
- *
- * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
- * figure. */
-
- if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0)) {
- /* As a special case, return /dev/null if we have been called for an empty data block */
- r = open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY);
- if (r < 0)
- return -errno;
-
- return r;
- }
-
- if ((flags & ACQUIRE_NO_MEMFD) == 0) {
- fd = memfd_new("data-fd");
- if (fd < 0)
- goto try_pipe;
-
- n = write(fd, data, size);
- if (n < 0)
- return -errno;
- if ((size_t) n != size)
- return -EIO;
-
- f = lseek(fd, 0, SEEK_SET);
- if (f != 0)
- return -errno;
-
- r = memfd_set_sealed(fd);
- if (r < 0)
- return r;
-
- return TAKE_FD(fd);
- }
-
-try_pipe:
- if ((flags & ACQUIRE_NO_PIPE) == 0) {
- if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
- return -errno;
-
- isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
- if (isz < 0)
- return -errno;
-
- if ((size_t) isz < size) {
- isz = (int) size;
- if (isz < 0 || (size_t) isz != size)
- return -E2BIG;
-
- /* Try to bump the pipe size */
- (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
-
- /* See if that worked */
- isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
- if (isz < 0)
- return -errno;
-
- if ((size_t) isz < size)
- goto try_dev_shm;
- }
-
- n = write(pipefds[1], data, size);
- if (n < 0)
- return -errno;
- if ((size_t) n != size)
- return -EIO;
-
- (void) fd_nonblock(pipefds[0], false);
-
- return TAKE_FD(pipefds[0]);
- }
-
-try_dev_shm:
- if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
- fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
- if (fd < 0)
- goto try_dev_shm_without_o_tmpfile;
-
- n = write(fd, data, size);
- if (n < 0)
- return -errno;
- if ((size_t) n != size)
- return -EIO;
-
- /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
- return fd_reopen(fd, O_RDONLY|O_CLOEXEC);
- }
-
-try_dev_shm_without_o_tmpfile:
- if ((flags & ACQUIRE_NO_REGULAR) == 0) {
- fd = mkostemp_safe(pattern);
- if (fd < 0)
- return fd;
-
- n = write(fd, data, size);
- if (n < 0) {
- r = -errno;
- goto unlink_and_return;
- }
- if ((size_t) n != size) {
- r = -EIO;
- goto unlink_and_return;
- }
-
- /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
- r = open(pattern, O_RDONLY|O_CLOEXEC);
- if (r < 0)
- r = -errno;
-
- unlink_and_return:
- (void) unlink(pattern);
- return r;
- }
-
- return -EOPNOTSUPP;
-}
-
-/* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */
-#define DATA_FD_MEMORY_LIMIT (64U*1024U)
-
-/* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */
-#define DATA_FD_TMP_LIMIT (1024U*1024U)
-
-int fd_duplicate_data_fd(int fd) {
-
- _cleanup_close_ int copy_fd = -1, tmp_fd = -1;
- _cleanup_free_ void *remains = NULL;
- size_t remains_size = 0;
- const char *td;
- struct stat st;
- int r;
-
- /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but
- * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be
- * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported
- * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in
- * /var/tmp. */
-
- if (fstat(fd, &st) < 0)
- return -errno;
-
- /* For now, let's only accept regular files, sockets, pipes and char devices */
- if (S_ISDIR(st.st_mode))
- return -EISDIR;
- if (S_ISLNK(st.st_mode))
- return -ELOOP;
- if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode))
- return -EBADFD;
-
- /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note
- * that we use the reported regular file size only as a hint, given that there are plenty special files in
- * /proc and /sys which report a zero file size but can be read from. */
-
- if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) {
-
- /* Try a memfd first */
- copy_fd = memfd_new("data-fd");
- if (copy_fd >= 0) {
- off_t f;
-
- r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0);
- if (r < 0)
- return r;
-
- f = lseek(copy_fd, 0, SEEK_SET);
- if (f != 0)
- return -errno;
-
- if (r == 0) {
- /* Did it fit into the limit? If so, we are done. */
- r = memfd_set_sealed(copy_fd);
- if (r < 0)
- return r;
-
- return TAKE_FD(copy_fd);
- }
-
- /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */
-
- } else {
- _cleanup_(close_pairp) int pipefds[2] = { -1, -1 };
- int isz;
-
- /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather
- * then block indefinitely when we hit the pipe size limit */
-
- if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
- return -errno;
-
- isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
- if (isz < 0)
- return -errno;
-
- /* Try to enlarge the pipe size if necessary */
- if ((size_t) isz < DATA_FD_MEMORY_LIMIT) {
-
- (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT);
-
- isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
- if (isz < 0)
- return -errno;
- }
-
- if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) {
-
- r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL);
- if (r < 0 && r != -EAGAIN)
- return r; /* If we get EAGAIN it could be because of the source or because of
- * the destination fd, we can't know, as sendfile() and friends won't
- * tell us. Hence, treat this as reason to fall back, just to be
- * sure. */
- if (r == 0) {
- /* Everything fit in, yay! */
- (void) fd_nonblock(pipefds[0], false);
-
- return TAKE_FD(pipefds[0]);
- }
-
- /* Things didn't fit in. But we read data into the pipe, let's remember that, so that
- * when writing the new file we incorporate this first. */
- copy_fd = TAKE_FD(pipefds[0]);
- }
- }
- }
-
- /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */
- if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) &&
- (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) {
- off_t f;
-
- tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC);
- if (tmp_fd < 0)
- return tmp_fd;
-
- if (copy_fd >= 0) {
- /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the
- * temporary file first. */
-
- r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0);
- if (r < 0)
- return r;
-
- assert(r == 0);
- }
-
- if (remains_size > 0) {
- /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the
- * failed copy operation, let's flush them out next. */
-
- r = loop_write(tmp_fd, remains, remains_size, false);
- if (r < 0)
- return r;
- }
-
- r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK);
- if (r < 0)
- return r;
- if (r == 0)
- goto finish; /* Yay, it fit in */
-
- /* It didn't fit in. Let's not forget to use what we already used */
- f = lseek(tmp_fd, 0, SEEK_SET);
- if (f != 0)
- return -errno;
-
- CLOSE_AND_REPLACE(copy_fd, tmp_fd);
-
- remains = mfree(remains);
- remains_size = 0;
- }
-
- /* As last fallback use /var/tmp */
- r = var_tmp_dir(&td);
- if (r < 0)
- return r;
-
- tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC);
- if (tmp_fd < 0)
- return tmp_fd;
-
- if (copy_fd >= 0) {
- /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this
- * into the temporary file first. */
- r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
- if (r < 0)
- return r;
-
- assert(r == 0);
- }
-
- if (remains_size > 0) {
- /* Then, copy in any read but not yet written bytes. */
- r = loop_write(tmp_fd, remains, remains_size, false);
- if (r < 0)
- return r;
- }
-
- /* Copy in the rest */
- r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
- if (r < 0)
- return r;
-
- assert(r == 0);
-
-finish:
- /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the
- * file again */
-
- return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC);
-}
-
int fd_move_above_stdio(int fd) {
int flags, copy;
PROTECT_ERRNO;
@@ -1057,3 +695,20 @@ int read_nr_open(void) {
/* If we fail, fall back to the hard-coded kernel limit of 1024 * 1024. */
return 1024 * 1024;
}
+
+/* This is here because it's fd-related and is called from sd-journal code. Other btrfs-related utilities are
+ * in src/shared, but libsystemd must not link to libsystemd-shared, see docs/ARCHITECTURE.md. */
+int btrfs_defrag_fd(int fd) {
+ int r;
+
+ assert(fd >= 0);
+
+ r = fd_verify_regular(fd);
+ if (r < 0)
+ return r;
+
+ if (ioctl(fd, BTRFS_IOC_DEFRAG, NULL) < 0)
+ return -errno;
+
+ return 0;
+}