Projects
home:zhoujc:Mega:24.03
lxc
_service:tar_scm:0006-remove-isulad_cgfsng.patch
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:0006-remove-isulad_cgfsng.patch of Package lxc
From 8db83e2f87cc5377b7a0d3a895d05df37c4abba1 Mon Sep 17 00:00:00 2001 From: zhangxiaoyu <zhangxiaoyu58@huawei.com> Date: Wed, 18 Oct 2023 11:01:26 +0800 Subject: [PATCH 1/2] remove isulad_cgfsng Signed-off-by: zhangxiaoyu <zhangxiaoyu58@huawei.com> --- src/lxc/cgroups/isulad_cgfsng.c | 4137 ------------------------------- 1 file changed, 4137 deletions(-) delete mode 100644 src/lxc/cgroups/isulad_cgfsng.c diff --git a/src/lxc/cgroups/isulad_cgfsng.c b/src/lxc/cgroups/isulad_cgfsng.c deleted file mode 100644 index 1160af5..0000000 --- a/src/lxc/cgroups/isulad_cgfsng.c +++ /dev/null @@ -1,4137 +0,0 @@ -/****************************************************************************** - * Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. - * Author: lifeng - * Create: 2020-11-02 - * Description: provide container definition - * lxc: linux Container library - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - ******************************************************************************/ - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif -#include <ctype.h> -#include <dirent.h> -#include <errno.h> -#include <grp.h> -#include <linux/kdev_t.h> -#include <linux/types.h> -#include <poll.h> -#include <signal.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/epoll.h> -#include <sys/types.h> -#include <unistd.h> - -#include "af_unix.h" -#include "caps.h" -#include "cgroup.h" -#include "cgroup2_devices.h" -#include "cgroup_utils.h" -#include "commands.h" -#include "commands_utils.h" -#include "conf.h" -#include "config.h" -#include "log.h" -#include "macro.h" -#include "mainloop.h" -#include "memory_utils.h" -#include "open_utils.h" -#include "storage/storage.h" -#include "utils.h" - -#if !HAVE_STRLCPY -#include "include/strlcpy.h" -#endif - -#if !HAVE_STRLCAT -#include "include/strlcat.h" -#endif - -#if HAVE_LIBSYSTEMD -#include <systemd/sd-bus.h> -#include <systemd/sd-event.h> -#endif - -lxc_log_define(isulad_cgfsng, cgroup); - -/* - * Given a pointer to a null-terminated array of pointers, realloc to add one - * entry, and point the new entry to NULL. Do not fail. Return the index to the - * second-to-last entry - that is, the one which is now available for use - * (keeping the list null-terminated). - */ -static int cg_list_add(void ***list) -{ - int idx = 0; - void **p; - - if (*list) - for (; (*list)[idx]; idx++) - ; - - p = realloc(*list, (idx + 2) * sizeof(void **)); - if (!p) - return ret_errno(ENOMEM); - - p[idx + 1] = NULL; - *list = p; - - return idx; -} - -/* Given a null-terminated array of strings, check whether @entry is one of the - * strings. - */ -static bool string_in_list(char **list, const char *entry) -{ - if (!list) - return false; - - for (int i = 0; list[i]; i++) - if (strcmp(list[i], entry) == 0) - return true; - - return false; -} - -/* Given a handler's cgroup data, return the struct hierarchy for the controller - * @c, or NULL if there is none. - */ -static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller) -{ - if (!ops->hierarchies) - return log_trace_errno(NULL, errno, "There are no useable cgroup controllers"); - - for (int i = 0; ops->hierarchies[i]; i++) { - if (!controller) { - /* This is the empty unified hierarchy. */ - if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0]) - return ops->hierarchies[i]; - - continue; - } - - /* - * Handle controllers with significant implementation changes - * from cgroup to cgroup2. - */ - if (pure_unified_layout(ops)) { - if (strequal(controller, "devices")) { - if (device_utility_controller(ops->unified)) - return ops->unified; - - break; - } else if (strequal(controller, "freezer")) { - if (freezer_utility_controller(ops->unified)) - return ops->unified; - - break; - } - } - - if (string_in_list(ops->hierarchies[i]->controllers, controller)) - return ops->hierarchies[i]; - } - - if (controller) - WARN("There is no useable %s controller", controller); - else - WARN("There is no empty unified cgroup hierarchy"); - - return ret_set_errno(NULL, ENOENT); -} - -int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit) -{ - int dfd; - const struct hierarchy *h; - - h = get_hierarchy(ops, fd->controller); - if (!h) - return ret_errno(ENOENT); - - /* - * The client requested that the controller must be in a specific - * cgroup version. - */ - if (fd->type != 0 && (cgroupfs_type_magic_t)fd->type != h->fs_type) - return ret_errno(EINVAL); - - if (limit) - dfd = h->dfd_con; - else - dfd = h->dfd_lim; - if (dfd < 0) - return ret_errno(EBADF); - - fd->layout = ops->cgroup_layout; - fd->type = h->fs_type; - if (fd->type == UNIFIED_HIERARCHY) - fd->utilities = h->utilities; - fd->fd = dfd; - - return 0; -} - -#define BATCH_SIZE 50 -static void batch_realloc(char **mem, size_t oldlen, size_t newlen) -{ - int newbatches = (newlen / BATCH_SIZE) + 1; - int oldbatches = (oldlen / BATCH_SIZE) + 1; - - if (!*mem || newbatches > oldbatches) - *mem = must_realloc(*mem, newbatches * BATCH_SIZE); -} - -static void append_line(char **dest, size_t oldlen, char *new, size_t newlen) -{ - size_t full = oldlen + newlen; - - batch_realloc(dest, oldlen, full + 1); - - memcpy(*dest + oldlen, new, newlen + 1); -} - -/* Slurp in a whole file */ -static char *read_file(const char *fnam) -{ - __do_free char *buf = NULL, *line = NULL; - __do_fclose FILE *f = NULL; - size_t len = 0, fulllen = 0; - int linelen; - - f = fopen(fnam, "re"); - if (!f) - return NULL; - - while ((linelen = getline(&line, &len, f)) != -1) { - append_line(&buf, fulllen, line, linelen); - fulllen += linelen; - } - - return move_ptr(buf); -} - -static inline bool is_unified_hierarchy(const struct hierarchy *h) -{ - return h->fs_type == UNIFIED_HIERARCHY; -} - -static char *trim(char *s) -{ - size_t len; - - len = strlen(s); - while ((len > 1) && (s[len - 1] == '\n')) - s[--len] = '\0'; - - return s; -} - -/* Return true if the controller @entry is found in the null-terminated list of - * hierarchies @hlist. - */ -static bool controller_available(struct hierarchy **hlist, char *entry) -{ - if (!hlist) - return false; - - for (int i = 0; hlist[i]; i++) - if (string_in_list(hlist[i]->controllers, entry)) - return true; - - return false; -} - -static bool controllers_available(struct cgroup_ops *ops) -{ - struct hierarchy **hlist; - - if (!ops->cgroup_use) - return true; - - hlist = ops->hierarchies; - for (char **cur = ops->cgroup_use; cur && *cur; cur++) - if (!controller_available(hlist, *cur)) - return log_error(false, "The %s controller found", *cur); - - return true; -} - -static char **list_new(void) -{ - __do_free_string_list char **list = NULL; - int idx; - - idx = cg_list_add((void ***)&list); - if (idx < 0) - return NULL; - - list[idx] = NULL; - return move_ptr(list); -} - -static int list_add_string(char ***list, char *entry) -{ - __do_free char *dup = NULL; - int idx; - - dup = strdup(entry); - if (!dup) - return ret_errno(ENOMEM); - - idx = cg_list_add((void ***)list); - if (idx < 0) - return idx; - - (*list)[idx] = move_ptr(dup); - return 0; -} - -static char **list_add_controllers(char *controllers) -{ - __do_free_string_list char **list = NULL; - char *it; - - lxc_iterate_parts(it, controllers, ", \t\n") { - int ret; - - ret = list_add_string(&list, it); - if (ret < 0) - return NULL; - } - - return move_ptr(list); -} - -static char **unified_controllers(int dfd, const char *file) -{ - __do_free char *buf = NULL; - - buf = read_file_at(dfd, file, PROTECT_OPEN, 0); - if (!buf) - return NULL; - - return list_add_controllers(buf); -} - -static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers) -{ - if (!ops->cgroup_use) - return false; - - for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) { - bool found = false; - - for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) { - if (!strequal(*cur_use, *cur_ctrl)) - continue; - - found = true; - break; - } - - if (found) - continue; - - return true; - } - - return false; -} - -static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt, - int dfd_base, char *base_cgroup, - char **controllers, cgroupfs_type_magic_t fs_type) -{ - __do_free struct hierarchy *new = NULL; - int idx; - - if (abspath(base_cgroup)) - return syserror_set(-EINVAL, "Container base path must be relative to controller mount"); - - new = zalloc(sizeof(*new)); - if (!new) - return ret_errno(ENOMEM); - - new->dfd_con = -EBADF; - new->dfd_lim = -EBADF; - new->dfd_mon = -EBADF; - - new->fs_type = fs_type; - new->controllers = controllers; - new->at_mnt = mnt; - new->at_base = base_cgroup; - - new->dfd_mnt = dfd_mnt; - new->dfd_base = dfd_base; - - TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s", - mnt, maybe_empty(base_cgroup)); - for (char *const *it = new->controllers; it && *it; it++) - TRACE("The hierarchy contains the %s controller", *it); - - idx = cg_list_add((void ***)&ops->hierarchies); - if (idx < 0) - return ret_errno(idx); - - if (fs_type == UNIFIED_HIERARCHY) - ops->unified = new; - (ops->hierarchies)[idx] = move_ptr(new); - - return 0; -} - -struct generic_userns_exec_data { - struct hierarchy **hierarchies; - const char *path_prune; - struct lxc_conf *conf; - uid_t origuid; /* target uid in parent namespace */ - char *path; -}; - -static int isulad_cgroup_tree_remove(struct hierarchy **hierarchies, - const char *container_cgroup) -{ - if (!container_cgroup || !hierarchies) - return 0; - - for (int i = 0; hierarchies[i]; i++) { - struct hierarchy *h = hierarchies[i]; - int ret; - - if (!h->path_con) { - h->path_con = must_make_path(h->at_mnt, h->at_base, container_cgroup, NULL); - } - - ret = lxc_rm_rf(h->path_con); - if (ret < 0) { - if (errno == ENOENT) { - WARN("Destroy path: \"%s\" do not exist", h->path_con); - return 0; - } - SYSERROR("Failed to destroy \"%s\"", h->path_con); - return -1; - } - - free_disarm(h->path_con); - } - - return 0; -} - -static int isulad_cgroup_tree_remove_wrapper(void *data) -{ - struct generic_userns_exec_data *arg = data; - uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid; - gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid; - int ret; - - if (!lxc_drop_groups() && errno != EPERM) - return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)"); - - ret = setresgid(nsgid, nsgid, nsgid); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)", - (int)nsgid, (int)nsgid, (int)nsgid); - - ret = setresuid(nsuid, nsuid, nsuid); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)", - (int)nsuid, (int)nsuid, (int)nsuid); - - return isulad_cgroup_tree_remove(arg->hierarchies, arg->path_prune); -} - -__cgfsng_ops static bool isulad_cgfsng_payload_destroy(struct cgroup_ops *ops, - struct lxc_handler *handler) -{ - int ret; - - if (!ops) { - ERROR("Called with uninitialized cgroup operations"); - return false; - } - - if (ops->no_controller) { - DEBUG("no controller found, ignore isulad_cgfsng_payload_destroy"); - return true; - } - - if (!ops->hierarchies) { - DEBUG("no hierarchies found, ignore isulad_cgfsng_payload_destroy"); - return true; - } - - if (!handler) { - ERROR("Called with uninitialized handler"); - return false; - } - - if (!handler->conf) { - ERROR("Called with uninitialized conf"); - return false; - } - -#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX - ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices); - if (ret < 0) - WARN("Failed to detach bpf program from cgroup"); -#endif - - if (!list_empty(&handler->conf->id_map) && !handler->am_root) { - struct generic_userns_exec_data wrap = { - .conf = handler->conf, - .path_prune = ops->container_limit_cgroup, - .hierarchies = ops->hierarchies, - .origuid = 0, - }; - ret = userns_exec_1(handler->conf, isulad_cgroup_tree_remove_wrapper, - &wrap, "cgroup_tree_remove_wrapper"); - } else { - ret = isulad_cgroup_tree_remove(ops->hierarchies, ops->container_cgroup); - } - if (ret < 0) { - SYSWARN("Failed to destroy cgroups"); - return false; - } - - return true; -} - -__cgfsng_ops static void isulad_cgfsng_monitor_destroy(struct cgroup_ops *ops, - struct lxc_handler *handler) -{ - return; -} - -#define SYSTEMD_SCOPE_FAILED 2 -#define SYSTEMD_SCOPE_UNSUPP 1 -#define SYSTEMD_SCOPE_SUCCESS 0 - -#if HAVE_LIBSYSTEMD -struct sd_callback_data { - char *scope_name; - bool job_complete; -}; - -static int systemd_jobremoved_callback(sd_bus_message *m, void *userdata, sd_bus_error *error) -{ - char *path, *unit, *result; - struct sd_callback_data *sd_data = userdata; - uint32_t id; - int r; - - r = sd_bus_message_read(m, "uoss", &id, &path, &unit, &result); - if (r < 0) - return log_error(-1, "bad message received in callback: %s", strerror(-r)); - - if (sd_data->scope_name && strcmp(unit, sd_data->scope_name) != 0) - return log_trace(-1, "unit was '%s' not '%s'", unit, sd_data->scope_name); - if (strcmp(result, "done") == 0) { - sd_data->job_complete = true; - return log_info(1, "job is done"); - } - return log_debug(0, "result was '%s', not 'done'", result); -} - -#define DESTINATION "org.freedesktop.systemd1" -#define PATH "/org/freedesktop/systemd1" -#define INTERFACE "org.freedesktop.systemd1.Manager" -#define MEMBER "StartTransientUnit" -static bool start_scope(sd_bus *bus, struct sd_callback_data *data, struct sd_event *event) -{ - __attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;; - __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL; - __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL; - char *path = NULL; - int r; - - r = sd_bus_message_new_method_call(bus, &m, - DESTINATION, PATH, INTERFACE, MEMBER); - if (r < 0) - return log_error(false, "Failed creating sdbus message"); - - r = sd_bus_message_append(m, "ss", data->scope_name, "fail"); - if (r < 0) - return log_error(false, "Failed setting systemd scope name"); - - r = sd_bus_message_open_container(m, 'a', "(sv)"); - if (r < 0) - return log_error(false, "Failed allocating sdbus msg properties"); - - r = sd_bus_message_append(m, "(sv)(sv)(sv)", - "PIDs", "au", 1, getpid(), - "Delegate", "b", 1, - "CollectMode", "s", "inactive-or-failed"); - if (r < 0) - return log_error(false, "Failed setting properties on sdbus message"); - - r = sd_bus_message_close_container(m); - if (r < 0) - return log_error(false, "Failed closing sdbus message properties"); - - r = sd_bus_message_append(m, "a(sa(sv))", 0); - if (r < 0) - return log_error(false, "Failed appending aux boilerplate\n"); - - r = sd_bus_call(NULL, m, 0, &error, &reply); - if (r < 0) - return log_error(false, "Failed sending sdbus message: %s", error.message); - - /* Parse the response message */ - r = sd_bus_message_read(reply, "o", &path); - if (r < 0) - return log_error(false, "Failed to parse response message: %s", strerror(-r)); - - /* Now spin up a mini-event-loop to wait for the "job completed" message */ - int tries = 0; - - while (!data->job_complete) { - r = sd_event_run(event, 1000 * 1000); - if (r < 0) { - log_debug(stderr, "Error waiting for JobRemoved: %s\n", strerror(-r)); - continue; - } - if (data->job_complete || tries == 5) - break; - if (r > 0) { - log_trace(stderr, "Debug: we processed an event (%d), but not the one we wanted\n", r); - continue; - } - if (r == 0) // timeout - tries++; - } - if (!data->job_complete) { - return log_error(false, "Error: %s job was never removed", data->scope_name); - } - return true; -} - -static bool string_pure_unified_system(char *contents) -{ - char *p; - bool first_line_read = false; - - lxc_iterate_parts(p, contents, "\n") { - if (first_line_read) // if >1 line, this is not pure unified - return false; - first_line_read = true; - - if (strlen(p) > 3 && strncmp(p, "0:", 2) == 0) - return true; - } - - return false; -} - -/* - * Only call get_current_unified_cgroup() when we are in a pure - * unified (v2-only) cgroup - */ -static char *get_current_unified_cgroup(void) -{ - __do_free char *buf = NULL; - __do_free_string_list char **list = NULL; - char *p; - - buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); - if (!buf) - return NULL; - - if (!string_pure_unified_system(buf)) - return NULL; - - // 0::/user.slice/user-1000.slice/session-136.scope - // Get past the "0::" - p = buf; - if (strnequal(p, "0::", STRLITERALLEN("0::"))) - p += STRLITERALLEN("0::"); - - return strdup(p); -} - -static bool pure_unified_system(void) -{ - __do_free char *buf = NULL; - - buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); - if (!buf) - return false; - - return string_pure_unified_system(buf); -} - -#define MEMBER_JOIN "AttachProcessesToUnit" -static bool enter_scope(char *scope_name, pid_t pid) -{ - __attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL; - __attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;; - __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL; - __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL; - int r; - - r = sd_bus_open_user(&bus); - if (r < 0) - return log_error(false, "Failed to connect to user bus: %s", strerror(-r)); - - r = sd_bus_message_new_method_call(bus, &m, - DESTINATION, PATH, INTERFACE, MEMBER_JOIN); - if (r < 0) - return log_error(false, "Failed creating sdbus message"); - - r = sd_bus_message_append(m, "ssau", scope_name, "/init", 1, pid); - if (r < 0) - return log_error(false, "Failed setting systemd scope name"); - - - r = sd_bus_call(NULL, m, 0, &error, &reply); - if (r < 0) - return log_error(false, "Failed sending sdbus message: %s", error.message); - - return true; -} - -static bool enable_controllers_delegation(int fd_dir, char *cg) -{ - __do_free char *rbuf = NULL; - __do_free char *wbuf = NULL; - __do_free_string_list char **cpulist = NULL; - char *controller; - size_t full_len = 0; - bool first = true; - int ret; - - rbuf = read_file_at(fd_dir, "cgroup.controllers", PROTECT_OPEN, 0); - if (!rbuf) - return false; - - lxc_iterate_parts(controller, rbuf, " ") { - full_len += strlen(controller) + 2; - wbuf = must_realloc(wbuf, full_len + 1); - if (first) { - wbuf[0] = '\0'; - first = false; - } else { - (void)strlcat(wbuf, " ", full_len + 1); - } - strlcat(wbuf, "+", full_len + 1); - strlcat(wbuf, controller, full_len + 1); - } - if (!wbuf) - return log_debug(true, "No controllers to delegate!"); - - ret = lxc_writeat(fd_dir, "cgroup.subtree_control", wbuf, strlen(wbuf)); - if (ret < 0) - return log_error_errno(false, errno, "Failed to write \"%s\" to %s/cgroup.subtree_control", wbuf, cg); - - return true; -} - -/* - * systemd places us in say .../lxc-1.scope. We create lxc-1.scope/init, - * move ourselves to there, then enable controllers in lxc-1.scope - */ -static bool move_and_delegate_unified(char *parent_cgroup) -{ - __do_free char *buf = NULL; - __do_close int fd_parent = -EBADF; - int ret; - - fd_parent = open_at(-EBADF, parent_cgroup, O_DIRECTORY, 0, 0); - if (fd_parent < 0) - return syserror_ret(false, "Failed opening cgroup dir \"%s\"", parent_cgroup); - - ret = mkdirat(fd_parent, "init", 0755); - if (ret < 0 && errno != EEXIST) - return syserror_ret(false, "Failed to create \"%d/init\" cgroup", fd_parent); - - buf = read_file_at(fd_parent, "cgroup.procs", PROTECT_OPEN, 0); - if (!buf) - return false; - - ret = lxc_writeat(fd_parent, "init/cgroup.procs", buf, strlen(buf)); - if (ret) - return syserror_ret(false, "Failed to escape to cgroup \"init/cgroup.procs\""); - - /* enable controllers in parent_cgroup */ - return enable_controllers_delegation(fd_parent, parent_cgroup); -} - -static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf) -{ - __do_free char *full_scope_name = NULL; - __do_free char *fs_cg_path = NULL; - sd_event *event = NULL; - __attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL; // free the bus before the names it references, just to be sure - struct sd_callback_data sd_data; - int idx = 0; - size_t len; - int r; - - if (geteuid() == 0) - return log_info(SYSTEMD_SCOPE_UNSUPP, "Running privileged, not using a systemd unit"); - // Pure_unified_layout() can't be used as that info is not yet setup. At - // the same time, we don't want to calculate current cgroups until after - // we optionally enter a new systemd user scope. So let's just do a quick - // check for pure unified cgroup system: single line /proc/self/cgroup with - // only index '0:' - if (!pure_unified_system()) - return log_info(SYSTEMD_SCOPE_UNSUPP, "Not in unified layout, not using a systemd unit"); - - r = sd_bus_open_user(&bus); - if (r < 0) - return log_error(SYSTEMD_SCOPE_FAILED, "Failed to connect to user bus: %s", strerror(-r)); - - r = sd_bus_call_method_async(bus, NULL, DESTINATION, PATH, INTERFACE, "Subscribe", NULL, NULL, NULL); - if (r < 0) - return log_error(SYSTEMD_SCOPE_FAILED, "Failed to subscribe to signals: %s", strerror(-r)); - - sd_data.job_complete = false; - sd_data.scope_name = NULL; - r = sd_bus_match_signal(bus, - NULL, // no slot - DESTINATION, PATH, INTERFACE, "JobRemoved", - systemd_jobremoved_callback, &sd_data); - if (r < 0) - return log_error(SYSTEMD_SCOPE_FAILED, "Failed to register systemd event loop signal handler: %s", strerror(-r)); - - // NEXT: create and attach event - r = sd_event_new(&event); - if (r < 0) - return log_error(SYSTEMD_SCOPE_FAILED, "Failed allocating new event: %s\n", strerror(-r)); - r = sd_bus_attach_event(bus, event, SD_EVENT_PRIORITY_NORMAL); - if (r < 0) { - // bus won't clean up event since the attach failed - sd_event_unrefp(&event); - return log_error(SYSTEMD_SCOPE_FAILED, "Failed attaching event: %s\n", strerror(-r)); - } - - // "lxc-" + (conf->name) + "-NN" + ".scope" + '\0' - len = STRLITERALLEN("lxc-") + strlen(conf->name) + 3 + STRLITERALLEN(".scope") + 1; - full_scope_name = malloc(len); - if (!full_scope_name) - return syserror("Out of memory"); - - do { - r = strnprintf(full_scope_name, len, "lxc-%s-%d.scope", conf->name, idx); - if (r < 0) - return log_error_errno(-1, errno, "Failed to build scope name for \"%s\"", conf->name); - sd_data.scope_name = full_scope_name; - if (start_scope(bus, &sd_data, event)) { - conf->cgroup_meta.systemd_scope = get_current_unified_cgroup(); - if (!conf->cgroup_meta.systemd_scope) - return log_trace(SYSTEMD_SCOPE_FAILED, "Out of memory"); - fs_cg_path = must_make_path("/sys/fs/cgroup", conf->cgroup_meta.systemd_scope, NULL); - if (!move_and_delegate_unified(fs_cg_path)) - return log_error(SYSTEMD_SCOPE_FAILED, "Failed delegating the controllers to our cgroup"); - return log_trace(SYSTEMD_SCOPE_SUCCESS, "Created systemd scope %s", full_scope_name); - } - idx++; - } while (idx < 99); - - return SYSTEMD_SCOPE_FAILED; // failed, let's try old-school after all -} -#else /* !HAVE_LIBSYSTEMD */ -static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf) -{ - TRACE("unpriv_systemd_create_scope: no systemd support"); - return SYSTEMD_SCOPE_UNSUPP; // not supported -} -#endif /* HAVE_LIBSYSTEMD */ - -// Return a duplicate of cgroup path @cg without leading /, so -// that caller can own+free it and be certain it's not abspath. -static char *cgroup_relpath(char *cg) -{ - char *p; - - if (!cg || strequal(cg, "/")) - return NULL; - p = strdup(deabs(cg)); - if (!p) - return ERR_PTR(-ENOMEM); - - return p; -} - -__cgfsng_ops static inline bool isulad_cgfsng_monitor_create(struct cgroup_ops *ops, - struct lxc_handler *handler) -{ - return true; -} - -static bool isulad_copy_parent_file(char *path, char *file) -{ - int ret; - int len = 0; - char *value = NULL; - char *current = NULL; - char *fpath = NULL; - char *lastslash = NULL; - char oldv; - - fpath = must_make_path(path, file, NULL); - current = read_file(fpath); - - if (current == NULL) { - SYSERROR("Failed to read file \"%s\"", fpath); - free(fpath); - return false; - } - - if (strcmp(current, "\n") != 0) { - free(fpath); - free(current); - return true; - } - - free(fpath); - free(current); - - lastslash = strrchr(path, '/'); - if (lastslash == NULL) { - ERROR("Failed to detect \"/\" in \"%s\"", path); - return false; - } - oldv = *lastslash; - *lastslash = '\0'; - fpath = must_make_path(path, file, NULL); - *lastslash = oldv; - len = lxc_read_from_file(fpath, NULL, 0); - if (len <= 0) - goto on_error; - - value = must_realloc(NULL, len + 1); - ret = lxc_read_from_file(fpath, value, len); - if (ret != len) - goto on_error; - free(fpath); - - fpath = must_make_path(path, file, NULL); - ret = lxc_write_to_file(fpath, value, len, false, 0666); - if (ret < 0) - SYSERROR("Failed to write \"%s\" to file \"%s\"", value, fpath); - free(fpath); - free(value); - return ret >= 0; - -on_error: - SYSERROR("Failed to read file \"%s\"", fpath); - free(fpath); - free(value); - return false; -} - -static bool build_sub_cpuset_cgroup_dir(char *cgpath) -{ - int ret; - - ret = mkdir_p(cgpath, 0755); - if (ret < 0) { - if (errno != EEXIST) { - SYSERROR("Failed to create directory \"%s\"", cgpath); - return false; - } - } - - /* copy parent's settings */ - if (!isulad_copy_parent_file(cgpath, "cpuset.cpus")) { - SYSERROR("Failed to copy \"cpuset.cpus\" settings"); - return false; - } - - /* copy parent's settings */ - if (!isulad_copy_parent_file(cgpath, "cpuset.mems")) { - SYSERROR("Failed to copy \"cpuset.mems\" settings"); - return false; - } - - return true; -} - -static bool isulad_cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname) -{ - char *cgpath, *slash; - bool sub_mk_success = false; - - if (is_unified_hierarchy(h)) - return true; - - if (!string_in_list(h->controllers, "cpuset")) - return true; - - cgname += strspn(cgname, "/"); - - slash = strchr(cgname, '/'); - - if (slash != NULL) { - while (slash) { - *slash = '\0'; - cgpath = must_make_path(h->at_mnt, h->at_base, cgname, NULL); - sub_mk_success = build_sub_cpuset_cgroup_dir(cgpath); - free(cgpath); - *slash = '/'; - if (!sub_mk_success) { - return false; - } - slash = strchr(slash + 1, '/'); - } - } - - cgpath = must_make_path(h->at_mnt, h->at_base, cgname, NULL); - sub_mk_success = build_sub_cpuset_cgroup_dir(cgpath); - free(cgpath); - if (!sub_mk_success) { - return false; - } - - return true; -} - -static int isulad_mkdir_eexist_on_last(const char *dir, mode_t mode) -{ - const char *tmp = dir; - const char *orig = dir; - - do { - int ret; - size_t cur_len; - char *makeme; - - dir = tmp + strspn(tmp, "/"); - tmp = dir + strcspn(dir, "/"); - - errno = ENOMEM; - cur_len = dir - orig; - makeme = strndup(orig, cur_len); - if (!makeme) - return -1; - - ret = mkdir(makeme, mode); - if (ret < 0) { - if (errno != EEXIST) { - SYSERROR("Failed to create directory \"%s\"", makeme); - free(makeme); - return -1; - } - } - free(makeme); - - } while (tmp != dir); - - return 0; -} - -static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname, int errfd) -{ - int ret; - __do_free char *path = NULL; - - path = must_make_path(h->at_mnt, h->at_base, cgname, NULL); - - if (file_exists(path)) { // it must not already exist - ERROR("Cgroup path \"%s\" already exist.", path); - lxc_write_error_message(errfd, "%s:%d: Cgroup path \"%s\" already exist.", - __FILE__, __LINE__, path); - return false; - } - - if (!isulad_cg_legacy_handle_cpuset_hierarchy(h, cgname)) { - ERROR("Failed to handle legacy cpuset controller"); - return false; - } - - ret = isulad_mkdir_eexist_on_last(path, 0755); - if (ret < 0) { - ERROR("Failed to create cgroup \"%s\"", path); - return false; - } - - h->dfd_con = lxc_open_dirfd(path); - if (h->dfd_con < 0) - return log_error_errno(false, errno, "Failed to open %s", path); - - if (h->path_con == NULL) { - h->path_con = move_ptr(path); - } - - return true; -} - -/* isulad: create hierarchies path, if fail, return the error */ -__cgfsng_ops static inline bool isulad_cgfsng_payload_create(struct cgroup_ops *ops, - struct lxc_handler *handler) -{ - int i; - - if (!ops) - return ret_set_errno(false, ENOENT); - - char *container_cgroup = ops->container_cgroup; - - if (!ops->hierarchies) - return true; - -#ifdef HAVE_ISULAD - if (ops->no_controller) { - DEBUG("no controller found, isgnore isulad_cgfsng_payload_create"); - return true; - } -#endif - - if (!container_cgroup) { - ERROR("cgfsng_create container_cgroup is invalid"); - return false; - } - - for (i = 0; ops->hierarchies[i]; i++) { - if (!create_path_for_hierarchy(ops->hierarchies[i], container_cgroup, ops->errfd)) { - SYSERROR("Failed to create %s", ops->hierarchies[i]->path_con); - return false; - } - } - - return true; -} - -__cgfsng_ops static bool isulad_cgfsng_monitor_enter(struct cgroup_ops *ops, - struct lxc_handler *handler) -{ - return true; -} - -__cgfsng_ops static bool isulad_cgfsng_payload_enter(struct cgroup_ops *ops, - struct lxc_handler *handler) -{ - int len; - char pidstr[INTTYPE_TO_STRLEN(pid_t)]; - - if (!ops) - return ret_set_errno(false, ENOENT); - -#ifdef HAVE_ISULAD - if (ops->no_controller) { - DEBUG("no controller found, isgnore isulad_cgfsng_payload_enter"); - return true; - } -#endif - - if (!ops->hierarchies) - return true; - - if (!ops->container_cgroup) - return ret_set_errno(false, ENOENT); - - if (!handler || !handler->conf) - return ret_set_errno(false, EINVAL); - - len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid); - - for (int i = 0; ops->hierarchies[i]; i++) { - int ret; - char *fullpath; - int retry_count = 0; - int max_retry = 10; - - fullpath = must_make_path(ops->hierarchies[i]->path_con, - "cgroup.procs", NULL); -retry: - ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666); - if (ret != 0) { - if (retry_count < max_retry) { - SYSERROR("Failed to enter cgroup \"%s\" with retry count:%d", fullpath, retry_count); - (void)isulad_cg_legacy_handle_cpuset_hierarchy(ops->hierarchies[i], ops->container_cgroup); - (void)isulad_mkdir_eexist_on_last(ops->hierarchies[i]->path_con, 0755); - usleep(100 * 1000); /* 100 millisecond */ - retry_count++; - goto retry; - } - SYSERROR("Failed to enter cgroup \"%s\"", fullpath); - free(fullpath); - return false; - } - free(fullpath); - } - - return true; -} - -static int fchowmodat(int dirfd, const char *path, uid_t chown_uid, - gid_t chown_gid, mode_t chmod_mode) -{ - int ret; - - ret = fchownat(dirfd, path, chown_uid, chown_gid, - AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); - if (ret < 0) - return log_warn_errno(-1, - errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )", - dirfd, path, (int)chown_uid, - (int)chown_gid); - - ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0); - if (ret < 0) - return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)", - dirfd, path, (int)chmod_mode); - - return 0; -} - -/* chgrp the container cgroups to container group. We leave - * the container owner as cgroup owner. So we must make the - * directories 775 so that the container can create sub-cgroups. - * - * Also chown the tasks and cgroup.procs files. Those may not - * exist depending on kernel version. - */ -static int chown_cgroup_wrapper(void *data) -{ - int ret; - uid_t destuid; - struct generic_userns_exec_data *arg = data; - uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid; - gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid; - - if (!lxc_drop_groups() && errno != EPERM) - return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)"); - - ret = setresgid(nsgid, nsgid, nsgid); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)", - (int)nsgid, (int)nsgid, (int)nsgid); - - ret = setresuid(nsuid, nsuid, nsuid); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)", - (int)nsuid, (int)nsuid, (int)nsuid); - - destuid = get_ns_uid(arg->origuid); - if (destuid == LXC_INVALID_UID) - destuid = 0; - - for (int i = 0; arg->hierarchies[i]; i++) { - int dirfd = arg->hierarchies[i]->dfd_con; - - if (dirfd < 0) - return syserror_set(-EBADF, "Invalid cgroup file descriptor"); - - (void)fchowmodat(dirfd, "", destuid, nsgid, 0775); - - /* - * Failures to chown() these are inconvenient but not - * detrimental We leave these owned by the container launcher, - * so that container root can write to the files to attach. We - * chmod() them 664 so that container systemd can write to the - * files (which systemd in wily insists on doing). - */ - - if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY) - (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664); - - (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664); - - if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY) - continue; - - for (char **p = arg->hierarchies[i]->delegate; p && *p; p++) - (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664); - } - - return 0; -} - -__cgfsng_ops static bool isulad_cgfsng_chown(struct cgroup_ops *ops, - struct lxc_conf *conf) -{ - struct generic_userns_exec_data wrap; - - if (!ops) - return ret_set_errno(false, ENOENT); - - if (!ops->hierarchies) - return true; - - if (!ops->container_cgroup) - return ret_set_errno(false, ENOENT); - - if (!conf) - return ret_set_errno(false, EINVAL); - - if (list_empty(&conf->id_map)) - return true; - - wrap.origuid = geteuid(); - wrap.path = NULL; - wrap.hierarchies = ops->hierarchies; - wrap.conf = conf; - - if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0) - return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace"); - - return true; -} - -__cgfsng_ops static void isulad_cgfsng_finalize(struct cgroup_ops *ops) -{ - if (!ops) - return; - -#ifdef HAVE_ISULAD - if (ops->no_controller) { - DEBUG("no controller found, isgnore isulad_cgfsng_payload_finalize"); - return; - } -#endif - - if (!ops->hierarchies) - return; - - for (int i = 0; ops->hierarchies[i]; i++) { - struct hierarchy *h = ops->hierarchies[i]; - - /* Close all monitor cgroup file descriptors. */ - close_prot_errno_disarm(h->dfd_mon); - } - /* Close the cgroup root file descriptor. */ - close_prot_errno_disarm(ops->dfd_mnt); - - /* - * The checking for freezer support should obviously be done at cgroup - * initialization time but that doesn't work reliable. The freezer - * controller has been demoted (rightly so) to a simple file located in - * each non-root cgroup. At the time when the container is created we - * might still be located in /sys/fs/cgroup and so checking for - * cgroup.freeze won't tell us anything because this file doesn't exist - * in the root cgroup. We could then iterate through /sys/fs/cgroup and - * find an already existing cgroup and then check within that cgroup - * for the existence of cgroup.freeze but that will only work on - * systemd based hosts. Other init systems might not manage cgroups and - * so no cgroup will exist. So we defer until we have created cgroups - * for our container which means we check here. - */ - if (pure_unified_layout(ops) && - !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK, - AT_SYMLINK_NOFOLLOW)) { - TRACE("Unified hierarchy supports freezer"); - ops->unified->utilities |= FREEZER_CONTROLLER; - } -} - -/* cgroup-full:* is done, no need to create subdirs */ -static inline bool cg_mount_needs_subdirs(int type) -{ - return !(type >= LXC_AUTO_CGROUP_FULL_RO); -} - -/* After $rootfs/sys/fs/container/controller/the/cg/path has been created, - * remount controller ro if needed and bindmount the cgroupfs onto - * control/the/cg/path. - */ -static int cg_legacy_mount_controllers(int type, struct hierarchy *h, - char *controllerpath, char *cgpath, - const char *container_cgroup) -{ - __do_free char *sourcepath = NULL; - int ret, remount_flags; - int flags = MS_BIND; - - if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) { - ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"", - controllerpath, controllerpath); - - remount_flags = add_required_remount_flags(controllerpath, - controllerpath, - flags | MS_REMOUNT); - ret = mount(controllerpath, controllerpath, "cgroup", - remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY, - NULL); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath); - - INFO("Remounted %s read-only", controllerpath); - } - - sourcepath = must_make_path(h->at_mnt, h->at_base, - container_cgroup, NULL); - if (type == LXC_AUTO_CGROUP_RO) - flags |= MS_RDONLY; - - ret = mount(sourcepath, cgpath, "cgroup", flags, NULL); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"", - h->controllers[0], cgpath); - INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath); - - if (flags & MS_RDONLY) { - remount_flags = add_required_remount_flags(sourcepath, cgpath, - flags | MS_REMOUNT); - ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath); - INFO("Remounted %s read-only", cgpath); - } - - INFO("Completed second stage cgroup automounts for \"%s\"", cgpath); - return 0; -} - -/* __cgroupfs_mount - * - * Mount cgroup hierarchies directly without using bind-mounts. The main - * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting - * cgroups for the LXC_AUTO_CGROUP_FULL option. - */ -static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, - struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs, - const char *hierarchy_mnt) -{ - __do_close int fd_fs = -EBADF; - unsigned int flags = 0; - char *fstype; - int ret; - - if (dfd_mnt_cgroupfs < 0) - return ret_errno(EINVAL); - - flags |= MOUNT_ATTR_NOSUID; - flags |= MOUNT_ATTR_NOEXEC; - flags |= MOUNT_ATTR_NODEV; - flags |= MOUNT_ATTR_RELATIME; - - if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) || - (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO) || - (cgroup_automount_type == LXC_AUTO_CGROUP2_RO)) - flags |= MOUNT_ATTR_RDONLY; - - if (is_unified_hierarchy(h)) - fstype = "cgroup2"; - else - fstype = "cgroup"; - - if (can_use_mount_api()) { - fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0); - if (fd_fs < 0) - return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype); - - if (!is_unified_hierarchy(h)) { - for (const char **it = (const char **)h->controllers; it && *it; it++) { - if (strnequal(*it, "name=", STRLITERALLEN("name="))) - ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name=")); - else - ret = fs_set_property(fd_fs, *it, ""); - if (ret < 0) - return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs); - } - } - - ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt, - PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, - flags); - } else { - __do_free char *controllers = NULL, *target = NULL; - unsigned int old_flags = 0; - const char *rootfs_mnt; - - if (!is_unified_hierarchy(h)) { - controllers = lxc_string_join(",", (const char **)h->controllers, false); - if (!controllers) - return ret_errno(ENOMEM); - } - - rootfs_mnt = get_rootfs_mnt(rootfs); - ret = mnt_attributes_old(flags, &old_flags); - if (ret) - return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified"); - - target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL); -#ifdef HAVE_ISULAD - ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt, NULL); -#else - ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt); -#endif - } - if (ret < 0) - return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)", - fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt)); - - DEBUG("Mounted cgroup filesystem %s onto %d(%s)", - fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt)); - return 0; -} - -static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, - struct lxc_rootfs *rootfs, - int dfd_mnt_cgroupfs, const char *hierarchy_mnt) -{ - return __cgroupfs_mount(cgroup_automount_type, h, rootfs, - dfd_mnt_cgroupfs, hierarchy_mnt); -} - -static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h, - struct lxc_rootfs *rootfs, - int dfd_mnt_cgroupfs, - const char *hierarchy_mnt) -{ - switch (cgroup_automount_type) { - case LXC_AUTO_CGROUP_FULL_RO: - break; - case LXC_AUTO_CGROUP_FULL_RW: - break; - case LXC_AUTO_CGROUP_FULL_MIXED: - break; - default: - return 0; - } - - return __cgroupfs_mount(cgroup_automount_type, h, rootfs, - dfd_mnt_cgroupfs, hierarchy_mnt); -} - -/* __cg_mount_direct - * - * Mount cgroup hierarchies directly without using bind-mounts. The main - * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting - * cgroups for the LXC_AUTO_CGROUP_FULL option. - */ -static int __cg_mount_direct(int type, struct hierarchy *h, - const char *controllerpath) -{ - __do_free char *controllers = NULL; - char *fstype = "cgroup2"; - unsigned long flags = 0; - int ret; - - flags |= MS_NOSUID; - flags |= MS_NOEXEC; - flags |= MS_NODEV; - flags |= MS_RELATIME; - - if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO) - flags |= MS_RDONLY; - - if (h->fs_type != CGROUP2_SUPER_MAGIC) { - controllers = lxc_string_join(",", (const char **)h->controllers, false); - if (!controllers) - return -ENOMEM; - fstype = "cgroup"; - } - - ret = mount("cgroup", controllerpath, fstype, flags, controllers); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to mount \"%s\" with cgroup filesystem type %s", - controllerpath, fstype); - - DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype); - return 0; -} - -static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h, - const char *controllerpath) -{ - return __cg_mount_direct(type, h, controllerpath); -} - -static inline int cg_mount_cgroup_full(int type, struct hierarchy *h, - const char *controllerpath) -{ - if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) - return 0; - - return __cg_mount_direct(type, h, controllerpath); -} - -__cgfsng_ops static bool isulad_cgfsng_mount(struct cgroup_ops *ops, - struct lxc_handler *handler, int cg_flags) -{ - __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF; - __do_free char *cgroup_root = NULL; - int cgroup_automount_type; - bool in_cgroup_ns = false, wants_force_mount = false; - struct lxc_conf *conf = handler->conf; - struct lxc_rootfs *rootfs = &conf->rootfs; - const char *rootfs_mnt = get_rootfs_mnt(rootfs); - int ret; -#ifdef HAVE_ISULAD - char **merged = NULL; - __do_free char *systemdpath = NULL; - __do_free char *unifiedpath = NULL; -#endif - - if (!ops) - return ret_set_errno(false, ENOENT); - - if (!ops->hierarchies) - return true; - - if (!conf) - return ret_set_errno(false, EINVAL); - - if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0) - return log_trace(true, "No cgroup mounts requested"); - - if (cg_flags & LXC_AUTO_CGROUP_FORCE) { - cg_flags &= ~LXC_AUTO_CGROUP_FORCE; - wants_force_mount = true; - } - - switch (cg_flags) { - case LXC_AUTO_CGROUP_RO: - TRACE("Read-only cgroup mounts requested"); - break; - case LXC_AUTO_CGROUP_RW: - TRACE("Read-write cgroup mounts requested"); - break; - case LXC_AUTO_CGROUP_MIXED: - TRACE("Mixed cgroup mounts requested"); - break; - case LXC_AUTO_CGROUP_FULL_RO: - TRACE("Full read-only cgroup mounts requested"); - break; - case LXC_AUTO_CGROUP_FULL_RW: - TRACE("Full read-write cgroup mounts requested"); - break; - case LXC_AUTO_CGROUP_FULL_MIXED: - TRACE("Full mixed cgroup mounts requested"); - break; - case LXC_AUTO_CGROUP2_RW: - TRACE("Read-write cgroup2 mount requested"); - break; - case LXC_AUTO_CGROUP2_RO: - TRACE("Read-only cgroup2 mount requested"); - break; - default: - return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified"); - } - cgroup_automount_type = cg_flags; - - if (!wants_force_mount) { - wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf); - - /* - * Most recent distro versions currently have init system that - * do support cgroup2 but do not mount it by default unless - * explicitly told so even if the host is cgroup2 only. That - * means they often will fail to boot. Fix this by pre-mounting - * cgroup2 by default. We will likely need to be doing this a - * few years until all distros have switched over to cgroup2 at - * which point we can safely assume that their init systems - * will mount it themselves. - */ - if (pure_unified_layout(ops)) - wants_force_mount = true; - } - - if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP)) - in_cgroup_ns = true; - - if (in_cgroup_ns && !wants_force_mount) - return log_trace(true, "Mounting cgroups not requested or needed"); - - /* This is really the codepath that we want. */ - if (pure_unified_layout(ops) || - (cgroup_automount_type == LXC_AUTO_CGROUP2_RW) || - (cgroup_automount_type == LXC_AUTO_CGROUP2_RO)) { - __do_close int dfd_mnt_unified = -EBADF; - - if (!ops->unified) - return log_error_errno(false, EINVAL, "No unified cgroup hierarchy mounted on the host"); - - dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, - PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); - if (dfd_mnt_unified < 0) - return syserror_ret(false, "Failed to open %d(%s)", - rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); - /* - * If cgroup namespaces are supported but the container will - * not have CAP_SYS_ADMIN after it has started we need to mount - * the cgroups manually. - * - * Note that here we know that wants_force_mount is true. - * Otherwise we would've returned early above. - */ - if (in_cgroup_ns) { - /* - * 1. cgroup:rw:force -> Mount the cgroup2 filesystem. - * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only. - * 3. cgroup:mixed:force -> See comment above how this - * does not apply so - * cgroup:mixed is equal to - * cgroup:rw when cgroup - * namespaces are supported. - - * 4. cgroup:rw -> No-op; init system responsible for mounting. - * 5. cgroup:ro -> No-op; init system responsible for mounting. - * 6. cgroup:mixed -> No-op; init system responsible for mounting. - * - * 7. cgroup-full:rw -> Not supported. - * 8. cgroup-full:ro -> Not supported. - * 9. cgroup-full:mixed -> Not supported. - - * 10. cgroup-full:rw:force -> Not supported. - * 11. cgroup-full:ro:force -> Not supported. - * 12. cgroup-full:mixed:force -> Not supported. - * - * 13. cgroup2 -> No-op; init system responsible for mounting. - * 14. cgroup2:ro -> No-op; init system responsible for mounting. - * 15. cgroup2:force -> Mount the cgroup2 filesystem read-write - * 16. cgroup2:ro:force -> Mount the cgroup2 filesystem read-only - */ - ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, ""); - if (ret < 0) - return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace"); - - return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace"); - } else { - /* - * Either no cgroup namespace supported (highly - * unlikely unless we're dealing with a Frankenkernel. - * Or the user requested to keep the cgroup namespace - * of the host or another container. - */ - errno = EOPNOTSUPP; - if (wants_force_mount) - SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported"); - else - SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported"); - } - - return syserror_ret(false, "Failed to mount cgroups"); - } - - /* - * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're - * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the - * DEFAULT_CGROUP_MOUNTPOINT define. - */ - if (can_use_mount_api()) { - fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0); - if (fd_fs < 0) - return log_error_errno(false, errno, "Failed to create new filesystem context for tmpfs"); - - ret = fs_set_property(fd_fs, "mode", "0755"); - if (ret < 0) - return log_error_errno(false, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); - - ret = fs_set_property(fd_fs, "size", "10240k"); - if (ret < 0) - return log_error_errno(false, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); - - ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, - PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, - MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | - MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME); - } else { - cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); - ret = safe_mount(NULL, cgroup_root, "tmpfs", - MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, - "size=10240k,mode=755", rootfs_mnt, handler->conf->rootfs.lsm_se_mount_context); - } - if (ret < 0) - return log_error_errno(false, errno, "Failed to mount tmpfs on %s", - DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); - - dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, - PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); - if (dfd_mnt_tmpfs < 0) - return syserror_ret(false, "Failed to open %d(%s)", - rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); - - for (int i = 0; ops->hierarchies[i]; i++) { - __do_free char *hierarchy_mnt = NULL, *path2 = NULL; - struct hierarchy *h = ops->hierarchies[i]; - -#ifdef HAVE_ISULAD - // isulad: symlink subcgroup - if (strchr(h->at_mnt, ',') != NULL) { - int pret; - pret = lxc_append_string(&merged, h->at_mnt); - if (pret < 0) - return false; - } -#endif - - ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000); -#ifdef HAVE_ISULAD - if (ret < 0) { - lxc_free_array((void **)merged, free); - return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt); - } -#else - if (ret < 0) - return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt); -#endif - - if (in_cgroup_ns && wants_force_mount) { - /* - * If cgroup namespaces are supported but the container - * will not have CAP_SYS_ADMIN after it has started we - * need to mount the cgroups manually. - */ - ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, - dfd_mnt_tmpfs, h->at_mnt); -#ifdef HAVE_ISULAD - if (ret < 0) { - lxc_free_array((void **)merged, free); - return false; - } -#else - if (ret < 0) - return false; -#endif - continue; - } - - /* Here is where the ancient kernel section begins. */ - ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, - dfd_mnt_tmpfs, h->at_mnt); -#ifdef HAVE_ISULAD - if (ret < 0) { - lxc_free_array((void **)merged, free); - return false; - } -#else - if (ret < 0) - return false; -#endif - - if (!cg_mount_needs_subdirs(cgroup_automount_type)) - continue; - - if (!cgroup_root) - cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); - - hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL); -#ifdef HAVE_ISULAD - // isulad: ignore ops->container_cgroup so we will not see directory lxc after /sys/fs/cgroup/xxx in container, - // isulad: ignore h->container_base_path so we will not see subgroup of /sys/fs/cgroup/xxx/subgroup in container - path2 = must_make_path(h->at_mnt, NULL); -#else - path2 = must_make_path(hierarchy_mnt, h->at_base, - ops->container_cgroup, NULL); -#endif - ret = mkdir_p(path2, 0755); -#ifdef HAVE_ISULAD - if (ret < 0 && (errno != EEXIST)) { - lxc_free_array((void **)merged, free); - return false; - } -#else - if (ret < 0 && (errno != EEXIST)) - return false; -#endif - - ret = cg_legacy_mount_controllers(cgroup_automount_type, h, - hierarchy_mnt, path2, - ops->container_cgroup); -#ifdef HAVE_ISULAD - if (ret < 0) { - lxc_free_array((void **)merged, free); - return false; - } -#else - if (ret < 0) - return false; -#endif - } - -#ifdef HAVE_ISULAD - // isulad: symlink subcgroup - if (merged) { - char **mc = NULL; - for (mc = merged; *mc; mc++) { - char *token = NULL; - char *copy = must_copy_string(*mc); - lxc_iterate_parts(token, copy, ",") { - int mret; - char *link; - link = must_make_path(cgroup_root, token, NULL); - mret = symlink(*mc, link); - if (mret < 0 && errno != EEXIST) { - SYSERROR("Failed to create link %s for target %s", link, *mc); - free(copy); - free(link); - lxc_free_array((void **)merged, free); - return false; - } - free(link); - } - free(copy); - } - } - - // isulad: remount /sys/fs/cgroup to readonly - if (cg_flags == LXC_AUTO_CGROUP_FULL_RO || cg_flags == LXC_AUTO_CGROUP_RO) { - ret = mount(cgroup_root, cgroup_root, "bind", - MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME|MS_RDONLY|MS_BIND|MS_REMOUNT, NULL); - if (ret < 0) { - SYSERROR("Failed to remount /sys/fs/cgroup."); - lxc_free_array((void **)merged, free); - return false; - } - } - - // isulad: remount /sys/fs/cgroup/systemd to readwrite for system container - if (handler->conf->systemd != NULL && strcmp(handler->conf->systemd, "true") == 0) - { - unifiedpath = must_make_path(get_rootfs_mnt(rootfs), "/sys/fs/cgroup/unified", NULL); - if (dir_exists(unifiedpath)) - { - ret = umount2(unifiedpath, MNT_DETACH); - if (ret < 0) - { - SYSERROR("Failed to umount /sys/fs/cgroup/unified."); - lxc_free_array((void **)merged, free); - return false; - } - } - - systemdpath = must_make_path(get_rootfs_mnt(rootfs), "/sys/fs/cgroup/systemd", NULL); - ret = mount(systemdpath, systemdpath, "bind", - MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME | MS_BIND | MS_REMOUNT, NULL); - if (ret < 0) - { - SYSERROR("Failed to remount /sys/fs/cgroup/systemd."); - lxc_free_array((void **)merged, free); - return false; - } - } -#endif - - return true; -} - -/* Only root needs to escape to the cgroup of its init. */ -__cgfsng_ops static bool isulad_cgfsng_criu_escape(const struct cgroup_ops *ops, - struct lxc_conf *conf) -{ - if (!ops) - return ret_set_errno(false, ENOENT); - - if (!ops->hierarchies) - return true; - - if (!conf) - return ret_set_errno(false, EINVAL); - - if (conf->cgroup_meta.relative || geteuid()) - return true; - - for (int i = 0; ops->hierarchies[i]; i++) { - __do_free char *fullpath = NULL; - int ret; - - fullpath = - must_make_path(ops->hierarchies[i]->at_mnt, - ops->hierarchies[i]->at_base, - "cgroup.procs", NULL); - ret = lxc_write_to_file(fullpath, "0", 2, false, 0666); - if (ret != 0) - return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath); - } - - return true; -} - -__cgfsng_ops static int isulad_cgfsng_criu_num_hierarchies(struct cgroup_ops *ops) -{ - int i = 0; - - if (!ops) - return ret_set_errno(-1, ENOENT); - - if (!ops->hierarchies) - return 0; - - for (; ops->hierarchies[i]; i++) - ; - - return i; -} - -__cgfsng_ops static bool isulad_cgfsng_criu_get_hierarchies(struct cgroup_ops *ops, int n, - char ***out) -{ - int i; - - if (!ops) - return ret_set_errno(false, ENOENT); - - if (!ops->hierarchies) - return ret_set_errno(false, ENOENT); - - /* sanity check n */ - for (i = 0; i < n; i++) - if (!ops->hierarchies[i]) - return ret_set_errno(false, ENOENT); - - *out = ops->hierarchies[i]->controllers; - - return true; -} - -static bool cg_legacy_freeze(struct cgroup_ops *ops) -{ - struct hierarchy *h; - - h = get_hierarchy(ops, "freezer"); - if (!h) - return ret_set_errno(-1, ENOENT); - - return lxc_write_openat(h->path_con, "freezer.state", - "FROZEN", STRLITERALLEN("FROZEN")); -} - -static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata, - struct lxc_async_descr *descr) -{ - __do_close int duped_fd = -EBADF; - __do_free char *line = NULL; - __do_fclose FILE *f = NULL; - int state = PTR_TO_INT(cbdata); - size_t len; - const char *state_string; - - duped_fd = dup(fd); - if (duped_fd < 0) - return LXC_MAINLOOP_ERROR; - - if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1) - return LXC_MAINLOOP_ERROR; - - f = fdopen(duped_fd, "re"); - if (!f) - return LXC_MAINLOOP_ERROR; - move_fd(duped_fd); - - if (state == 1) - state_string = "frozen 1"; - else - state_string = "frozen 0"; - - while (getline(&line, &len, f) != -1) - if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0) - return LXC_MAINLOOP_CLOSE; - - return LXC_MAINLOOP_CONTINUE; -} - -static int cg_unified_freeze(struct cgroup_ops *ops, int timeout) -{ - __do_close int fd = -EBADF; - call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL; - int ret; - struct lxc_async_descr descr; - struct hierarchy *h; - - h = ops->unified; - if (!h) - return ret_set_errno(-1, ENOENT); - - if (!h->path_con) - return ret_set_errno(-1, EEXIST); - - if (timeout != 0) { - __do_free char *events_file = NULL; - - events_file = must_make_path(h->path_con, "cgroup.events", NULL); - fd = open(events_file, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return log_error_errno(-1, errno, "Failed to open cgroup.events file"); - - ret = lxc_mainloop_open(&descr); - if (ret) - return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze"); - - /* automatically cleaned up now */ - descr_ptr = &descr; - - ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, default_cleanup_handler, - INT_TO_PTR((int){1}), "freezer_cgroup_events"); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); - } - - ret = lxc_write_openat(h->path_con, "cgroup.freeze", "1", 1); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); - - if (timeout != 0 && lxc_mainloop(&descr, timeout)) - return log_error_errno(-1, errno, "Failed to wait for container to be frozen"); - - return 0; -} - -__cgfsng_ops static int isulad_cgfsng_freeze(struct cgroup_ops *ops, int timeout) -{ - if (!ops->hierarchies) - return ret_set_errno(-1, ENOENT); - - if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) - return cg_legacy_freeze(ops); - - return cg_unified_freeze(ops, timeout); -} - -static int cg_legacy_unfreeze(struct cgroup_ops *ops) -{ - struct hierarchy *h; - - h = get_hierarchy(ops, "freezer"); - if (!h) - return ret_set_errno(-1, ENOENT); - - return lxc_write_openat(h->path_con, "freezer.state", - "THAWED", STRLITERALLEN("THAWED")); -} - -static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout) -{ - __do_close int fd = -EBADF; - call_cleaner(lxc_mainloop_close)struct lxc_async_descr *descr_ptr = NULL; - int ret; - struct lxc_async_descr descr; - struct hierarchy *h; - - h = ops->unified; - if (!h) - return ret_set_errno(-1, ENOENT); - - if (!h->path_con) - return ret_set_errno(-1, EEXIST); - - if (timeout != 0) { - __do_free char *events_file = NULL; - - events_file = must_make_path(h->path_con, "cgroup.events", NULL); - fd = open(events_file, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return log_error_errno(-1, errno, "Failed to open cgroup.events file"); - - ret = lxc_mainloop_open(&descr); - if (ret) - return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze"); - - /* automatically cleaned up now */ - descr_ptr = &descr; - - ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, default_cleanup_handler, - INT_TO_PTR((int){0}), "freezer_cgroup_events"); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); - } - - ret = lxc_write_openat(h->path_con, "cgroup.freeze", "0", 1); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); - - if (timeout != 0 && lxc_mainloop(&descr, timeout)) - return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen"); - - return 0; -} - -__cgfsng_ops static int isulad_cgfsng_unfreeze(struct cgroup_ops *ops, int timeout) -{ - if (!ops->hierarchies) - return ret_set_errno(-1, ENOENT); - - if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) - return cg_legacy_unfreeze(ops); - - return cg_unified_unfreeze(ops, timeout); -} - -__cgfsng_ops static const char *isulad_cgfsng_get_cgroup(struct cgroup_ops *ops, - const char *controller) -{ - struct hierarchy *h; - - h = get_hierarchy(ops, controller); - if (!h) - return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"", - controller ? controller : "(null)"); - - if (!h->path_con) - h->path_con = must_make_path(h->at_mnt, h->at_base, ops->container_cgroup, NULL); - - return h->path_con - ? h->path_con + strlen(h->at_mnt) - : NULL; -} - -__cgfsng_ops static const char *isulad_cgfsng_get_cgroup_full_path(struct cgroup_ops *ops, - const char *controller) -{ - struct hierarchy *h; - - h = get_hierarchy(ops, controller); - if (!h) - return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"", - controller ? controller : "(null)"); - - if (!h->path_con) - h->path_con = must_make_path(h->at_mnt, h->at_base, ops->container_cgroup, NULL); - - return h->path_con; -} - -/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path, - * which must be freed by the caller. - */ -static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h, - const char *inpath, - const char *filename) -{ - return must_make_path(h->at_mnt, inpath, filename, NULL); -} - -static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid) -{ - int idx = 1; - int ret; - char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; - size_t pidstr_len; - - /* Create leaf cgroup. */ - ret = mkdirat(unified_fd, ".lxc", 0755); - if (ret < 0 && errno != EEXIST) - return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\""); - - pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid); - ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len); - if (ret < 0) - ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len); - if (ret == 0) - return 0; - - /* this is a non-leaf node */ - if (errno != EBUSY) - return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); - - do { - bool rm = false; - char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1]; - char *slash; - - ret = snprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx); - if (ret < 0 || (size_t)ret >= sizeof(attach_cgroup)) - return ret_errno(EIO); - - /* - * This shouldn't really happen but the compiler might complain - * that a short write would cause a buffer overrun. So be on - * the safe side. - */ - if ((size_t)ret < STRLITERALLEN(".lxc-/cgroup.procs")) - return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun"); - - slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs"); - *slash = '\0'; - - ret = mkdirat(unified_fd, attach_cgroup, 0755); - if (ret < 0 && errno != EEXIST) - return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup); - if (ret == 0) - rm = true; - - *slash = '/'; - - ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len); - if (ret == 0) - return 0; - - if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR)) - SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup); - - /* this is a non-leaf node */ - if (errno != EBUSY) - return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); - - idx++; - } while (idx < 1000); - - return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); -} - -static int cgroup_attach_create_leaf(const struct lxc_conf *conf, - int unified_fd, int *sk_fd, bool unprivileged) -{ - __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; - int target_fds[2]; - ssize_t ret; - - /* Create leaf cgroup. */ - ret = mkdirat(unified_fd, ".lxc", 0755); - if (ret < 0 && errno != EEXIST) - return syserror("Failed to create leaf cgroup \".lxc\""); - - if (unprivileged) { - target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); - if (target_fd0 < 0) - return syserror("Failed to open \".lxc/cgroup.procs\""); - target_fds[0] = target_fd0; - - target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); - if (target_fd1 < 0) - return syserror("Failed to open \".lxc/cgroup.procs\""); - target_fds[1] = target_fd1; - - ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0); - if (ret <= 0) - return syserror("Failed to send \".lxc/cgroup.procs\" fds %d and %d", - target_fd0, target_fd1); - - TRACE("Sent cgroup file descriptors %d and %d", target_fd0, target_fd1); - } else { - ret = lxc_abstract_unix_send_credential(sk, NULL, 0); - if (ret < 0) - return syserror("Failed to inform parent that we are done setting up mounts"); - - TRACE("Informed parent process that cgroup has been created"); - } - - return 0; -} - -static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf, - const char *lxcpath, - int unified_fd, int *sk_fd, pid_t pid, - bool unprivileged) -{ - __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; - char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; - size_t pidstr_len; -#if HAVE_LIBSYSTEMD - __do_free char *scope = NULL; -#endif - ssize_t ret; - -#if HAVE_LIBSYSTEMD - scope = lxc_cmd_get_systemd_scope(conf->name, lxcpath); - if (scope) { - TRACE("%s:%s is running under systemd-created scope '%s'. Attaching...", lxcpath, conf->name, scope); - if (enter_scope(scope, pid)) - TRACE("Successfully entered scope '%s'", scope); - else - ERROR("Failed entering scope '%s'", scope); - } else { - TRACE("%s:%s is not running under a systemd-created scope", lxcpath, conf->name); - } -#endif - if (unprivileged) { - ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to receive target cgroup fd"); - } else { - ret = lxc_abstract_unix_rcv_credential(sk, NULL, 0); - if (ret < 0) - return syserror("Failed to receive notification from parent process"); - - TRACE("Child process informed us that cgroup has been created"); - - target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); - if (target_fd0 < 0) - return syserror("Failed to open \".lxc/cgroup.procs\""); - - target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); - if (target_fd1 < 0) - return syserror("Failed to open \".lxc/cgroup.procs\""); - - TRACE("Opened target cgroup file descriptors %d and %d", target_fd0, target_fd1); - } - - pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid); - - ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len); - if (ret > 0 && (size_t)ret == pidstr_len) - return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0); - - ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len); - if (ret > 0 && (size_t)ret == pidstr_len) - return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1); - - return syserror("Failed to move process into target cgroup via fd %d and %d", target_fd0, target_fd1); -} - -struct userns_exec_unified_attach_data { - const struct lxc_conf *conf; - const char *lxcpath; - int unified_fd; - int sk_pair[2]; - pid_t pid; - bool unprivileged; -}; - -static int cgroup_unified_attach_child_wrapper(void *data) -{ - struct userns_exec_unified_attach_data *args = data; - - if (!args->conf || !args->lxcpath || args->unified_fd < 0 || - args->pid <= 0 || args->sk_pair[0] < 0 || args->sk_pair[1] < 0) - return ret_errno(EINVAL); - - close_prot_errno_disarm(args->sk_pair[0]); - return cgroup_attach_create_leaf(args->conf, args->unified_fd, - &args->sk_pair[1], args->unprivileged); -} - -static int cgroup_unified_attach_parent_wrapper(void *data) -{ - struct userns_exec_unified_attach_data *args = data; - - if (!args->conf || args->unified_fd < 0 || args->pid <= 0 || - args->sk_pair[0] < 0 || args->sk_pair[1] < 0) - return ret_errno(EINVAL); - - close_prot_errno_disarm(args->sk_pair[1]); - return cgroup_attach_move_into_leaf(args->conf, args->lxcpath, - args->unified_fd, - &args->sk_pair[0], args->pid, - args->unprivileged); -} - -/* Technically, we're always at a delegation boundary here (This is especially - * true when cgroup namespaces are available.). The reasoning is that in order - * for us to have been able to start a container in the first place the root - * cgroup must have been a leaf node. Now, either the container's init system - * has populated the cgroup and kept it as a leaf node or it has created - * subtrees. In the former case we will simply attach to the leaf node we - * created when we started the container in the latter case we create our own - * cgroup for the attaching process. - */ -static int __cg_unified_attach(const struct hierarchy *h, - const struct lxc_conf *conf, const char *name, - const char *lxcpath, pid_t pid, - const char *controller) -{ - __do_close int unified_fd = -EBADF; - __do_free char *path = NULL, *cgroup = NULL; - int ret; - - if (!conf || !name || !lxcpath || pid <= 0) - return ret_errno(EINVAL); - - ret = cgroup_attach(conf, name, lxcpath, pid); - if (ret == 0) - return log_trace(0, "Attached to unified cgroup via command handler"); - TRACE("__cg_unified_attach: cgroup_attach returned %d", ret); - if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2) - return log_error_errno(ret, errno, "Failed to attach to unified cgroup"); - - /* Fall back to retrieving the path for the unified cgroup. */ - cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller); - /* not running */ - if (!cgroup) - return 0; - TRACE("lxc_cmd_get_cgroup_path returned %s", cgroup); - - path = make_cgroup_path(h, cgroup, NULL); - - unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC); - if (unified_fd < 0) - return ret_errno(EBADF); - - if (!list_empty(&conf->id_map)) { - struct userns_exec_unified_attach_data args = { - .conf = conf, - .unified_fd = unified_fd, - .pid = pid, - .unprivileged = am_guest_unpriv(), - .lxcpath = lxcpath, - }; - - ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); - if (ret < 0) - return -errno; - - ret = userns_exec_minimal(conf, - cgroup_unified_attach_parent_wrapper, - &args, - cgroup_unified_attach_child_wrapper, - &args); - } else { - ret = cgroup_attach_leaf(conf, unified_fd, pid); - } - - return ret; -} - -__cgfsng_ops static bool isulad_cgfsng_attach(struct cgroup_ops *ops, - const struct lxc_conf *conf, - const char *name, const char *lxcpath, - pid_t pid) -{ - int len, ret; - char pidstr[INTTYPE_TO_STRLEN(pid_t)]; - - if (!ops) - return ret_set_errno(false, ENOENT); - -#ifdef HAVE_ISULAD - if (ops->no_controller) { - DEBUG("no controller found, isgnore isulad_cgfsng_attach"); - return true; - } -#endif - - if (!ops->hierarchies) - return true; - - len = snprintf(pidstr, sizeof(pidstr), "%d", pid); - if (len < 0 || (size_t)len >= sizeof(pidstr)) - return false; - - for (int i = 0; ops->hierarchies[i]; i++) { - __do_free char *fullpath = NULL, *path = NULL; - struct hierarchy *h = ops->hierarchies[i]; - - if (h->fs_type == CGROUP2_SUPER_MAGIC) { - ret = __cg_unified_attach(h, conf, name, lxcpath, pid, - h->controllers[0]); - if (ret < 0) - return false; - - continue; - } - - path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]); - /* not running */ - if (!path) - return false; - - fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs"); - ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666); - if (ret < 0) - return log_error_errno(false, errno, "Failed to attach %d to %s", - (int)pid, fullpath); - } - - return true; -} - -__cgfsng_ops static int isulad_cgfsng_get(struct cgroup_ops *ops, const char *filename, - char *value, size_t len, const char *name, - const char *lxcpath) -{ - int ret = -1; - size_t controller_len; - char *controller, *p, *path; - struct hierarchy *h; - - controller_len = strlen(filename); - controller = alloca(controller_len + 1); - (void)strlcpy(controller, filename, controller_len + 1); - - p = strchr(controller, '.'); - if (p) - *p = '\0'; - - const char *ori_path = ops->get_cgroup(ops, controller); - if (ori_path == NULL) { - ERROR("Failed to get cgroup path:%s", controller); - return -1; - } - path = safe_strdup(ori_path); - - h = get_hierarchy(ops, controller); - if (h) { - char *fullpath; - - fullpath = build_full_cgpath_from_monitorpath(h, path, filename); - ret = lxc_read_from_file(fullpath, value, len); - free(fullpath); - } - free(path); - - return ret; -} - -static int device_cgroup_parse_access(struct device_item *device, const char *val) -{ - for (int count = 0; count < 3; count++, val++) { - switch (*val) { - case 'r': - device->access[count] = *val; - break; - case 'w': - device->access[count] = *val; - break; - case 'm': - device->access[count] = *val; - break; - case '\n': - case '\0': - count = 3; - break; - default: - return ret_errno(EINVAL); - } - } - - return 0; -} - -static int device_cgroup_rule_parse(struct device_item *device, const char *key, - const char *val) -{ - size_t count; - int ret; - char temp[50]; - - if (strequal("devices.allow", key)) - device->allow = 1; /* allow the device */ - else - device->allow = 0; /* deny the device */ - - if (strequal(val, "a")) { - /* global rule */ - device->type = 'a'; - device->major = -1; - device->minor = -1; - return 0; - } - - switch (*val) { - case 'a': - __fallthrough; - case 'b': - __fallthrough; - case 'c': - device->type = *val; - break; - default: - return -1; - } - - val++; - if (!isspace(*val)) - return -1; - val++; - if (*val == '*') { - device->major = -1; - val++; - } else if (isdigit(*val)) { - memset(temp, 0, sizeof(temp)); - for (count = 0; count < sizeof(temp) - 1; count++) { - temp[count] = *val; - val++; - if (!isdigit(*val)) - break; - } - ret = lxc_safe_int(temp, &device->major); - if (ret) - return -1; - } else { - return -1; - } - if (*val != ':') - return -1; - val++; - - /* read minor */ - if (*val == '*') { - device->minor = -1; - val++; - } else if (isdigit(*val)) { - memset(temp, 0, sizeof(temp)); - for (count = 0; count < sizeof(temp) - 1; count++) { - temp[count] = *val; - val++; - if (!isdigit(*val)) - break; - } - ret = lxc_safe_int(temp, &device->minor); - if (ret) - return -1; - } else { - return -1; - } - if (!isspace(*val)) - return -1; - - return device_cgroup_parse_access(device, ++val); -} - -__cgfsng_ops static int isulad_cgfsng_set(struct cgroup_ops *ops, - const char *filename, const char *value, - const char *name, const char *lxcpath) -{ - int ret = -1; - size_t controller_len; - char *controller, *p, *path; - struct hierarchy *h; - - controller_len = strlen(filename); - controller = alloca(controller_len + 1); - (void)strlcpy(controller, filename, controller_len + 1); - - p = strchr(controller, '.'); - if (p) - *p = '\0'; - - const char *ori_path = ops->get_cgroup(ops, controller); - if (ori_path == NULL) { - ERROR("Failed to get cgroup path:%s", controller); - return -1; - } - path = safe_strdup(ori_path); - - h = get_hierarchy(ops, controller); - if (h) { - char *fullpath; - fullpath = build_full_cgpath_from_monitorpath(h, path, filename); - - if (strcmp(filename, "io.weight") == 0 || strcmp(filename, "io.bfq.weight") == 0) { - if (!file_exists(fullpath)) { - free(path); - free(fullpath); - return 0; - } - } - - ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666); - free(fullpath); - } - free(path); - - return ret; -} - -/* take devices cgroup line - * /dev/foo rwx - * and convert it to a valid - * type major:minor mode - * line. Return <0 on error. Dest is a preallocated buffer long enough to hold - * the output. - */ -static int device_cgroup_rule_parse_devpath(struct device_item *device, - const char *devpath) -{ - __do_free char *path = NULL; - char *mode = NULL; - int n_parts, ret; - char *p; - struct stat sb; - - path = strdup(devpath); - if (!path) - return ret_errno(ENOMEM); - - /* - * Read path followed by mode. Ignore any trailing text. - * A ' # comment' would be legal. Technically other text is not - * legal, we could check for that if we cared to. - */ - for (n_parts = 1, p = path; *p; p++) { - if (*p != ' ') - continue; - *p = '\0'; - - if (n_parts != 1) - break; - p++; - n_parts++; - - while (*p == ' ') - p++; - - mode = p; - - if (*p == '\0') - return ret_set_errno(-1, EINVAL); - } - - if (device_cgroup_parse_access(device, mode) < 0) - return -1; - - ret = stat(path, &sb); - if (ret < 0) - return ret_set_errno(-1, errno); - - mode_t m = sb.st_mode & S_IFMT; - switch (m) { - case S_IFBLK: - device->type = 'b'; - break; - case S_IFCHR: - device->type = 'c'; - break; - default: - return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path); - } - - device->major = MAJOR(sb.st_rdev); - device->minor = MINOR(sb.st_rdev); - device->allow = 1; - - return 0; -} - -static int convert_devpath(const char *invalue, char *dest) -{ - struct device_item device = {0}; - int ret; - - ret = device_cgroup_rule_parse_devpath(&device, invalue); - if (ret < 0) - return -1; - - ret = snprintf(dest, 50, "%c %d:%d %s", device.type, device.major, - device.minor, device.access); - if (ret < 0 || ret >= 50) - return log_error_errno(-1, ENAMETOOLONG, "Error on configuration value \"%c %d:%d %s\" (max 50 chars)", - device.type, device.major, device.minor, device.access); - - return 0; -} - -/* Called from setup_limits - here we have the container's cgroup_data because - * we created the cgroups. - */ -static int isulad_cg_legacy_get_data(struct cgroup_ops *ops, const char *filename, - char *value, size_t len) -{ - char *fullpath = NULL; - char *p = NULL; - struct hierarchy *h = NULL; - int ret = 0; - char *controller = NULL; - - len = strlen(filename); - if (SIZE_MAX - 1 < len) { - errno = EINVAL; - return -1; - } - controller = calloc(1, len + 1); - if (controller == NULL) { - errno = ENOMEM; - return -1; - } - (void)strlcpy(controller, filename, len + 1); - - p = strchr(controller, '.'); - if (p) - *p = '\0'; - - - h = get_hierarchy(ops, controller); - if (!h) { - ERROR("Failed to setup limits for the \"%s\" controller. " - "The controller seems to be unused by \"cgfsng\" cgroup " - "driver or not enabled on the cgroup hierarchy", - controller); - errno = ENOENT; - free(controller); - return -ENOENT; - } - - fullpath = must_make_path(h->path_con, filename, NULL); - ret = lxc_read_from_file(fullpath, value, len); - free(fullpath); - free(controller); - return ret; -} - -static int isulad_cg_legacy_set_data(struct cgroup_ops *ops, const char *filename, - const char *value) -{ - size_t len; - char *fullpath, *p; - /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */ - char converted_value[50]; - struct hierarchy *h; - int ret = 0; - char *controller = NULL; - int retry_count = 0; - int max_retry = 10; - char *container_cgroup = ops->container_cgroup; - - len = strlen(filename); - controller = alloca(len + 1); - (void)strlcpy(controller, filename, len + 1); - - p = strchr(controller, '.'); - if (p) - *p = '\0'; - - if (strcmp("devices.allow", filename) == 0 && value[0] == '/') { - ret = convert_devpath(value, converted_value); - if (ret < 0) - return ret; - value = converted_value; - } - - h = get_hierarchy(ops, controller); - if (!h) { - ERROR("Failed to setup limits for the \"%s\" controller. " - "The controller seems to be unused by \"cgfsng\" cgroup " - "driver or not enabled on the cgroup hierarchy", - controller); - errno = ENOENT; - return -ENOENT; - } - - fullpath = must_make_path(h->path_con, filename, NULL); - -retry: - ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666); - if (ret != 0) { - if (retry_count < max_retry) { - SYSERROR("setting cgroup config for ready process caused \"failed to write %s to %s\".", value, fullpath); - (void)isulad_cg_legacy_handle_cpuset_hierarchy(h, container_cgroup); - (void)isulad_mkdir_eexist_on_last(h->path_con, 0755); - usleep(100 * 1000); /* 100 millisecond */ - retry_count++; - goto retry; - } - lxc_write_error_message(ops->errfd, - "%s:%d: setting cgroup config for ready process caused failed to write %s to %s: %s", - __FILE__, __LINE__, value, fullpath, strerror(errno)); - } - free(fullpath); - return ret; -} - -/* - * Return the list of cgroup_settings sorted according to the following rules - * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes - */ -static void sort_cgroup_settings(struct lxc_conf *conf) -{ - LIST_HEAD(memsw_list); - struct lxc_cgroup *cgroup, *ncgroup; - - /* Iterate over the cgroup settings and copy them to the output list. */ - list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) { - if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes")) - continue; - - /* Move the memsw entry from the cgroup settings list. */ - list_move_tail(&cgroup->head, &memsw_list); - } - - /* - * Append all the memsw entries to the end of the cgroup settings list - * to make sure they are applied after all memory limit settings. - */ - list_splice_tail(&memsw_list, &conf->cgroup); - -} - -__cgfsng_ops static bool isulad_cgfsng_setup_limits_legacy(struct cgroup_ops *ops, - struct lxc_conf *conf, - bool do_devices) -{ - struct list_head *cgroup_settings; - struct lxc_cgroup *cgroup; - char value[21 + 1] = { 0 }; - long long int readvalue, setvalue; - - if (!ops) - return ret_set_errno(false, ENOENT); - - if (!conf) - return ret_set_errno(false, EINVAL); - - cgroup_settings = &conf->cgroup; - if (list_empty(cgroup_settings)) - return true; - - if (!ops->hierarchies) - return ret_set_errno(false, EINVAL); - - if (pure_unified_layout(ops)) - return true; - - sort_cgroup_settings(conf); - list_for_each_entry(cgroup, cgroup_settings, head) { - if (do_devices == strnequal("devices", cgroup->subsystem, 7)) { - const char *cgvalue = cgroup->value; - if (strcmp(cgroup->subsystem, "files.limit") == 0) { - if (lxc_safe_long_long(cgvalue, &setvalue) != 0) { - SYSERROR("Invalid integer value %s", cgvalue); - return false; - } - if (setvalue <= 0) { - cgvalue = "max"; - } - } - if (isulad_cg_legacy_set_data(ops, cgroup->subsystem, cgvalue)) { - if (do_devices && (errno == EACCES || errno == EPERM)) { - SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgvalue); - continue; - } - SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgvalue); - return false; - } - DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgvalue); - } - - // isulad: check cpu shares - if (strcmp(cgroup->subsystem, "cpu.shares") == 0) { - if (isulad_cg_legacy_get_data(ops, cgroup->subsystem, value, sizeof(value) - 1) < 0) { - SYSERROR("Error get %s", cgroup->subsystem); - return false; - } - trim(value); - if (lxc_safe_long_long(cgroup->value, &setvalue) != 0) { - SYSERROR("Invalid value %s", cgroup->value); - return false; - } - if (lxc_safe_long_long(value, &readvalue) != 0) { - SYSERROR("Invalid value %s", value); - return false; - } - if (setvalue > readvalue) { - ERROR("The maximum allowed cpu-shares is %s", value); - lxc_write_error_message(ops->errfd, - "%s:%d: setting cgroup config for ready process caused \"The maximum allowed cpu-shares is %s\".", - __FILE__, __LINE__, value); - return false; - } else if (setvalue < readvalue) { - ERROR("The minimum allowed cpu-shares is %s", value); - lxc_write_error_message(ops->errfd, - "%s:%d: setting cgroup config for ready process caused \"The minimum allowed cpu-shares is %s\".", - __FILE__, __LINE__, value); - return false; - } - } - } - - INFO("Limits for the legacy cgroup hierarchies have been setup"); - return true; -} - -/* - * Some of the parsing logic comes from the original cgroup device v1 - * implementation in the kernel. - */ -static int bpf_device_cgroup_prepare(struct cgroup_ops *ops, - struct lxc_conf *conf, const char *key, - const char *val) -{ - struct device_item device_item = {}; - int ret; - - if (strequal("devices.allow", key) && abspath(val)) - ret = device_cgroup_rule_parse_devpath(&device_item, val); - else - ret = device_cgroup_rule_parse(&device_item, key, val); - if (ret < 0) - return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val); - - /* - * Note that bpf_list_add_device() returns 1 if it altered the device - * list and 0 if it didn't; both return values indicate success. - * Only a negative return value indicates an error. - */ - ret = bpf_list_add_device(&conf->bpf_devices, &device_item); - if (ret < 0) - return -1; - - return 0; -} -__cgfsng_ops static bool isulad_cgfsng_setup_limits(struct cgroup_ops *ops, - struct lxc_handler *handler) -{ - __do_free char *path = NULL; - struct list_head *cgroup_settings; - struct hierarchy *h; - struct lxc_conf *conf; - struct lxc_cgroup *cg; - - if (!ops) - return ret_set_errno(false, ENOENT); - - if (!ops->hierarchies) - return true; - - if (!ops->container_cgroup) - return ret_set_errno(false, EINVAL); - - if (!handler || !handler->conf) - return ret_set_errno(false, EINVAL); - conf = handler->conf; - - if (list_empty(&conf->cgroup2)) - return true; - cgroup_settings = &conf->cgroup2; - - if (!pure_unified_layout(ops)) - return true; - - if (!ops->unified) - return false; - h = ops->unified; - - list_for_each_entry(cg, cgroup_settings, head) { - int ret; - - if (strncmp("devices", cg->subsystem, 7) == 0) { - ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, - cg->value); - } else if (strcmp(cg->subsystem, "files.limit") == 0) { - long long int setvalue = 0; - const char *cgvalue = cg->value; - - if (lxc_safe_long_long(cgvalue, &setvalue) != 0) - return log_error(false, "Invalid integer value %s", cgvalue); - - if (setvalue <= 0) - cgvalue = "max"; - - ret = lxc_write_openat(h->path_con, - cg->subsystem, cgvalue, - strlen(cgvalue)); - if (ret < 0) - return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", - cg->subsystem, cgvalue); - } else { - if (strcmp(cg->subsystem, "io.weight") == 0 || strcmp(cg->subsystem, "io.bfq.weight") == 0) { - path = must_make_path(h->path_con, cg->subsystem, NULL); - if (!file_exists(path)) { - continue; - } - } - ret = lxc_write_openat(h->path_con, - cg->subsystem, cg->value, - strlen(cg->value)); - if (ret < 0) - return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", - cg->subsystem, cg->value); - } - TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value); - } - - return log_info(true, "Limits for the unified cgroup hierarchy have been setup"); -} - -__cgfsng_ops bool isulad_cgfsng_devices_activate(struct cgroup_ops *ops, - struct lxc_handler *handler) -{ -#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX - __do_bpf_program_free struct bpf_program *devices = NULL; - int ret; - struct lxc_conf *conf; - struct hierarchy *unified; - struct lxc_list *it; - struct bpf_program *devices_old; - - if (!ops) - return ret_set_errno(false, ENOENT); - - if (!ops->hierarchies) - return true; - - if (!ops->container_cgroup) - return ret_set_errno(false, EEXIST); - - if (!handler || !handler->conf) - return ret_set_errno(false, EINVAL); - conf = handler->conf; - - unified = ops->unified; - if (!unified || !unified->bpf_device_controller || - !unified->path_con || lxc_list_empty(&conf->devices)) - return true; - - devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE); - if (!devices) - return log_error_errno(false, ENOMEM, "Failed to create new bpf program"); - - ret = bpf_program_init(devices); - if (ret) - return log_error_errno(false, ENOMEM, "Failed to initialize bpf program"); - - lxc_list_for_each(it, &conf->devices) { - struct device_item *cur = it->elem; - - ret = bpf_program_append_device(devices, cur); - if (ret) - return log_error_errno(false, ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d", - cur->type, - cur->major, - cur->minor, - cur->access, - cur->allow, - cur->global_rule); - TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d", - cur->type, - cur->major, - cur->minor, - cur->access, - cur->allow, - cur->global_rule); - } - - ret = bpf_program_finalize(devices); - if (ret) - return log_error_errno(false, ENOMEM, "Failed to finalize bpf program"); - - ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE, - unified->path_con, - BPF_F_ALLOW_MULTI); - if (ret) - return log_error_errno(false, ENOMEM, "Failed to attach bpf program"); - - /* Replace old bpf program. */ - devices_old = move_ptr(conf->cgroup2_devices); - conf->cgroup2_devices = move_ptr(devices); - devices = move_ptr(devices_old); -#endif - return true; -} - -bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup) -{ - __do_free char *add_controllers = NULL, *base_path = NULL; - __do_free_string_list char **parts = NULL; - struct hierarchy *unified = ops->unified; - ssize_t parts_len; - char **it; - size_t full_len = 0; - - if (!ops->hierarchies || !pure_unified_layout(ops) || - !unified->controllers[0]) - return true; - - /* For now we simply enable all controllers that we have detected by - * creating a string like "+memory +pids +cpu +io". - * TODO: In the near future we might want to support "-<controller>" - * etc. but whether supporting semantics like this make sense will need - * some thinking. - */ - for (it = unified->controllers; it && *it; it++) { - full_len += strlen(*it) + 2; - add_controllers = must_realloc(add_controllers, full_len + 1); - - if (unified->controllers[0] == *it) - add_controllers[0] = '\0'; - - (void)strlcat(add_controllers, "+", full_len + 1); - (void)strlcat(add_controllers, *it, full_len + 1); - - if (*(it + 1)) - (void)strlcat(add_controllers, " ", full_len + 1); - } - - parts = lxc_string_split(cgroup, '/'); - if (!parts) - return false; - - parts_len = lxc_array_len((void **)parts); - if (parts_len > 0) - parts_len--; - - base_path = must_make_path(unified->at_mnt, unified->at_base, NULL); - for (ssize_t i = -1; i < parts_len; i++) { - int ret; - __do_free char *target = NULL; - - if (i >= 0) - base_path = must_append_path(base_path, parts[i], NULL); - target = must_make_path(base_path, "cgroup.subtree_control", NULL); - ret = lxc_writeat(-1, target, add_controllers, full_len); - if (ret < 0) - return log_error_errno(false, errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"", - add_controllers, target); - TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target); - } - - return true; -} - -__cgfsng_ops bool isulad_cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops) -{ - return true; -} - -__cgfsng_ops bool isulad_cgfsng_payload_delegate_controllers(struct cgroup_ops *ops) -{ - if (!ops) - return ret_set_errno(false, ENOENT); - -#ifdef HAVE_ISULAD - if (ops->no_controller) { - DEBUG("no controller found, isgnore isulad_cgfsng_payload_delegate_controllers"); - return true; - } -#endif - - return __cgfsng_delegate_controllers(ops, ops->container_cgroup); -} - -static inline bool unified_cgroup(const char *line) -{ - return *line == '0'; -} - -static inline char *current_unified_cgroup(bool relative, char *line) -{ - char *current_cgroup; - - line += STRLITERALLEN("0::"); - - if (!abspath(line)) - return ERR_PTR(-EINVAL); - - /* remove init.scope */ - if (!relative) - line = prune_init_scope(line); - - /* create a relative path */ - line = deabs(line); - - current_cgroup = strdup(line); - if (!current_cgroup) - return ERR_PTR(-ENOMEM); - - return current_cgroup; -} - -static inline const char *unprefix(const char *controllers) -{ - if (strnequal(controllers, "name=", STRLITERALLEN("name="))) - return controllers + STRLITERALLEN("name="); - return controllers; -} - -static int __list_cgroup_delegate(char ***delegate) -{ - __do_free char **list = NULL; - __do_free char *buf = NULL; - char *standard[] = { - "cgroup.procs", - "cgroup.threads", - "cgroup.subtree_control", - "memory.oom.group", - NULL, - }; - char *token; - int ret; - - buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0); - if (!buf) { - for (char **p = standard; p && *p; p++) { - ret = list_add_string(&list, *p); - if (ret < 0) - return ret; - } - - *delegate = move_ptr(list); - return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate"); - } - - lxc_iterate_parts(token, buf, " \t\n") { - /* - * We always need to chown this for both cgroup and - * cgroup2. - */ - if (strequal(token, "cgroup.procs")) - continue; - - ret = list_add_string(&list, token); - if (ret < 0) - return ret; - } - - *delegate = move_ptr(list); - return 0; -} - -static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files) -{ - __do_free_string_list char **list = NULL; - int ret; - - ret = __list_cgroup_delegate(&list); - if (ret < 0) - return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements"); - - for (char *const *s = list; s && *s; s++) { - if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT) - continue; - - return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s); - } - - *ret_files = move_ptr(list); - return true; -} - -static bool legacy_hierarchy_delegated(int dfd_base) -{ - int ret; - - ret = faccessat(dfd_base, ".", W_OK, 0); - if (ret < 0 && errno != ENOENT) - return sysinfo_ret(false, "Legacy hierarchy not writable, skipping"); - - return true; -} - -/** - * systemd guarantees that the order of co-mounted controllers is stable. On - * some systems the order of the controllers might be reversed though. - * - * For example, this is how the order is mismatched on CentOS 7: - * - * [root@localhost ~]# cat /proc/self/cgroup - * 11:perf_event:/ - * 10:pids:/ - * 9:freezer:/ - * >>>> 8:cpuacct,cpu:/ - * 7:memory:/ - * 6:blkio:/ - * 5:devices:/ - * 4:hugetlb:/ - * >>>> 3:net_prio,net_cls:/ - * 2:cpuset:/ - * 1:name=systemd:/user.slice/user-0.slice/session-c1.scope - * - * whereas the mountpoint: - * - * | |-/sys/fs/cgroup tmpfs tmpfs ro,nosuid,nodev,noexec,mode=755 - * | | |-/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd - * | | |-/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset - * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_prio,net_cls - * | | |-/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb - * | | |-/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices - * | | |-/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio - * | | |-/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory - * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuacct,cpu - * | | |-/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer - * | | |-/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids - * | | `-/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event - * - * Ensure that we always use the systemd-guaranteed stable order when checking - * for the mountpoint. - */ -#if HAVE_COMPILER_ATTR_NONNULL -__attribute__((nonnull)) -#endif -#if HAVE_COMPILER_ATTR_RETURNS_NONNULL -__attribute__((returns_nonnull)) -#endif -static const char *stable_order(const char *controllers) -{ - if (strequal(controllers, "cpuacct,cpu")) - return "cpu,cpuacct"; - - if (strequal(controllers, "net_prio,net_cls")) - return "net_cls,net_prio"; - - return unprefix(controllers); -} - -#define CGFSNG_LAYOUT_LEGACY BIT(0) -#define CGFSNG_LAYOUT_UNIFIED BIT(1) - -static int __initialize_cgroups(struct cgroup_ops *ops, bool relative, - bool unprivileged, struct lxc_conf *conf) -{ - __do_free char *cgroup_info = NULL; - unsigned int layout_mask = 0; - int ret; - char *it; - - ret = unpriv_systemd_create_scope(ops, conf); - if (ret < 0) - return ret_set_errno(false, ret); - else if (ret == 0) - TRACE("Entered an unpriv systemd scope"); - - /* - * Root spawned containers escape the current cgroup, so use init's - * cgroups as our base in that case. - */ - if (!relative && (geteuid() == 0)) - cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0); - else - cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); - if (!cgroup_info) - return ret_errno(ENOMEM); - - lxc_iterate_parts(it, cgroup_info, "\n") { - __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF; - __do_free char *controllers = NULL, *current_cgroup = NULL; - __do_free_string_list char **controller_list = NULL, - **delegate = NULL; - char *line; - int dfd, type; - - /* Handle the unified cgroup hierarchy. */ - line = it; - if (unified_cgroup(line)) { - char *unified_mnt; - - type = UNIFIED_HIERARCHY; - layout_mask |= CGFSNG_LAYOUT_UNIFIED; - - if (conf->cgroup_meta.systemd_scope) - current_cgroup = cgroup_relpath(conf->cgroup_meta.systemd_scope); - if (IS_ERR_OR_NULL(current_cgroup)) - current_cgroup = current_unified_cgroup(relative, line); - if (IS_ERR(current_cgroup)) - return PTR_ERR(current_cgroup); - - if (unified_cgroup_fd(ops->dfd_mnt)) { - dfd_mnt = dup_cloexec(ops->dfd_mnt); - unified_mnt = ""; - } else { - dfd_mnt = open_at(ops->dfd_mnt, - "unified", - PROTECT_OPATH_DIRECTORY, - PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); - unified_mnt = "unified"; - } - if (dfd_mnt < 0) { - if (errno != ENOENT) - return syserror("Failed to open %d/unified", ops->dfd_mnt); - - SYSTRACE("Unified cgroup not mounted"); - continue; - } - - if (!fhas_fs_type(dfd_mnt, CGROUP2_SUPER_MAGIC)) { - SYSTRACE("Opened file descriptor %d is not a cgroup2 mountpoint", dfd_mnt); - continue; - } - - dfd = dfd_mnt; - - if (!is_empty_string(current_cgroup)) { - dfd_base = open_at(dfd_mnt, current_cgroup, - PROTECT_OPATH_DIRECTORY, - PROTECT_LOOKUP_BENEATH_XDEV, 0); - if (dfd_base < 0) { - if (errno != ENOENT) - return syserror("Failed to open %d/%s", - dfd_mnt, current_cgroup); - - SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)", - dfd_mnt, current_cgroup); - continue; - } - dfd = dfd_base; - } - - if (!unified_hierarchy_delegated(dfd, &delegate)) - continue; - - controller_list = unified_controllers(dfd, "cgroup.controllers"); - if (!controller_list) { - TRACE("No controllers are enabled for delegation in the unified hierarchy"); - controller_list = list_new(); - if (!controller_list) - return syserror_set(-ENOMEM, "Failed to create empty controller list"); - } - - controllers = strdup(unified_mnt); - if (!controllers) - return ret_errno(ENOMEM); - } else { - char *__controllers, *__current_cgroup; - - type = LEGACY_HIERARCHY; - layout_mask |= CGFSNG_LAYOUT_LEGACY; - - __controllers = strchr(line, ':'); - if (!__controllers) - return ret_errno(EINVAL); - __controllers++; - - __current_cgroup = strchr(__controllers, ':'); - if (!__current_cgroup) - return ret_errno(EINVAL); - *__current_cgroup = '\0'; - __current_cgroup++; - - controllers = strdup(stable_order(__controllers)); - if (!controllers) - return ret_errno(ENOMEM); - - dfd_mnt = open_at(ops->dfd_mnt, - controllers, - PROTECT_OPATH_DIRECTORY, - PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); - if (dfd_mnt < 0) { - if (errno != ENOENT) - return syserror("Failed to open %d/%s", - ops->dfd_mnt, controllers); - - SYSTRACE("%s not mounted", controllers); - continue; - } - - if (!fhas_fs_type(dfd_mnt, CGROUP_SUPER_MAGIC)) { - SYSTRACE("Opened file descriptor %d is not a cgroup mountpoint", dfd_mnt); - continue; - } - - dfd = dfd_mnt; - - if (!abspath(__current_cgroup)) - return ret_errno(EINVAL); - - /* remove init.scope */ - if (!relative) - __current_cgroup = prune_init_scope(__current_cgroup); - - /* create a relative path */ - __current_cgroup = deabs(__current_cgroup); - - current_cgroup = strdup(__current_cgroup); - if (!current_cgroup) - return ret_errno(ENOMEM); - - if (!is_empty_string(current_cgroup)) { - dfd_base = open_at(dfd_mnt, current_cgroup, - PROTECT_OPATH_DIRECTORY, - PROTECT_LOOKUP_BENEATH_XDEV, 0); - if (dfd_base < 0) { - if (errno != ENOENT) - return syserror("Failed to open %d/%s", - dfd_mnt, current_cgroup); - - SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)", - dfd_mnt, current_cgroup); - continue; - } - dfd = dfd_base; - } - - if (!legacy_hierarchy_delegated(dfd)) - continue; - - /* - * We intentionally pass __current_cgroup here and not - * controllers because we would otherwise chop the - * mountpoint. - */ - controller_list = list_add_controllers(__controllers); - if (!controller_list) - return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers); - - if (skip_hierarchy(ops, controller_list)) - continue; - - ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; - } - - ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd, - current_cgroup, controller_list, type); - if (ret < 0) - return syserror_ret(ret, "Failed to add %s hierarchy", controllers); - - /* Transfer ownership. */ - move_fd(dfd_mnt); - move_fd(dfd_base); - move_ptr(current_cgroup); - move_ptr(controllers); - move_ptr(controller_list); - if (type == UNIFIED_HIERARCHY) - ops->unified->delegate = move_ptr(delegate); - } - - /* determine cgroup layout */ - if (ops->unified) { - if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) { - ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; - } else { - if (bpf_devices_cgroup_supported()) - ops->unified->utilities |= DEVICES_CONTROLLER; - ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; - } - } - - /* - * If we still don't know the cgroup layout at this point it means we - * have not found any writable cgroup hierarchies. Infer the layout - * from the layout bitmask we created when parsing the cgroups. - * - * Keep the ordering in the switch otherwise the bistmask-based - * matching won't work. - */ - if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) { - switch (layout_mask) { - case (CGFSNG_LAYOUT_LEGACY | CGFSNG_LAYOUT_UNIFIED): - ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; - break; - case CGFSNG_LAYOUT_LEGACY: - ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; - break; - case CGFSNG_LAYOUT_UNIFIED: - ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; - break; - } - } - - if (!controllers_available(ops)) - return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated"); - - return 0; -} - -static int isulad_initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf) -{ - __do_close int dfd = -EBADF; - int ret; - const char *controllers_use; - - if (ops->dfd_mnt >= 0) - return ret_errno(EBUSY); - - /* - * I don't see the need for allowing symlinks here. If users want to - * have their hierarchy available in different locations I strongly - * suggest bind-mounts. - */ - dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT, - PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); - if (dfd < 0) - return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT); - - controllers_use = lxc_global_config_value("lxc.cgroup.use"); - if (controllers_use) { - __do_free char *dup = NULL; - char *it; - - dup = strdup(controllers_use); - if (!dup) - return -errno; - - lxc_iterate_parts(it, dup, ",") { - ret = list_add_string(&ops->cgroup_use, it); - if (ret < 0) - return ret; - } - } - - /* - * Keep dfd referenced by the cleanup function and actually move the fd - * once we know the initialization succeeded. So if we fail we clean up - * the dfd. - */ - ops->dfd_mnt = dfd; - - ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map), conf); - if (ret < 0) - return syserror_ret(ret, "Failed to initialize cgroups"); - - /* Transfer ownership to cgroup_ops. */ - move_fd(dfd); - return 0; -} - -__cgfsng_ops static int isulad_cgfsng_data_init(struct cgroup_ops *ops, struct lxc_conf *conf) -{ - const char *cgroup_pattern; -#ifdef HAVE_ISULAD - const char *cgroup_tree; - __do_free char *container_cgroup = NULL, *__cgroup_tree = NULL; - size_t len; -#endif - - if (!ops) - return ret_set_errno(-1, ENOENT); - - /* copy system-wide cgroup information */ - cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern"); - if (cgroup_pattern && !strequal(cgroup_pattern, "")) { - ops->cgroup_pattern = strdup(cgroup_pattern); - if (!ops->cgroup_pattern) - return ret_errno(ENOMEM); - } - -#ifdef HAVE_ISULAD - if (conf->cgroup_meta.dir) { - cgroup_tree = conf->cgroup_meta.dir; - container_cgroup = must_concat(&len, cgroup_tree, "/", conf->name, NULL); - } else if (ops->cgroup_pattern) { - __cgroup_tree = lxc_string_replace("%n", conf->name, ops->cgroup_pattern); - if (!__cgroup_tree) - return ret_set_errno(-1, ENOMEM); - - cgroup_tree = __cgroup_tree; - container_cgroup = must_concat(&len, cgroup_tree, NULL); - } else { - cgroup_tree = NULL; - container_cgroup = must_concat(&len, conf->name, NULL); - } - if (!container_cgroup) - return ret_set_errno(-1, ENOMEM); - - ops->container_cgroup = move_ptr(container_cgroup); -#endif - - return 0; -} - -struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) -{ - __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL; - - cgfsng_ops = zalloc(sizeof(struct cgroup_ops)); - if (!cgfsng_ops) - return ret_set_errno(NULL, ENOMEM); - - cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN; - cgfsng_ops->dfd_mnt = -EBADF; - - if (isulad_initialize_cgroups(cgfsng_ops, conf)) - return NULL; - - cgfsng_ops->data_init = isulad_cgfsng_data_init; - - cgfsng_ops->errfd = conf ? conf->errpipe[1] : -1; - cgfsng_ops->get_cgroup_full_path = isulad_cgfsng_get_cgroup_full_path; - cgfsng_ops->payload_destroy = isulad_cgfsng_payload_destroy; - cgfsng_ops->monitor_destroy = isulad_cgfsng_monitor_destroy; - cgfsng_ops->monitor_create = isulad_cgfsng_monitor_create; - cgfsng_ops->monitor_enter = isulad_cgfsng_monitor_enter; - cgfsng_ops->monitor_delegate_controllers = isulad_cgfsng_monitor_delegate_controllers; - cgfsng_ops->payload_delegate_controllers = isulad_cgfsng_payload_delegate_controllers; - cgfsng_ops->payload_create = isulad_cgfsng_payload_create; - cgfsng_ops->payload_enter = isulad_cgfsng_payload_enter; - cgfsng_ops->finalize = isulad_cgfsng_finalize; - cgfsng_ops->get_cgroup = isulad_cgfsng_get_cgroup; - cgfsng_ops->get = isulad_cgfsng_get; - cgfsng_ops->set = isulad_cgfsng_set; - cgfsng_ops->freeze = isulad_cgfsng_freeze; - cgfsng_ops->unfreeze = isulad_cgfsng_unfreeze; - cgfsng_ops->setup_limits_legacy = isulad_cgfsng_setup_limits_legacy; - cgfsng_ops->setup_limits = isulad_cgfsng_setup_limits; - cgfsng_ops->driver = "isulad_cgfsng"; - cgfsng_ops->version = "1.0.0"; - cgfsng_ops->attach = isulad_cgfsng_attach; - cgfsng_ops->chown = isulad_cgfsng_chown; - cgfsng_ops->mount = isulad_cgfsng_mount; - cgfsng_ops->devices_activate = isulad_cgfsng_devices_activate; - - cgfsng_ops->criu_escape = isulad_cgfsng_criu_escape; - cgfsng_ops->criu_num_hierarchies = isulad_cgfsng_criu_num_hierarchies; - cgfsng_ops->criu_get_hierarchies = isulad_cgfsng_criu_get_hierarchies; - - return move_ptr(cgfsng_ops); -} - -static int __unified_attach_fd(const struct lxc_conf *conf, const char *lxcpath, int fd_unified, pid_t pid) -{ - int ret; - - if (!list_empty(&conf->id_map)) { - struct userns_exec_unified_attach_data args = { - .conf = conf, - .unified_fd = fd_unified, - .pid = pid, - .unprivileged = am_guest_unpriv(), - .lxcpath = lxcpath, - }; - - ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); - if (ret < 0) - return -errno; - - ret = userns_exec_minimal(conf, - cgroup_unified_attach_parent_wrapper, - &args, - cgroup_unified_attach_child_wrapper, - &args); - } else { - ret = cgroup_attach_leaf(conf, fd_unified, pid); - } - - return ret; -} - -static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name, - const char *lxcpath, pid_t pid) -{ - call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){}; - int ret; - size_t idx; - ssize_t pidstr_len; - char pidstr[INTTYPE_TO_STRLEN(pid_t)]; - - ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx); - if (ret < 0) - return ret_errno(ENOSYS); - - if (ctx->fd_len == 0) - return log_trace(0, "Container runs with unwritable %s cgroup layout", - cgroup_layout_name(ctx->layout)); - - pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid); - if (pidstr_len < 0) - return pidstr_len; - - for (idx = 0; idx < ctx->fd_len; idx++) { - int dfd_con = ctx->fd[idx]; - - if (unified_cgroup_fd(dfd_con)) - ret = __unified_attach_fd(conf, lxcpath, dfd_con, pid); - else - ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len); - if (ret) - return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con); - else - TRACE("Attached to cgroup fd %d", dfd_con); - } - - TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout)); - return 0; -} - -static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name, - const char *lxcpath, pid_t pid) -{ - __do_close int dfd_unified = -EBADF; - - if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0) - return ret_errno(EINVAL); - - dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath); - if (dfd_unified < 0) - return ret_errno(ENOSYS); - - return __unified_attach_fd(conf, lxcpath, dfd_unified, pid); -} - -int cgroup_attach(const struct lxc_conf *conf, const char *name, - const char *lxcpath, pid_t pid) -{ - int ret; - - ret = __cgroup_attach_many(conf, name, lxcpath, pid); - if (ret < 0) { - if (!ERRNO_IS_NOT_SUPPORTED(ret)) - return ret; - - ret = __cgroup_attach_unified(conf, name, lxcpath, pid); - if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret)) - return ret_errno(ENOSYS); - } - - return ret; -} - -/* Connects to command socket therefore isn't callable from command handler. */ -int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len) -{ - __do_close int dfd = -EBADF; - struct cgroup_fd fd = { - .fd = -EBADF, - }; - size_t len_controller; - int ret; - - if (is_empty_string(name) || is_empty_string(lxcpath) || - is_empty_string(key)) - return ret_errno(EINVAL); - - if ((buf && !len) || (len && !buf)) - return ret_errno(EINVAL); - - len_controller = strcspn(key, "."); - len_controller++; /* Don't forget the \0 byte. */ - if (len_controller >= MAX_CGROUP_ROOT_NAMELEN) - return ret_errno(EINVAL); - (void)strlcpy(fd.controller, key, len_controller); - - ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd); - if (ret < 0) { - if (!ERRNO_IS_NOT_SUPPORTED(ret)) - return ret; - - dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); - if (dfd < 0) { - if (!ERRNO_IS_NOT_SUPPORTED(ret)) - return ret; - - return ret_errno(ENOSYS); - } - fd.type = UNIFIED_HIERARCHY; - fd.fd = move_fd(dfd); - } - dfd = move_fd(fd.fd); - - TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type)); - - if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) - return ret_errno(EOPNOTSUPP); - else - ret = lxc_read_try_buf_at(dfd, key, buf, len); - - return ret; -} - -/* Connects to command socket therefore isn't callable from command handler. */ -int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value) -{ - __do_close int dfd = -EBADF; - struct cgroup_fd fd = { - .fd = -EBADF, - }; - size_t len_controller; - int ret; - - if (is_empty_string(name) || is_empty_string(lxcpath) || - is_empty_string(key) || is_empty_string(value)) - return ret_errno(EINVAL); - - len_controller = strcspn(key, "."); - len_controller++; /* Don't forget the \0 byte. */ - if (len_controller >= MAX_CGROUP_ROOT_NAMELEN) - return ret_errno(EINVAL); - (void)strlcpy(fd.controller, key, len_controller); - - ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd); - if (ret < 0) { - if (!ERRNO_IS_NOT_SUPPORTED(ret)) - return ret; - - dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); - if (dfd < 0) { - if (!ERRNO_IS_NOT_SUPPORTED(ret)) - return ret; - - return ret_errno(ENOSYS); - } - fd.type = UNIFIED_HIERARCHY; - fd.fd = move_fd(dfd); - } - dfd = move_fd(fd.fd); - - TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type)); - - if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) { - struct device_item device = {}; - - ret = device_cgroup_rule_parse(&device, key, value); - if (ret < 0) - return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", - key, value); - - ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device); - } else { - ret = lxc_writeat(dfd, key, value, strlen(value)); - } - - return ret; -} - -static int do_cgroup_freeze(int unified_fd, - const char *state_string, - int state_num, - int timeout, - const char *epoll_error, - const char *wait_error) -{ - __do_close int events_fd = -EBADF; - call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL; - int ret; - struct lxc_async_descr descr = {}; - - if (timeout != 0) { - ret = lxc_mainloop_open(&descr); - if (ret) - return log_error_errno(-1, errno, "%s", epoll_error); - - /* automatically cleaned up now */ - descr_ptr = &descr; - - events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0); - if (events_fd < 0) - return log_error_errno(-errno, errno, "Failed to open cgroup.events file"); - - ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, - freezer_cgroup_events_cb, - default_cleanup_handler, - INT_TO_PTR(state_num), - "freezer_cgroup_events_cb"); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); - } - - ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1); - if (ret < 0) - return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); - - if (timeout != 0) { - ret = lxc_mainloop(&descr, timeout); - if (ret) - return log_error_errno(-1, errno, "%s", wait_error); - } - - return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen"); -} - -static inline int __cgroup_freeze(int unified_fd, int timeout) -{ - return do_cgroup_freeze(unified_fd, "1", 1, timeout, - "Failed to create epoll instance to wait for container freeze", - "Failed to wait for container to be frozen"); -} - -int cgroup_freeze(const char *name, const char *lxcpath, int timeout) -{ - __do_close int unified_fd = -EBADF; - int ret; - - if (is_empty_string(name) || is_empty_string(lxcpath)) - return ret_errno(EINVAL); - - unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); - if (unified_fd < 0) - return ret_errno(ENOCGROUP2); - - lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING); - ret = __cgroup_freeze(unified_fd, timeout); - lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING); - return ret; -} - -int __cgroup_unfreeze(int unified_fd, int timeout) -{ - return do_cgroup_freeze(unified_fd, "0", 0, timeout, - "Failed to create epoll instance to wait for container freeze", - "Failed to wait for container to be frozen"); -} - -int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout) -{ - __do_close int unified_fd = -EBADF; - int ret; - - if (is_empty_string(name) || is_empty_string(lxcpath)) - return ret_errno(EINVAL); - - unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); - if (unified_fd < 0) - return ret_errno(ENOCGROUP2); - - lxc_cmd_notify_state_listeners(name, lxcpath, THAWED); - ret = __cgroup_unfreeze(unified_fd, timeout); - lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN); - return ret; -} -- 2.25.1
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2