jdk/src/hotspot/os/linux/cgroupSubsystem_linux.cpp
2026-01-19 14:44:37 +00:00

910 lines
36 KiB
C++

/*
* Copyright (c) 2019, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "cgroupSubsystem_linux.hpp"
#include "cgroupUtil_linux.hpp"
#include "cgroupV1Subsystem_linux.hpp"
#include "cgroupV2Subsystem_linux.hpp"
#include "logging/log.hpp"
#include "memory/allocation.hpp"
#include "os_linux.hpp"
#include "runtime/globals.hpp"
#include "runtime/os.hpp"
#include "utilities/globalDefinitions.hpp"
#include <errno.h>
#include <math.h>
#include <string.h>
#include <sys/vfs.h>
// Inlined from <linux/magic.h> for portability.
#ifndef CGROUP2_SUPER_MAGIC
# define CGROUP2_SUPER_MAGIC 0x63677270
#endif
// controller names have to match the *_IDX indices
static const char* cg_controller_name[] = { "cpuset", "cpu", "cpuacct", "memory", "pids" };
static inline int cg_v2_controller_index(const char* name) {
if (strcmp(name, "cpuset") == 0) {
return CPUSET_IDX;
} else if (strcmp(name, "cpu") == 0) {
return CPU_IDX;
} else if (strcmp(name, "memory") == 0) {
return MEMORY_IDX;
} else if (strcmp(name, "pids") == 0) {
return PIDS_IDX;
} else {
return -1;
}
}
CgroupSubsystem* CgroupSubsystemFactory::create() {
CgroupV1MemoryController* memory = nullptr;
CgroupV1Controller* cpuset = nullptr;
CgroupV1CpuController* cpu = nullptr;
CgroupV1CpuacctController* cpuacct = nullptr;
CgroupV1Controller* pids = nullptr;
CgroupInfo cg_infos[CG_INFO_LENGTH];
u1 cg_type_flags = INVALID_CGROUPS_GENERIC;
const char* proc_cgroups = "/proc/cgroups";
const char* sys_fs_cgroup_cgroup_controllers = "/sys/fs/cgroup/cgroup.controllers";
const char* controllers_file = proc_cgroups;
const char* proc_self_cgroup = "/proc/self/cgroup";
const char* proc_self_mountinfo = "/proc/self/mountinfo";
const char* sys_fs_cgroup = "/sys/fs/cgroup";
struct statfs fsstat = {};
bool cgroups_v2_enabled = false;
// Assume cgroups v2 is usable by the JDK iff /sys/fs/cgroup has the cgroup v2
// file system magic. If it does not then heuristics are required to determine
// if cgroups v1 is usable or not.
if (statfs(sys_fs_cgroup, &fsstat) != -1) {
cgroups_v2_enabled = (fsstat.f_type == CGROUP2_SUPER_MAGIC);
if (cgroups_v2_enabled) {
controllers_file = sys_fs_cgroup_cgroup_controllers;
}
}
bool valid_cgroup = determine_type(cg_infos, cgroups_v2_enabled, controllers_file, proc_self_cgroup, proc_self_mountinfo, &cg_type_flags);
if (!valid_cgroup) {
// Could not detect cgroup type
return nullptr;
}
assert(is_valid_cgroup(&cg_type_flags), "Expected valid cgroup type");
if (is_cgroup_v2(&cg_type_flags)) {
// Cgroups v2 case, we have all the info we need.
// Construct the subsystem, free resources and return
// Note: We use the memory for non-cpu non-memory controller look-ups.
// Perhaps we ought to have separate controllers for all.
CgroupV2Controller mem_other = CgroupV2Controller(cg_infos[MEMORY_IDX]._mount_path,
cg_infos[MEMORY_IDX]._cgroup_path,
cg_infos[MEMORY_IDX]._read_only);
CgroupV2MemoryController* memory = new CgroupV2MemoryController(mem_other);
CgroupV2CpuController* cpu = new CgroupV2CpuController(CgroupV2Controller(cg_infos[CPU_IDX]._mount_path,
cg_infos[CPU_IDX]._cgroup_path,
cg_infos[CPU_IDX]._read_only));
CgroupV2CpuacctController* cpuacct = new CgroupV2CpuacctController(cpu);
log_debug(os, container)("Detected cgroups v2 unified hierarchy");
cleanup(cg_infos);
return new CgroupV2Subsystem(memory, cpu, cpuacct, mem_other);
}
/*
* Cgroup v1 case:
*
* Use info gathered previously from /proc/self/cgroup
* and map host mount point to
* local one via /proc/self/mountinfo content above
*
* Docker example:
* 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
*
* Host example:
* 5:memory:/user.slice
*
* Construct a path to the process specific memory and cpuset
* cgroup directory.
*
* For a container running under Docker from memory example above
* the paths would be:
*
* /sys/fs/cgroup/memory
*
* For a Host from memory example above the path would be:
*
* /sys/fs/cgroup/memory/user.slice
*
*/
assert(is_cgroup_v1(&cg_type_flags), "Cgroup v1 expected");
for (int i = 0; i < CG_INFO_LENGTH; i++) {
CgroupInfo info = cg_infos[i];
if (info._data_complete) { // pids controller might have incomplete data
if (strcmp(info._name, "memory") == 0) {
memory = new CgroupV1MemoryController(CgroupV1Controller(info._root_mount_path, info._mount_path, info._read_only));
memory->set_subsystem_path(info._cgroup_path);
} else if (strcmp(info._name, "cpuset") == 0) {
cpuset = new CgroupV1Controller(info._root_mount_path, info._mount_path, info._read_only);
cpuset->set_subsystem_path(info._cgroup_path);
} else if (strcmp(info._name, "cpu") == 0) {
cpu = new CgroupV1CpuController(CgroupV1Controller(info._root_mount_path, info._mount_path, info._read_only));
cpu->set_subsystem_path(info._cgroup_path);
} else if (strcmp(info._name, "cpuacct") == 0) {
cpuacct = new CgroupV1CpuacctController(CgroupV1Controller(info._root_mount_path, info._mount_path, info._read_only));
cpuacct->set_subsystem_path(info._cgroup_path);
} else if (strcmp(info._name, "pids") == 0) {
pids = new CgroupV1Controller(info._root_mount_path, info._mount_path, info._read_only);
pids->set_subsystem_path(info._cgroup_path);
}
} else {
log_debug(os, container)("CgroupInfo for %s not complete", cg_controller_name[i]);
}
}
cleanup(cg_infos);
return new CgroupV1Subsystem(cpuset, cpu, cpuacct, pids, memory);
}
void CgroupSubsystemFactory::set_controller_paths(CgroupInfo* cg_infos,
int controller,
const char* name,
char* mount_path,
char* root_path,
bool read_only) {
if (cg_infos[controller]._mount_path != nullptr) {
// On some systems duplicate controllers get mounted in addition to
// the main cgroup controllers most likely under /sys/fs/cgroup. In that
// case pick the one under /sys/fs/cgroup and discard others.
if (strstr(cg_infos[controller]._mount_path, "/sys/fs/cgroup") != cg_infos[controller]._mount_path) {
log_debug(os, container)("Duplicate %s controllers detected. Picking %s, skipping %s.",
name, mount_path, cg_infos[controller]._mount_path);
os::free(cg_infos[controller]._mount_path);
os::free(cg_infos[controller]._root_mount_path);
cg_infos[controller]._mount_path = os::strdup(mount_path);
cg_infos[controller]._root_mount_path = os::strdup(root_path);
cg_infos[controller]._read_only = read_only;
} else {
log_debug(os, container)("Duplicate %s controllers detected. Picking %s, skipping %s.",
name, cg_infos[controller]._mount_path, mount_path);
}
} else {
cg_infos[controller]._mount_path = os::strdup(mount_path);
cg_infos[controller]._root_mount_path = os::strdup(root_path);
cg_infos[controller]._read_only = read_only;
}
}
/*
* Determine whether or not the mount options, which are comma separated,
* contain the 'ro' string.
*/
static bool find_ro_opt(char* mount_opts) {
char* token;
char* mo_ptr = mount_opts;
// mount options are comma-separated (man proc).
while ((token = strsep(&mo_ptr, ",")) != nullptr) {
if (strcmp(token, "ro") == 0) {
return true;
}
}
return false;
}
/*
* Read values of a /proc/self/mountinfo line into variables. For cgroups v1
* super options are needed. On cgroups v2 super options are not used.
*
* The scanning of a single mountinfo line entry is as follows:
*
* 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
* (1) (2) (3):(4) (5) (6) (7) (8) (9) (10) (11) (12)
*
* The numbers in parentheses are labels for the descriptions below:
*
* (1) mount ID: matched with '%*d' and discarded
* (2) parent ID: matched with '%*d' and discarded
* (3) major: ---,---> major, minor separated by ':'. matched with '%*d:%*d' and discarded
* (4) minor: ---'
* (5) root: matched with '%s' and captured in 'tmproot'. Must be non-empty.
* (6) mount point: matched with '%s' and captured in 'tmpmount'. Must be non-empty.
* (7) mount options: matched with '%s' and captured in 'mount_opts'. Must be non-empty.
* (8) optional fields: ---,---> matched with '%*[^-]-'. Anything not a hyphen, followed by a hyphen
* (9) separator: ---' and discarded. Note: The discarded match is space characters if there
* are no optionals. Otherwise it includes the optional fields as well.
* (10) filesystem type: matched with '%s' and captured in 'tmp_fs_type'
* (11) mount source: matched with '%*s' and discarded
* (12) super options: matched with '%s' and captured in 'tmpcgroups'
*/
static inline bool match_mount_info_line(char* line,
char* tmproot,
char* tmpmount,
char* mount_opts,
char* tmp_fs_type,
char* tmpcgroups) {
return sscanf(line,
"%*d %*d %*d:%*d %s %s %s%*[^-]- %s %*s %s",
tmproot,
tmpmount,
mount_opts,
tmp_fs_type,
tmpcgroups) == 5;
}
bool CgroupSubsystemFactory::determine_type(CgroupInfo* cg_infos,
bool cgroups_v2_enabled,
const char* controllers_file,
const char* proc_self_cgroup,
const char* proc_self_mountinfo,
u1* flags) {
FILE *mntinfo = nullptr;
FILE* controllers = nullptr;
FILE *cgroup = nullptr;
char buf[MAXPATHLEN+1];
char *p;
// true iff all required controllers, memory, cpu, cpuacct are enabled
// at the kernel level.
// pids might not be enabled on older Linux distros (SLES 12.1, RHEL 7.1)
// cpuset might not be enabled on newer Linux distros (Fedora 41)
bool all_required_controllers_enabled = true;
// If cgroups v2 is enabled, open /sys/fs/cgroup/cgroup.controllers. If not, open /proc/cgroups.
controllers = os::fopen(controllers_file, "r");
if (controllers == nullptr) {
log_debug(os, container)("Can't open %s, %s", controllers_file, os::strerror(errno));
*flags = INVALID_CGROUPS_GENERIC;
return false;
}
if (cgroups_v2_enabled) {
/*
* cgroups v2 is enabled. For cgroups v2 (unified hierarchy), the cpu and memory
* controllers must be enabled.
*/
if ((p = fgets(buf, MAXPATHLEN, controllers)) != nullptr) {
char* controller = nullptr;
#define ISSPACE_CHARS " \n\t\r\f\v"
while ((controller = strsep(&p, ISSPACE_CHARS)) != nullptr) {
int i;
if ((i = cg_v2_controller_index(controller)) != -1) {
cg_infos[i]._name = os::strdup(controller);
cg_infos[i]._enabled = true;
if (i == PIDS_IDX || i == CPUSET_IDX) {
log_debug(os, container)("Detected optional %s controller entry in %s",
controller, controllers_file);
}
}
}
#undef ISSPACE_CHARS
} else {
log_debug(os, container)("Can't read %s, %s", controllers_file, os::strerror(errno));
*flags = INVALID_CGROUPS_V2;
fclose(controllers);
return false;
}
for (int i = 0; i < CG_INFO_LENGTH; i++) {
// cgroups v2 does not have cpuacct.
if (i == CPUACCT_IDX) {
continue;
}
// For cgroups v2, cpuacct is rolled into cpu, and the pids and cpuset controllers
// are optional; the remaining controllers, cpu and memory, are required.
if (i == CPU_IDX || i == MEMORY_IDX) {
all_required_controllers_enabled = all_required_controllers_enabled && cg_infos[i]._enabled;
}
if (log_is_enabled(Debug, os, container) && !cg_infos[i]._enabled) {
log_debug(os, container)("controller %s is not enabled", cg_controller_name[i]);
}
}
} else {
/*
* The /sys/fs/cgroup filesystem magic hint suggests we have cg v1. Read /proc/cgroups; for
* cgroups v1 hierarchy (hybrid or legacy), cpu, cpuacct, cpuset, and memory controllers must
* have non-zero for the hierarchy ID field and relevant controllers mounted.
*/
while ((p = fgets(buf, MAXPATHLEN, controllers)) != nullptr) {
char name[MAXPATHLEN+1];
int hierarchy_id;
int enabled;
// Format of /proc/cgroups documented via man 7 cgroups
if (sscanf(p, "%s %d %*d %d", name, &hierarchy_id, &enabled) != 3) {
continue;
}
if (strcmp(name, "memory") == 0) {
cg_infos[MEMORY_IDX]._name = os::strdup(name);
cg_infos[MEMORY_IDX]._hierarchy_id = hierarchy_id;
cg_infos[MEMORY_IDX]._enabled = (enabled == 1);
} else if (strcmp(name, "cpuset") == 0) {
cg_infos[CPUSET_IDX]._name = os::strdup(name);
cg_infos[CPUSET_IDX]._hierarchy_id = hierarchy_id;
cg_infos[CPUSET_IDX]._enabled = (enabled == 1);
} else if (strcmp(name, "cpu") == 0) {
cg_infos[CPU_IDX]._name = os::strdup(name);
cg_infos[CPU_IDX]._hierarchy_id = hierarchy_id;
cg_infos[CPU_IDX]._enabled = (enabled == 1);
} else if (strcmp(name, "cpuacct") == 0) {
cg_infos[CPUACCT_IDX]._name = os::strdup(name);
cg_infos[CPUACCT_IDX]._hierarchy_id = hierarchy_id;
cg_infos[CPUACCT_IDX]._enabled = (enabled == 1);
} else if (strcmp(name, "pids") == 0) {
log_debug(os, container)("Detected optional pids controller entry in %s", controllers_file);
cg_infos[PIDS_IDX]._name = os::strdup(name);
cg_infos[PIDS_IDX]._hierarchy_id = hierarchy_id;
cg_infos[PIDS_IDX]._enabled = (enabled == 1);
}
}
for (int i = 0; i < CG_INFO_LENGTH; i++) {
// pids controller is optional. All other controllers are required
if (i != PIDS_IDX) {
all_required_controllers_enabled = all_required_controllers_enabled && cg_infos[i]._enabled;
}
if (log_is_enabled(Debug, os, container) && !cg_infos[i]._enabled) {
log_debug(os, container)("controller %s is not enabled", cg_controller_name[i]);
}
}
}
fclose(controllers);
if (!all_required_controllers_enabled) {
// one or more required controllers disabled, disable container support
log_debug(os, container)("One or more required controllers disabled at kernel level.");
cleanup(cg_infos);
*flags = INVALID_CGROUPS_GENERIC;
return false;
}
/*
* Read /proc/self/cgroup and determine:
* - the cgroup path for cgroups v2 or
* - on a cgroups v1 system, collect info for mapping
* the host mount point to the local one via /proc/self/mountinfo below.
*/
cgroup = os::fopen(proc_self_cgroup, "r");
if (cgroup == nullptr) {
log_debug(os, container)("Can't open %s, %s",
proc_self_cgroup, os::strerror(errno));
cleanup(cg_infos);
*flags = INVALID_CGROUPS_GENERIC;
return false;
}
while ((p = fgets(buf, MAXPATHLEN, cgroup)) != nullptr) {
char *controllers;
char *token;
char *hierarchy_id_str;
int hierarchy_id;
char *cgroup_path;
hierarchy_id_str = strsep(&p, ":");
hierarchy_id = atoi(hierarchy_id_str);
/* Get controllers and base */
controllers = strsep(&p, ":");
cgroup_path = strsep(&p, "\n");
if (controllers == nullptr) {
continue;
}
while (!cgroups_v2_enabled && (token = strsep(&controllers, ",")) != nullptr) {
if (strcmp(token, "memory") == 0) {
assert(hierarchy_id == cg_infos[MEMORY_IDX]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch for memory");
cg_infos[MEMORY_IDX]._cgroup_path = os::strdup(cgroup_path);
} else if (strcmp(token, "cpuset") == 0) {
assert(hierarchy_id == cg_infos[CPUSET_IDX]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch for cpuset");
cg_infos[CPUSET_IDX]._cgroup_path = os::strdup(cgroup_path);
} else if (strcmp(token, "cpu") == 0) {
assert(hierarchy_id == cg_infos[CPU_IDX]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch for cpu");
cg_infos[CPU_IDX]._cgroup_path = os::strdup(cgroup_path);
} else if (strcmp(token, "cpuacct") == 0) {
assert(hierarchy_id == cg_infos[CPUACCT_IDX]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch for cpuacct");
cg_infos[CPUACCT_IDX]._cgroup_path = os::strdup(cgroup_path);
} else if (strcmp(token, "pids") == 0) {
assert(hierarchy_id == cg_infos[PIDS_IDX]._hierarchy_id, "/proc/cgroups (%d) and /proc/self/cgroup (%d) hierarchy mismatch for pids",
cg_infos[PIDS_IDX]._hierarchy_id, hierarchy_id);
cg_infos[PIDS_IDX]._cgroup_path = os::strdup(cgroup_path);
}
}
if (cgroups_v2_enabled) {
// On some systems we have mixed cgroups v1 and cgroups v2 controllers (e.g. freezer on cg1 and
// all relevant controllers on cg2). Only set the cgroup path when we see a hierarchy id of 0.
if (hierarchy_id != 0) {
continue;
}
for (int i = 0; i < CG_INFO_LENGTH; i++) {
assert(cg_infos[i]._cgroup_path == nullptr, "cgroup path must only be set once");
cg_infos[i]._cgroup_path = os::strdup(cgroup_path);
}
}
}
fclose(cgroup);
// Find various mount points by reading /proc/self/mountinfo
// mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt
mntinfo = os::fopen(proc_self_mountinfo, "r");
if (mntinfo == nullptr) {
log_debug(os, container)("Can't open %s, %s",
proc_self_mountinfo, os::strerror(errno));
cleanup(cg_infos);
*flags = INVALID_CGROUPS_GENERIC;
return false;
}
bool cgroupv2_mount_point_found = false;
bool any_cgroup_mounts_found = false;
while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != nullptr) {
char tmp_fs_type[MAXPATHLEN+1];
char tmproot[MAXPATHLEN+1];
char tmpmount[MAXPATHLEN+1];
char tmpcgroups[MAXPATHLEN+1];
char mount_opts[MAXPATHLEN+1];
char *cptr = tmpcgroups;
char *token;
/* Cgroup v2 relevant info. We only look for the _mount_path iff cgroups_v2_enabled so
* as to avoid memory stomping of the _mount_path pointer later on in the cgroup v1
* block in the hybrid case.
*
* We collect the read only mount option in the cgroup infos so as to have that
* info ready when determining is_containerized().
*/
if (cgroups_v2_enabled && match_mount_info_line(p,
tmproot,
tmpmount,
mount_opts,
tmp_fs_type,
tmpcgroups /* unused */)) {
// we likely have an early match return (e.g. cgroup fs match), be sure we have cgroup2 as fstype
if (strcmp("cgroup2", tmp_fs_type) == 0) {
cgroupv2_mount_point_found = true;
any_cgroup_mounts_found = true;
// For unified we only have a single line with cgroup2 fs type.
// Therefore use that option for all CG info structs.
bool ro_option = find_ro_opt(mount_opts);
for (int i = 0; i < CG_INFO_LENGTH; i++) {
set_controller_paths(cg_infos, i, "(cg2, unified)", tmpmount, tmproot, ro_option);
}
}
}
/* Cgroup v1 relevant info
*
* Find the cgroup mount point for memory, cpuset, cpu, cpuacct, pids. For each controller
* determine whether or not they show up as mounted read only or not.
*
* Example for docker:
* 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
*
* Example for host:
* 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
*
* 44 31 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:23 - cgroup cgroup rw,pids
*
*/
if (match_mount_info_line(p, tmproot, tmpmount, mount_opts, tmp_fs_type, tmpcgroups)) {
if (strcmp("cgroup", tmp_fs_type) != 0) {
// Skip cgroup2 fs lines on hybrid or unified hierarchy.
continue;
}
while ((token = strsep(&cptr, ",")) != nullptr) {
if (strcmp(token, "memory") == 0) {
any_cgroup_mounts_found = true;
bool ro_option = find_ro_opt(mount_opts);
set_controller_paths(cg_infos, MEMORY_IDX, token, tmpmount, tmproot, ro_option);
cg_infos[MEMORY_IDX]._data_complete = true;
} else if (strcmp(token, "cpuset") == 0) {
any_cgroup_mounts_found = true;
bool ro_option = find_ro_opt(mount_opts);
set_controller_paths(cg_infos, CPUSET_IDX, token, tmpmount, tmproot, ro_option);
cg_infos[CPUSET_IDX]._data_complete = true;
} else if (strcmp(token, "cpu") == 0) {
any_cgroup_mounts_found = true;
bool ro_option = find_ro_opt(mount_opts);
set_controller_paths(cg_infos, CPU_IDX, token, tmpmount, tmproot, ro_option);
cg_infos[CPU_IDX]._data_complete = true;
} else if (strcmp(token, "cpuacct") == 0) {
any_cgroup_mounts_found = true;
bool ro_option = find_ro_opt(mount_opts);
set_controller_paths(cg_infos, CPUACCT_IDX, token, tmpmount, tmproot, ro_option);
cg_infos[CPUACCT_IDX]._data_complete = true;
} else if (strcmp(token, "pids") == 0) {
any_cgroup_mounts_found = true;
bool ro_option = find_ro_opt(mount_opts);
set_controller_paths(cg_infos, PIDS_IDX, token, tmpmount, tmproot, ro_option);
cg_infos[PIDS_IDX]._data_complete = true;
}
}
}
}
fclose(mntinfo);
// Neither cgroup2 nor cgroup filesystems mounted via /proc/self/mountinfo
// No point in continuing.
if (!any_cgroup_mounts_found) {
log_trace(os, container)("No relevant cgroup controllers mounted.");
cleanup(cg_infos);
*flags = INVALID_CGROUPS_NO_MOUNT;
return false;
}
if (cgroups_v2_enabled) {
if (!cgroupv2_mount_point_found) {
log_trace(os, container)("Mount point for cgroupv2 not found in /proc/self/mountinfo");
cleanup(cg_infos);
*flags = INVALID_CGROUPS_V2;
return false;
}
// Cgroups v2 case, we have all the info we need.
*flags = CGROUPS_V2;
return true;
}
// What follows is cgroups v1
log_debug(os, container)("Detected cgroups hybrid or legacy hierarchy, using cgroups v1 controllers");
if (!cg_infos[MEMORY_IDX]._data_complete) {
log_debug(os, container)("Required cgroup v1 memory subsystem not found");
cleanup(cg_infos);
*flags = INVALID_CGROUPS_V1;
return false;
}
if (!cg_infos[CPUSET_IDX]._data_complete) {
log_debug(os, container)("Required cgroup v1 cpuset subsystem not found");
cleanup(cg_infos);
*flags = INVALID_CGROUPS_V1;
return false;
}
if (!cg_infos[CPU_IDX]._data_complete) {
log_debug(os, container)("Required cgroup v1 cpu subsystem not found");
cleanup(cg_infos);
*flags = INVALID_CGROUPS_V1;
return false;
}
if (!cg_infos[CPUACCT_IDX]._data_complete) {
log_debug(os, container)("Required cgroup v1 cpuacct subsystem not found");
cleanup(cg_infos);
*flags = INVALID_CGROUPS_V1;
return false;
}
if (log_is_enabled(Debug, os, container) && !cg_infos[PIDS_IDX]._data_complete) {
log_debug(os, container)("Optional cgroup v1 pids subsystem not found");
// keep the other controller info, pids is optional
}
// Cgroups v1 case, we have all the info we need.
*flags = CGROUPS_V1;
return true;
};
void CgroupSubsystemFactory::cleanup(CgroupInfo* cg_infos) {
assert(cg_infos != nullptr, "Invariant");
for (int i = 0; i < CG_INFO_LENGTH; i++) {
os::free(cg_infos[i]._name);
os::free(cg_infos[i]._cgroup_path);
os::free(cg_infos[i]._root_mount_path);
os::free(cg_infos[i]._mount_path);
}
}
/* active_processor_count
*
* Calculate an appropriate number of active processors for the
* VM to use based on these three inputs.
*
* cpu affinity
* cgroup cpu quota & cpu period
*
* Algorithm:
*
* Determine the number of available CPUs from sched_getaffinity
*
* If user specified a quota (quota != -1), calculate the number of
* required CPUs by dividing quota by period.
*
* All results of division are rounded up to the next whole number.
*
* If quotas have not been specified, sets the result reference to
* the number of active processors in the system.
*
* If quotas have been specified, the number set in the result
* reference will never exceed the number of active processors.
*
* return:
* true if there were no errors. false otherwise.
*/
bool CgroupSubsystem::active_processor_count(double& value) {
// We use a cache with a timeout to avoid performing expensive
// computations in the event this function is called frequently.
// [See 8227006].
CachingCgroupController<CgroupCpuController, double>* contrl = cpu_controller();
CachedMetric<double>* cpu_limit = contrl->metrics_cache();
if (!cpu_limit->should_check_metric()) {
value = cpu_limit->value();
log_trace(os, container)("CgroupSubsystem::active_processor_count (cached): %.2f", value);
return true;
}
int cpu_count = os::Linux::active_processor_count();
double result = -1;
if (!CgroupUtil::processor_count(contrl->controller(), cpu_count, result)) {
return false;
}
assert(result > 0 && result <= cpu_count, "must be");
// Update cached metric to avoid re-reading container settings too often
cpu_limit->set_value(result, OSCONTAINER_CACHE_TIMEOUT);
value = result;
return true;
}
/* memory_limit_in_bytes
*
* Return the limit of available memory for this process in the provided
* physical_memory_size_type reference. If there was no limit value set in the underlying
* interface files 'value_unlimited' is returned.
*
* return:
* false if retrieving the value failed
* true if retrieving the value was successfull and the value was
* set in the 'value' reference.
*/
bool CgroupSubsystem::memory_limit_in_bytes(physical_memory_size_type upper_bound,
physical_memory_size_type& value) {
CachingCgroupController<CgroupMemoryController, physical_memory_size_type>* contrl = memory_controller();
CachedMetric<physical_memory_size_type>* memory_limit = contrl->metrics_cache();
if (!memory_limit->should_check_metric()) {
value = memory_limit->value();
return true;
}
physical_memory_size_type mem_limit = 0;
if (!contrl->controller()->read_memory_limit_in_bytes(upper_bound, mem_limit)) {
return false;
}
// Update cached metric to avoid re-reading container settings too often
memory_limit->set_value(mem_limit, OSCONTAINER_CACHE_TIMEOUT);
value = mem_limit;
return true;
}
bool CgroupController::read_string(const char* filename, char* buf, size_t buf_size) {
assert(buf != nullptr, "buffer must not be null");
assert(filename != nullptr, "filename must be given");
const char* s_path = subsystem_path();
if (s_path == nullptr) {
log_debug(os, container)("read_string: subsystem path is null");
return false;
}
stringStream file_path;
file_path.print_raw(s_path);
file_path.print_raw(filename);
if (file_path.size() > MAXPATHLEN) {
log_debug(os, container)("File path too long %s, %s", file_path.base(), filename);
return false;
}
const char* absolute_path = file_path.freeze();
log_trace(os, container)("Path to %s is %s", filename, absolute_path);
FILE* fp = os::fopen(absolute_path, "r");
if (fp == nullptr) {
log_debug(os, container)("Open of file %s failed, %s", absolute_path, os::strerror(errno));
return false;
}
// Read a single line into the provided buffer.
// At most buf_size - 1 characters.
char* line = fgets(buf, buf_size, fp);
fclose(fp);
if (line == nullptr) {
log_debug(os, container)("Empty file %s", absolute_path);
return false;
}
size_t len = strlen(line);
assert(len <= buf_size - 1, "At most buf_size - 1 bytes can be read");
if (line[len - 1] == '\n') {
line[len - 1] = '\0'; // trim trailing new line
}
return true;
}
bool CgroupController::read_number(const char* filename, uint64_t& result) {
char buf[1024];
bool is_ok = read_string(filename, buf, 1024);
if (!is_ok) {
return false;
}
int matched = sscanf(buf, UINT64_FORMAT, &result);
if (matched == 1) {
return true;
}
return false;
}
bool CgroupController::read_number_handle_max(const char* filename, uint64_t& result) {
char buf[1024];
bool is_ok = read_string(filename, buf, 1024);
if (!is_ok) {
return false;
}
uint64_t val = 0;
if (!limit_from_str(buf, val)) {
return false;
}
result = val;
return true;
}
bool CgroupController::read_numerical_key_value(const char* filename, const char* key, uint64_t& result) {
assert(key != nullptr, "key must be given");
assert(filename != nullptr, "file to search in must be given");
const char* s_path = subsystem_path();
if (s_path == nullptr) {
log_debug(os, container)("read_numerical_key_value: subsystem path is null");
return false;
}
stringStream file_path;
file_path.print_raw(s_path);
file_path.print_raw(filename);
if (file_path.size() > MAXPATHLEN) {
log_debug(os, container)("File path too long %s, %s", file_path.base(), filename);
return false;
}
const char* absolute_path = file_path.freeze();
log_trace(os, container)("Path to %s is %s", filename, absolute_path);
FILE* fp = os::fopen(absolute_path, "r");
if (fp == nullptr) {
log_debug(os, container)("Open of file %s failed, %s", absolute_path, os::strerror(errno));
return false;
}
const int buf_len = MAXPATHLEN+1;
char buf[buf_len];
char* line = fgets(buf, buf_len, fp);
bool found_match = false;
// File consists of multiple lines in a "key value"
// fashion, we have to find the key.
const size_t key_len = strlen(key);
for (; line != nullptr; line = fgets(buf, buf_len, fp)) {
char after_key = line[key_len];
if (strncmp(line, key, key_len) == 0
&& isspace((unsigned char) after_key) != 0
&& after_key != '\n') {
// Skip key, skip space
const char* value_substr = line + key_len + 1;
int matched = sscanf(value_substr, UINT64_FORMAT, &result);
found_match = matched == 1;
if (found_match) {
break;
}
}
}
fclose(fp);
if (found_match) {
return true;
}
log_debug(os, container)("Type %s (key == %s) not found in file %s", UINT64_FORMAT,
key, absolute_path);
return false;
}
bool CgroupController::read_numerical_tuple_value(const char* filename, bool use_first, uint64_t& result) {
char buf[1024];
bool is_ok = read_string(filename, buf, 1024);
if (!is_ok) {
return false;
}
char token[1024];
const int matched = sscanf(buf, (use_first ? "%1023s %*s" : "%*s %1023s"), token);
if (matched != 1) {
return false;
}
uint64_t val = 0;
if (!limit_from_str(token, val)) {
return false;
}
result = val;
return true;
}
bool CgroupController::limit_from_str(char* limit_str, uint64_t& value) {
if (limit_str == nullptr) {
return false;
}
// Unlimited memory in cgroups is the literal string 'max' for
// some controllers, for example the pids controller.
if (strcmp("max", limit_str) == 0) {
value = value_unlimited;
return true;
}
uint64_t limit;
if (sscanf(limit_str, UINT64_FORMAT, &limit) != 1) {
return false;
}
value = limit;
return true;
}
// CgroupSubsystem implementations
bool CgroupSubsystem::memory_and_swap_limit_in_bytes(physical_memory_size_type upper_mem_bound,
physical_memory_size_type upper_swap_bound,
physical_memory_size_type& value) {
return memory_controller()->controller()->memory_and_swap_limit_in_bytes(upper_mem_bound,
upper_swap_bound,
value);
}
bool CgroupSubsystem::memory_and_swap_usage_in_bytes(physical_memory_size_type upper_mem_bound,
physical_memory_size_type upper_swap_bound,
physical_memory_size_type& value) {
return memory_controller()->controller()->memory_and_swap_usage_in_bytes(upper_mem_bound,
upper_swap_bound,
value);
}
bool CgroupSubsystem::memory_soft_limit_in_bytes(physical_memory_size_type upper_bound,
physical_memory_size_type& value) {
return memory_controller()->controller()->memory_soft_limit_in_bytes(upper_bound, value);
}
bool CgroupSubsystem::memory_throttle_limit_in_bytes(physical_memory_size_type& value) {
return memory_controller()->controller()->memory_throttle_limit_in_bytes(value);
}
bool CgroupSubsystem::memory_usage_in_bytes(physical_memory_size_type& value) {
return memory_controller()->controller()->memory_usage_in_bytes(value);
}
bool CgroupSubsystem::memory_max_usage_in_bytes(physical_memory_size_type& value) {
return memory_controller()->controller()->memory_max_usage_in_bytes(value);
}
bool CgroupSubsystem::rss_usage_in_bytes(physical_memory_size_type& value) {
return memory_controller()->controller()->rss_usage_in_bytes(value);
}
bool CgroupSubsystem::cache_usage_in_bytes(physical_memory_size_type& value) {
return memory_controller()->controller()->cache_usage_in_bytes(value);
}
bool CgroupSubsystem::cpu_quota(int& value) {
return cpu_controller()->controller()->cpu_quota(value);
}
bool CgroupSubsystem::cpu_period(int& value) {
return cpu_controller()->controller()->cpu_period(value);
}
bool CgroupSubsystem::cpu_shares(int& value) {
return cpu_controller()->controller()->cpu_shares(value);
}
bool CgroupSubsystem::cpu_usage_in_micros(uint64_t& value) {
return cpuacct_controller()->cpu_usage_in_micros(value);
}
void CgroupSubsystem::print_version_specific_info(outputStream* st, physical_memory_size_type upper_mem_bound) {
memory_controller()->controller()->print_version_specific_info(st, upper_mem_bound);
}