/* * Copyright (c) 2019, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. * */ #include "cgroupUtil_linux.hpp" #include "cgroupV1Subsystem_linux.hpp" #include "logging/log.hpp" #include "memory/allocation.hpp" #include "os_linux.hpp" #include "runtime/globals.hpp" #include "runtime/os.hpp" #include "utilities/globalDefinitions.hpp" #include #include #include /* * Set directory to subsystem specific files based * on the contents of the mountinfo and cgroup files. * * The method determines whether it runs in * - host mode * - container mode * * In the host mode, _root is equal to "/" and * the subsystem path is equal to the _mount_point path * joined with cgroup_path. * * In the container mode, it can be two possibilities: * - private namespace (cgroupns=private) * - host namespace (cgroupns=host, default mode in cgroup V1 hosts) * * Private namespace is equivalent to the host mode, i.e. * the subsystem path is set by concatenating * _mount_point and cgroup_path. * * In the host namespace, _root is equal to host's cgroup path * of the control group to which the containerized process * belongs to at the moment of creation. The mountinfo and * cgroup files are mirrored from the host, while the subsystem * specific files are mapped directly at _mount_point, i.e. * at /sys/fs/cgroup//, the subsystem path is * then set equal to _mount_point. * * A special case of the subsystem path is when a cgroup path * includes a subgroup, when a containerized process was associated * with an existing cgroup, that is different from cgroup * in which the process has been created. * Here, the _root is equal to the host's initial cgroup path, * cgroup_path will be equal to host's new cgroup path. * As host cgroup hierarchies are not accessible in the container, * it needs to be determined which part of cgroup path * is accessible inside container, i.e. mapped under * /sys/fs/cgroup//. * In Docker default setup, host's cgroup path can be * of the form: /docker//, * from which only is mapped. * The method trims cgroup path from left, until the subgroup * component is found. The subsystem path will be set to * the _mount_point joined with the subgroup path. */ void CgroupV1Controller::set_subsystem_path(const char* cgroup_path) { if (_cgroup_path != nullptr) { os::free(_cgroup_path); } if (_path != nullptr) { os::free(_path); _path = nullptr; } _cgroup_path = os::strdup(cgroup_path); stringStream ss; if (_root != nullptr && cgroup_path != nullptr) { ss.print_raw(_mount_point); if (strcmp(_root, "/") == 0) { // host processes and containers with cgroupns=private if (strcmp(cgroup_path,"/") != 0) { ss.print_raw(cgroup_path); } } else { // containers with cgroupns=host, default setting is _root==cgroup_path if (strcmp(_root, cgroup_path) != 0) { if (*cgroup_path != '\0' && strcmp(cgroup_path, "/") != 0) { // When moved to a subgroup, between subgroups, the path suffix will change. const char *suffix = cgroup_path; while (suffix != nullptr) { stringStream pp; pp.print_raw(_mount_point); pp.print_raw(suffix); if (os::file_exists(pp.base())) { ss.print_raw(suffix); if (suffix != cgroup_path) { log_trace(os, container)("set_subsystem_path: cgroup v1 path reduced to: %s.", suffix); } break; } log_trace(os, container)("set_subsystem_path: skipped non-existent directory: %s.", suffix); suffix = strchr(suffix + 1, '/'); } } } } _path = os::strdup(ss.base()); } } bool CgroupV1MemoryController::read_use_hierarchy_val(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.use_hierarchy", "Use Hierarchy", result); } bool CgroupV1MemoryController::uses_mem_hierarchy() { physical_memory_size_type use_hierarchy = 0; return read_use_hierarchy_val(use_hierarchy) && use_hierarchy > 0; } /* * The common case, containers, we have _root == _cgroup_path, and thus set the * controller path to the _mount_point. This is where the limits are exposed in * the cgroup pseudo filesystem (at the leaf) and adjustment of the path won't * be needed for that reason. */ bool CgroupV1Controller::needs_hierarchy_adjustment() { assert(_cgroup_path != nullptr, "sanity"); return strcmp(_root, _cgroup_path) != 0; } bool CgroupV1MemoryController::read_memory_limit_val(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.limit_in_bytes", "Memory Limit", result); } bool CgroupV1MemoryController::read_hierarchical_memory_limit_val(physical_memory_size_type& result) { CONTAINER_READ_NUMERICAL_KEY_VALUE_CHECKED(reader(), "/memory.stat", "hierarchical_memory_limit", "Hierarchical Memory Limit", result); } bool CgroupV1MemoryController::read_memory_limit_in_bytes(physical_memory_size_type upper_bound, physical_memory_size_type& result) { physical_memory_size_type memlimit = 0; if (!read_memory_limit_val(memlimit)) { log_trace(os, container)("container memory limit failed, upper bound is " PHYS_MEM_TYPE_FORMAT, upper_bound); return false; } if (memlimit >= upper_bound) { physical_memory_size_type hierlimit = 0; if (uses_mem_hierarchy() && read_hierarchical_memory_limit_val(hierlimit) && hierlimit < upper_bound) { log_trace(os, container)("Memory Limit is: " PHYS_MEM_TYPE_FORMAT, hierlimit); result = hierlimit; } else { // Exceeding physical memory is treated as unlimited. This implementation // caps it at host_mem since Cg v1 has no value to represent 'max'. log_trace(os, container)("container memory limit ignored: " PHYS_MEM_TYPE_FORMAT ", upper bound is " PHYS_MEM_TYPE_FORMAT, memlimit, upper_bound); result = value_unlimited; } } else { result = memlimit; } return true; } bool CgroupV1MemoryController::read_mem_swap(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.memsw.limit_in_bytes", "Memory and Swap Limit", result); } bool CgroupV1MemoryController::read_hierarchical_mem_swap_val(physical_memory_size_type& result) { CONTAINER_READ_NUMERICAL_KEY_VALUE_CHECKED(reader(), "/memory.stat", "hierarchical_memsw_limit", "Hierarchical Memory and Swap Limit", result); } /* memory_and_swap_limit_in_bytes * * Determine the memory and swap limit metric. Sets the 'result' reference to a positive limit value or * 'value_unlimited' (for unlimited). * * returns: * * false if an error occurred. 'result' reference remains unchanged. * * true if the limit value has been set in the 'result' reference */ bool CgroupV1MemoryController::memory_and_swap_limit_in_bytes(physical_memory_size_type upper_mem_bound, physical_memory_size_type upper_swap_bound, physical_memory_size_type& result) { physical_memory_size_type total_mem_swap = upper_mem_bound + upper_swap_bound; physical_memory_size_type memory_swap = 0; bool mem_swap_read_failed = false; if (!read_mem_swap(memory_swap)) { mem_swap_read_failed = true; } if (memory_swap >= total_mem_swap) { physical_memory_size_type hiermswlimit = 0; if (uses_mem_hierarchy() && read_hierarchical_mem_swap_val(hiermswlimit) && hiermswlimit < total_mem_swap) { log_trace(os, container)("Memory and Swap Limit is: " PHYS_MEM_TYPE_FORMAT, hiermswlimit); memory_swap = hiermswlimit; } else { memory_swap = value_unlimited; } } if (memory_swap == value_unlimited) { log_trace(os, container)("Memory and Swap Limit is: Unlimited"); result = value_unlimited; return true; } // If there is a swap limit, but swappiness == 0, reset the limit // to the memory limit. Do the same for cases where swap isn't // supported. physical_memory_size_type swappiness = 0; if (!read_mem_swappiness(swappiness)) { // assume no swap mem_swap_read_failed = true; } if (swappiness == 0 || mem_swap_read_failed) { physical_memory_size_type memlimit = value_unlimited; if (!read_memory_limit_in_bytes(upper_mem_bound, memlimit)) { return false; } if (memlimit == value_unlimited) { result = value_unlimited; // No memory limit, thus no swap limit return true; } if (mem_swap_read_failed) { log_trace(os, container)("Memory and Swap Limit has been reset to " PHYS_MEM_TYPE_FORMAT " because swap is not supported", memlimit); } else { log_trace(os, container)("Memory and Swap Limit has been reset to " PHYS_MEM_TYPE_FORMAT " because swappiness is 0", memlimit); } result = memlimit; return true; } result = memory_swap; return true; } static inline bool memory_swap_usage_impl(CgroupController* ctrl, physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(ctrl, "/memory.memsw.usage_in_bytes", "mem swap usage", result); } bool CgroupV1MemoryController::memory_and_swap_usage_in_bytes(physical_memory_size_type upper_mem_bound, physical_memory_size_type upper_swap_bound, physical_memory_size_type& result) { physical_memory_size_type memory_sw_limit = value_unlimited; if (!memory_and_swap_limit_in_bytes(upper_mem_bound, upper_swap_bound, memory_sw_limit)) { return false; } physical_memory_size_type mem_limit_val = value_unlimited; physical_memory_size_type memory_limit = value_unlimited; if (read_memory_limit_in_bytes(upper_mem_bound, mem_limit_val)) { if (mem_limit_val != value_unlimited) { memory_limit = mem_limit_val; } } if (memory_sw_limit != value_unlimited && memory_limit != value_unlimited) { if (memory_limit < memory_sw_limit) { // swap allowed and > 0 physical_memory_size_type swap_usage = 0; if (!memory_swap_usage_impl(reader(), swap_usage)) { return false; } result = swap_usage; return true; } } return memory_usage_in_bytes(result); } bool CgroupV1MemoryController::read_mem_swappiness(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.swappiness", "Swappiness", result); } bool CgroupV1MemoryController::memory_soft_limit_val(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.soft_limit_in_bytes", "Memory Soft Limit", result); } bool CgroupV1MemoryController::memory_soft_limit_in_bytes(physical_memory_size_type upper_bound, physical_memory_size_type& result) { physical_memory_size_type mem_soft_limit = 0; if (!memory_soft_limit_val(mem_soft_limit)) { return false; } if (mem_soft_limit >= upper_bound) { log_trace(os, container)("Memory Soft Limit is: Unlimited"); result = value_unlimited; } else { result = mem_soft_limit; } return true; } bool CgroupV1MemoryController::memory_throttle_limit_in_bytes(physical_memory_size_type& result) { // Log this string at trace level so as to make tests happy. log_trace(os, container)("Memory Throttle Limit is not supported."); return false; } // Constructor CgroupV1Subsystem::CgroupV1Subsystem(CgroupV1Controller* cpuset, CgroupV1CpuController* cpu, CgroupV1CpuacctController* cpuacct, CgroupV1Controller* pids, CgroupV1MemoryController* memory) : _cpuset(cpuset), _cpuacct(cpuacct), _pids(pids) { CgroupUtil::adjust_controller(memory); CgroupUtil::adjust_controller(cpu); _memory = new CachingCgroupController(memory); _cpu = new CachingCgroupController(cpu); } bool CgroupV1Subsystem::is_containerized() { // containerized iff all required controllers are mounted // read-only. See OSContainer::is_containerized() for // the full logic. // return _memory->controller()->is_read_only() && _cpu->controller()->is_read_only() && _cpuacct->is_read_only() && _cpuset->is_read_only(); } bool CgroupV1MemoryController::memory_usage_in_bytes(physical_memory_size_type& result) { physical_memory_size_type memory_usage = 0; if (!memory_usage_val(memory_usage)) { return false; } result = memory_usage; return true; } /* memory_usage_val * * Read the amount of used memory for this process into the passed in reference 'result' * * return: * true when reading of the file was successful and 'result' was set appropriately * false when reading of the file failed */ bool CgroupV1MemoryController::memory_usage_val(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.usage_in_bytes", "Memory Usage", result); } bool CgroupV1MemoryController::memory_max_usage_val(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.max_usage_in_bytes", "Maximum Memory Usage", result); } /* memory_max_usage_in_bytes * * Return the maximum amount of used memory for this process in the * result reference. * * return: * true if the result reference has been set * false otherwise (e.g. on error) */ bool CgroupV1MemoryController::memory_max_usage_in_bytes(physical_memory_size_type& result) { physical_memory_size_type memory_max_usage = 0; if (!memory_max_usage_val(memory_max_usage)) { return false; } result = memory_max_usage; return true; } bool CgroupV1MemoryController::rss_usage_in_bytes(physical_memory_size_type& result) { physical_memory_size_type rss = 0; if (!reader()->read_numerical_key_value("/memory.stat", "rss", rss)) { return false; } log_trace(os, container)("RSS usage is: " PHYS_MEM_TYPE_FORMAT, rss); result = rss; return true; } bool CgroupV1MemoryController::cache_usage_in_bytes(physical_memory_size_type& result) { physical_memory_size_type cache = 0; if (!reader()->read_numerical_key_value("/memory.stat", "cache", cache)) { return false; } log_trace(os, container)("Cache usage is: " PHYS_MEM_TYPE_FORMAT, cache); result = cache; return true; } bool CgroupV1MemoryController::kernel_memory_usage_val(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.usage_in_bytes", "Kernel Memory Usage", result); } bool CgroupV1MemoryController::kernel_memory_usage_in_bytes(physical_memory_size_type& result) { physical_memory_size_type kmem_usage = 0; if (!kernel_memory_usage_val(kmem_usage)) { return false; } result = kmem_usage; return true; } bool CgroupV1MemoryController::kernel_memory_limit_val(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.limit_in_bytes", "Kernel Memory Limit", result); } bool CgroupV1MemoryController::kernel_memory_limit_in_bytes(physical_memory_size_type upper_bound, physical_memory_size_type& result) { physical_memory_size_type kmem_limit = 0; if (!kernel_memory_limit_val(kmem_limit)) { return false; } if (kmem_limit >= upper_bound) { kmem_limit = value_unlimited; } result = kmem_limit; return true; } bool CgroupV1MemoryController::kernel_memory_max_usage_val(physical_memory_size_type& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.max_usage_in_bytes", "Maximum Kernel Memory Usage", result); } bool CgroupV1MemoryController::kernel_memory_max_usage_in_bytes(physical_memory_size_type& result) { physical_memory_size_type kmem_max_usage = 0; if (!kernel_memory_max_usage_val(kmem_max_usage)) { return false; } result = kmem_max_usage; return true; } void CgroupV1MemoryController::print_version_specific_info(outputStream* st, physical_memory_size_type mem_bound) { MetricResult kmem_usage; physical_memory_size_type temp = 0; if (kernel_memory_usage_in_bytes(temp)) { kmem_usage.set_value(temp); } MetricResult kmem_limit; temp = value_unlimited; if (kernel_memory_limit_in_bytes(mem_bound, temp)) { kmem_limit.set_value(temp); } MetricResult kmem_max_usage; temp = 0; if (kernel_memory_max_usage_in_bytes(temp)) { kmem_max_usage.set_value(temp); } OSContainer::print_container_helper(st, kmem_limit, "kernel_memory_limit"); OSContainer::print_container_helper(st, kmem_usage, "kernel_memory_usage"); OSContainer::print_container_helper(st, kmem_max_usage, "kernel_memory_max_usage"); } char* CgroupV1Subsystem::cpu_cpuset_cpus() { char cpus[1024]; CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.cpus", "cpuset.cpus", cpus, 1024); return os::strdup(cpus); } char* CgroupV1Subsystem::cpu_cpuset_memory_nodes() { char mems[1024]; CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.mems", "cpuset.mems", mems, 1024); return os::strdup(mems); } /* cpu_quota * * Return the number of microseconds per period * a process is guaranteed to run in the provided * result reference. * * return: * true if the value was set in the result reference * false on failure to read the number from the file * and the result reference has not been touched. */ bool CgroupV1CpuController::cpu_quota(int& result) { uint64_t quota = 0; // intentionally not using the macro so as to not log a // negative value as a large unsiged int if (!reader()->read_number("/cpu.cfs_quota_us", quota)) { log_trace(os, container)("CPU Quota failed"); return false; } // cast to int since the read value might be negative // and we want to avoid logging -1 as a large unsigned value. int quota_int = static_cast(quota); log_trace(os, container)("CPU Quota is: %d", quota_int); result = quota_int; return true; } bool CgroupV1CpuController::cpu_period_val(uint64_t& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.cfs_period_us", "CPU Period", result); } bool CgroupV1CpuController::cpu_period(int& result) { uint64_t period = value_unlimited; if (!cpu_period_val(period)) { return false; } result = static_cast(period); return true; } bool CgroupV1CpuController::cpu_shares_val(uint64_t& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.shares", "CPU Shares", result); } /* cpu_shares * * Return the amount of cpu shares available to the process * - Share number (typically a number relative to 1024) * - (2048 typically expresses 2 CPUs worth of processing) * * return: * false on error * true if the result has been set in the result reference */ bool CgroupV1CpuController::cpu_shares(int& result) { uint64_t shares = 0; if (!cpu_shares_val(shares)) { return false; } int shares_int = static_cast(shares); // Convert 1024 to no shares setup (-1) if (shares_int == 1024) { shares_int = -1; } result = shares_int; return true; } bool CgroupV1CpuacctController::cpu_usage_in_micros_val(uint64_t& result) { CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpuacct.usage", "CPU Usage", result); } bool CgroupV1CpuacctController::cpu_usage_in_micros(uint64_t& result) { uint64_t cpu_usage = 0; if (!cpu_usage_in_micros_val(cpu_usage)) { return false; } // Output is in nanoseconds, convert to microseconds. result = static_cast(cpu_usage / 1000); return true; } static bool pids_max_val(CgroupController* ctrl, uint64_t& result) { CONTAINER_READ_NUMBER_CHECKED_MAX(ctrl, "/pids.max", "Maximum number of tasks", result); } /* pids_max * * Return the maximum number of tasks available to the process * in the passed result reference (might be value_unlimited). * * return: * false on error * true when the result reference has been appropriately set */ bool CgroupV1Subsystem::pids_max(uint64_t& result) { if (_pids == nullptr) return false; uint64_t pids_val = 0; if (!pids_max_val(_pids, pids_val)) { return false; } result = pids_val; return true; } static bool pids_current_val(CgroupController* ctrl, uint64_t& result) { CONTAINER_READ_NUMBER_CHECKED(ctrl, "/pids.current", "Current number of tasks", result); } /* pids_current * * The number of tasks currently in the cgroup (and its descendants) of the process * * return: * true if the current number of tasks has been set in the result reference * false if an error occurred */ bool CgroupV1Subsystem::pids_current(uint64_t& result) { if (_pids == nullptr) return false; uint64_t pids_current = 0; if (!pids_current_val(_pids, pids_current)) { return false; } result = pids_current; return true; }