jdk/src/hotspot/os/linux/cgroupV1Subsystem_linux.cpp

/*
 * Copyright (c) 2019, 2026, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */

#include "cgroupUtil_linux.hpp"
#include "cgroupV1Subsystem_linux.hpp"
#include "logging/log.hpp"
#include "memory/allocation.hpp"
#include "os_linux.hpp"
#include "runtime/globals.hpp"
#include "runtime/os.hpp"
#include "utilities/globalDefinitions.hpp"

#include <errno.h>
#include <math.h>
#include <string.h>

/*
 * Set directory to subsystem specific files based
 * on the contents of the mountinfo and cgroup files.
 *
 * The method determines whether it runs in
 * - host mode
 * - container mode
 *
 * In the host mode, _root is equal to "/" and
 * the subsystem path is equal to the _mount_point path
 * joined with cgroup_path.
 *
 * In the container mode, it can be two possibilities:
 * - private namespace (cgroupns=private)
 * - host namespace (cgroupns=host, default mode in cgroup V1 hosts)
 *
 * Private namespace is equivalent to the host mode, i.e.
 * the subsystem path is set by concatenating
 * _mount_point and cgroup_path.
 *
 * In the host namespace, _root is equal to host's cgroup path
 * of the control group to which the containerized process
 * belongs to at the moment of creation. The mountinfo and
 * cgroup files are mirrored from the host, while the subsystem
 * specific files are mapped directly at _mount_point, i.e.
 * at /sys/fs/cgroup/<controller>/, the subsystem path is
 * then set equal to _mount_point.
 *
 * A special case of the subsystem path is when a cgroup path
 * includes a subgroup, when a containerized process was associated
 * with an existing cgroup, that is different from cgroup
 * in which the process has been created.
 * Here, the _root is equal to the host's initial cgroup path,
 * cgroup_path will be equal to host's new cgroup path.
 * As host cgroup hierarchies are not accessible in the container,
 * it needs to be determined which part of cgroup path
 * is accessible inside container, i.e. mapped under
 * /sys/fs/cgroup/<controller>/<subgroup>.
 * In Docker default setup, host's cgroup path can be
 * of the form: /docker/<CONTAINER_ID>/<subgroup>,
 * from which only <subgroup> is mapped.
 * The method trims cgroup path from left, until the subgroup
 * component is found. The subsystem path will be set to
 * the _mount_point joined with the subgroup path.
 */
void CgroupV1Controller::set_subsystem_path(const char* cgroup_path) {
  if (_cgroup_path != nullptr) {
    os::free(_cgroup_path);
  }
  if (_path != nullptr) {
    os::free(_path);
    _path = nullptr;
  }
  _cgroup_path = os::strdup(cgroup_path);
  stringStream ss;
  if (_root != nullptr && cgroup_path != nullptr) {
    ss.print_raw(_mount_point);
    if (strcmp(_root, "/") == 0) {
      // host processes and containers with cgroupns=private
      if (strcmp(cgroup_path,"/") != 0) {
        ss.print_raw(cgroup_path);
      }
    } else {
      // containers with cgroupns=host, default setting is _root==cgroup_path
      if (strcmp(_root, cgroup_path) != 0) {
        if (*cgroup_path != '\0' && strcmp(cgroup_path, "/") != 0) {
          // When moved to a subgroup, between subgroups, the path suffix will change.
          const char *suffix = cgroup_path;
          while (suffix != nullptr) {
            stringStream pp;
            pp.print_raw(_mount_point);
            pp.print_raw(suffix);
            if (os::file_exists(pp.base())) {
              ss.print_raw(suffix);
              if (suffix != cgroup_path) {
                log_trace(os, container)("set_subsystem_path: cgroup v1 path reduced to: %s.", suffix);
              }
              break;
            }
            log_trace(os, container)("set_subsystem_path: skipped non-existent directory: %s.", suffix);
            suffix = strchr(suffix + 1, '/');
          }
        }
      }
    }
    _path = os::strdup(ss.base());
  }
}

bool CgroupV1MemoryController::read_use_hierarchy_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.use_hierarchy", "Use Hierarchy", result);
}

bool CgroupV1MemoryController::uses_mem_hierarchy() {
  physical_memory_size_type use_hierarchy = 0;
  return read_use_hierarchy_val(use_hierarchy) && use_hierarchy > 0;
}

/*
 * The common case, containers, we have _root == _cgroup_path, and thus set the
 * controller path to the _mount_point. This is where the limits are exposed in
 * the cgroup pseudo filesystem (at the leaf) and adjustment of the path won't
 * be needed for that reason.
 */
bool CgroupV1Controller::needs_hierarchy_adjustment() {
  assert(_cgroup_path != nullptr, "sanity");
  return strcmp(_root, _cgroup_path) != 0;
}

bool CgroupV1MemoryController::read_memory_limit_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.limit_in_bytes", "Memory Limit", result);
}

bool CgroupV1MemoryController::read_hierarchical_memory_limit_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMERICAL_KEY_VALUE_CHECKED(reader(), "/memory.stat",
                                             "hierarchical_memory_limit", "Hierarchical Memory Limit",
                                             result);
}

bool CgroupV1MemoryController::read_memory_limit_in_bytes(physical_memory_size_type upper_bound,
                                                          physical_memory_size_type& result) {
  physical_memory_size_type memlimit = 0;
  if (!read_memory_limit_val(memlimit)) {
    log_trace(os, container)("container memory limit failed, upper bound is " PHYS_MEM_TYPE_FORMAT, upper_bound);
    return false;
  }
  if (memlimit >= upper_bound) {
    physical_memory_size_type hierlimit = 0;
    if (uses_mem_hierarchy() && read_hierarchical_memory_limit_val(hierlimit) &&
        hierlimit < upper_bound) {
      log_trace(os, container)("Memory Limit is: " PHYS_MEM_TYPE_FORMAT, hierlimit);
      result = hierlimit;
    } else {
      // Exceeding physical memory is treated as unlimited. This implementation
      // caps it at host_mem since Cg v1 has no value to represent 'max'.
      log_trace(os, container)("container memory limit ignored: " PHYS_MEM_TYPE_FORMAT
                               ", upper bound is " PHYS_MEM_TYPE_FORMAT, memlimit, upper_bound);
      result = value_unlimited;
    }
  } else {
    result = memlimit;
  }
  return true;
}

bool CgroupV1MemoryController::read_mem_swap(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.memsw.limit_in_bytes", "Memory and Swap Limit", result);
}

bool CgroupV1MemoryController::read_hierarchical_mem_swap_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMERICAL_KEY_VALUE_CHECKED(reader(), "/memory.stat",
                                             "hierarchical_memsw_limit", "Hierarchical Memory and Swap Limit",
                                             result);
}

/* memory_and_swap_limit_in_bytes
 *
 * Determine the memory and swap limit metric. Sets the 'result' reference to a positive limit value or
 * 'value_unlimited' (for unlimited).
 *
 * returns:
 *    * false if an error occurred. 'result' reference remains unchanged.
 *    * true if the limit value has been set in the 'result' reference
 */
bool CgroupV1MemoryController::memory_and_swap_limit_in_bytes(physical_memory_size_type upper_mem_bound,
                                                              physical_memory_size_type upper_swap_bound,
                                                              physical_memory_size_type& result) {
  physical_memory_size_type total_mem_swap = upper_mem_bound + upper_swap_bound;
  physical_memory_size_type memory_swap = 0;
  bool mem_swap_read_failed = false;
  if (!read_mem_swap(memory_swap)) {
    mem_swap_read_failed = true;
  }
  if (memory_swap >= total_mem_swap) {
    physical_memory_size_type hiermswlimit = 0;
    if (uses_mem_hierarchy() && read_hierarchical_mem_swap_val(hiermswlimit) &&
        hiermswlimit < total_mem_swap) {
      log_trace(os, container)("Memory and Swap Limit is: " PHYS_MEM_TYPE_FORMAT, hiermswlimit);
      memory_swap = hiermswlimit;
    } else {
      memory_swap = value_unlimited;
    }
  }
  if (memory_swap == value_unlimited) {
    log_trace(os, container)("Memory and Swap Limit is: Unlimited");
    result = value_unlimited;
    return true;
  }

  // If there is a swap limit, but swappiness == 0, reset the limit
  // to the memory limit. Do the same for cases where swap isn't
  // supported.
  physical_memory_size_type swappiness = 0;
  if (!read_mem_swappiness(swappiness)) {
    // assume no swap
    mem_swap_read_failed = true;
  }
  if (swappiness == 0 || mem_swap_read_failed) {
    physical_memory_size_type memlimit = value_unlimited;
    if (!read_memory_limit_in_bytes(upper_mem_bound, memlimit)) {
      return false;
    }
    if (memlimit == value_unlimited) {
      result = value_unlimited; // No memory limit, thus no swap limit
      return true;
    }
    if (mem_swap_read_failed) {
      log_trace(os, container)("Memory and Swap Limit has been reset to " PHYS_MEM_TYPE_FORMAT
                               " because swap is not supported", memlimit);
    } else {
      log_trace(os, container)("Memory and Swap Limit has been reset to " PHYS_MEM_TYPE_FORMAT
                               " because swappiness is 0", memlimit);
    }
    result = memlimit;
    return true;
  }
  result = memory_swap;
  return true;
}

static inline
bool memory_swap_usage_impl(CgroupController* ctrl, physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(ctrl, "/memory.memsw.usage_in_bytes", "mem swap usage", result);
}

bool CgroupV1MemoryController::memory_and_swap_usage_in_bytes(physical_memory_size_type upper_mem_bound,
                                                              physical_memory_size_type upper_swap_bound,
                                                              physical_memory_size_type& result) {
  physical_memory_size_type memory_sw_limit = value_unlimited;
  if (!memory_and_swap_limit_in_bytes(upper_mem_bound, upper_swap_bound, memory_sw_limit)) {
    return false;
  }
  physical_memory_size_type mem_limit_val = value_unlimited;
  physical_memory_size_type memory_limit = value_unlimited;
  if (read_memory_limit_in_bytes(upper_mem_bound, mem_limit_val)) {
    if (mem_limit_val != value_unlimited) {
      memory_limit = mem_limit_val;
    }
  }
  if (memory_sw_limit != value_unlimited && memory_limit != value_unlimited) {
    if (memory_limit < memory_sw_limit) {
      // swap allowed and > 0
      physical_memory_size_type swap_usage = 0;
      if (!memory_swap_usage_impl(reader(), swap_usage)) {
        return false;
      }
      result = swap_usage;
      return true;
    }
  }
  return memory_usage_in_bytes(result);
}

bool CgroupV1MemoryController::read_mem_swappiness(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.swappiness", "Swappiness", result);
}

bool CgroupV1MemoryController::memory_soft_limit_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.soft_limit_in_bytes", "Memory Soft Limit", result);
}

bool CgroupV1MemoryController::memory_soft_limit_in_bytes(physical_memory_size_type upper_bound,
                                                          physical_memory_size_type& result) {
  physical_memory_size_type mem_soft_limit = 0;
  if (!memory_soft_limit_val(mem_soft_limit)) {
    return false;
  }
  if (mem_soft_limit >= upper_bound) {
    log_trace(os, container)("Memory Soft Limit is: Unlimited");
    result = value_unlimited;
  } else {
    result = mem_soft_limit;
  }
  return true;
}

bool CgroupV1MemoryController::memory_throttle_limit_in_bytes(physical_memory_size_type& result) {
  // Log this string at trace level so as to make tests happy.
  log_trace(os, container)("Memory Throttle Limit is not supported.");
  return false;
}

// Constructor
CgroupV1Subsystem::CgroupV1Subsystem(CgroupV1Controller* cpuset,
                      CgroupV1CpuController* cpu,
                      CgroupV1CpuacctController* cpuacct,
                      CgroupV1Controller* pids,
                      CgroupV1MemoryController* memory) :
    _cpuset(cpuset),
    _cpuacct(cpuacct),
    _pids(pids) {
  CgroupUtil::adjust_controller(memory);
  CgroupUtil::adjust_controller(cpu);
  _memory = new CachingCgroupController<CgroupMemoryController, physical_memory_size_type>(memory);
  _cpu = new CachingCgroupController<CgroupCpuController, double>(cpu);
}

bool CgroupV1Subsystem::is_containerized() {
  // containerized iff all required controllers are mounted
  // read-only. See OSContainer::is_containerized() for
  // the full logic.
  //
  return _memory->controller()->is_read_only() &&
         _cpu->controller()->is_read_only() &&
         _cpuacct->is_read_only() &&
         _cpuset->is_read_only();
}

bool CgroupV1MemoryController::memory_usage_in_bytes(physical_memory_size_type& result) {
  physical_memory_size_type memory_usage = 0;
  if (!memory_usage_val(memory_usage)) {
    return false;
  }
  result = memory_usage;
  return true;
}

/* memory_usage_val
 *
 * Read the amount of used memory for this process into the passed in reference 'result'
 *
 * return:
 *    true when reading of the file was successful and 'result' was set appropriately
 *    false when reading of the file failed
 */
bool CgroupV1MemoryController::memory_usage_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.usage_in_bytes", "Memory Usage", result);
}

bool CgroupV1MemoryController::memory_max_usage_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.max_usage_in_bytes", "Maximum Memory Usage", result);
}

/* memory_max_usage_in_bytes
 *
 * Return the maximum amount of used memory for this process in the
 * result reference.
 *
 * return:
 *    true if the result reference has been set
 *    false otherwise (e.g. on error)
 */
bool CgroupV1MemoryController::memory_max_usage_in_bytes(physical_memory_size_type& result) {
  physical_memory_size_type memory_max_usage = 0;
  if (!memory_max_usage_val(memory_max_usage)) {
     return false;
  }
  result = memory_max_usage;
  return true;
}

bool CgroupV1MemoryController::rss_usage_in_bytes(physical_memory_size_type& result) {
  physical_memory_size_type rss = 0;

  if (!reader()->read_numerical_key_value("/memory.stat", "rss", rss)) {
    return false;
  }
  log_trace(os, container)("RSS usage is: " PHYS_MEM_TYPE_FORMAT, rss);
  result = rss;
  return true;
}

bool CgroupV1MemoryController::cache_usage_in_bytes(physical_memory_size_type& result) {
  physical_memory_size_type cache = 0;
  if (!reader()->read_numerical_key_value("/memory.stat", "cache", cache)) {
    return false;
  }
  log_trace(os, container)("Cache usage is: " PHYS_MEM_TYPE_FORMAT, cache);
  result = cache;
  return true;
}

bool CgroupV1MemoryController::kernel_memory_usage_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.usage_in_bytes", "Kernel Memory Usage", result);
}

bool CgroupV1MemoryController::kernel_memory_usage_in_bytes(physical_memory_size_type& result) {
  physical_memory_size_type kmem_usage = 0;
  if (!kernel_memory_usage_val(kmem_usage)) {
    return false;
  }
  result = kmem_usage;
  return true;
}

bool CgroupV1MemoryController::kernel_memory_limit_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.limit_in_bytes", "Kernel Memory Limit", result);
}

bool CgroupV1MemoryController::kernel_memory_limit_in_bytes(physical_memory_size_type upper_bound,
                                                            physical_memory_size_type& result) {
  physical_memory_size_type kmem_limit = 0;
  if (!kernel_memory_limit_val(kmem_limit)) {
    return false;
  }
  if (kmem_limit >= upper_bound) {
    kmem_limit = value_unlimited;
  }
  result = kmem_limit;
  return true;
}

bool CgroupV1MemoryController::kernel_memory_max_usage_val(physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.max_usage_in_bytes", "Maximum Kernel Memory Usage", result);
}

bool CgroupV1MemoryController::kernel_memory_max_usage_in_bytes(physical_memory_size_type& result) {
  physical_memory_size_type kmem_max_usage = 0;
  if (!kernel_memory_max_usage_val(kmem_max_usage)) {
    return false;
  }
  result = kmem_max_usage;
  return true;
}

void CgroupV1MemoryController::print_version_specific_info(outputStream* st, physical_memory_size_type mem_bound) {
  MetricResult kmem_usage;
  physical_memory_size_type temp = 0;
  if (kernel_memory_usage_in_bytes(temp)) {
    kmem_usage.set_value(temp);
  }
  MetricResult kmem_limit;
  temp = value_unlimited;
  if (kernel_memory_limit_in_bytes(mem_bound, temp)) {
    kmem_limit.set_value(temp);
  }
  MetricResult kmem_max_usage;
  temp = 0;
  if (kernel_memory_max_usage_in_bytes(temp)) {
    kmem_max_usage.set_value(temp);
  }

  OSContainer::print_container_helper(st, kmem_limit, "kernel_memory_limit");
  OSContainer::print_container_helper(st, kmem_usage, "kernel_memory_usage");
  OSContainer::print_container_helper(st, kmem_max_usage, "kernel_memory_max_usage");
}

char* CgroupV1Subsystem::cpu_cpuset_cpus() {
  char cpus[1024];
  CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.cpus", "cpuset.cpus", cpus, 1024);
  return os::strdup(cpus);
}

char* CgroupV1Subsystem::cpu_cpuset_memory_nodes() {
  char mems[1024];
  CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.mems", "cpuset.mems", mems, 1024);
  return os::strdup(mems);
}

/* cpu_quota
 *
 * Return the number of microseconds per period
 * a process is guaranteed to run in the provided
 * result reference.
 *
 * return:
 *   true if the value was set in the result reference
 *   false on failure to read the number from the file
 *   and the result reference has not been touched.
 */
bool CgroupV1CpuController::cpu_quota(int& result) {
  uint64_t quota = 0;

  // intentionally not using the macro so as to not log a
  // negative value as a large unsiged int
  if (!reader()->read_number("/cpu.cfs_quota_us", quota)) {
    log_trace(os, container)("CPU Quota failed");
    return false;
  }
  // cast to int since the read value might be negative
  // and we want to avoid logging -1 as a large unsigned value.
  int quota_int = static_cast<int>(quota);
  log_trace(os, container)("CPU Quota is: %d", quota_int);
  result = quota_int;
  return true;
}

bool CgroupV1CpuController::cpu_period_val(uint64_t& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.cfs_period_us", "CPU Period", result);
}

bool CgroupV1CpuController::cpu_period(int& result) {
  uint64_t period = value_unlimited;
  if (!cpu_period_val(period)) {
    return false;
  }
  result = static_cast<int>(period);
  return true;
}

bool CgroupV1CpuController::cpu_shares_val(uint64_t& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.shares", "CPU Shares", result);
}

/* cpu_shares
 *
 * Return the amount of cpu shares available to the process
 *    - Share number (typically a number relative to 1024)
 *    - (2048 typically expresses 2 CPUs worth of processing)
 *
 * return:
 *    false on error
 *    true if the result has been set in the result reference
 */
bool CgroupV1CpuController::cpu_shares(int& result) {
  uint64_t shares = 0;
  if (!cpu_shares_val(shares)) {
    return false;
  }
  int shares_int = static_cast<int>(shares);
  // Convert 1024 to no shares setup (-1)
  if (shares_int == 1024) {
    shares_int = -1;
  }

  result = shares_int;
  return true;
}

bool CgroupV1CpuacctController::cpu_usage_in_micros_val(uint64_t& result) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpuacct.usage", "CPU Usage", result);
}

bool CgroupV1CpuacctController::cpu_usage_in_micros(uint64_t& result) {
  uint64_t cpu_usage = 0;
  if (!cpu_usage_in_micros_val(cpu_usage)) {
    return false;
  }
  // Output is in nanoseconds, convert to microseconds.
  result = static_cast<uint64_t>(cpu_usage / 1000);
  return true;
}

static
bool pids_max_val(CgroupController* ctrl, uint64_t& result) {
  CONTAINER_READ_NUMBER_CHECKED_MAX(ctrl, "/pids.max", "Maximum number of tasks", result);
}

/* pids_max
 *
 * Return the maximum number of tasks available to the process
 * in the passed result reference (might be value_unlimited).
 *
 * return:
 *    false on error
 *    true when the result reference has been appropriately set
 */
bool CgroupV1Subsystem::pids_max(uint64_t& result) {
  if (_pids == nullptr) return false;
  uint64_t pids_val = 0;
  if (!pids_max_val(_pids, pids_val)) {
    return false;
  }
  result = pids_val;
  return true;
}

static
bool pids_current_val(CgroupController* ctrl, uint64_t& result) {
  CONTAINER_READ_NUMBER_CHECKED(ctrl, "/pids.current", "Current number of tasks", result);
}

/* pids_current
 *
 * The number of tasks currently in the cgroup (and its descendants) of the process
 *
 * return:
 *    true if the current number of tasks has been set in the result reference
 *    false if an error occurred
 */
bool CgroupV1Subsystem::pids_current(uint64_t& result) {
  if (_pids == nullptr) return false;
  uint64_t pids_current = 0;
  if (!pids_current_val(_pids, pids_current)) {
    return false;
  }
  result = pids_current;
  return true;
}