jdk/src/hotspot/os/linux/cgroupV2Subsystem_linux.cpp

/*
 * Copyright (c) 2020, 2025, Red Hat Inc.
 * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */

#include "cgroupUtil_linux.hpp"
#include "cgroupV2Subsystem_linux.hpp"

#include <math.h>

// Constructor
CgroupV2Controller::CgroupV2Controller(char* mount_path,
                                       char *cgroup_path,
                                       bool ro) :  _read_only(ro),
                                                   _path(construct_path(mount_path, cgroup_path)) {
  _cgroup_path = os::strdup(cgroup_path);
  _mount_point = os::strdup(mount_path);
}
// Shallow copy constructor
CgroupV2Controller::CgroupV2Controller(const CgroupV2Controller& o) :
                                            _read_only(o._read_only),
                                            _path(o._path) {
  _cgroup_path = o._cgroup_path;
  _mount_point = o._mount_point;
}

static
bool read_cpu_shares_value(CgroupV2Controller* ctrl, uint64_t& value) {
  CONTAINER_READ_NUMBER_CHECKED(ctrl, "/cpu.weight", "Raw value for CPU Shares", value);
}

/* cpu_shares
 *
 * Return the amount of cpu shares available to the process in the
 * 'result' reference.
 *
 *    Share number (typically a number relative to 1024)
 *                 (2048 typically expresses 2 CPUs worth of processing)
 *
 * return:
 *    true if the result reference got updated
 *    false if there was an error
 */
bool CgroupV2CpuController::cpu_shares(int& result) {
  uint64_t shares = 0;
  bool is_ok = read_cpu_shares_value(reader(), shares);
  if (!is_ok) {
    return false;
  }
  int shares_int = static_cast<int>(shares);
  // Convert default value of 100 to no shares setup
  if (shares_int == 100) {
    log_debug(os, container)("CPU Shares is: unlimited");
    result = -1;
    return true;
  }
  // cg v2 values must be in range [1-10000]
  assert(shares_int >= 1 && shares_int <= 10000, "invariant");

  // CPU shares (OCI) value needs to get translated into
  // a proper Cgroups v2 value. See:
  // https://github.com/containers/crun/blob/1.24/crun.1.md#cpu-controller
  //
  // Use the inverse of (x == OCI value, y == cgroupsv2 value):
  // y = 10^(log2(x)^2/612 + 125/612 * log2(x) - 7.0/34.0)
  //
  // By re-arranging it to the standard quadratic form:
  // log2(x)^2 + 125 * log2(x) - (126 + 612 * log_10(y)) = 0
  //
  // Therefore, log2(x) = (-125 + sqrt( 125^2 - 4 * (-(126 + 612 * log_10(y)))))/2
  //
  // As a result we have the inverse (we can discount substraction of the
  // square root value since those values result in very small numbers and the
  // cpu shares values - OCI - are in range [2,262144]):
  //
  // x = 2^((-125 + sqrt(16129 + 2448* log10(y)))/2)
  //
  double log_multiplicand = log10(shares_int);
  double discriminant = 16129 + 2448 * log_multiplicand;
  double square_root = sqrt(discriminant);
  double exponent = (-125 + square_root)/2;
  double scaled_val = pow(2, exponent);
  int x = (int) scaled_val;
  log_trace(os, container)("Scaled CPU shares value is: %d", x);
  // Since the scaled value is not precise, return the closest
  // multiple of PER_CPU_SHARES for a more conservative mapping
  if ( x <= PER_CPU_SHARES ) {
     // Don't do the multiples of PER_CPU_SHARES mapping since we
     // have a value <= PER_CPU_SHARES
     log_debug(os, container)("CPU Shares is: %d", x);
     result = x;
     return true;
  }
  int f = x/PER_CPU_SHARES;
  int lower_multiple = f * PER_CPU_SHARES;
  int upper_multiple = (f + 1) * PER_CPU_SHARES;
  int distance_lower = MAX2(lower_multiple, x) - MIN2(lower_multiple, x);
  int distance_upper = MAX2(upper_multiple, x) - MIN2(upper_multiple, x);
  x = distance_lower <= distance_upper ? lower_multiple : upper_multiple;
  log_trace(os, container)("Closest multiple of %d of the CPU Shares value is: %d", PER_CPU_SHARES, x);
  log_debug(os, container)("CPU Shares is: %d", x);
  result = x;
  return true;
}

/* cpu_quota
 *
 * Return the number of microseconds per period
 * process is guaranteed to run in the passed in 'result' reference.
 *
 * return:
 *    true if the result reference has been set
 *    false on error
 */
bool CgroupV2CpuController::cpu_quota(int& result) {
  uint64_t quota_val = 0;
  if (!reader()->read_numerical_tuple_value("/cpu.max", true /* use_first */, quota_val)) {
    return false;
  }
  int limit = -1;
  // The read first tuple value might be 'max' which maps
  // to value_unlimited. Keep that at -1;
  if (quota_val != value_unlimited) {
    limit = static_cast<int>(quota_val);
  }
  log_trace(os, container)("CPU Quota is: %d", limit);
  result = limit;
  return true;
}

// Constructor
CgroupV2Subsystem::CgroupV2Subsystem(CgroupV2MemoryController * memory,
                                     CgroupV2CpuController* cpu,
                                     CgroupV2CpuacctController* cpuacct,
                                     CgroupV2Controller unified) :
                                     _unified(unified) {
  CgroupUtil::adjust_controller(memory);
  CgroupUtil::adjust_controller(cpu);
  _memory = new CachingCgroupController<CgroupMemoryController>(memory);
  _cpu = new CachingCgroupController<CgroupCpuController>(cpu);
  _cpuacct = cpuacct;
}

bool CgroupV2Subsystem::is_containerized() {
  return _unified.is_read_only() &&
         _memory->controller()->is_read_only() &&
         _cpu->controller()->is_read_only();
}

char* CgroupV2Subsystem::cpu_cpuset_cpus() {
  char cpus[1024];
  CONTAINER_READ_STRING_CHECKED(unified(), "/cpuset.cpus", "cpuset.cpus", cpus, 1024);
  return os::strdup(cpus);
}

char* CgroupV2Subsystem::cpu_cpuset_memory_nodes() {
  char mems[1024];
  CONTAINER_READ_STRING_CHECKED(unified(), "/cpuset.mems", "cpuset.mems", mems, 1024);
  return os::strdup(mems);
}

bool CgroupV2CpuController::cpu_period(int& result) {
  uint64_t cpu_period = 0;
  if (!reader()->read_numerical_tuple_value("/cpu.max", false /* use_first */, cpu_period)) {
    log_trace(os, container)("CPU Period failed");
    return false;
  }
  int period_int = static_cast<int>(cpu_period);
  log_trace(os, container)("CPU Period is: %d", period_int);
  result = period_int;
  return true;
}

bool CgroupV2CpuController::cpu_usage_in_micros(uint64_t& value) {
  bool is_ok = reader()->read_numerical_key_value("/cpu.stat", "usage_usec", value);
  if (!is_ok) {
    log_trace(os, container)("CPU Usage failed");
    return false;
  }
  log_trace(os, container)("CPU Usage is: " UINT64_FORMAT, value);
  return true;
}

/* memory_usage_in_bytes
 *
 * read the amount of used memory used by this cgroup and descendents
 * into the passed in 'value' reference.
 *
 * return:
 *    false on failure, true otherwise.
 */
bool CgroupV2MemoryController::memory_usage_in_bytes(physical_memory_size_type& value) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.current", "Memory Usage", value);
}

bool CgroupV2MemoryController::memory_soft_limit_in_bytes(physical_memory_size_type upper_bound,
                                                          physical_memory_size_type& value) {
  CONTAINER_READ_NUMBER_CHECKED_MAX(reader(), "/memory.low", "Memory Soft Limit", value);
}

bool CgroupV2MemoryController::memory_throttle_limit_in_bytes(physical_memory_size_type& value) {
  CONTAINER_READ_NUMBER_CHECKED_MAX(reader(), "/memory.high", "Memory Throttle Limit", value);
}

bool CgroupV2MemoryController::memory_max_usage_in_bytes(physical_memory_size_type& value) {
  CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.peak", "Maximum Memory Usage", value);
}

bool CgroupV2MemoryController::rss_usage_in_bytes(physical_memory_size_type& value) {
  if (!reader()->read_numerical_key_value("/memory.stat", "anon", value)) {
    return false;
  }
  log_trace(os, container)("RSS usage is: " PHYS_MEM_TYPE_FORMAT, value);
  return true;
}

bool CgroupV2MemoryController::cache_usage_in_bytes(physical_memory_size_type& value) {
  if (!reader()->read_numerical_key_value("/memory.stat", "file", value)) {
    return false;
  }
  log_trace(os, container)("Cache usage is: " PHYS_MEM_TYPE_FORMAT, value);
  return true;
}

// Note that for cgroups v2 the actual limits set for swap and
// memory live in two different files, memory.swap.max and memory.max
// respectively. In order to properly report a cgroup v1 like
// compound value we need to sum the two values. Setting a swap limit
// without also setting a memory limit is not allowed.
bool CgroupV2MemoryController::memory_and_swap_limit_in_bytes(physical_memory_size_type upper_mem_bound,
                                                              physical_memory_size_type upper_swap_bound, /* unused in cg v2 */
                                                              physical_memory_size_type& result) {
  physical_memory_size_type swap_limit_val = 0;
  if (!reader()->read_number_handle_max("/memory.swap.max", swap_limit_val)) {
    // Some container tests rely on this trace logging to happen.
    log_trace(os, container)("Swap Limit failed");
    // swap disabled at kernel level, treat it as no swap
    physical_memory_size_type mem_limit = value_unlimited;
    if (!read_memory_limit_in_bytes(upper_mem_bound, mem_limit)) {
      return false;
    }
    result = mem_limit;
    return true;
  }
  if (swap_limit_val == value_unlimited) {
    log_trace(os, container)("Memory and Swap Limit is: Unlimited");
    result = swap_limit_val;
    return true;
  }
  log_trace(os, container)("Swap Limit is: " PHYS_MEM_TYPE_FORMAT, swap_limit_val);
  physical_memory_size_type memory_limit = 0;
  if (read_memory_limit_in_bytes(upper_mem_bound, memory_limit)) {
    assert(memory_limit != value_unlimited, "swap limit without memory limit?");
    result = memory_limit + swap_limit_val;
    log_trace(os, container)("Memory and Swap Limit is: " PHYS_MEM_TYPE_FORMAT, result);
    return true;
  } else {
    return false;
  }
}

// memory.swap.current : total amount of swap currently used by the cgroup and its descendants
static
bool memory_swap_current_value(CgroupV2Controller* ctrl, physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED(ctrl, "/memory.swap.current", "Swap currently used", result);
}

bool CgroupV2MemoryController::memory_and_swap_usage_in_bytes(physical_memory_size_type upper_mem_bound,
                                                              physical_memory_size_type upper_swap_bound,
                                                              physical_memory_size_type& result) {
  physical_memory_size_type memory_usage = 0;
  if (!memory_usage_in_bytes(memory_usage)) {
     return false;
  }
  physical_memory_size_type swap_current = 0;
  if (!memory_swap_current_value(reader(), swap_current)) {
    result = memory_usage; // treat as no swap usage
    return true;
  }
  result = memory_usage + swap_current;
  return true;
}

static
bool memory_limit_value(CgroupV2Controller* ctrl, physical_memory_size_type& result) {
  CONTAINER_READ_NUMBER_CHECKED_MAX(ctrl, "/memory.max", "Memory Limit", result);
}

/* read_memory_limit_in_bytes
 *
 * Calculate the limit of available memory for this process. The result will be
 * set in the 'result' variable if the function returns true.
 *
 * return:
 *    true when the limit could be read correctly.
 *    false in case of any error.
 */
bool CgroupV2MemoryController::read_memory_limit_in_bytes(physical_memory_size_type upper_bound,
                                                          physical_memory_size_type& result) {
  physical_memory_size_type limit = 0; // default unlimited
  if (!memory_limit_value(reader(), limit)) {
    log_trace(os, container)("container memory limit failed, using host value " PHYS_MEM_TYPE_FORMAT,
                              upper_bound);
    return false;
  }
  bool is_unlimited = limit == value_unlimited;
  bool exceeds_physical_mem = false;
  if (!is_unlimited && limit >= upper_bound) {
    exceeds_physical_mem = true;
  }
  if (log_is_enabled(Trace, os, container)) {
    if (!is_unlimited) {
      log_trace(os, container)("Memory Limit is: " PHYS_MEM_TYPE_FORMAT, limit);
    }
    if (is_unlimited || exceeds_physical_mem) {
      if (is_unlimited) {
        log_trace(os, container)("Memory Limit is: Unlimited");
        log_trace(os, container)("container memory limit unlimited, using upper bound value " PHYS_MEM_TYPE_FORMAT, upper_bound);
      } else {
        log_trace(os, container)("container memory limit ignored: " PHYS_MEM_TYPE_FORMAT ", upper bound is " PHYS_MEM_TYPE_FORMAT,
                                 limit, upper_bound);
      }
    }
  }
  result = limit;
  return true;
}

static
bool memory_swap_limit_value(CgroupV2Controller* ctrl, physical_memory_size_type& value) {
  CONTAINER_READ_NUMBER_CHECKED_MAX(ctrl, "/memory.swap.max", "Swap Limit", value);
}

void CgroupV2Controller::set_subsystem_path(const char* cgroup_path) {
  if (_cgroup_path != nullptr) {
    os::free(_cgroup_path);
  }
  _cgroup_path = os::strdup(cgroup_path);
  if (_path != nullptr) {
    os::free(_path);
  }
  _path = construct_path(_mount_point, cgroup_path);
}

// For cgv2 we only need hierarchy walk if the cgroup path isn't '/' (root)
bool CgroupV2Controller::needs_hierarchy_adjustment() {
  return strcmp(_cgroup_path, "/") != 0;
}

void CgroupV2MemoryController::print_version_specific_info(outputStream* st, physical_memory_size_type upper_mem_bound) {
  MetricResult swap_current;
  physical_memory_size_type swap_current_val = 0;
  if (memory_swap_current_value(reader(), swap_current_val)) {
    swap_current.set_value(swap_current_val);
  }
  MetricResult swap_limit;
  physical_memory_size_type swap_limit_val = 0;
  if (memory_swap_limit_value(reader(), swap_limit_val)) {
    swap_limit.set_value(swap_limit_val);
  }
  OSContainer::print_container_helper(st, swap_current, "memory_swap_current");
  OSContainer::print_container_helper(st, swap_limit, "memory_swap_max_limit");
}

char* CgroupV2Controller::construct_path(char* mount_path, const char* cgroup_path) {
  stringStream ss;
  ss.print_raw(mount_path);
  if (strcmp(cgroup_path, "/") != 0) {
    ss.print_raw(cgroup_path);
  }
  return os::strdup(ss.base());
}

/* pids_max
 *
 * Calculate the maximum number of tasks available to the process. Set the
 * value in the passed in 'value' reference. The value might be 'value_unlimited' when
 * there is no limit.
 *
 * return:
 *    true if the value has been set appropriately
 *    false if there was an error
 */
bool CgroupV2Subsystem::pids_max(uint64_t& value) {
  CONTAINER_READ_NUMBER_CHECKED_MAX(unified(), "/pids.max", "Maximum number of tasks", value);
}

/* pids_current
 *
 * The number of tasks currently in the cgroup (and its descendants) of the process. Set
 * in the passed in 'value' reference.
 *
 * return:
 *    true on success
 *    false when there was an error
 */
bool CgroupV2Subsystem::pids_current(uint64_t& value) {
  CONTAINER_READ_NUMBER_CHECKED(unified(), "/pids.current", "Current number of tasks", value);
}