Skip to content

Commit 087747b

Browse files
Include direct memory and non-heap memory in ML memory calculations (take #2) (elastic#128742)
* Include direct memory and non-heap memory in ML memory calculations. * Reduce ML_ONLY heap size, so that direct memory is accounted for. * [CI] Auto commit changes from spotless * changelog * improve docs * Reuse direct memory to heap factor * feature flag --------- Co-authored-by: elasticsearchmachine <[email protected]>
1 parent a60f4ef commit 087747b

File tree

6 files changed

+58
-24
lines changed

6 files changed

+58
-24
lines changed

distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
*/
2929
final class JvmErgonomics {
3030

31+
static final double DIRECT_MEMORY_TO_HEAP_FACTOR = 0.5;
32+
3133
private JvmErgonomics() {
3234
throw new AssertionError("No instances intended");
3335
}
@@ -44,7 +46,7 @@ static List<String> choose(final List<String> userDefinedJvmOptions, Settings no
4446
final long heapSize = JvmOption.extractMaxHeapSize(finalJvmOptions);
4547
final long maxDirectMemorySize = JvmOption.extractMaxDirectMemorySize(finalJvmOptions);
4648
if (maxDirectMemorySize == 0) {
47-
ergonomicChoices.add("-XX:MaxDirectMemorySize=" + heapSize / 2);
49+
ergonomicChoices.add("-XX:MaxDirectMemorySize=" + (long) (DIRECT_MEMORY_TO_HEAP_FACTOR * heapSize));
4850
}
4951

5052
final boolean tuneG1GCForSmallHeap = tuneG1GCForSmallHeap(heapSize);

distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
1313
import org.elasticsearch.common.settings.Settings;
14+
import org.elasticsearch.common.util.FeatureFlag;
1415
import org.elasticsearch.node.NodeRoleSettings;
1516

1617
import java.io.IOException;
@@ -37,6 +38,8 @@ public class MachineDependentHeap {
3738
protected static final long MAX_HEAP_SIZE = GB * 31; // 31GB
3839
protected static final long MIN_HEAP_SIZE = 1024 * 1024 * 128; // 128MB
3940

41+
private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation");
42+
4043
public MachineDependentHeap() {}
4144

4245
/**
@@ -76,12 +79,16 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av
7679
/*
7780
* Machine learning only node.
7881
*
79-
* <p>Heap is computed as:
80-
* <ul>
81-
* <li>40% of total system memory when total system memory 16 gigabytes or less.</li>
82-
* <li>40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes.</li>
83-
* <li>The absolute maximum heap size is 31 gigabytes.</li>
84-
* </ul>
82+
* The memory reserved for Java is computed as:
83+
* - 40% of total system memory when total system memory 16 gigabytes or less.
84+
* - 40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes.
85+
* - The absolute maximum heap size is 31 gigabytes.
86+
*
87+
* This Java memory is divided as follows:
88+
* - 2/3 of the Java memory is reserved for the Java heap.
89+
* - 1/3 of the Java memory is reserved for the Java direct memory.
90+
*
91+
* The direct memory being half of the heap is set by the JvmErgonomics class.
8592
*
8693
* In all cases the result is rounded down to the next whole multiple of 4 megabytes.
8794
* The reason for doing this is that Java will round requested heap sizes to a multiple
@@ -95,13 +102,22 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av
95102
*
96103
* If this formula is changed then corresponding changes must be made to the {@code NativeMemoryCalculator} and
97104
* {@code MlAutoscalingDeciderServiceTests} classes in the ML plugin code. Failure to keep the logic synchronized
98-
* could result in repeated autoscaling up and down.
105+
* could result in ML processes crashing with OOM errors or repeated autoscaling up and down.
99106
*/
100107
case ML_ONLY -> {
101-
if (availableMemory <= (GB * 16)) {
102-
yield mb((long) (availableMemory * .4), 4);
108+
double heapFractionBelow16GB = 0.4;
109+
double heapFractionAbove16GB = 0.1;
110+
if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) {
111+
heapFractionBelow16GB = 0.4 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR);
112+
heapFractionAbove16GB = 0.1 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR);
113+
}
114+
if (availableMemory <= GB * 16) {
115+
yield mb((long) (availableMemory * heapFractionBelow16GB), 4);
103116
} else {
104-
yield mb((long) min((GB * 16) * .4 + (availableMemory - GB * 16) * .1, MAX_HEAP_SIZE), 4);
117+
yield mb(
118+
(long) min(GB * 16 * heapFractionBelow16GB + (availableMemory - GB * 16) * heapFractionAbove16GB, MAX_HEAP_SIZE),
119+
4
120+
);
105121
}
106122
}
107123
/*

distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,13 @@ public void testMasterOnlyOptions() throws Exception {
5656
}
5757

5858
public void testMlOnlyOptions() throws Exception {
59-
assertHeapOptions(1, containsInAnyOrder("-Xmx408m", "-Xms408m"), "ml");
60-
assertHeapOptions(4, containsInAnyOrder("-Xmx1636m", "-Xms1636m"), "ml");
61-
assertHeapOptions(32, containsInAnyOrder("-Xmx8192m", "-Xms8192m"), "ml");
62-
assertHeapOptions(64, containsInAnyOrder("-Xmx11468m", "-Xms11468m"), "ml");
59+
assertHeapOptions(1, containsInAnyOrder("-Xmx272m", "-Xms272m"), "ml");
60+
assertHeapOptions(4, containsInAnyOrder("-Xmx1092m", "-Xms1092m"), "ml");
61+
assertHeapOptions(32, containsInAnyOrder("-Xmx5460m", "-Xms5460m"), "ml");
62+
assertHeapOptions(64, containsInAnyOrder("-Xmx7644m", "-Xms7644m"), "ml");
6363
// We'd never see a node this big in Cloud, but this assertion proves that the 31GB absolute maximum
6464
// eventually kicks in (because 0.4 * 16 + 0.1 * (263 - 16) > 31)
65-
assertHeapOptions(263, containsInAnyOrder("-Xmx31744m", "-Xms31744m"), "ml");
65+
assertHeapOptions(263, containsInAnyOrder("-Xmx21228m", "-Xms21228m"), "ml");
6666
}
6767

6868
public void testDataNodeOptions() throws Exception {

docs/changelog/128742.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 128742
2+
summary: "Account for Java direct memory on machine learning nodes to prevent out-of-memory crashes."
3+
area: Machine Learning
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,7 @@ public class JvmInfo implements ReportingService.Info {
4343
long nonHeapInit = memoryMXBean.getNonHeapMemoryUsage().getInit() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getInit();
4444
long nonHeapMax = memoryMXBean.getNonHeapMemoryUsage().getMax() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getMax();
4545
long directMemoryMax = 0;
46-
try {
47-
Class<?> vmClass = Class.forName("sun.misc.VM");
48-
directMemoryMax = (Long) vmClass.getMethod("maxDirectMemory").invoke(null);
49-
} catch (Exception t) {
50-
// ignore
51-
}
5246
String[] inputArguments = runtimeMXBean.getInputArguments().toArray(new String[runtimeMXBean.getInputArguments().size()]);
53-
Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax);
5447

5548
String bootClassPath;
5649
try {
@@ -130,6 +123,11 @@ public class JvmInfo implements ReportingService.Info {
130123
configuredMaxHeapSize = Long.parseLong((String) valueMethod.invoke(maxHeapSizeVmOptionObject));
131124
} catch (Exception ignored) {}
132125

126+
try {
127+
Object maxDirectMemorySizeVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "MaxDirectMemorySize");
128+
directMemoryMax = Long.parseLong((String) valueMethod.invoke(maxDirectMemorySizeVmOptionObject));
129+
} catch (Exception ignored) {}
130+
133131
try {
134132
Object useSerialGCVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "UseSerialGC");
135133
useSerialGC = (String) valueMethod.invoke(useSerialGCVmOptionObject);
@@ -139,6 +137,8 @@ public class JvmInfo implements ReportingService.Info {
139137

140138
}
141139

140+
Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax);
141+
142142
INSTANCE = new JvmInfo(
143143
ProcessHandle.current().pid(),
144144
System.getProperty("java.version"),
@@ -496,5 +496,8 @@ public ByteSizeValue getHeapMax() {
496496
return ByteSizeValue.ofBytes(heapMax);
497497
}
498498

499+
public ByteSizeValue getTotalMax() {
500+
return ByteSizeValue.ofBytes(heapMax + nonHeapMax + directMemoryMax);
501+
}
499502
}
500503
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import org.elasticsearch.common.settings.SettingsModule;
4242
import org.elasticsearch.common.unit.ByteSizeValue;
4343
import org.elasticsearch.common.unit.Processors;
44+
import org.elasticsearch.common.util.FeatureFlag;
4445
import org.elasticsearch.common.util.concurrent.EsExecutors;
4546
import org.elasticsearch.core.TimeValue;
4647
import org.elasticsearch.env.Environment;
@@ -557,6 +558,8 @@ public class MachineLearning extends Plugin
557558
License.OperationMode.PLATINUM
558559
);
559560

561+
private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation");
562+
560563
@Override
561564
public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
562565
if (this.enabled == false) {
@@ -874,7 +877,12 @@ public Settings additionalSettings() {
874877
machineMemoryAttrName,
875878
Long.toString(OsProbe.getInstance().osStats().getMem().getAdjustedTotal().getBytes())
876879
);
877-
addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(Runtime.getRuntime().maxMemory()));
880+
881+
long jvmSize = Runtime.getRuntime().maxMemory();
882+
if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) {
883+
jvmSize = JvmInfo.jvmInfo().getMem().getTotalMax().getBytes();
884+
}
885+
addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(jvmSize));
878886
addMlNodeAttribute(
879887
additionalSettings,
880888
deprecatedAllocatedProcessorsAttrName,

0 commit comments

Comments
 (0)