From d9e558110b8bf118f3e8bf107969a1e76c19f09f Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Thu, 30 May 2019 13:12:04 -0600 Subject: [PATCH 1/6] Return 0 for negative "free" and "total" memory reported by the OS We've had a situation where the MX bean reported negative values for the free memory of the OS, in those rare cases we want to return a value of 0 rather than blowing up later down the pipeline. In the event that there is a serialization or creation error with regard to memory use, this adds asserts so the failure will occur as soon as possible and give us a better location for investigation. Resolves #42157 --- .../java/org/elasticsearch/monitor/os/OsProbe.java | 14 ++++++++++++-- .../java/org/elasticsearch/monitor/os/OsStats.java | 4 ++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index 18173dd275a46..3050ed51c9f0d 100644 --- a/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -70,7 +70,12 @@ public long getFreePhysicalMemorySize() { return -1; } try { - return (long) getFreePhysicalMemorySize.invoke(osMxBean); + final long freeMem = (long) getFreePhysicalMemorySize.invoke(osMxBean); + if (freeMem < 0) { + logger.warn("OS reported a negative free memory value [{}]", freeMem); + return 0; + } + return freeMem; } catch (Exception e) { return -1; } @@ -84,7 +89,12 @@ public long getTotalPhysicalMemorySize() { return -1; } try { - return (long) getTotalPhysicalMemorySize.invoke(osMxBean); + final long totalMem = (long) getTotalPhysicalMemorySize.invoke(osMxBean); + if (totalMem < 0) { + logger.warn("OS reported a negative total memory value [{}]", totalMem); + return 0; + } + return totalMem; } catch (Exception e) { return -1; } diff --git a/server/src/main/java/org/elasticsearch/monitor/os/OsStats.java b/server/src/main/java/org/elasticsearch/monitor/os/OsStats.java index 86047281a22fb..9dbd9e4365a53 100644 --- a/server/src/main/java/org/elasticsearch/monitor/os/OsStats.java +++ b/server/src/main/java/org/elasticsearch/monitor/os/OsStats.java @@ -228,13 +228,17 @@ public static class Mem implements Writeable, ToXContentFragment { private final long free; public Mem(long total, long free) { + assert total >= 0 : "expected total memory to be positive, got: " + total; + assert free >= 0 : "expected free memory to be positive, got: " + total; this.total = total; this.free = free; } public Mem(StreamInput in) throws IOException { this.total = in.readLong(); + assert total >= 0 : "expected total memory to be positive, got: " + total; this.free = in.readLong(); + assert free >= 0 : "expected free memory to be positive, got: " + total; } @Override From 1797f2cc8f4325f8594968917ddc5d810ca3c932 Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Thu, 30 May 2019 14:02:31 -0600 Subject: [PATCH 2/6] Fix test passing in invalid memory value --- .../monitoring/collector/node/NodeStatsMonitoringDocTests.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/node/NodeStatsMonitoringDocTests.java b/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/node/NodeStatsMonitoringDocTests.java index 4296bedfe0da3..2906adac892b0 100644 --- a/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/node/NodeStatsMonitoringDocTests.java +++ b/x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/node/NodeStatsMonitoringDocTests.java @@ -329,7 +329,7 @@ private static NodeStats mockNodeStats() { final OsStats.Cgroup osCgroup = new OsStats.Cgroup("_cpu_acct_ctrl_group", ++iota, "_cpu_ctrl_group", ++iota, ++iota, osCpuStat, "_memory_ctrl_group", "2000000000", "1000000000"); - final OsStats.Mem osMem = new OsStats.Mem(no, no); + final OsStats.Mem osMem = new OsStats.Mem(0, 0); final OsStats.Swap osSwap = new OsStats.Swap(no, no); final OsStats os = new OsStats(no, osCpu, osMem, osSwap, osCgroup); From 2c86349f2cbeaa80179da85249a2a1f1b5b57c0e Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Thu, 30 May 2019 15:01:18 -0600 Subject: [PATCH 3/6] Fix another test passing in invalid memory value --- .../java/org/elasticsearch/xpack/ml/MachineLearningTests.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MachineLearningTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MachineLearningTests.java index 9504cbe7a7011..e83fcdb5af6fb 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MachineLearningTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MachineLearningTests.java @@ -97,8 +97,8 @@ public void testNoAttributes_givenClash() { public void testMachineMemory_givenStatsFailure() throws IOException { OsStats stats = mock(OsStats.class); - when(stats.getMem()).thenReturn(new OsStats.Mem(-1, -1)); - assertEquals(-1L, MachineLearning.machineMemoryFromStats(stats)); + when(stats.getMem()).thenReturn(new OsStats.Mem(0, 0)); + assertEquals(0L, MachineLearning.machineMemoryFromStats(stats)); } public void testMachineMemory_givenNoCgroup() throws IOException { From 04b27ea227dbdb4080c388f8e6a4217bbf52f85c Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Fri, 14 Jun 2019 14:05:30 -0600 Subject: [PATCH 4/6] Also change mem check in MachineLearning.machineMemoryFromStats --- .../main/java/org/elasticsearch/xpack/ml/MachineLearning.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java index 42e945ffec8fd..92512224e58a8 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java @@ -797,8 +797,8 @@ static long machineMemoryFromStats(OsStats stats) { if (containerLimitStr != null) { BigInteger containerLimit = new BigInteger(containerLimitStr); if ((containerLimit.compareTo(BigInteger.valueOf(mem)) < 0 && containerLimit.compareTo(BigInteger.ZERO) > 0) - // mem < 0 means the value couldn't be obtained for some reason - || (mem < 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) { + // mem <= 0 means the value couldn't be obtained for some reason + || (mem <= 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) { mem = containerLimit.longValue(); } } From 2fad4254413fc6349bb192005aaf5db8ad78b5c0 Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Fri, 14 Jun 2019 16:20:57 -0600 Subject: [PATCH 5/6] Add background documentation for why we prevent negative return values --- .../org/elasticsearch/monitor/os/OsProbe.java | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index 3050ed51c9f0d..36e9c9666af22 100644 --- a/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -42,6 +42,23 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; +/** + * The {@link OsProbe} class retrieves information about the physical and swap size of the machine + * memory, as well as the system load average and cpu load. + * + * In some exceptional cases, it's possible the underlying native method used by + * {@link #getFreePhysicalMemorySize()} and {@link #getTotalPhysicalMemorySize()} can return a + * negative value. Because of this, we prevent those methods from returning negative values, + * returning 0 instead. + * + * The OS can report a negative number in a number of cases: + * - Non-supported OSes (HP-UX, AIX, OSX after failing to initialize host statistics + * - An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call + * - An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE} + * - An error case retrieving these values from a linux kernel + * - A non-standard libc implementation not implementing the required values + * For a more exhaustive explanation, see https://github.com/elastic/elasticsearch/pull/42725 + */ public class OsProbe { private static final OperatingSystemMXBean osMxBean = ManagementFactory.getOperatingSystemMXBean(); @@ -67,7 +84,8 @@ public class OsProbe { */ public long getFreePhysicalMemorySize() { if (getFreePhysicalMemorySize == null) { - return -1; + logger.warn("getFreePhysicalMemorySize is not available"); + return 0; } try { final long freeMem = (long) getFreePhysicalMemorySize.invoke(osMxBean); @@ -77,7 +95,8 @@ public long getFreePhysicalMemorySize() { } return freeMem; } catch (Exception e) { - return -1; + logger.warn("exception retrieving free physical memory", e); + return 0; } } @@ -86,7 +105,8 @@ public long getFreePhysicalMemorySize() { */ public long getTotalPhysicalMemorySize() { if (getTotalPhysicalMemorySize == null) { - return -1; + logger.warn("getTotalPhysicalMemorySize is not available"); + return 0; } try { final long totalMem = (long) getTotalPhysicalMemorySize.invoke(osMxBean); @@ -96,7 +116,8 @@ public long getTotalPhysicalMemorySize() { } return totalMem; } catch (Exception e) { - return -1; + logger.warn("exception retrieving total physical memory", e); + return 0; } } From 6e87a2d16b103c76167364d3c50657d1fbd5944b Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Mon, 17 Jun 2019 08:43:54 -0600 Subject: [PATCH 6/6] Clarify comment a bit more --- server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index 36e9c9666af22..320bc15fda1f4 100644 --- a/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -52,7 +52,8 @@ * returning 0 instead. * * The OS can report a negative number in a number of cases: - * - Non-supported OSes (HP-UX, AIX, OSX after failing to initialize host statistics + * - Non-supported OSes (HP-UX, or AIX) + * - A failure of macOS to initialize host statistics * - An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call * - An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE} * - An error case retrieving these values from a linux kernel