Simple strategy with OOMKill data (#257)

* Simple strategy with OOMKill data * Merge oom into simple, minor other improvements * Fix join bug in MaxOOMKilledMemoryLoader * Minor improvements * Colored oom message, comments improvements * One more comment improvement
author: Pavel Zhukov <33721692+LeaveMyYard@users.noreply.github.com> 2024-04-29 17:08:26 +0300
committer: GitHub <noreply@github.com> 2024-04-29 17:08:26 +0300
commit: dd13deeac4ca62f8187d77d944305675abc9dba8 (patch)
tree: 8aeda833042b293519bce28896ace24144aadbf2 /robusta_krr/strategies
parent: 4ce0e9ac47ea60b5d03eda159e3c76b739b14b64 (diff)
2 files changed, 65 insertions, 9 deletions
diff --git a/robusta_krr/strategies/__init__.py b/robusta_krr/strategies/__init__.py
index 05e029b..3409ebd 100644
--- a/robusta_krr/strategies/__init__.py
+++ b/robusta_krr/strategies/__init__.py
@@ -1 +1 @@
-from .simple import SimpleStrategy
+from .simple import SimpleStrategy
+\ No newline at end of file
diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py
index 64e58ac..3cecd18 100644
--- a/robusta_krr/strategies/simple.py
+++ b/robusta_krr/strategies/simple.py
@@ -19,8 +19,10 @@ from robusta_krr.core.integrations.prometheus.metrics import (
     MemoryAmountLoader,
     PercentileCPULoader,
     PrometheusMetric,
+    MaxOOMKilledMemoryLoader,
 )
 
+
 class SimpleStrategySettings(StrategySettings):
     cpu_percentile: float = pd.Field(99, gt=0, le=100, description="The percentile to use for the CPU recommendation.")
     memory_buffer_percentage: float = pd.Field(
@@ -29,14 +31,27 @@ class SimpleStrategySettings(StrategySettings):
     points_required: int = pd.Field(
         100, ge=1, description="The number of data points required to make a recommendation for a resource."
     )
-    allow_hpa: bool = pd.Field(False, description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.")
+    allow_hpa: bool = pd.Field(
+        False,
+        description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.",
+    )
+    use_oomkill_data: bool = pd.Field(
+        False,
+        description="Whether to bump the memory when OOMKills are detected (experimental).",
+    )
+    oom_memory_buffer_percentage: float = pd.Field(
+        25, gt=0, description="What percentage to increase the memory when there are OOMKill events."
+    )
 
-    def calculate_memory_proposal(self, data: PodsTimeData) -> float:
+    def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float:
         data_ = [np.max(values[:, 1]) for values in data.values()]
         if len(data_) == 0:
             return float("NaN")
 
-        return np.max(data_) * (1 + self.memory_buffer_percentage / 100)
+        return max(
+            np.max(data_) * (1 + self.memory_buffer_percentage / 100),
+            max_oomkill * (1 + self.oom_memory_buffer_percentage / 100),
+        )
 
     def calculate_cpu_proposal(self, data: PodsTimeData) -> float:
         if len(data) == 0:
@@ -75,7 +90,17 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
 
     @property
     def metrics(self) -> list[type[PrometheusMetric]]:
-        return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader, CPUAmountLoader, MemoryAmountLoader]
+        metrics = [
+            PercentileCPULoader(self.settings.cpu_percentile),
+            MaxMemoryLoader,
+            CPUAmountLoader,
+            MemoryAmountLoader,
+        ]
+
+        if self.settings.use_oomkill_data:
+            metrics.append(MaxOOMKilledMemoryLoader)
+
+        return metrics
 
     def __calculate_cpu_proposal(
         self, history_data: MetricsPodData, object_data: K8sObjectData
@@ -85,13 +110,20 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
         if len(data) == 0:
             return ResourceRecommendation.undefined(info="No data")
 
+        # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
+        # As CPUAmountLoader returns only the last value (1 point), [0, 1] is used to get the value
+        # So each pod is string with pod name, and values is numpy array of shape (N, 2)
         data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()}
         total_points_count = sum(data_count.values())
 
         if total_points_count < self.settings.points_required:
             return ResourceRecommendation.undefined(info="Not enough data")
 
-        if object_data.hpa is not None and object_data.hpa.target_cpu_utilization_percentage is not None and not self.settings.allow_hpa:
+        if (
+            object_data.hpa is not None
+            and object_data.hpa.target_cpu_utilization_percentage is not None
+            and not self.settings.allow_hpa
+        ):
             return ResourceRecommendation.undefined(info="HPA detected")
 
         cpu_usage = self.settings.calculate_cpu_proposal(data)
@@ -102,20 +134,44 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
     ) -> ResourceRecommendation:
         data = history_data["MaxMemoryLoader"]
 
+        oomkill_detected = False
+
+        if self.settings.use_oomkill_data:
+            max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"]
+            # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
+            # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value
+            # So each value is numpy array of shape (N, 2)
+            max_oomkill_value = (
+                np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0
+            )
+            if max_oomkill_value != 0:
+                oomkill_detected = True
+        else:
+            max_oomkill_value = 0
+
         if len(data) == 0:
             return ResourceRecommendation.undefined(info="No data")
 
+        # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
+        # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value
+        # So each pod is string with pod name, and values is numpy array of shape (N, 2)
         data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()}
         total_points_count = sum(data_count.values())
 
         if total_points_count < self.settings.points_required:
             return ResourceRecommendation.undefined(info="Not enough data")
 
-        if object_data.hpa is not None and object_data.hpa.target_memory_utilization_percentage is not None and not self.settings.allow_hpa:
+        if (
+            object_data.hpa is not None
+            and object_data.hpa.target_memory_utilization_percentage is not None
+            and not self.settings.allow_hpa
+        ):
             return ResourceRecommendation.undefined(info="HPA detected")
 
-        memory_usage = self.settings.calculate_memory_proposal(data)
-        return ResourceRecommendation(request=memory_usage, limit=memory_usage)
+        memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value)
+        return ResourceRecommendation(
+            request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None
+        )
 
     def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult:
         return {
author	Pavel Zhukov <33721692+LeaveMyYard@users.noreply.github.com>	2024-04-29 17:08:26 +0300
committer	GitHub <noreply@github.com>	2024-04-29 17:08:26 +0300
commit	dd13deeac4ca62f8187d77d944305675abc9dba8 (patch)
tree	8aeda833042b293519bce28896ace24144aadbf2 /robusta_krr/strategies
parent	4ce0e9ac47ea60b5d03eda159e3c76b739b14b64 (diff)