summaryrefslogtreecommitdiff
path: root/robusta_krr/strategies
diff options
context:
space:
mode:
authorPavel Zhukov <33721692+LeaveMyYard@users.noreply.github.com>2024-04-29 17:08:26 +0300
committerGitHub <noreply@github.com>2024-04-29 17:08:26 +0300
commitdd13deeac4ca62f8187d77d944305675abc9dba8 (patch)
tree8aeda833042b293519bce28896ace24144aadbf2 /robusta_krr/strategies
parent4ce0e9ac47ea60b5d03eda159e3c76b739b14b64 (diff)
Simple strategy with OOMKill data (#257)
* Simple strategy with OOMKill data * Merge oom into simple, minor other improvements * Fix join bug in MaxOOMKilledMemoryLoader * Minor improvements * Colored oom message, comments improvements * One more comment improvement
Diffstat (limited to 'robusta_krr/strategies')
-rw-r--r--robusta_krr/strategies/__init__.py2
-rw-r--r--robusta_krr/strategies/simple.py72
2 files changed, 65 insertions, 9 deletions
diff --git a/robusta_krr/strategies/__init__.py b/robusta_krr/strategies/__init__.py
index 05e029b..3409ebd 100644
--- a/robusta_krr/strategies/__init__.py
+++ b/robusta_krr/strategies/__init__.py
@@ -1 +1 @@
-from .simple import SimpleStrategy
+from .simple import SimpleStrategy \ No newline at end of file
diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py
index 64e58ac..3cecd18 100644
--- a/robusta_krr/strategies/simple.py
+++ b/robusta_krr/strategies/simple.py
@@ -19,8 +19,10 @@ from robusta_krr.core.integrations.prometheus.metrics import (
MemoryAmountLoader,
PercentileCPULoader,
PrometheusMetric,
+ MaxOOMKilledMemoryLoader,
)
+
class SimpleStrategySettings(StrategySettings):
cpu_percentile: float = pd.Field(99, gt=0, le=100, description="The percentile to use for the CPU recommendation.")
memory_buffer_percentage: float = pd.Field(
@@ -29,14 +31,27 @@ class SimpleStrategySettings(StrategySettings):
points_required: int = pd.Field(
100, ge=1, description="The number of data points required to make a recommendation for a resource."
)
- allow_hpa: bool = pd.Field(False, description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.")
+ allow_hpa: bool = pd.Field(
+ False,
+ description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.",
+ )
+ use_oomkill_data: bool = pd.Field(
+ False,
+ description="Whether to bump the memory when OOMKills are detected (experimental).",
+ )
+ oom_memory_buffer_percentage: float = pd.Field(
+ 25, gt=0, description="What percentage to increase the memory when there are OOMKill events."
+ )
- def calculate_memory_proposal(self, data: PodsTimeData) -> float:
+ def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float:
data_ = [np.max(values[:, 1]) for values in data.values()]
if len(data_) == 0:
return float("NaN")
- return np.max(data_) * (1 + self.memory_buffer_percentage / 100)
+ return max(
+ np.max(data_) * (1 + self.memory_buffer_percentage / 100),
+ max_oomkill * (1 + self.oom_memory_buffer_percentage / 100),
+ )
def calculate_cpu_proposal(self, data: PodsTimeData) -> float:
if len(data) == 0:
@@ -75,7 +90,17 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
@property
def metrics(self) -> list[type[PrometheusMetric]]:
- return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader, CPUAmountLoader, MemoryAmountLoader]
+ metrics = [
+ PercentileCPULoader(self.settings.cpu_percentile),
+ MaxMemoryLoader,
+ CPUAmountLoader,
+ MemoryAmountLoader,
+ ]
+
+ if self.settings.use_oomkill_data:
+ metrics.append(MaxOOMKilledMemoryLoader)
+
+ return metrics
def __calculate_cpu_proposal(
self, history_data: MetricsPodData, object_data: K8sObjectData
@@ -85,13 +110,20 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
if len(data) == 0:
return ResourceRecommendation.undefined(info="No data")
+ # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
+ # As CPUAmountLoader returns only the last value (1 point), [0, 1] is used to get the value
+ # So each pod is string with pod name, and values is numpy array of shape (N, 2)
data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()}
total_points_count = sum(data_count.values())
if total_points_count < self.settings.points_required:
return ResourceRecommendation.undefined(info="Not enough data")
- if object_data.hpa is not None and object_data.hpa.target_cpu_utilization_percentage is not None and not self.settings.allow_hpa:
+ if (
+ object_data.hpa is not None
+ and object_data.hpa.target_cpu_utilization_percentage is not None
+ and not self.settings.allow_hpa
+ ):
return ResourceRecommendation.undefined(info="HPA detected")
cpu_usage = self.settings.calculate_cpu_proposal(data)
@@ -102,20 +134,44 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
) -> ResourceRecommendation:
data = history_data["MaxMemoryLoader"]
+ oomkill_detected = False
+
+ if self.settings.use_oomkill_data:
+ max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"]
+ # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
+ # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value
+ # So each value is numpy array of shape (N, 2)
+ max_oomkill_value = (
+ np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0
+ )
+ if max_oomkill_value != 0:
+ oomkill_detected = True
+ else:
+ max_oomkill_value = 0
+
if len(data) == 0:
return ResourceRecommendation.undefined(info="No data")
+ # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
+ # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value
+ # So each pod is string with pod name, and values is numpy array of shape (N, 2)
data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()}
total_points_count = sum(data_count.values())
if total_points_count < self.settings.points_required:
return ResourceRecommendation.undefined(info="Not enough data")
- if object_data.hpa is not None and object_data.hpa.target_memory_utilization_percentage is not None and not self.settings.allow_hpa:
+ if (
+ object_data.hpa is not None
+ and object_data.hpa.target_memory_utilization_percentage is not None
+ and not self.settings.allow_hpa
+ ):
return ResourceRecommendation.undefined(info="HPA detected")
- memory_usage = self.settings.calculate_memory_proposal(data)
- return ResourceRecommendation(request=memory_usage, limit=memory_usage)
+ memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value)
+ return ResourceRecommendation(
+ request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None
+ )
def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult:
return {