diff options
| author | Pavel Zhukov <33721692+LeaveMyYard@users.noreply.github.com> | 2024-04-29 17:08:26 +0300 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-04-29 17:08:26 +0300 |
| commit | dd13deeac4ca62f8187d77d944305675abc9dba8 (patch) | |
| tree | 8aeda833042b293519bce28896ace24144aadbf2 /robusta_krr/strategies | |
| parent | 4ce0e9ac47ea60b5d03eda159e3c76b739b14b64 (diff) | |
Simple strategy with OOMKill data (#257)
* Simple strategy with OOMKill data
* Merge oom into simple, minor other improvements
* Fix join bug in MaxOOMKilledMemoryLoader
* Minor improvements
* Colored oom message, comments improvements
* One more comment improvement
Diffstat (limited to 'robusta_krr/strategies')
| -rw-r--r-- | robusta_krr/strategies/__init__.py | 2 | ||||
| -rw-r--r-- | robusta_krr/strategies/simple.py | 72 |
2 files changed, 65 insertions, 9 deletions
diff --git a/robusta_krr/strategies/__init__.py b/robusta_krr/strategies/__init__.py index 05e029b..3409ebd 100644 --- a/robusta_krr/strategies/__init__.py +++ b/robusta_krr/strategies/__init__.py @@ -1 +1 @@ -from .simple import SimpleStrategy +from .simple import SimpleStrategy
\ No newline at end of file diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py index 64e58ac..3cecd18 100644 --- a/robusta_krr/strategies/simple.py +++ b/robusta_krr/strategies/simple.py @@ -19,8 +19,10 @@ from robusta_krr.core.integrations.prometheus.metrics import ( MemoryAmountLoader, PercentileCPULoader, PrometheusMetric, + MaxOOMKilledMemoryLoader, ) + class SimpleStrategySettings(StrategySettings): cpu_percentile: float = pd.Field(99, gt=0, le=100, description="The percentile to use for the CPU recommendation.") memory_buffer_percentage: float = pd.Field( @@ -29,14 +31,27 @@ class SimpleStrategySettings(StrategySettings): points_required: int = pd.Field( 100, ge=1, description="The number of data points required to make a recommendation for a resource." ) - allow_hpa: bool = pd.Field(False, description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.") + allow_hpa: bool = pd.Field( + False, + description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.", + ) + use_oomkill_data: bool = pd.Field( + False, + description="Whether to bump the memory when OOMKills are detected (experimental).", + ) + oom_memory_buffer_percentage: float = pd.Field( + 25, gt=0, description="What percentage to increase the memory when there are OOMKill events." + ) - def calculate_memory_proposal(self, data: PodsTimeData) -> float: + def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float: data_ = [np.max(values[:, 1]) for values in data.values()] if len(data_) == 0: return float("NaN") - return np.max(data_) * (1 + self.memory_buffer_percentage / 100) + return max( + np.max(data_) * (1 + self.memory_buffer_percentage / 100), + max_oomkill * (1 + self.oom_memory_buffer_percentage / 100), + ) def calculate_cpu_proposal(self, data: PodsTimeData) -> float: if len(data) == 0: @@ -75,7 +90,17 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): @property def metrics(self) -> list[type[PrometheusMetric]]: - return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader, CPUAmountLoader, MemoryAmountLoader] + metrics = [ + PercentileCPULoader(self.settings.cpu_percentile), + MaxMemoryLoader, + CPUAmountLoader, + MemoryAmountLoader, + ] + + if self.settings.use_oomkill_data: + metrics.append(MaxOOMKilledMemoryLoader) + + return metrics def __calculate_cpu_proposal( self, history_data: MetricsPodData, object_data: K8sObjectData @@ -85,13 +110,20 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] + # As CPUAmountLoader returns only the last value (1 point), [0, 1] is used to get the value + # So each pod is string with pod name, and values is numpy array of shape (N, 2) data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()} total_points_count = sum(data_count.values()) if total_points_count < self.settings.points_required: return ResourceRecommendation.undefined(info="Not enough data") - if object_data.hpa is not None and object_data.hpa.target_cpu_utilization_percentage is not None and not self.settings.allow_hpa: + if ( + object_data.hpa is not None + and object_data.hpa.target_cpu_utilization_percentage is not None + and not self.settings.allow_hpa + ): return ResourceRecommendation.undefined(info="HPA detected") cpu_usage = self.settings.calculate_cpu_proposal(data) @@ -102,20 +134,44 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): ) -> ResourceRecommendation: data = history_data["MaxMemoryLoader"] + oomkill_detected = False + + if self.settings.use_oomkill_data: + max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"] + # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] + # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value + # So each value is numpy array of shape (N, 2) + max_oomkill_value = ( + np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0 + ) + if max_oomkill_value != 0: + oomkill_detected = True + else: + max_oomkill_value = 0 + if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] + # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value + # So each pod is string with pod name, and values is numpy array of shape (N, 2) data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} total_points_count = sum(data_count.values()) if total_points_count < self.settings.points_required: return ResourceRecommendation.undefined(info="Not enough data") - if object_data.hpa is not None and object_data.hpa.target_memory_utilization_percentage is not None and not self.settings.allow_hpa: + if ( + object_data.hpa is not None + and object_data.hpa.target_memory_utilization_percentage is not None + and not self.settings.allow_hpa + ): return ResourceRecommendation.undefined(info="HPA detected") - memory_usage = self.settings.calculate_memory_proposal(data) - return ResourceRecommendation(request=memory_usage, limit=memory_usage) + memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value) + return ResourceRecommendation( + request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None + ) def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult: return { |
