diff options
| author | Pavel Zhukov <33721692+LeaveMyYard@users.noreply.github.com> | 2024-04-30 19:06:27 +0300 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-04-30 19:06:27 +0300 |
| commit | 09c372bfa7039c22eab9e411040585ec675caf6d (patch) | |
| tree | c6428b6c1ed3b6b2139ede20ff704b42a8b23fb3 /robusta_krr/strategies | |
| parent | f7d841278afe46fe3b21da64ca57691b44670ab4 (diff) | |
| parent | dd13deeac4ca62f8187d77d944305675abc9dba8 (diff) | |
Merge branch 'main' into prometheus-workload-loader
Diffstat (limited to 'robusta_krr/strategies')
| -rw-r--r-- | robusta_krr/strategies/__init__.py | 2 | ||||
| -rw-r--r-- | robusta_krr/strategies/simple.py | 72 |
2 files changed, 65 insertions, 9 deletions
diff --git a/robusta_krr/strategies/__init__.py b/robusta_krr/strategies/__init__.py index 05e029b..3409ebd 100644 --- a/robusta_krr/strategies/__init__.py +++ b/robusta_krr/strategies/__init__.py @@ -1 +1 @@ -from .simple import SimpleStrategy +from .simple import SimpleStrategy
\ No newline at end of file diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py index db15f6c..498401f 100644 --- a/robusta_krr/strategies/simple.py +++ b/robusta_krr/strategies/simple.py @@ -19,8 +19,10 @@ from robusta_krr.core.integrations.prometheus.metrics import ( MemoryAmountLoader, PercentileCPULoader, PrometheusMetric, + MaxOOMKilledMemoryLoader, ) + class SimpleStrategySettings(StrategySettings): cpu_percentile: float = pd.Field(99, gt=0, le=100, description="The percentile to use for the CPU recommendation.") memory_buffer_percentage: float = pd.Field( @@ -29,14 +31,27 @@ class SimpleStrategySettings(StrategySettings): points_required: int = pd.Field( 100, ge=1, description="The number of data points required to make a recommendation for a resource." ) - allow_hpa: bool = pd.Field(False, description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.") + allow_hpa: bool = pd.Field( + False, + description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.", + ) + use_oomkill_data: bool = pd.Field( + False, + description="Whether to bump the memory when OOMKills are detected (experimental).", + ) + oom_memory_buffer_percentage: float = pd.Field( + 25, gt=0, description="What percentage to increase the memory when there are OOMKill events." + ) - def calculate_memory_proposal(self, data: PodsTimeData) -> float: + def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float: data_ = [np.max(values[:, 1]) for values in data.values()] if len(data_) == 0: return float("NaN") - return np.max(data_) * (1 + self.memory_buffer_percentage / 100) + return max( + np.max(data_) * (1 + self.memory_buffer_percentage / 100), + max_oomkill * (1 + self.oom_memory_buffer_percentage / 100), + ) def calculate_cpu_proposal(self, data: PodsTimeData) -> float: if len(data) == 0: @@ -75,7 +90,17 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): @property def metrics(self) -> list[type[PrometheusMetric]]: - return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader, CPUAmountLoader, MemoryAmountLoader] + metrics = [ + PercentileCPULoader(self.settings.cpu_percentile), + MaxMemoryLoader, + CPUAmountLoader, + MemoryAmountLoader, + ] + + if self.settings.use_oomkill_data: + metrics.append(MaxOOMKilledMemoryLoader) + + return metrics def __calculate_cpu_proposal( self, history_data: MetricsPodData, object_data: K8sWorkload @@ -85,13 +110,20 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] + # As CPUAmountLoader returns only the last value (1 point), [0, 1] is used to get the value + # So each pod is string with pod name, and values is numpy array of shape (N, 2) data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()} total_points_count = sum(data_count.values()) if total_points_count < self.settings.points_required: return ResourceRecommendation.undefined(info="Not enough data") - if object_data.hpa is not None and object_data.hpa.target_cpu_utilization_percentage is not None and not self.settings.allow_hpa: + if ( + object_data.hpa is not None + and object_data.hpa.target_cpu_utilization_percentage is not None + and not self.settings.allow_hpa + ): return ResourceRecommendation.undefined(info="HPA detected") cpu_usage = self.settings.calculate_cpu_proposal(data) @@ -102,20 +134,44 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): ) -> ResourceRecommendation: data = history_data["MaxMemoryLoader"] + oomkill_detected = False + + if self.settings.use_oomkill_data: + max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"] + # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] + # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value + # So each value is numpy array of shape (N, 2) + max_oomkill_value = ( + np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0 + ) + if max_oomkill_value != 0: + oomkill_detected = True + else: + max_oomkill_value = 0 + if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] + # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value + # So each pod is string with pod name, and values is numpy array of shape (N, 2) data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} total_points_count = sum(data_count.values()) if total_points_count < self.settings.points_required: return ResourceRecommendation.undefined(info="Not enough data") - if object_data.hpa is not None and object_data.hpa.target_memory_utilization_percentage is not None and not self.settings.allow_hpa: + if ( + object_data.hpa is not None + and object_data.hpa.target_memory_utilization_percentage is not None + and not self.settings.allow_hpa + ): return ResourceRecommendation.undefined(info="HPA detected") - memory_usage = self.settings.calculate_memory_proposal(data) - return ResourceRecommendation(request=memory_usage, limit=memory_usage) + memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value) + return ResourceRecommendation( + request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None + ) def run(self, history_data: MetricsPodData, object_data: K8sWorkload) -> RunResult: return { |
