From 118108e41710c05e1dfaf906cdebe4776227d5c7 Mon Sep 17 00:00:00 2001 From: LeaveMyYard Date: Wed, 16 Aug 2023 18:52:04 +0300 Subject: Fix CPU calculation, improve get_query --- .../integrations/prometheus/metrics/__init__.py | 4 +- .../core/integrations/prometheus/metrics/base.py | 24 +++++------ .../core/integrations/prometheus/metrics/cpu.py | 46 ++++++---------------- .../core/integrations/prometheus/metrics/memory.py | 28 ++----------- 4 files changed, 32 insertions(+), 70 deletions(-) diff --git a/robusta_krr/core/integrations/prometheus/metrics/__init__.py b/robusta_krr/core/integrations/prometheus/metrics/__init__.py index 6212a35..35d53af 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/__init__.py +++ b/robusta_krr/core/integrations/prometheus/metrics/__init__.py @@ -1,3 +1,3 @@ from .base import PrometheusMetric -from .cpu import CPULoader, MaxCPULoader, PercentileCPULoader -from .memory import MaxMemoryLoader, MemoryLoader, PercentileMemoryLoader +from .cpu import CPULoader, PercentileCPULoader +from .memory import MaxMemoryLoader, MemoryLoader diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py index 177d908..92a393c 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base.py @@ -7,7 +7,7 @@ import datetime import enum from concurrent.futures import ThreadPoolExecutor from functools import reduce -from typing import TYPE_CHECKING, Any, Optional +from typing import Any, Optional import numpy as np import pydantic as pd @@ -17,9 +17,7 @@ from robusta_krr.core.abstract.strategies import PodsTimeData from robusta_krr.core.models.config import Config from robusta_krr.core.models.objects import K8sObjectData from robusta_krr.utils.configurable import Configurable - -if TYPE_CHECKING: - from .. import CustomPrometheusConnect +from prometrix import CustomPrometheusConnect class QueryType(str, enum.Enum): @@ -69,13 +67,14 @@ class PrometheusMetric(BaseMetric, Configurable): return f', {self.config.prometheus_label}="{self.config.prometheus_cluster_label}"' @abc.abstractmethod - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: """ This method should be implemented by all subclasses to provide a query string to fetch metrics. Args: object (K8sObjectData): The object for which metrics need to be fetched. - resolution (Optional[str]): a string for configurable resolution to the query. + duration (str): a string for duration of the query. + step (str): a string for the step size of the query. Returns: str: The query string. @@ -108,7 +107,7 @@ class PrometheusMetric(BaseMetric, Configurable): return value else: # regular query, lighter on preformance - results = self.prometheus.custom_query(query=data.query) + results = self.prometheus.custom_query(query=data.query, params={"time": data.start_time.timestamp()}) # format the results to return the same format as custom_query_range for result in results: result["values"] = [result.pop("value")] @@ -142,9 +141,12 @@ class PrometheusMetric(BaseMetric, Configurable): Returns: ResourceHistoryData: An instance of the ResourceHistoryData class representing the loaded metrics. """ - resolution = f"{self._step_to_string(period)}:{self._step_to_string(step)}" - query = self.get_query(object, resolution) - end_time = datetime.datetime.now().astimezone() + + step_str = "75s" # self._step_to_string(step) + duration_str = self._step_to_string(period) + + query = self.get_query(object, duration_str, step_str) + end_time = datetime.datetime.now().replace(second=0, microsecond=0).astimezone() start_time = end_time - period result = await self.query_prometheus( @@ -152,7 +154,7 @@ class PrometheusMetric(BaseMetric, Configurable): query=query, start_time=start_time, end_time=end_time, - step=self._step_to_string(step), + step=step_str, type=self.query_type, ) ) diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu.py b/robusta_krr/core/integrations/prometheus/metrics/cpu.py index 302e09d..6dafb06 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/cpu.py +++ b/robusta_krr/core/integrations/prometheus/metrics/cpu.py @@ -4,57 +4,37 @@ from .base import BatchedRequestMixin, FilterJobsMixin, QueryMetric, QueryRangeM class CPULoader(QueryRangeMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: pods_selector = "|".join(pod.name for pod in object.pods) cluster_label = self.get_prometheus_cluster_label() return f""" - sum( - irate( - container_cpu_usage_seconds_total{{ - namespace="{object.namespace}", - pod=~"{pods_selector}", - container="{object.container}" - {cluster_label} - }}[5m] - ) - ) by (container, pod, job) - """ - - -class MaxCPULoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) - cluster_label = self.get_prometheus_cluster_label() - return f""" - max_over_time( - irate( - container_cpu_usage_seconds_total{{ - namespace="{object.namespace}", - pod=~"{pods_selector}", - container="{object.container}" - {cluster_label} - }}[{resolution}] - ) - ) by (container, pod, job) + rate( + container_cpu_usage_seconds_total{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }}[{step}] + ) """ def PercentileCPULoader(percentile: float) -> type[QueryMetric]: class PercentileCPULoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: pods_selector = "|".join(pod.name for pod in object.pods) cluster_label = self.get_prometheus_cluster_label() return f""" quantile_over_time( {round(percentile / 100, 2)}, - irate( + rate( container_cpu_usage_seconds_total{{ namespace="{object.namespace}", pod=~"{pods_selector}", container="{object.container}" {cluster_label} - }}[1m] - )[{resolution}] + }}[{step}] + )[{duration}:{step}] ) """ diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory.py b/robusta_krr/core/integrations/prometheus/metrics/memory.py index ad84abe..b135ae5 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/memory.py +++ b/robusta_krr/core/integrations/prometheus/metrics/memory.py @@ -4,7 +4,7 @@ from .base import BatchedRequestMixin, FilterJobsMixin, QueryMetric, QueryRangeM class MemoryLoader(QueryRangeMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: pods_selector = "|".join(pod.name for pod in object.pods) cluster_label = self.get_prometheus_cluster_label() return f""" @@ -15,12 +15,12 @@ class MemoryLoader(QueryRangeMetric, FilterJobsMixin, BatchedRequestMixin): container="{object.container}" {cluster_label} }} - ) by (container, pod, job, id) + ) by (container, pod, job) """ class MaxMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: pods_selector = "|".join(pod.name for pod in object.pods) cluster_label = self.get_prometheus_cluster_label() return f""" @@ -30,26 +30,6 @@ class MaxMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): pod=~"{pods_selector}", container="{object.container}" {cluster_label} - }}[{resolution}] + }}[{duration}:{step}] ) """ - - -def PercentileMemoryLoader(percentile: float) -> type[QueryMetric]: - class PercentileMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) - cluster_label = self.get_prometheus_cluster_label() - return f""" - quantile_over_time( - {round(percentile / 100, 2)}, - container_memory_working_set_bytes{{ - namespace="{object.namespace}", - pod=~"{pods_selector}", - container="{object.container}" - {cluster_label} - }}[{resolution}] - ) - """ - - return PercentileMemoryLoader -- cgit v1.2.3 From 3daa36a89f33f2934400548525f0e0f9618e8195 Mon Sep 17 00:00:00 2001 From: LeaveMyYard Date: Thu, 17 Aug 2023 15:29:29 +0300 Subject: Remove time param for custom_query --- robusta_krr/core/integrations/prometheus/metrics/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py index 92a393c..87ee583 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base.py @@ -107,7 +107,7 @@ class PrometheusMetric(BaseMetric, Configurable): return value else: # regular query, lighter on preformance - results = self.prometheus.custom_query(query=data.query, params={"time": data.start_time.timestamp()}) + results = self.prometheus.custom_query(query=data.query) # format the results to return the same format as custom_query_range for result in results: result["values"] = [result.pop("value")] -- cgit v1.2.3 From ef4c0deff3733cdc1d336b054239e20ff10ffd99 Mon Sep 17 00:00:00 2001 From: LeaveMyYard Date: Thu, 17 Aug 2023 16:43:02 +0300 Subject: Fix resource format bug --- robusta_krr/utils/resource_units.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/robusta_krr/utils/resource_units.py b/robusta_krr/utils/resource_units.py index 8489b75..9fd290d 100644 --- a/robusta_krr/utils/resource_units.py +++ b/robusta_krr/utils/resource_units.py @@ -42,6 +42,8 @@ def format(x: Union[float, int], /, *, base: Literal[1024, 1000] = 1024) -> str: if x < 1: return f"{int(x*1000)}m" + if x < base: + return str(x) units = ["", "K", "M", "G", "T", "P", "E"] binary_units = ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei"] -- cgit v1.2.3 From 1253ebfbdcaa404ff02eafde31ad331097d464e2 Mon Sep 17 00:00:00 2001 From: LeaveMyYard Date: Fri, 18 Aug 2023 14:07:46 +0300 Subject: Change default step to 1.25 minutes --- robusta_krr/core/abstract/strategies.py | 2 +- robusta_krr/core/integrations/prometheus/metrics/base.py | 2 +- robusta_krr/strategies/simple.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/robusta_krr/core/abstract/strategies.py b/robusta_krr/core/abstract/strategies.py index 9e6b269..204341f 100644 --- a/robusta_krr/core/abstract/strategies.py +++ b/robusta_krr/core/abstract/strategies.py @@ -49,7 +49,7 @@ class StrategySettings(pd.BaseModel): history_duration: float = pd.Field( 24 * 7 * 2, ge=1, description="The duration of the history data to use (in hours)." ) - timeframe_duration: float = pd.Field(2, ge=1, description="The step for the history data (in minutes).") + timeframe_duration: float = pd.Field(1.25, gt=0, description="The step for the history data (in minutes).") @property def history_timedelta(self) -> datetime.timedelta: diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py index 87ee583..bf44d60 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base.py @@ -142,7 +142,7 @@ class PrometheusMetric(BaseMetric, Configurable): ResourceHistoryData: An instance of the ResourceHistoryData class representing the loaded metrics. """ - step_str = "75s" # self._step_to_string(step) + step_str = f"{round(step.total_seconds())}s" duration_str = self._step_to_string(period) query = self.get_query(object, duration_str, step_str) diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py index f4adee3..e15c4fa 100644 --- a/robusta_krr/strategies/simple.py +++ b/robusta_krr/strategies/simple.py @@ -43,6 +43,8 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): """ CPU request: {cpu_percentile}% percentile, limit: unset Memory request: max + {memory_buffer_percentage}%, limit: max + {memory_buffer_percentage}% + History: {history_duration} hours + Step: {timeframe_duration} minutes This strategy does not work with objects with HPA defined (Horizontal Pod Autoscaler). If HPA is defined for CPU or Memory, the strategy will return "?" for that resource. -- cgit v1.2.3 From 99ad46ea6917e5bac0b38ffc134184fbd36cb069 Mon Sep 17 00:00:00 2001 From: LeaveMyYard Date: Fri, 18 Aug 2023 14:08:42 +0300 Subject: Change default memory buffer to 15% --- robusta_krr/strategies/simple.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py index e15c4fa..d7efae8 100644 --- a/robusta_krr/strategies/simple.py +++ b/robusta_krr/strategies/simple.py @@ -17,7 +17,7 @@ from robusta_krr.core.integrations.prometheus.metrics import MaxMemoryLoader, Pe class SimpleStrategySettings(StrategySettings): cpu_percentile: float = pd.Field(99, gt=0, le=100, description="The percentile to use for the CPU recommendation.") memory_buffer_percentage: float = pd.Field( - 5, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation." + 15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation." ) def calculate_memory_proposal(self, data: PodsTimeData) -> float: -- cgit v1.2.3 From 357ce77e8419c6cb569accbaacd5470987217cfb Mon Sep 17 00:00:00 2001 From: LeaveMyYard Date: Fri, 18 Aug 2023 15:07:51 +0300 Subject: Add minimal points required for recommendation --- .../integrations/prometheus/metrics/__init__.py | 4 +-- .../core/integrations/prometheus/metrics/cpu.py | 16 +++++++++++ .../core/integrations/prometheus/metrics/memory.py | 16 +++++++++++ robusta_krr/strategies/simple.py | 31 +++++++++++++++++++--- 4 files changed, 62 insertions(+), 5 deletions(-) diff --git a/robusta_krr/core/integrations/prometheus/metrics/__init__.py b/robusta_krr/core/integrations/prometheus/metrics/__init__.py index 35d53af..fcce607 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/__init__.py +++ b/robusta_krr/core/integrations/prometheus/metrics/__init__.py @@ -1,3 +1,3 @@ from .base import PrometheusMetric -from .cpu import CPULoader, PercentileCPULoader -from .memory import MaxMemoryLoader, MemoryLoader +from .cpu import CPULoader, PercentileCPULoader, CPUAmountLoader +from .memory import MaxMemoryLoader, MemoryLoader, MemoryAmountLoader diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu.py b/robusta_krr/core/integrations/prometheus/metrics/cpu.py index 6dafb06..b9c281d 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/cpu.py +++ b/robusta_krr/core/integrations/prometheus/metrics/cpu.py @@ -39,3 +39,19 @@ def PercentileCPULoader(percentile: float) -> type[QueryMetric]: """ return PercentileCPULoader + + +class CPUAmountLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: + pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() + return f""" + count_over_time( + container_cpu_usage_seconds_total{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }}[{duration}] + ) + """ diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory.py b/robusta_krr/core/integrations/prometheus/metrics/memory.py index b135ae5..21843b9 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/memory.py +++ b/robusta_krr/core/integrations/prometheus/metrics/memory.py @@ -33,3 +33,19 @@ class MaxMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): }}[{duration}:{step}] ) """ + + +class MemoryAmountLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: + pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() + return f""" + count_over_time( + container_memory_working_set_bytes{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }}[{duration}:{step}] + ) + """ diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py index d7efae8..8cb8bfc 100644 --- a/robusta_krr/strategies/simple.py +++ b/robusta_krr/strategies/simple.py @@ -11,7 +11,13 @@ from robusta_krr.core.abstract.strategies import ( RunResult, StrategySettings, ) -from robusta_krr.core.integrations.prometheus.metrics import MaxMemoryLoader, PercentileCPULoader, PrometheusMetric +from robusta_krr.core.integrations.prometheus.metrics import ( + MaxMemoryLoader, + PercentileCPULoader, + PrometheusMetric, + CPUAmountLoader, + MemoryAmountLoader, +) class SimpleStrategySettings(StrategySettings): @@ -19,6 +25,9 @@ class SimpleStrategySettings(StrategySettings): memory_buffer_percentage: float = pd.Field( 15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation." ) + points_required: int = pd.Field( + 100, ge=1, description="The number of data points required to make a recommendation for a resource." + ) def calculate_memory_proposal(self, data: PodsTimeData) -> float: data_ = [np.max(values[:, 1]) for values in data.values()] @@ -57,16 +66,24 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): @property def metrics(self) -> list[type[PrometheusMetric]]: - return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader] + return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader, CPUAmountLoader, MemoryAmountLoader] def __calculate_cpu_proposal( self, history_data: MetricsPodData, object_data: K8sObjectData ) -> ResourceRecommendation: data = history_data["PercentileCPULoader"] + data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()} if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + filtered_data = { + pod: values for pod, values in data.items() if data_count.get(pod, 0) >= self.settings.points_required + } + + if len(filtered_data) == 0: + return ResourceRecommendation.undefined(info="Not enough data") + if object_data.hpa is not None and object_data.hpa.target_cpu_utilization_percentage is not None: return ResourceRecommendation.undefined(info="HPA detected") @@ -77,14 +94,22 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): self, history_data: MetricsPodData, object_data: K8sObjectData ) -> ResourceRecommendation: data = history_data["MaxMemoryLoader"] + data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + filtered_data = { + pod: value for pod, value in data.items() if data_count.get(pod, 0) >= self.settings.points_required + } + + if len(filtered_data) == 0: + return ResourceRecommendation.undefined(info="Not enough data") + if object_data.hpa is not None and object_data.hpa.target_memory_utilization_percentage is not None: return ResourceRecommendation.undefined(info="HPA detected") - memory_usage = self.settings.calculate_memory_proposal(data) + memory_usage = self.settings.calculate_memory_proposal(filtered_data) return ResourceRecommendation(request=memory_usage, limit=memory_usage) def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult: -- cgit v1.2.3 From 4d3ab4e92fb90fd34147fda85835e051f2e275a5 Mon Sep 17 00:00:00 2001 From: LeaveMyYard Date: Tue, 22 Aug 2023 16:26:20 +0300 Subject: Small fix for points filtering --- robusta_krr/strategies/simple.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py index 8cb8bfc..93efb5f 100644 --- a/robusta_krr/strategies/simple.py +++ b/robusta_krr/strategies/simple.py @@ -72,11 +72,12 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): self, history_data: MetricsPodData, object_data: K8sObjectData ) -> ResourceRecommendation: data = history_data["PercentileCPULoader"] - data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()} if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()} + # Here we filter out pods from calculation that have less than `points_required` data points filtered_data = { pod: values for pod, values in data.items() if data_count.get(pod, 0) >= self.settings.points_required } @@ -94,11 +95,12 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): self, history_data: MetricsPodData, object_data: K8sObjectData ) -> ResourceRecommendation: data = history_data["MaxMemoryLoader"] - data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} + # Here we filter out pods from calculation that have less than `points_required` data points filtered_data = { pod: value for pod, value in data.items() if data_count.get(pod, 0) >= self.settings.points_required } -- cgit v1.2.3