diff options
| author | Pavel Zhukov <33721692+LeaveMyYard@users.noreply.github.com> | 2023-08-22 16:27:53 +0300 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-08-22 16:27:53 +0300 |
| commit | 80ef416ae5cc2f670ded18f846ce9acb0582afa5 (patch) | |
| tree | 77739268c3a51929c0825e89b7b5db45b38d4eda /robusta_krr | |
| parent | 9cba7cec2e9be89c9ea2d349b65e48460f6c1b5e (diff) | |
| parent | 4d3ab4e92fb90fd34147fda85835e051f2e275a5 (diff) | |
Merge pull request #128 from robusta-dev/calculation-improvement
Calculation algo improvement
Diffstat (limited to 'robusta_krr')
| -rw-r--r-- | robusta_krr/core/abstract/strategies.py | 2 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/metrics/__init__.py | 4 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/metrics/base.py | 22 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/metrics/cpu.py | 62 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/metrics/memory.py | 40 | ||||
| -rw-r--r-- | robusta_krr/strategies/simple.py | 37 | ||||
| -rw-r--r-- | robusta_krr/utils/resource_units.py | 2 |
7 files changed, 97 insertions, 72 deletions
diff --git a/robusta_krr/core/abstract/strategies.py b/robusta_krr/core/abstract/strategies.py index 9e6b269..204341f 100644 --- a/robusta_krr/core/abstract/strategies.py +++ b/robusta_krr/core/abstract/strategies.py @@ -49,7 +49,7 @@ class StrategySettings(pd.BaseModel): history_duration: float = pd.Field( 24 * 7 * 2, ge=1, description="The duration of the history data to use (in hours)." ) - timeframe_duration: float = pd.Field(2, ge=1, description="The step for the history data (in minutes).") + timeframe_duration: float = pd.Field(1.25, gt=0, description="The step for the history data (in minutes).") @property def history_timedelta(self) -> datetime.timedelta: diff --git a/robusta_krr/core/integrations/prometheus/metrics/__init__.py b/robusta_krr/core/integrations/prometheus/metrics/__init__.py index 6212a35..fcce607 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/__init__.py +++ b/robusta_krr/core/integrations/prometheus/metrics/__init__.py @@ -1,3 +1,3 @@ from .base import PrometheusMetric -from .cpu import CPULoader, MaxCPULoader, PercentileCPULoader -from .memory import MaxMemoryLoader, MemoryLoader, PercentileMemoryLoader +from .cpu import CPULoader, PercentileCPULoader, CPUAmountLoader +from .memory import MaxMemoryLoader, MemoryLoader, MemoryAmountLoader diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py index 177d908..bf44d60 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base.py @@ -7,7 +7,7 @@ import datetime import enum from concurrent.futures import ThreadPoolExecutor from functools import reduce -from typing import TYPE_CHECKING, Any, Optional +from typing import Any, Optional import numpy as np import pydantic as pd @@ -17,9 +17,7 @@ from robusta_krr.core.abstract.strategies import PodsTimeData from robusta_krr.core.models.config import Config from robusta_krr.core.models.objects import K8sObjectData from robusta_krr.utils.configurable import Configurable - -if TYPE_CHECKING: - from .. import CustomPrometheusConnect +from prometrix import CustomPrometheusConnect class QueryType(str, enum.Enum): @@ -69,13 +67,14 @@ class PrometheusMetric(BaseMetric, Configurable): return f', {self.config.prometheus_label}="{self.config.prometheus_cluster_label}"' @abc.abstractmethod - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: """ This method should be implemented by all subclasses to provide a query string to fetch metrics. Args: object (K8sObjectData): The object for which metrics need to be fetched. - resolution (Optional[str]): a string for configurable resolution to the query. + duration (str): a string for duration of the query. + step (str): a string for the step size of the query. Returns: str: The query string. @@ -142,9 +141,12 @@ class PrometheusMetric(BaseMetric, Configurable): Returns: ResourceHistoryData: An instance of the ResourceHistoryData class representing the loaded metrics. """ - resolution = f"{self._step_to_string(period)}:{self._step_to_string(step)}" - query = self.get_query(object, resolution) - end_time = datetime.datetime.now().astimezone() + + step_str = f"{round(step.total_seconds())}s" + duration_str = self._step_to_string(period) + + query = self.get_query(object, duration_str, step_str) + end_time = datetime.datetime.now().replace(second=0, microsecond=0).astimezone() start_time = end_time - period result = await self.query_prometheus( @@ -152,7 +154,7 @@ class PrometheusMetric(BaseMetric, Configurable): query=query, start_time=start_time, end_time=end_time, - step=self._step_to_string(step), + step=step_str, type=self.query_type, ) ) diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu.py b/robusta_krr/core/integrations/prometheus/metrics/cpu.py index 302e09d..b9c281d 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/cpu.py +++ b/robusta_krr/core/integrations/prometheus/metrics/cpu.py @@ -4,58 +4,54 @@ from .base import BatchedRequestMixin, FilterJobsMixin, QueryMetric, QueryRangeM class CPULoader(QueryRangeMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: pods_selector = "|".join(pod.name for pod in object.pods) cluster_label = self.get_prometheus_cluster_label() return f""" - sum( - irate( - container_cpu_usage_seconds_total{{ - namespace="{object.namespace}", - pod=~"{pods_selector}", - container="{object.container}" - {cluster_label} - }}[5m] - ) - ) by (container, pod, job) - """ - - -class MaxCPULoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) - cluster_label = self.get_prometheus_cluster_label() - return f""" - max_over_time( - irate( - container_cpu_usage_seconds_total{{ - namespace="{object.namespace}", - pod=~"{pods_selector}", - container="{object.container}" - {cluster_label} - }}[{resolution}] - ) - ) by (container, pod, job) + rate( + container_cpu_usage_seconds_total{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }}[{step}] + ) """ def PercentileCPULoader(percentile: float) -> type[QueryMetric]: class PercentileCPULoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: pods_selector = "|".join(pod.name for pod in object.pods) cluster_label = self.get_prometheus_cluster_label() return f""" quantile_over_time( {round(percentile / 100, 2)}, - irate( + rate( container_cpu_usage_seconds_total{{ namespace="{object.namespace}", pod=~"{pods_selector}", container="{object.container}" {cluster_label} - }}[1m] - )[{resolution}] + }}[{step}] + )[{duration}:{step}] ) """ return PercentileCPULoader + + +class CPUAmountLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: + pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() + return f""" + count_over_time( + container_cpu_usage_seconds_total{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }}[{duration}] + ) + """ diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory.py b/robusta_krr/core/integrations/prometheus/metrics/memory.py index ad84abe..21843b9 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/memory.py +++ b/robusta_krr/core/integrations/prometheus/metrics/memory.py @@ -4,7 +4,7 @@ from .base import BatchedRequestMixin, FilterJobsMixin, QueryMetric, QueryRangeM class MemoryLoader(QueryRangeMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: pods_selector = "|".join(pod.name for pod in object.pods) cluster_label = self.get_prometheus_cluster_label() return f""" @@ -15,12 +15,12 @@ class MemoryLoader(QueryRangeMetric, FilterJobsMixin, BatchedRequestMixin): container="{object.container}" {cluster_label} }} - ) by (container, pod, job, id) + ) by (container, pod, job) """ class MaxMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: pods_selector = "|".join(pod.name for pod in object.pods) cluster_label = self.get_prometheus_cluster_label() return f""" @@ -30,26 +30,22 @@ class MaxMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): pod=~"{pods_selector}", container="{object.container}" {cluster_label} - }}[{resolution}] + }}[{duration}:{step}] ) """ -def PercentileMemoryLoader(percentile: float) -> type[QueryMetric]: - class PercentileMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): - def get_query(self, object: K8sObjectData, resolution: str) -> str: - pods_selector = "|".join(pod.name for pod in object.pods) - cluster_label = self.get_prometheus_cluster_label() - return f""" - quantile_over_time( - {round(percentile / 100, 2)}, - container_memory_working_set_bytes{{ - namespace="{object.namespace}", - pod=~"{pods_selector}", - container="{object.container}" - {cluster_label} - }}[{resolution}] - ) - """ - - return PercentileMemoryLoader +class MemoryAmountLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin): + def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: + pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() + return f""" + count_over_time( + container_memory_working_set_bytes{{ + namespace="{object.namespace}", + pod=~"{pods_selector}", + container="{object.container}" + {cluster_label} + }}[{duration}:{step}] + ) + """ diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py index f4adee3..93efb5f 100644 --- a/robusta_krr/strategies/simple.py +++ b/robusta_krr/strategies/simple.py @@ -11,13 +11,22 @@ from robusta_krr.core.abstract.strategies import ( RunResult, StrategySettings, ) -from robusta_krr.core.integrations.prometheus.metrics import MaxMemoryLoader, PercentileCPULoader, PrometheusMetric +from robusta_krr.core.integrations.prometheus.metrics import ( + MaxMemoryLoader, + PercentileCPULoader, + PrometheusMetric, + CPUAmountLoader, + MemoryAmountLoader, +) class SimpleStrategySettings(StrategySettings): cpu_percentile: float = pd.Field(99, gt=0, le=100, description="The percentile to use for the CPU recommendation.") memory_buffer_percentage: float = pd.Field( - 5, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation." + 15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation." + ) + points_required: int = pd.Field( + 100, ge=1, description="The number of data points required to make a recommendation for a resource." ) def calculate_memory_proposal(self, data: PodsTimeData) -> float: @@ -43,6 +52,8 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): """ CPU request: {cpu_percentile}% percentile, limit: unset Memory request: max + {memory_buffer_percentage}%, limit: max + {memory_buffer_percentage}% + History: {history_duration} hours + Step: {timeframe_duration} minutes This strategy does not work with objects with HPA defined (Horizontal Pod Autoscaler). If HPA is defined for CPU or Memory, the strategy will return "?" for that resource. @@ -55,7 +66,7 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): @property def metrics(self) -> list[type[PrometheusMetric]]: - return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader] + return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader, CPUAmountLoader, MemoryAmountLoader] def __calculate_cpu_proposal( self, history_data: MetricsPodData, object_data: K8sObjectData @@ -65,6 +76,15 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()} + # Here we filter out pods from calculation that have less than `points_required` data points + filtered_data = { + pod: values for pod, values in data.items() if data_count.get(pod, 0) >= self.settings.points_required + } + + if len(filtered_data) == 0: + return ResourceRecommendation.undefined(info="Not enough data") + if object_data.hpa is not None and object_data.hpa.target_cpu_utilization_percentage is not None: return ResourceRecommendation.undefined(info="HPA detected") @@ -79,10 +99,19 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): if len(data) == 0: return ResourceRecommendation.undefined(info="No data") + data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} + # Here we filter out pods from calculation that have less than `points_required` data points + filtered_data = { + pod: value for pod, value in data.items() if data_count.get(pod, 0) >= self.settings.points_required + } + + if len(filtered_data) == 0: + return ResourceRecommendation.undefined(info="Not enough data") + if object_data.hpa is not None and object_data.hpa.target_memory_utilization_percentage is not None: return ResourceRecommendation.undefined(info="HPA detected") - memory_usage = self.settings.calculate_memory_proposal(data) + memory_usage = self.settings.calculate_memory_proposal(filtered_data) return ResourceRecommendation(request=memory_usage, limit=memory_usage) def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult: diff --git a/robusta_krr/utils/resource_units.py b/robusta_krr/utils/resource_units.py index 8489b75..9fd290d 100644 --- a/robusta_krr/utils/resource_units.py +++ b/robusta_krr/utils/resource_units.py @@ -42,6 +42,8 @@ def format(x: Union[float, int], /, *, base: Literal[1024, 1000] = 1024) -> str: if x < 1: return f"{int(x*1000)}m" + if x < base: + return str(x) units = ["", "K", "M", "G", "T", "P", "E"] binary_units = ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei"] |
