diff options
| author | Павел Жуков <33721692+LeaveMyYard@users.noreply.github.com> | 2023-04-27 18:13:02 +0300 |
|---|---|---|
| committer | Павел Жуков <33721692+LeaveMyYard@users.noreply.github.com> | 2023-04-27 18:13:02 +0300 |
| commit | 3729eec24a8b3cec8fee2c89b5718ccd63fd4c67 (patch) | |
| tree | 0bd43f4da150be37cb153ce0b3654b10e66def99 /robusta_krr | |
| parent | afbb08efb31031ffee80b919256c0a1015365520 (diff) | |
Add metric filtering to fix bug with double value
Diffstat (limited to 'robusta_krr')
5 files changed, 69 insertions, 7 deletions
diff --git a/robusta_krr/core/integrations/prometheus/loader.py b/robusta_krr/core/integrations/prometheus/loader.py index 8938496..c0c1961 100644 --- a/robusta_krr/core/integrations/prometheus/loader.py +++ b/robusta_krr/core/integrations/prometheus/loader.py @@ -111,11 +111,10 @@ class PrometheusLoader(Configurable): resource: ResourceType, period: datetime.timedelta, *, - timeframe: datetime.timedelta = datetime.timedelta(minutes=30), + step: datetime.timedelta = datetime.timedelta(minutes=30), ) -> ResourceHistoryData: self.debug(f"Gathering data for {object} and {resource}") - step = f"{int(timeframe.total_seconds()) // 60}m" MetricLoaderType = BaseMetricLoader.get_by_resource(resource) metric_loader = MetricLoaderType(self.config, self.prometheus) return await metric_loader.load_data(object, period, step) diff --git a/robusta_krr/core/integrations/prometheus/metrics/base_metric.py b/robusta_krr/core/integrations/prometheus/metrics/base_metric.py index 9d366c8..407d3e3 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base_metric.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base_metric.py @@ -26,11 +26,23 @@ class BaseMetricLoader(Configurable, abc.ABC): def get_query(self, namespace: str, pod: str, container: str) -> str: ... - async def load_data(self, object: K8sObjectData, period: datetime.timedelta, step: str) -> ResourceHistoryData: + async def query_prometheus( + self, query: str, start_time: datetime.datetime, end_time: datetime.datetime, step: datetime.timedelta + ) -> list[dict]: + return await asyncio.to_thread( + self.prometheus.custom_query_range, + query=query, + start_time=start_time, + end_time=end_time, + step=f"{int(step.total_seconds()) // 60}m", + ) + + async def load_data( + self, object: K8sObjectData, period: datetime.timedelta, step: datetime.timedelta + ) -> ResourceHistoryData: result = await asyncio.gather( *[ - asyncio.to_thread( - self.prometheus.custom_query_range, + self.query_prometheus( query=self.get_query(object.namespace, pod, object.container), start_time=datetime.datetime.now() - period, end_time=datetime.datetime.now(), diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py b/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py index a8c2b8e..d2ad841 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py +++ b/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py @@ -6,4 +6,4 @@ from .base_metric import BaseMetricLoader, bind_metric @bind_metric(ResourceType.CPU) class CPUMetricLoader(BaseMetricLoader): def get_query(self, namespace: str, pod: str, container: str) -> str: - return f'sum(irate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod="{pod}", container="{container}"}}[1m]))' + return f'sum(irate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod="{pod}", container="{container}"}}[5m])) by (container, pod, job)' diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py b/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py index 4e018ae..6d5bc0c 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py +++ b/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py @@ -1,9 +1,60 @@ +import datetime +from typing import Any, Optional + from robusta_krr.core.models.allocations import ResourceType from .base_metric import BaseMetricLoader, bind_metric +PrometheusSeries = Any + @bind_metric(ResourceType.Memory) class MemoryMetricLoader(BaseMetricLoader): + @staticmethod + def get_target_name(series: PrometheusSeries) -> Optional[str]: + for label in ["container", "pod", "node"]: + if label in series.metric: + return series.metric[label] + return None + + @staticmethod + def filter_prom_jobs_results( + series_list_result: list[PrometheusSeries], + ) -> list[PrometheusSeries]: + """ + Because there might be multiple metrics with the same name, we need to filter them out. + + :param series_list_result: list of PrometheusSeries + """ + + if len(series_list_result) == 1: + return series_list_result + + target_names = { + MemoryMetricLoader.get_target_name(series) + for series in series_list_result + if MemoryMetricLoader.get_target_name(series) + } + return_list: list[PrometheusSeries] = [] + + # takes kubelet job if exists, return first job alphabetically if it doesn't + for target_name in target_names: + relevant_series = [ + series for series in series_list_result if MemoryMetricLoader.get_target_name(series) == target_name + ] + relevant_kubelet_metric = [series for series in relevant_series if series.metric.get("job") == "kubelet"] + if len(relevant_kubelet_metric) == 1: + return_list.append(relevant_kubelet_metric[0]) + continue + sorted_relevant_series = sorted(relevant_series, key=lambda series: series.metric.get("job"), reverse=False) + return_list.append(sorted_relevant_series[0]) + return return_list + + async def query_prometheus( + self, query: str, start_time: datetime.datetime, end_time: datetime.datetime, step: datetime.timedelta + ) -> list[PrometheusSeries]: + result = await super().query_prometheus(query, start_time, end_time, step) + return self.filter_prom_jobs_results(result) + def get_query(self, namespace: str, pod: str, container: str) -> str: return f'sum(container_memory_working_set_bytes{{image!="", namespace="{namespace}", pod="{pod}", container="{container}"}})' diff --git a/robusta_krr/core/runner.py b/robusta_krr/core/runner.py index 00dcbb9..f4d4d94 100644 --- a/robusta_krr/core/runner.py +++ b/robusta_krr/core/runner.py @@ -94,7 +94,7 @@ class Runner(Configurable): object, resource, self._strategy.settings.history_timedelta, - timeframe=self._strategy.settings.timeframe_timedelta, + step=self._strategy.settings.timeframe_timedelta, ) for resource in ResourceType ] |
