summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--robusta_krr/core/abstract/strategies.py2
-rw-r--r--robusta_krr/core/integrations/prometheus/metrics/__init__.py4
-rw-r--r--robusta_krr/core/integrations/prometheus/metrics/base.py22
-rw-r--r--robusta_krr/core/integrations/prometheus/metrics/cpu.py62
-rw-r--r--robusta_krr/core/integrations/prometheus/metrics/memory.py40
-rw-r--r--robusta_krr/strategies/simple.py37
-rw-r--r--robusta_krr/utils/resource_units.py2
7 files changed, 97 insertions, 72 deletions
diff --git a/robusta_krr/core/abstract/strategies.py b/robusta_krr/core/abstract/strategies.py
index 9e6b269..204341f 100644
--- a/robusta_krr/core/abstract/strategies.py
+++ b/robusta_krr/core/abstract/strategies.py
@@ -49,7 +49,7 @@ class StrategySettings(pd.BaseModel):
history_duration: float = pd.Field(
24 * 7 * 2, ge=1, description="The duration of the history data to use (in hours)."
)
- timeframe_duration: float = pd.Field(2, ge=1, description="The step for the history data (in minutes).")
+ timeframe_duration: float = pd.Field(1.25, gt=0, description="The step for the history data (in minutes).")
@property
def history_timedelta(self) -> datetime.timedelta:
diff --git a/robusta_krr/core/integrations/prometheus/metrics/__init__.py b/robusta_krr/core/integrations/prometheus/metrics/__init__.py
index 6212a35..fcce607 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/__init__.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/__init__.py
@@ -1,3 +1,3 @@
from .base import PrometheusMetric
-from .cpu import CPULoader, MaxCPULoader, PercentileCPULoader
-from .memory import MaxMemoryLoader, MemoryLoader, PercentileMemoryLoader
+from .cpu import CPULoader, PercentileCPULoader, CPUAmountLoader
+from .memory import MaxMemoryLoader, MemoryLoader, MemoryAmountLoader
diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py
index 177d908..bf44d60 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/base.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/base.py
@@ -7,7 +7,7 @@ import datetime
import enum
from concurrent.futures import ThreadPoolExecutor
from functools import reduce
-from typing import TYPE_CHECKING, Any, Optional
+from typing import Any, Optional
import numpy as np
import pydantic as pd
@@ -17,9 +17,7 @@ from robusta_krr.core.abstract.strategies import PodsTimeData
from robusta_krr.core.models.config import Config
from robusta_krr.core.models.objects import K8sObjectData
from robusta_krr.utils.configurable import Configurable
-
-if TYPE_CHECKING:
- from .. import CustomPrometheusConnect
+from prometrix import CustomPrometheusConnect
class QueryType(str, enum.Enum):
@@ -69,13 +67,14 @@ class PrometheusMetric(BaseMetric, Configurable):
return f', {self.config.prometheus_label}="{self.config.prometheus_cluster_label}"'
@abc.abstractmethod
- def get_query(self, object: K8sObjectData, resolution: str) -> str:
+ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
"""
This method should be implemented by all subclasses to provide a query string to fetch metrics.
Args:
object (K8sObjectData): The object for which metrics need to be fetched.
- resolution (Optional[str]): a string for configurable resolution to the query.
+ duration (str): a string for duration of the query.
+ step (str): a string for the step size of the query.
Returns:
str: The query string.
@@ -142,9 +141,12 @@ class PrometheusMetric(BaseMetric, Configurable):
Returns:
ResourceHistoryData: An instance of the ResourceHistoryData class representing the loaded metrics.
"""
- resolution = f"{self._step_to_string(period)}:{self._step_to_string(step)}"
- query = self.get_query(object, resolution)
- end_time = datetime.datetime.now().astimezone()
+
+ step_str = f"{round(step.total_seconds())}s"
+ duration_str = self._step_to_string(period)
+
+ query = self.get_query(object, duration_str, step_str)
+ end_time = datetime.datetime.now().replace(second=0, microsecond=0).astimezone()
start_time = end_time - period
result = await self.query_prometheus(
@@ -152,7 +154,7 @@ class PrometheusMetric(BaseMetric, Configurable):
query=query,
start_time=start_time,
end_time=end_time,
- step=self._step_to_string(step),
+ step=step_str,
type=self.query_type,
)
)
diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu.py b/robusta_krr/core/integrations/prometheus/metrics/cpu.py
index 302e09d..b9c281d 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/cpu.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/cpu.py
@@ -4,58 +4,54 @@ from .base import BatchedRequestMixin, FilterJobsMixin, QueryMetric, QueryRangeM
class CPULoader(QueryRangeMetric, FilterJobsMixin, BatchedRequestMixin):
- def get_query(self, object: K8sObjectData, resolution: str) -> str:
+ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
cluster_label = self.get_prometheus_cluster_label()
return f"""
- sum(
- irate(
- container_cpu_usage_seconds_total{{
- namespace="{object.namespace}",
- pod=~"{pods_selector}",
- container="{object.container}"
- {cluster_label}
- }}[5m]
- )
- ) by (container, pod, job)
- """
-
-
-class MaxCPULoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin):
- def get_query(self, object: K8sObjectData, resolution: str) -> str:
- pods_selector = "|".join(pod.name for pod in object.pods)
- cluster_label = self.get_prometheus_cluster_label()
- return f"""
- max_over_time(
- irate(
- container_cpu_usage_seconds_total{{
- namespace="{object.namespace}",
- pod=~"{pods_selector}",
- container="{object.container}"
- {cluster_label}
- }}[{resolution}]
- )
- ) by (container, pod, job)
+ rate(
+ container_cpu_usage_seconds_total{{
+ namespace="{object.namespace}",
+ pod=~"{pods_selector}",
+ container="{object.container}"
+ {cluster_label}
+ }}[{step}]
+ )
"""
def PercentileCPULoader(percentile: float) -> type[QueryMetric]:
class PercentileCPULoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin):
- def get_query(self, object: K8sObjectData, resolution: str) -> str:
+ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
cluster_label = self.get_prometheus_cluster_label()
return f"""
quantile_over_time(
{round(percentile / 100, 2)},
- irate(
+ rate(
container_cpu_usage_seconds_total{{
namespace="{object.namespace}",
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
- }}[1m]
- )[{resolution}]
+ }}[{step}]
+ )[{duration}:{step}]
)
"""
return PercentileCPULoader
+
+
+class CPUAmountLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin):
+ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
+ pods_selector = "|".join(pod.name for pod in object.pods)
+ cluster_label = self.get_prometheus_cluster_label()
+ return f"""
+ count_over_time(
+ container_cpu_usage_seconds_total{{
+ namespace="{object.namespace}",
+ pod=~"{pods_selector}",
+ container="{object.container}"
+ {cluster_label}
+ }}[{duration}]
+ )
+ """
diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory.py b/robusta_krr/core/integrations/prometheus/metrics/memory.py
index ad84abe..21843b9 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/memory.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/memory.py
@@ -4,7 +4,7 @@ from .base import BatchedRequestMixin, FilterJobsMixin, QueryMetric, QueryRangeM
class MemoryLoader(QueryRangeMetric, FilterJobsMixin, BatchedRequestMixin):
- def get_query(self, object: K8sObjectData, resolution: str) -> str:
+ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
cluster_label = self.get_prometheus_cluster_label()
return f"""
@@ -15,12 +15,12 @@ class MemoryLoader(QueryRangeMetric, FilterJobsMixin, BatchedRequestMixin):
container="{object.container}"
{cluster_label}
}}
- ) by (container, pod, job, id)
+ ) by (container, pod, job)
"""
class MaxMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin):
- def get_query(self, object: K8sObjectData, resolution: str) -> str:
+ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
cluster_label = self.get_prometheus_cluster_label()
return f"""
@@ -30,26 +30,22 @@ class MaxMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin):
pod=~"{pods_selector}",
container="{object.container}"
{cluster_label}
- }}[{resolution}]
+ }}[{duration}:{step}]
)
"""
-def PercentileMemoryLoader(percentile: float) -> type[QueryMetric]:
- class PercentileMemoryLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin):
- def get_query(self, object: K8sObjectData, resolution: str) -> str:
- pods_selector = "|".join(pod.name for pod in object.pods)
- cluster_label = self.get_prometheus_cluster_label()
- return f"""
- quantile_over_time(
- {round(percentile / 100, 2)},
- container_memory_working_set_bytes{{
- namespace="{object.namespace}",
- pod=~"{pods_selector}",
- container="{object.container}"
- {cluster_label}
- }}[{resolution}]
- )
- """
-
- return PercentileMemoryLoader
+class MemoryAmountLoader(QueryMetric, FilterJobsMixin, BatchedRequestMixin):
+ def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
+ pods_selector = "|".join(pod.name for pod in object.pods)
+ cluster_label = self.get_prometheus_cluster_label()
+ return f"""
+ count_over_time(
+ container_memory_working_set_bytes{{
+ namespace="{object.namespace}",
+ pod=~"{pods_selector}",
+ container="{object.container}"
+ {cluster_label}
+ }}[{duration}:{step}]
+ )
+ """
diff --git a/robusta_krr/strategies/simple.py b/robusta_krr/strategies/simple.py
index f4adee3..93efb5f 100644
--- a/robusta_krr/strategies/simple.py
+++ b/robusta_krr/strategies/simple.py
@@ -11,13 +11,22 @@ from robusta_krr.core.abstract.strategies import (
RunResult,
StrategySettings,
)
-from robusta_krr.core.integrations.prometheus.metrics import MaxMemoryLoader, PercentileCPULoader, PrometheusMetric
+from robusta_krr.core.integrations.prometheus.metrics import (
+ MaxMemoryLoader,
+ PercentileCPULoader,
+ PrometheusMetric,
+ CPUAmountLoader,
+ MemoryAmountLoader,
+)
class SimpleStrategySettings(StrategySettings):
cpu_percentile: float = pd.Field(99, gt=0, le=100, description="The percentile to use for the CPU recommendation.")
memory_buffer_percentage: float = pd.Field(
- 5, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation."
+ 15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation."
+ )
+ points_required: int = pd.Field(
+ 100, ge=1, description="The number of data points required to make a recommendation for a resource."
)
def calculate_memory_proposal(self, data: PodsTimeData) -> float:
@@ -43,6 +52,8 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
"""
CPU request: {cpu_percentile}% percentile, limit: unset
Memory request: max + {memory_buffer_percentage}%, limit: max + {memory_buffer_percentage}%
+ History: {history_duration} hours
+ Step: {timeframe_duration} minutes
This strategy does not work with objects with HPA defined (Horizontal Pod Autoscaler).
If HPA is defined for CPU or Memory, the strategy will return "?" for that resource.
@@ -55,7 +66,7 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
@property
def metrics(self) -> list[type[PrometheusMetric]]:
- return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader]
+ return [PercentileCPULoader(self.settings.cpu_percentile), MaxMemoryLoader, CPUAmountLoader, MemoryAmountLoader]
def __calculate_cpu_proposal(
self, history_data: MetricsPodData, object_data: K8sObjectData
@@ -65,6 +76,15 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
if len(data) == 0:
return ResourceRecommendation.undefined(info="No data")
+ data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()}
+ # Here we filter out pods from calculation that have less than `points_required` data points
+ filtered_data = {
+ pod: values for pod, values in data.items() if data_count.get(pod, 0) >= self.settings.points_required
+ }
+
+ if len(filtered_data) == 0:
+ return ResourceRecommendation.undefined(info="Not enough data")
+
if object_data.hpa is not None and object_data.hpa.target_cpu_utilization_percentage is not None:
return ResourceRecommendation.undefined(info="HPA detected")
@@ -79,10 +99,19 @@ class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
if len(data) == 0:
return ResourceRecommendation.undefined(info="No data")
+ data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()}
+ # Here we filter out pods from calculation that have less than `points_required` data points
+ filtered_data = {
+ pod: value for pod, value in data.items() if data_count.get(pod, 0) >= self.settings.points_required
+ }
+
+ if len(filtered_data) == 0:
+ return ResourceRecommendation.undefined(info="Not enough data")
+
if object_data.hpa is not None and object_data.hpa.target_memory_utilization_percentage is not None:
return ResourceRecommendation.undefined(info="HPA detected")
- memory_usage = self.settings.calculate_memory_proposal(data)
+ memory_usage = self.settings.calculate_memory_proposal(filtered_data)
return ResourceRecommendation(request=memory_usage, limit=memory_usage)
def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult:
diff --git a/robusta_krr/utils/resource_units.py b/robusta_krr/utils/resource_units.py
index 8489b75..9fd290d 100644
--- a/robusta_krr/utils/resource_units.py
+++ b/robusta_krr/utils/resource_units.py
@@ -42,6 +42,8 @@ def format(x: Union[float, int], /, *, base: Literal[1024, 1000] = 1024) -> str:
if x < 1:
return f"{int(x*1000)}m"
+ if x < base:
+ return str(x)
units = ["", "K", "M", "G", "T", "P", "E"]
binary_units = ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei"]