summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAvi-Robusta <97387909+Avi-Robusta@users.noreply.github.com>2023-06-14 12:08:31 +0300
committerGitHub <noreply@github.com>2023-06-14 12:08:31 +0300
commit31a192559010bb72e962844183a0a8bd4f83c5e9 (patch)
tree59dfca10049f5cf328820977048d333151d9096f
parentb69c44783dbf81dcf7087e4f395f5ce19baf3f8c (diff)
parent142b6ac3d310a187cae40e88f1c0c5a7b11b0fe3 (diff)
Merge pull request #70 from robusta-dev/centralized-support
Centralized multi-cluster Prometheus/Thanos/VM support
-rw-r--r--README.md33
-rw-r--r--robusta_krr/core/integrations/prometheus/__init__.py7
-rw-r--r--robusta_krr/core/integrations/prometheus/loader.py1
-rw-r--r--robusta_krr/core/integrations/prometheus/metrics/base_metric.py11
-rw-r--r--robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py2
-rw-r--r--robusta_krr/core/integrations/prometheus/metrics/memory_metric.py2
-rw-r--r--robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py6
-rw-r--r--robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py30
-rw-r--r--robusta_krr/core/models/config.py1
-rw-r--r--robusta_krr/core/models/objects.py3
-rw-r--r--robusta_krr/core/runner.py13
-rw-r--r--robusta_krr/main.py8
12 files changed, 108 insertions, 9 deletions
diff --git a/README.md b/README.md
index bc1ede3..dc5c278 100644
--- a/README.md
+++ b/README.md
@@ -274,9 +274,10 @@ krr simple --help
<!-- Port-forwarding -->
-## Prometheus auto-discovery
+## Prometheus, Victoria Metrics and Thanos auto-discovery
-By default, KRR will try to auto-discover the running Prometheus by scanning those labels:
+By default, KRR will try to auto-discover the running Prometheus Victoria Metrics and Thanos.
+For discovering prometheus it scan services for those labels:
```python
"app=kube-prometheus-stack-prometheus"
"app=prometheus,component=server"
@@ -286,8 +287,22 @@ By default, KRR will try to auto-discover the running Prometheus by scanning tho
"app=rancher-monitoring-prometheus"
"app=prometheus-prometheus"
```
+For Thanos its these labels:
+```python
+"app.kubernetes.io/component=query,app.kubernetes.io/name=thanos",
+"app.kubernetes.io/name=thanos-query",
+"app=thanos-query",
+"app=thanos-querier",
+```
+And for Victoria Metrics its the following labels:
+```python
+"app.kubernetes.io/name=vmsingle",
+"app.kubernetes.io/name=victoria-metrics-single",
+"app.kubernetes.io/name=vmselect",
+"app=vmselect",
+```
-If none of those labels result in finding Prometheus, you will get an error and will have to pass the working url explicitly (using the `-p` flag).
+If none of those labels result in finding Prometheus, Victoria Metrics or Thanos, you will get an error and will have to pass the working url explicitly (using the `-p` flag).
<p align="right">(<a href="#readme-top">back to top</a>)</p>
@@ -309,6 +324,18 @@ krr simple -p http://127.0.0.1:9090
<p align="right">(<a href="#readme-top">back to top</a>)</p>
+## Scanning with a centralized Prometheus
+
+If your Prometheus monitors multiple clusters you we require the label you defined for your cluster in Prometheus.
+
+For example, if your cluster has the Prometheus label `cluster: "my-cluster-name"` and your prometheus is at url `http://my-centralized-prometheus:9090`, then run this command:
+
+```sh
+krr.py simple -p http://my-centralized-prometheus:9090 -l my-cluster-name
+```
+
+<p align="right">(<a href="#readme-top">back to top</a>)</p>
+
<!-- Formatters -->
## Available formatters
diff --git a/robusta_krr/core/integrations/prometheus/__init__.py b/robusta_krr/core/integrations/prometheus/__init__.py
index 3d298c6..6cec9a6 100644
--- a/robusta_krr/core/integrations/prometheus/__init__.py
+++ b/robusta_krr/core/integrations/prometheus/__init__.py
@@ -1,2 +1,7 @@
from .loader import MetricsLoader
-from .metrics_service.prometheus_metrics_service import CustomPrometheusConnect, PrometheusDiscovery, PrometheusNotFound
+from .metrics_service.prometheus_metrics_service import (
+ ClusterNotSpecifiedException,
+ CustomPrometheusConnect,
+ PrometheusDiscovery,
+ PrometheusNotFound,
+)
diff --git a/robusta_krr/core/integrations/prometheus/loader.py b/robusta_krr/core/integrations/prometheus/loader.py
index 773e8b7..cb33a4e 100644
--- a/robusta_krr/core/integrations/prometheus/loader.py
+++ b/robusta_krr/core/integrations/prometheus/loader.py
@@ -62,6 +62,7 @@ class MetricsLoader(Configurable):
loader = metric_service_class(config, api_client=api_client, cluster=cluster)
loader.check_connection()
self.echo(f"{service_name} found")
+ loader.validate_cluster_name()
return loader
except MetricsNotFound as e:
self.debug(f"{service_name} not found")
diff --git a/robusta_krr/core/integrations/prometheus/metrics/base_metric.py b/robusta_krr/core/integrations/prometheus/metrics/base_metric.py
index 7118d8c..aea633c 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/base_metric.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/base_metric.py
@@ -30,6 +30,17 @@ class BaseMetricLoader(Configurable, abc.ABC):
super().__init__(config)
self.prometheus = prometheus
+ def get_prometheus_cluster_label(self) -> str:
+ """
+ Generates the cluster label for querying a centralized Prometheus
+
+ Returns:
+ str: a promql safe label string for querying the cluster.
+ """
+ if self.config.prometheus_cluster_label is None:
+ return ""
+ return f', cluster="{self.config.prometheus_cluster_label}"'
+
@abc.abstractmethod
def get_query(self, object: K8sObjectData) -> str:
"""
diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py b/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py
index 67ec98a..e2d364b 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py
@@ -9,10 +9,12 @@ from .base_metric import bind_metric
class CPUMetricLoader(BaseFilteredMetricLoader):
def get_query(self, object: K8sObjectData) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
+ cluster_label = self.get_prometheus_cluster_label()
return (
"sum(irate(container_cpu_usage_seconds_total{"
f'namespace="{object.namespace}", '
f'pod=~"{pods_selector}", '
f'container="{object.container}"'
+ f"{cluster_label}"
"}[5m])) by (container, pod, job)"
)
diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py b/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py
index ddb056a..5942ec1 100644
--- a/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py
+++ b/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py
@@ -9,10 +9,12 @@ from .base_metric import bind_metric
class MemoryMetricLoader(BaseFilteredMetricLoader):
def get_query(self, object: K8sObjectData) -> str:
pods_selector = "|".join(pod.name for pod in object.pods)
+ cluster_label = self.get_prometheus_cluster_label()
return (
"sum(container_memory_working_set_bytes{"
f'namespace="{object.namespace}", '
f'pod=~"{pods_selector}", '
f'container="{object.container}"'
+ f"{cluster_label}"
"}) by (container, pod, job)"
)
diff --git a/robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py b/robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py
index 21b3706..9ac308b 100644
--- a/robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py
+++ b/robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py
@@ -1,6 +1,6 @@
import abc
import datetime
-from typing import Optional
+from typing import List, Optional
from kubernetes.client.api_client import ApiClient
@@ -39,6 +39,10 @@ class MetricsService(Configurable, abc.ABC):
return classname.replace("MetricsService", "") if classname != MetricsService.__name__ else classname
@abc.abstractmethod
+ async def get_cluster_names(self) -> Optional[List[str]]:
+ ...
+
+ @abc.abstractmethod
async def gather_data(
self,
object: K8sObjectData,
diff --git a/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py b/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py
index c5d42ef..4f1c6f2 100644
--- a/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py
+++ b/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py
@@ -1,6 +1,6 @@
import asyncio
import datetime
-from typing import Optional, no_type_check
+from typing import List, Optional, no_type_check
import requests
from kubernetes.client import ApiClient
@@ -50,6 +50,14 @@ class PrometheusNotFound(MetricsNotFound):
pass
+class ClusterNotSpecifiedException(Exception):
+ """
+ An exception raised when a prometheus requires a cluster label but an invalid one is provided.
+ """
+
+ pass
+
+
class CustomPrometheusConnect(PrometheusConnect):
"""
Custom PrometheusConnect class to handle retries.
@@ -134,6 +142,26 @@ class PrometheusMetricsService(MetricsService):
async def query(self, query: str) -> dict:
return await asyncio.to_thread(self.prometheus.custom_query, query=query)
+ def validate_cluster_name(self):
+ cluster_label = self.config.prometheus_cluster_label
+ cluster_names = self.get_cluster_names()
+
+ if len(cluster_names) <= 1:
+ # there is only one cluster of metrics in this prometheus
+ return
+
+ if not cluster_label:
+ raise ClusterNotSpecifiedException(
+ f"No label specified, Rerun krr with the flag `-l <cluster>` where <cluster> is one of {cluster_names}"
+ )
+ if cluster_label not in cluster_names:
+ raise ClusterNotSpecifiedException(
+ f"Label {cluster_label} does not exist, Rerun krr with the flag `-l <cluster>` where <cluster> is one of {cluster_names}"
+ )
+
+ def get_cluster_names(self) -> Optional[List[str]]:
+ return self.prometheus.get_label_values(label_name="cluster")
+
async def gather_data(
self,
object: K8sObjectData,
diff --git a/robusta_krr/core/models/config.py b/robusta_krr/core/models/config.py
index 6cd0b56..72cf4d8 100644
--- a/robusta_krr/core/models/config.py
+++ b/robusta_krr/core/models/config.py
@@ -25,6 +25,7 @@ class Config(pd.BaseSettings):
prometheus_url: Optional[str] = pd.Field(None)
prometheus_auth_header: Optional[str] = pd.Field(None)
prometheus_ssl_enabled: bool = pd.Field(False)
+ prometheus_cluster_label: Optional[str] = pd.Field(None)
# Logging Settings
format: str
diff --git a/robusta_krr/core/models/objects.py b/robusta_krr/core/models/objects.py
index 72694fd..3fbbc9c 100644
--- a/robusta_krr/core/models/objects.py
+++ b/robusta_krr/core/models/objects.py
@@ -1,6 +1,7 @@
+from typing import Optional
+
import pydantic as pd
-from typing import Optional
from robusta_krr.core.models.allocations import ResourceAllocations
diff --git a/robusta_krr/core/runner.py b/robusta_krr/core/runner.py
index 7a64bfb..02c505b 100644
--- a/robusta_krr/core/runner.py
+++ b/robusta_krr/core/runner.py
@@ -4,7 +4,7 @@ from typing import Optional, Union
from robusta_krr.core.abstract.strategies import ResourceRecommendation, RunResult
from robusta_krr.core.integrations.kubernetes import KubernetesLoader
-from robusta_krr.core.integrations.prometheus import MetricsLoader, PrometheusNotFound
+from robusta_krr.core.integrations.prometheus import ClusterNotSpecifiedException, MetricsLoader, PrometheusNotFound
from robusta_krr.core.models.config import Config
from robusta_krr.core.models.objects import K8sObjectData
from robusta_krr.core.models.result import MetricsData, ResourceAllocations, ResourceScan, ResourceType, Result
@@ -139,6 +139,13 @@ class Runner(Configurable):
async def _collect_result(self) -> Result:
clusters = await self._k8s_loader.list_clusters()
+ if len(clusters) > 1 and self.config.prometheus_url:
+ # this can only happen for multi-cluster querying a single centeralized prometheus
+ # In this scenario we dont yet support determining which metrics belong to which cluster so the reccomendation can be incorrect
+ raise ClusterNotSpecifiedException(
+ f"Cannot scan multiple clusters for this prometheus, Rerun with the flag `-c <cluster>` where <cluster> is one of {clusters}"
+ )
+
self.info(f'Using clusters: {clusters if clusters is not None else "inner cluster"}')
objects = await self._k8s_loader.list_scannable_objects(clusters)
@@ -173,5 +180,7 @@ class Runner(Configurable):
try:
result = await self._collect_result()
self._process_result(result)
- except Exception:
+ except ClusterNotSpecifiedException as e:
+ self.error(e)
+ except Exception as e:
self.console.print_exception(extra_lines=1, max_frames=10)
diff --git a/robusta_krr/main.py b/robusta_krr/main.py
index dbf12d8..33480e9 100644
--- a/robusta_krr/main.py
+++ b/robusta_krr/main.py
@@ -91,6 +91,13 @@ def load_commands() -> None:
help="Enable SSL for Prometheus requests.",
rich_help_panel="Prometheus Settings",
),
+ prometheus_cluster_label: Optional[str] = typer.Option(
+ None,
+ "--prometheus-cluster-label",
+ "-l",
+ help="The label in prometheus for your cluster.(Only relevant for centralized prometheus)",
+ rich_help_panel="Prometheus Settings",
+ ),
format: str = typer.Option("table", "--formatter", "-f", help="Output formatter ({formatters})", rich_help_panel="Logging Settings"),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose mode", rich_help_panel="Logging Settings"),
quiet: bool = typer.Option(False, "--quiet", "-q", help="Enable quiet mode", rich_help_panel="Logging Settings"),
@@ -106,6 +113,7 @@ def load_commands() -> None:
prometheus_url=prometheus_url,
prometheus_auth_header=prometheus_auth_header,
prometheus_ssl_enabled=prometheus_ssl_enabled,
+ prometheus_cluster_label=prometheus_cluster_label,
format=format,
verbose=verbose,
quiet=quiet,