diff options
| author | Avi-Robusta <97387909+Avi-Robusta@users.noreply.github.com> | 2023-06-14 12:08:31 +0300 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-06-14 12:08:31 +0300 |
| commit | 31a192559010bb72e962844183a0a8bd4f83c5e9 (patch) | |
| tree | 59dfca10049f5cf328820977048d333151d9096f | |
| parent | b69c44783dbf81dcf7087e4f395f5ce19baf3f8c (diff) | |
| parent | 142b6ac3d310a187cae40e88f1c0c5a7b11b0fe3 (diff) | |
Merge pull request #70 from robusta-dev/centralized-support
Centralized multi-cluster Prometheus/Thanos/VM support
| -rw-r--r-- | README.md | 33 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/__init__.py | 7 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/loader.py | 1 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/metrics/base_metric.py | 11 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py | 2 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/metrics/memory_metric.py | 2 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py | 6 | ||||
| -rw-r--r-- | robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py | 30 | ||||
| -rw-r--r-- | robusta_krr/core/models/config.py | 1 | ||||
| -rw-r--r-- | robusta_krr/core/models/objects.py | 3 | ||||
| -rw-r--r-- | robusta_krr/core/runner.py | 13 | ||||
| -rw-r--r-- | robusta_krr/main.py | 8 |
12 files changed, 108 insertions, 9 deletions
@@ -274,9 +274,10 @@ krr simple --help <!-- Port-forwarding --> -## Prometheus auto-discovery +## Prometheus, Victoria Metrics and Thanos auto-discovery -By default, KRR will try to auto-discover the running Prometheus by scanning those labels: +By default, KRR will try to auto-discover the running Prometheus Victoria Metrics and Thanos. +For discovering prometheus it scan services for those labels: ```python "app=kube-prometheus-stack-prometheus" "app=prometheus,component=server" @@ -286,8 +287,22 @@ By default, KRR will try to auto-discover the running Prometheus by scanning tho "app=rancher-monitoring-prometheus" "app=prometheus-prometheus" ``` +For Thanos its these labels: +```python +"app.kubernetes.io/component=query,app.kubernetes.io/name=thanos", +"app.kubernetes.io/name=thanos-query", +"app=thanos-query", +"app=thanos-querier", +``` +And for Victoria Metrics its the following labels: +```python +"app.kubernetes.io/name=vmsingle", +"app.kubernetes.io/name=victoria-metrics-single", +"app.kubernetes.io/name=vmselect", +"app=vmselect", +``` -If none of those labels result in finding Prometheus, you will get an error and will have to pass the working url explicitly (using the `-p` flag). +If none of those labels result in finding Prometheus, Victoria Metrics or Thanos, you will get an error and will have to pass the working url explicitly (using the `-p` flag). <p align="right">(<a href="#readme-top">back to top</a>)</p> @@ -309,6 +324,18 @@ krr simple -p http://127.0.0.1:9090 <p align="right">(<a href="#readme-top">back to top</a>)</p> +## Scanning with a centralized Prometheus + +If your Prometheus monitors multiple clusters you we require the label you defined for your cluster in Prometheus. + +For example, if your cluster has the Prometheus label `cluster: "my-cluster-name"` and your prometheus is at url `http://my-centralized-prometheus:9090`, then run this command: + +```sh +krr.py simple -p http://my-centralized-prometheus:9090 -l my-cluster-name +``` + +<p align="right">(<a href="#readme-top">back to top</a>)</p> + <!-- Formatters --> ## Available formatters diff --git a/robusta_krr/core/integrations/prometheus/__init__.py b/robusta_krr/core/integrations/prometheus/__init__.py index 3d298c6..6cec9a6 100644 --- a/robusta_krr/core/integrations/prometheus/__init__.py +++ b/robusta_krr/core/integrations/prometheus/__init__.py @@ -1,2 +1,7 @@ from .loader import MetricsLoader -from .metrics_service.prometheus_metrics_service import CustomPrometheusConnect, PrometheusDiscovery, PrometheusNotFound +from .metrics_service.prometheus_metrics_service import ( + ClusterNotSpecifiedException, + CustomPrometheusConnect, + PrometheusDiscovery, + PrometheusNotFound, +) diff --git a/robusta_krr/core/integrations/prometheus/loader.py b/robusta_krr/core/integrations/prometheus/loader.py index 773e8b7..cb33a4e 100644 --- a/robusta_krr/core/integrations/prometheus/loader.py +++ b/robusta_krr/core/integrations/prometheus/loader.py @@ -62,6 +62,7 @@ class MetricsLoader(Configurable): loader = metric_service_class(config, api_client=api_client, cluster=cluster) loader.check_connection() self.echo(f"{service_name} found") + loader.validate_cluster_name() return loader except MetricsNotFound as e: self.debug(f"{service_name} not found") diff --git a/robusta_krr/core/integrations/prometheus/metrics/base_metric.py b/robusta_krr/core/integrations/prometheus/metrics/base_metric.py index 7118d8c..aea633c 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base_metric.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base_metric.py @@ -30,6 +30,17 @@ class BaseMetricLoader(Configurable, abc.ABC): super().__init__(config) self.prometheus = prometheus + def get_prometheus_cluster_label(self) -> str: + """ + Generates the cluster label for querying a centralized Prometheus + + Returns: + str: a promql safe label string for querying the cluster. + """ + if self.config.prometheus_cluster_label is None: + return "" + return f', cluster="{self.config.prometheus_cluster_label}"' + @abc.abstractmethod def get_query(self, object: K8sObjectData) -> str: """ diff --git a/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py b/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py index 67ec98a..e2d364b 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py +++ b/robusta_krr/core/integrations/prometheus/metrics/cpu_metric.py @@ -9,10 +9,12 @@ from .base_metric import bind_metric class CPUMetricLoader(BaseFilteredMetricLoader): def get_query(self, object: K8sObjectData) -> str: pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() return ( "sum(irate(container_cpu_usage_seconds_total{" f'namespace="{object.namespace}", ' f'pod=~"{pods_selector}", ' f'container="{object.container}"' + f"{cluster_label}" "}[5m])) by (container, pod, job)" ) diff --git a/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py b/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py index ddb056a..5942ec1 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py +++ b/robusta_krr/core/integrations/prometheus/metrics/memory_metric.py @@ -9,10 +9,12 @@ from .base_metric import bind_metric class MemoryMetricLoader(BaseFilteredMetricLoader): def get_query(self, object: K8sObjectData) -> str: pods_selector = "|".join(pod.name for pod in object.pods) + cluster_label = self.get_prometheus_cluster_label() return ( "sum(container_memory_working_set_bytes{" f'namespace="{object.namespace}", ' f'pod=~"{pods_selector}", ' f'container="{object.container}"' + f"{cluster_label}" "}) by (container, pod, job)" ) diff --git a/robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py b/robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py index 21b3706..9ac308b 100644 --- a/robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py +++ b/robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py @@ -1,6 +1,6 @@ import abc import datetime -from typing import Optional +from typing import List, Optional from kubernetes.client.api_client import ApiClient @@ -39,6 +39,10 @@ class MetricsService(Configurable, abc.ABC): return classname.replace("MetricsService", "") if classname != MetricsService.__name__ else classname @abc.abstractmethod + async def get_cluster_names(self) -> Optional[List[str]]: + ... + + @abc.abstractmethod async def gather_data( self, object: K8sObjectData, diff --git a/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py b/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py index c5d42ef..4f1c6f2 100644 --- a/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py +++ b/robusta_krr/core/integrations/prometheus/metrics_service/prometheus_metrics_service.py @@ -1,6 +1,6 @@ import asyncio import datetime -from typing import Optional, no_type_check +from typing import List, Optional, no_type_check import requests from kubernetes.client import ApiClient @@ -50,6 +50,14 @@ class PrometheusNotFound(MetricsNotFound): pass +class ClusterNotSpecifiedException(Exception): + """ + An exception raised when a prometheus requires a cluster label but an invalid one is provided. + """ + + pass + + class CustomPrometheusConnect(PrometheusConnect): """ Custom PrometheusConnect class to handle retries. @@ -134,6 +142,26 @@ class PrometheusMetricsService(MetricsService): async def query(self, query: str) -> dict: return await asyncio.to_thread(self.prometheus.custom_query, query=query) + def validate_cluster_name(self): + cluster_label = self.config.prometheus_cluster_label + cluster_names = self.get_cluster_names() + + if len(cluster_names) <= 1: + # there is only one cluster of metrics in this prometheus + return + + if not cluster_label: + raise ClusterNotSpecifiedException( + f"No label specified, Rerun krr with the flag `-l <cluster>` where <cluster> is one of {cluster_names}" + ) + if cluster_label not in cluster_names: + raise ClusterNotSpecifiedException( + f"Label {cluster_label} does not exist, Rerun krr with the flag `-l <cluster>` where <cluster> is one of {cluster_names}" + ) + + def get_cluster_names(self) -> Optional[List[str]]: + return self.prometheus.get_label_values(label_name="cluster") + async def gather_data( self, object: K8sObjectData, diff --git a/robusta_krr/core/models/config.py b/robusta_krr/core/models/config.py index 6cd0b56..72cf4d8 100644 --- a/robusta_krr/core/models/config.py +++ b/robusta_krr/core/models/config.py @@ -25,6 +25,7 @@ class Config(pd.BaseSettings): prometheus_url: Optional[str] = pd.Field(None) prometheus_auth_header: Optional[str] = pd.Field(None) prometheus_ssl_enabled: bool = pd.Field(False) + prometheus_cluster_label: Optional[str] = pd.Field(None) # Logging Settings format: str diff --git a/robusta_krr/core/models/objects.py b/robusta_krr/core/models/objects.py index 72694fd..3fbbc9c 100644 --- a/robusta_krr/core/models/objects.py +++ b/robusta_krr/core/models/objects.py @@ -1,6 +1,7 @@ +from typing import Optional + import pydantic as pd -from typing import Optional from robusta_krr.core.models.allocations import ResourceAllocations diff --git a/robusta_krr/core/runner.py b/robusta_krr/core/runner.py index 7a64bfb..02c505b 100644 --- a/robusta_krr/core/runner.py +++ b/robusta_krr/core/runner.py @@ -4,7 +4,7 @@ from typing import Optional, Union from robusta_krr.core.abstract.strategies import ResourceRecommendation, RunResult from robusta_krr.core.integrations.kubernetes import KubernetesLoader -from robusta_krr.core.integrations.prometheus import MetricsLoader, PrometheusNotFound +from robusta_krr.core.integrations.prometheus import ClusterNotSpecifiedException, MetricsLoader, PrometheusNotFound from robusta_krr.core.models.config import Config from robusta_krr.core.models.objects import K8sObjectData from robusta_krr.core.models.result import MetricsData, ResourceAllocations, ResourceScan, ResourceType, Result @@ -139,6 +139,13 @@ class Runner(Configurable): async def _collect_result(self) -> Result: clusters = await self._k8s_loader.list_clusters() + if len(clusters) > 1 and self.config.prometheus_url: + # this can only happen for multi-cluster querying a single centeralized prometheus + # In this scenario we dont yet support determining which metrics belong to which cluster so the reccomendation can be incorrect + raise ClusterNotSpecifiedException( + f"Cannot scan multiple clusters for this prometheus, Rerun with the flag `-c <cluster>` where <cluster> is one of {clusters}" + ) + self.info(f'Using clusters: {clusters if clusters is not None else "inner cluster"}') objects = await self._k8s_loader.list_scannable_objects(clusters) @@ -173,5 +180,7 @@ class Runner(Configurable): try: result = await self._collect_result() self._process_result(result) - except Exception: + except ClusterNotSpecifiedException as e: + self.error(e) + except Exception as e: self.console.print_exception(extra_lines=1, max_frames=10) diff --git a/robusta_krr/main.py b/robusta_krr/main.py index dbf12d8..33480e9 100644 --- a/robusta_krr/main.py +++ b/robusta_krr/main.py @@ -91,6 +91,13 @@ def load_commands() -> None: help="Enable SSL for Prometheus requests.", rich_help_panel="Prometheus Settings", ), + prometheus_cluster_label: Optional[str] = typer.Option( + None, + "--prometheus-cluster-label", + "-l", + help="The label in prometheus for your cluster.(Only relevant for centralized prometheus)", + rich_help_panel="Prometheus Settings", + ), format: str = typer.Option("table", "--formatter", "-f", help="Output formatter ({formatters})", rich_help_panel="Logging Settings"), verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose mode", rich_help_panel="Logging Settings"), quiet: bool = typer.Option(False, "--quiet", "-q", help="Enable quiet mode", rich_help_panel="Logging Settings"), @@ -106,6 +113,7 @@ def load_commands() -> None: prometheus_url=prometheus_url, prometheus_auth_header=prometheus_auth_header, prometheus_ssl_enabled=prometheus_ssl_enabled, + prometheus_cluster_label=prometheus_cluster_label, format=format, verbose=verbose, quiet=quiet, |
