diff --git a/README.rst b/README.rst index d254a2e..82f28ec 100644 --- a/README.rst +++ b/README.rst @@ -46,9 +46,14 @@ Usage :: - usage: pve_exporter [-h] [--collector.status] [--collector.version] - [--collector.node] [--collector.cluster] - [--collector.resources] [--collector.config] + usage: pve_exporter [-h] [--collector.status | --no-collector.status] + [--collector.version | --no-collector.version] + [--collector.node | --no-collector.node] + [--collector.cluster | --no-collector.cluster] + [--collector.resources | --no-collector.resources] + [--collector.config | --no-collector.config] + [--server.keyfile SERVER_KEYFILE] + [--server.certfile SERVER_CERTFILE] [config] [port] [address] positional arguments: @@ -56,27 +61,41 @@ Usage port Port on which the exporter is listening (9221) address Address to which the exporter will bind - optional arguments: + options: -h, --help show this help message and exit + --server.keyfile SERVER_KEYFILE + SSL key for server + --server.certfile SERVER_CERTFILE + SSL certificate for server + + cluster collectors: + cluster collectors are run if the url parameter cluster=1 is set and + skipped if the url parameter cluster=0 is set on a scrape url. + --collector.status, --no-collector.status - Exposes Node/VM/CT-Status (default: True) + Exposes Node/VM/CT-Status --collector.version, --no-collector.version - Exposes PVE version info (default: True) + Exposes PVE version info --collector.node, --no-collector.node - Exposes PVE node info (default: True) + Exposes PVE node info --collector.cluster, --no-collector.cluster - Exposes PVE cluster info (default: True) + Exposes PVE cluster info --collector.resources, --no-collector.resources - Exposes PVE resources info (default: True) + Exposes PVE resources info + + node collectors: + node collectors are run if the url parameter node=1 is set and skipped if + the url parameter node=0 is set on a scrape url. + --collector.config, --no-collector.config - Exposes PVE onboot status (default: True) + Exposes PVE onboot status -Use `::` for the `address` argument in order to bind to both IPv6 and IPv4 +Use `[::]` for the `address` argument in order to bind to both IPv6 and IPv4 sockets on dual stacked machines. -Visit http://localhost:9221/pve?target=1.2.3.4 where 1.2.3.4 is the IP -of the Proxmox VE node to get metrics from. Specify the ``module`` +Visit http://localhost:9221/pve?target=1.2.3.4&cluster=1&node=1 where 1.2.3.4 +is the IP of the Proxmox VE node to get metrics from. Specify the ``module`` request parameter, to choose which module to use from the config file. The ``target`` request parameter defaults to ``localhost``. Hence if @@ -258,6 +277,8 @@ Example config for PVE exporter running on PVE node: metrics_path: /pve params: module: [default] + cluster: 1 + node: 1 Example config for PVE exporter running on Prometheus host: @@ -272,6 +293,8 @@ Example config for PVE exporter running on Prometheus host: metrics_path: /pve params: module: [default] + cluster: 1 + node: 1 relabel_configs: - source_labels: [__address__] target_label: __param_target @@ -280,6 +303,20 @@ Example config for PVE exporter running on Prometheus host: - target_label: __address__ replacement: 127.0.0.1:9221 # PVE exporter. +**Note on scraping large clusters:** + +It is adviced to setup separate jobs to collect ``cluster`` metrics and +``node`` metrics in larger deployments. Scraping any node in a cluster with the +url params set to ``cluster=1&node=0`` results in the same set of metrics. Hence +cluster metrics can be scraped efficiently from a single node or from a subset +of cluster nodes (e.g., a different node selected on every scrape via +round-robin DNS). + +Node metrics can only be scraped from a given node. In order to compile a +complete set of node metrics it is necessary to scrape every node in a cluster +with url params set to ``cluster=0&node=1``. + + Grafana Dashboards ------------------ diff --git a/src/pve_exporter/cli.py b/src/pve_exporter/cli.py index d9fc7fe..7599baf 100755 --- a/src/pve_exporter/cli.py +++ b/src/pve_exporter/cli.py @@ -10,32 +10,44 @@ from pve_exporter.config import config_from_env from pve_exporter.collector import CollectorsOptions + def main(): """ Main entry point. """ parser = ArgumentParser() - parser.add_argument('--collector.status', dest='collector_status', - action=BooleanOptionalAction, default=True, - help='Exposes Node/VM/CT-Status') - parser.add_argument('--collector.version', dest='collector_version', - action=BooleanOptionalAction, default=True, - help='Exposes PVE version info') - parser.add_argument('--collector.node', dest='collector_node', - action=BooleanOptionalAction, default=True, - help='Exposes PVE node info') - parser.add_argument('--collector.cluster', dest='collector_cluster', - action=BooleanOptionalAction, default=True, - help='Exposes PVE cluster info') - parser.add_argument('--collector.resources', dest='collector_resources', - action=BooleanOptionalAction, default=True, - help='Exposes PVE resources info') - parser.add_argument('--collector.config', dest='collector_config', - action=BooleanOptionalAction, default=True, - help='Exposes PVE onboot status') + clusterflags = parser.add_argument_group('cluster collectors', description=( + 'cluster collectors are run if the url parameter cluster=1 is set and ' + 'skipped if the url parameter cluster=0 is set on a scrape url.' + )) + clusterflags.add_argument('--collector.status', dest='collector_status', + action=BooleanOptionalAction, default=True, + help='Exposes Node/VM/CT-Status') + clusterflags.add_argument('--collector.version', dest='collector_version', + action=BooleanOptionalAction, default=True, + help='Exposes PVE version info') + clusterflags.add_argument('--collector.node', dest='collector_node', + action=BooleanOptionalAction, default=True, + help='Exposes PVE node info') + clusterflags.add_argument('--collector.cluster', dest='collector_cluster', + action=BooleanOptionalAction, default=True, + help='Exposes PVE cluster info') + clusterflags.add_argument('--collector.resources', dest='collector_resources', + action=BooleanOptionalAction, default=True, + help='Exposes PVE resources info') + + nodeflags = parser.add_argument_group('node collectors', description=( + 'node collectors are run if the url parameter node=1 is set and ' + 'skipped if the url parameter node=0 is set on a scrape url.' + )) + nodeflags.add_argument('--collector.config', dest='collector_config', + action=BooleanOptionalAction, default=True, + help='Exposes PVE onboot status') + parser.add_argument('config', nargs='?', default='pve.yml', help='Path to configuration file (pve.yml)') + parser.add_argument('port', nargs='?', type=int, default='9221', help='Port on which the exporter is listening (9221)') parser.add_argument('address', nargs='?', default='', diff --git a/src/pve_exporter/collector.py b/src/pve_exporter/collector.py index 7c838d3..6855017 100644 --- a/src/pve_exporter/collector.py +++ b/src/pve_exporter/collector.py @@ -7,7 +7,6 @@ import itertools import logging from proxmoxer import ProxmoxAPI -from proxmoxer.core import ResourceException from prometheus_client import CollectorRegistry, generate_latest from prometheus_client.core import GaugeMetricFamily @@ -254,7 +253,7 @@ def collect(self): # pylint: disable=missing-docstring return itertools.chain(metrics.values(), info_metrics.values()) -class ClusterNodeConfigCollector: +class NodeConfigCollector: """ Collects Proxmox VE VM information directly from config, i.e. boot, name, onboot, etc. For manual test: "pvesh get /nodes////config" @@ -276,57 +275,49 @@ def collect(self): # pylint: disable=missing-docstring labels=['id', 'node', 'type']), } - for node in self._pve.nodes.get(): - # The nodes/{node} api call will result in requests being forwarded - # from the api node to the target node. Those calls can fail if the - # target node is offline or otherwise unable to respond to the - # request. In that case it is better to just skip scraping the - # config for guests on that particular node and continue with the - # next one in order to avoid failing the whole scrape. - try: - # Qemu - vmtype = 'qemu' - for vmdata in self._pve.nodes(node['node']).qemu.get(): - config = self._pve.nodes(node['node']).qemu(vmdata['vmid']).config.get().items() - for key, metric_value in config: - label_values = [f"{vmtype}/{vmdata['vmid']}", node['node'], vmtype] - if key in metrics: - metrics[key].add_metric(label_values, metric_value) - # LXC - vmtype = 'lxc' - for vmdata in self._pve.nodes(node['node']).lxc.get(): - config = self._pve.nodes(node['node']).lxc(vmdata['vmid']).config.get().items() - for key, metric_value in config: - label_values = [f"{vmtype}/{vmdata['vmid']}", node['node'], vmtype] - if key in metrics: - metrics[key].add_metric(label_values, metric_value) - - except ResourceException: - self._log.exception( - "Exception thrown while scraping quemu/lxc config from %s", - node['node'] - ) - continue + node = None + for entry in self._pve.cluster.status.get(): + if entry['type'] == 'node' and entry['local']: + node = entry['name'] + break + + # Scrape qemu config + vmtype = 'qemu' + for vmdata in self._pve.nodes(node).qemu.get(): + config = self._pve.nodes(node).qemu(vmdata['vmid']).config.get().items() + for key, metric_value in config: + label_values = [f"{vmtype}/{vmdata['vmid']}", node, vmtype] + if key in metrics: + metrics[key].add_metric(label_values, metric_value) + + # Scrape LXC config + vmtype = 'lxc' + for vmdata in self._pve.nodes(node).lxc.get(): + config = self._pve.nodes(node).lxc(vmdata['vmid']).config.get().items() + for key, metric_value in config: + label_values = [f"{vmtype}/{vmdata['vmid']}", node, vmtype] + if key in metrics: + metrics[key].add_metric(label_values, metric_value) return metrics.values() -def collect_pve(config, host, options: CollectorsOptions): +def collect_pve(config, host, cluster, node, options: CollectorsOptions): """Scrape a host and return prometheus text format for it""" pve = ProxmoxAPI(host, **config) registry = CollectorRegistry() - if options.status: + if cluster and options.status: registry.register(StatusCollector(pve)) - if options.resources: + if cluster and options.resources: registry.register(ClusterResourcesCollector(pve)) - if options.node: + if cluster and options.node: registry.register(ClusterNodeCollector(pve)) - if options.cluster: + if cluster and options.cluster: registry.register(ClusterInfoCollector(pve)) - if options.config: - registry.register(ClusterNodeConfigCollector(pve)) - if options.version: + if cluster and options.version: registry.register(VersionCollector(pve)) + if node and options.config: + registry.register(NodeConfigCollector(pve)) return generate_latest(registry) diff --git a/src/pve_exporter/http.py b/src/pve_exporter/http.py index 8528263..50cd4a6 100644 --- a/src/pve_exporter/http.py +++ b/src/pve_exporter/http.py @@ -28,14 +28,20 @@ def __init__(self, config, duration, errors, collectors): self._log = logging.getLogger(__name__) - def on_pve(self, module='default', target='localhost'): + def on_pve(self, module='default', target='localhost', cluster='1', node='1'): """ Request handler for /pve route """ if module in self._config: start = time.time() - output = collect_pve(self._config[module], target, self._collectors) + output = collect_pve( + self._config[module], + target, + cluster.lower() not in ['false', '0', ''], + node.lower() not in ['false', '0', ''], + self._collectors + ) response = Response(output) response.headers['content-type'] = CONTENT_TYPE_LATEST self._duration.labels(module).observe(time.time() - start) @@ -79,7 +85,7 @@ def view(self, endpoint, values, args): """ allowed_args = { - 'pve': ['module', 'target'] + 'pve': ['module', 'target', 'cluster', 'node'] } view_registry = {