From 604dd2f5a8b3b30073d5179a344079795580c40d Mon Sep 17 00:00:00 2001 From: paladox Date: Fri, 22 Nov 2024 14:36:24 +0000 Subject: [PATCH] mediawikI: Add a way to view opcache/apcu stats This is from wikimedias puppet repo [0] with minor tweeks to conform to how we do things. [0] https://github.com/wikimedia/operations-puppet/blob/bd7cbb36d87e147da028c3c51b583f93f5851957/modules/profile/manifests/mediawiki/php/monitoring.pp --- modules/mediawiki/files/php/admin/index.php | 65 +++ modules/mediawiki/files/php/admin/lib.php | 473 ++++++++++++++++++ .../mediawiki/files/php/nrpe_check_opcache.py | 113 +++++ modules/mediawiki/files/php/php7adm.sh | 32 ++ modules/mediawiki/manifests/monitoring.pp | 25 + .../mediawiki/templates/php-admin.conf.erb | 19 + 6 files changed, 727 insertions(+) create mode 100644 modules/mediawiki/files/php/admin/index.php create mode 100644 modules/mediawiki/files/php/admin/lib.php create mode 100755 modules/mediawiki/files/php/nrpe_check_opcache.py create mode 100755 modules/mediawiki/files/php/php7adm.sh create mode 100644 modules/mediawiki/templates/php-admin.conf.erb diff --git a/modules/mediawiki/files/php/admin/index.php b/modules/mediawiki/files/php/admin/index.php new file mode 100644 index 0000000000..6d651b9527 --- /dev/null +++ b/modules/mediawiki/files/php/admin/index.php @@ -0,0 +1,65 @@ + 1) { + $frag = $fragsize / $freetotal * 100; + } else { + $frag = 0; + } + return round($frag, 5, PHP_ROUND_HALF_UP); +} + +/* + + Very simple class to manage prometheus metrics printing. + Not intended to be complete or useful outside of this context. + +*/ +class PrometheusMetric { + public $description; + public $key; + private $value; + private $labels; + private $type; + + function __construct(string $key, string $type, string $description) { + $this->key = $key; + $this->description = $description; + // Set labels empty + // We need to tag the prometheus metrics with the php version as well. + // Given we sometimes report too much info in PHP_VERSION, let's limit + // this to major.minor.patch + $php_ver = preg_filter("/^(\d\.\d+\.\d+).*/", "$1", PHP_VERSION); + $this->labels = ['php_version="' . $php_ver . '"']; + $this->type = $type; + } + + public function setValue($value) { + if (is_bool($value) === true) { + $this->value = (int) $value; + } elseif (is_array($value)) { + $this->value = implode(" ", $value); + } else { + $this->value = $value; + } + } + + public function setLabel(string $name, string $value) { + $this->labels[] = "$name=\"{$value}\""; + } + + private function _helpLine(): string { + // If the description is empty, don't return + // any help header. + if ($this->description == "") { + return ""; + } + return sprintf("# HELP %s %s\n# TYPE %s %s\n", + $this->key, $this->description, + $this->key, $this->type + ); + } + + public function __toString() { + if ($this->labels != []) { + $full_name = sprintf('%s{%s}',$this->key, implode(",", $this->labels)); + } else { + $full_name = $this->key; + } + return sprintf( + "%s%s %s\n", + $this->_helpLine(), + $full_name, + $this->value + ); + } +} + + +function prometheus_metrics(): array { + $oc = opcache_stats(); + $ac = apcu_stats(); + $af = apcu_frag(); + $defs = [ + [ + 'name' => 'php_opcache_enabled', + 'type' => 'gauge', + 'desc' => 'Opcache is enabled', + 'value' => $oc['opcache_enabled'] + ], + [ + 'name' => 'php_opcache_full', + 'type' => 'gauge', + 'desc' => 'Opcache is full', + 'value' => $oc['cache_full'] + ], + [ + 'name' => 'php_opcache_memory', + 'type' => 'gauge', + 'label' => ['type', 'used'], + 'desc' => 'Used memory stats', + 'value' => $oc['memory_usage']['used_memory'] + ], + [ + 'name' => 'php_opcache_memory', + 'type' => 'gauge', + 'label' => ['type', 'free'], + 'desc' => '', + 'value' => $oc['memory_usage']['free_memory'] + ], + [ + 'name' => 'php_opcache_memory', + 'type' => 'gauge', + 'label' => ['type', 'wasted'], + 'desc' => '', + 'value' => $oc['memory_usage']['wasted_memory'] + ], + [ + 'name' => 'php_opcache_wasted_memory', + 'type' => 'gauge', + 'desc' => 'Percentage of wasted memory in opcache', + 'value' => round($oc['memory_usage']['current_wasted_percentage'],5, PHP_ROUND_HALF_UP) + ], + [ + 'name' => 'php_opcache_strings_memory', + 'type' => 'gauge', + 'label' => ['type', 'used'], + 'desc' => 'Memory usage from interned strings', + 'value' => $oc['interned_strings_usage']['used_memory'] + ], + [ + 'name' => 'php_opcache_strings_memory', + 'type' => 'gauge', + 'label' => ['type', 'free'], + 'desc' => '', + 'value' => $oc['interned_strings_usage']['free_memory'] + ], + [ + 'name' => 'php_opcache_strings_numbers', + 'type' => 'gauge', + 'desc' => 'Memory usage from interned strings', + 'value' => $oc['interned_strings_usage']['number_of_strings'], + ], + [ + 'name' => 'php_opcache_stats_cached', + 'type' => 'gauge', + 'label' => ['type', 'scripts'], + 'desc' => 'Stats about cached objects', + 'value' => $oc['opcache_statistics']['num_cached_scripts'] + ], + [ + 'name' => 'php_opcache_stats_cached', + 'type' => 'gauge', + 'label' => ['type', 'keys'], + 'desc' => '', + 'value' => $oc['opcache_statistics']['num_cached_keys'] + ], + [ + 'name' => 'php_opcache_stats_cached', + 'type' => 'counter', + 'label' => ['type', 'max_keys'], + 'desc' => '', + 'value' => $oc['opcache_statistics']['max_cached_keys'] + ], + [ + 'name' => 'php_opcache_stats_cache_hit', + 'type' => 'counter', + 'label' => ['type', 'hits'], + 'desc' => 'Stats about cached object hit/miss ratio', + 'value' => $oc['opcache_statistics']['hits'] + ], + [ + 'name' => 'php_opcache_stats_cache_hit', + 'type' => 'counter', + 'label' => ['type', 'misses'], + 'desc' => '', + 'value' => $oc['opcache_statistics']['misses'] + ], + [ + 'name' => 'php_opcache_stats_cache_hit', + 'type' => 'counter', + 'label' => ['type', 'total'], + 'desc' => '', + 'value' => ($oc['opcache_statistics']['misses'] + $oc['opcache_statistics']['hits']) + ], + [ + 'name' => 'php_apcu_num_slots', + 'type' => 'counter', + 'desc' => 'Number of distinct APCu slots available', + 'value' => $ac['num_slots'], + ], + [ + 'name' => 'php_apcu_cache_ops', + 'type' => 'counter', + 'label' => ['type', 'hits'], + 'desc' => 'Stats about APCu operations', + 'value' => $ac['num_hits'], + ], + [ + 'name' => 'php_apcu_cache_ops', + 'type' => 'counter', + 'label' => ['type', 'misses'], + 'desc' => '', + 'value' => $ac['num_misses'], + ], + [ + 'name' => 'php_apcu_cache_ops', + 'type' => 'counter', + 'label' => ['type', 'total_gets'], + 'desc' => '', + 'value' => ($ac['num_misses'] + $ac['num_hits']), + ], + [ + 'name' => 'php_apcu_cache_ops', + 'type' => 'counter', + 'label' => ['type', 'inserts'], + 'desc' => '', + 'value' => $ac['num_inserts'], + ], + [ + 'name' => 'php_apcu_cache_ops', + 'type' => 'counter', + 'label' => ['type', 'entries'], + 'desc' => '', + 'value' => $ac['num_entries'], + ], + [ + 'name' => 'php_apcu_cache_ops', + 'type' => 'counter', + 'label' => ['type', 'expunges'], + 'desc' => '', + 'value' => $ac['expunges'], + ], + [ + 'name' => 'php_apcu_memory', + 'type' => 'gauge', + 'label' => ['type', 'free'], + 'desc' => 'APCu memory status', + 'value' => $ac['avail_mem'], + ], + [ + 'name' => 'php_apcu_memory', + 'type' => 'gauge', + 'label' => ['type', 'total'], + 'desc' => '', + 'value' => $ac['seg_size'], + ], + [ + 'name' => 'php_apcu_fragmentation', + 'type' => 'gauge', + 'desc' => 'APCu fragementation percentage', + 'value' => $af, + ], + ]; + $metrics = []; + foreach ($defs as $metric_def) { + $t = isset($metric_def['type'])? $metric_def['type'] : 'counter'; + $p = new PrometheusMetric($metric_def['name'], $t, $metric_def['desc']); + if (isset($metric_def['label'])) { + $p->setLabel(...$metric_def['label']); + } + if (isset($metric_def['value'])) { + $p->setValue($metric_def['value']); + } + $metrics[] = $p; + } + return $metrics; +} + + +/** + * Simple class to manage combining prometheus metrics from multiple ports/php versions + */ +class RemoteMetrics { + const VERSIONS_FILE = '/etc/php7adm.versions'; + const ADMIN_PORT_BASE = 9181; + private $_admin_ports; + function __construct() { + $this->_admin_ports = $this->_read_ports(); + } + + private function _read_ports() { + $contents = @file_get_contents(self::VERSIONS_FILE); + // If no file is present, assume we're the only version available. + if ($contents === false) { + return []; + } + $ports = json_decode($contents, true); + if ($ports === null) { + return []; + } + // We don't need the port for the current version. + $version_brief = preg_filter("#^(\d.\d).*$#", "$1", PHP_VERSION); + if (array_key_exists($version_brief, $ports)) { + unset($ports[$version_brief]); + } + return $ports; + } + + private static function _url($port) { + return sprintf("http://localhost:%d/local-metrics", $port); + } + + private function _get_remote_metrics() { + $output = ""; + # Ok in theory this is bad - I'm downloading metrics from the other php versions in sequence. + # But in practice, I doubt we'll have more than two versions of php at any time, so this is a single download. + foreach (array_values($this->_admin_ports) as $additional_port) { + $url = self::_url($additional_port); + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + // Set a 2 seconds timeout, and a stringent connect timeout. + // The idea here is that if it takes more than 10 ms to connect to the other admin port, that + // php-fpm instance, or the apache server itself, are overwhelmed and should be left alone. + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 10); + curl_setopt($ch, CURLOPT_TIMEOUT, 2); + $response = curl_exec($ch); + if ($response !== false) { + $output .= "\n" . $response; + } else { + error_log("Error fetching {$url}: " . curl_error($ch)); + } + curl_close($ch); + } + return $output; + } + + public function show_metrics() { + echo $this->_get_remote_metrics(); + } +} + +function dump_file($name, $contents) { + if (is_file($name)) { + if (!unlink($name)) { + die("Could not remove {$name}.\n"); + } + } + file_put_contents( + $name, + json_encode($contents) + ); + echo "Requested data dumped at {$name}.\n"; +} + +// Views +function show_prometheus_metrics() { + header("Content-Type: text/plain"); + foreach (prometheus_metrics() as $k) { + printf("%s", $k); + } +} + +function show_all_prometheus_metrics() { + show_prometheus_metrics(); + $rm = new RemoteMetrics(); + $rm->show_metrics(); +} + +function show_apcu_info() { + header("Content-Type: application/json"); + print json_encode(apcu_stats()); +} + +function show_apcu_frag() { + header("Content-Type: application/json"); + print json_encode(array('fragmentation'=>apcu_frag())); +} + +function dump_apcu_full() { + header("Content-Type: text/plain"); + $stats = apcu_stats(true); + dump_file('/tmp/apcu_dump_meta', $stats['cache_list']); +} + +function clear_apcu() { + header("Content-Type: text/plain"); + apcu_clear_cache(); + echo "APCu cache cleared\n"; +} + +function show_opcache_info() { + header("Content-Type: application/json"); + print json_encode(opcache_stats()); +} + +function dump_opcache_meta() { + header("Content-Type: text/plain"); + $oc = opcache_stats(true); + dump_file('/tmp/opcache_dump_meta', $oc['scripts']); +} + +function clear_opcache() { + header("Content-Type: text/plain"); + opcache_reset(); +} + +function ini_value() { + header("Content-Type: application/json"); + $all_ini_values = ini_get_all(); + if (isset($_GET['key'])) { + $key = $_GET['key']; + if (array_key_exists($key, $all_ini_values)) { + $val = $all_ini_values[$key]; + print json_encode([$key => $val]); + } else { + http_response_code(400); + print json_encode(['error' => "parameter '$key' not found"]); + } + } else { + print json_encode($all_ini_values); + } + # Add a new line to beautify output on the console + print "\n"; +} + diff --git a/modules/mediawiki/files/php/nrpe_check_opcache.py b/modules/mediawiki/files/php/nrpe_check_opcache.py new file mode 100755 index 0000000000..d0910b8c51 --- /dev/null +++ b/modules/mediawiki/files/php/nrpe_check_opcache.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Copyright (C) 2019-2020 Giuseppe Lavagetto +Copyright (C) 2021 Kunal Mehta + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +""" +import argparse +import sys +import traceback + +import requests + +class Alerts: + criticals = [] + warnings = [] + + def critical(self, msg): + self.criticals.append(msg) + + def warning(self, msg): + self.warnings.append(msg) + + def __str__(self): + # If we have critical alerts, print them. + if self.criticals: + return f"CRITICAL: {'; '.join(self.criticals)}" + elif self.warnings: + return f"WARNING: {'; '.join(self.warnings)}" + else: + return "OK: opcache is healthy" + + def retval(self): + if self.criticals: + return 2 + elif self.warnings: + return 1 + else: + return 0 + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-w", "--warning", help="free space warning threshold", + type=int, required=True) + parser.add_argument("-c", "--critical", help="free space critical threshold", + type=int, required=True) + return parser.parse_args() + + +def opcache_info(port): + resp = requests.get( + f"http://localhost:{port}/opcache-info", + headers={"user-agent": "nrpe_check_opcache.py"}, + ) + resp.raise_for_status() + return resp.json() + + +def main(): + alerts = Alerts() + args = parse_args() + info = opcache_info(9181) + # First check if the opcache is full + if info["cache_full"]: + alerts.critical(f"opcache full on php.") + else: + # Now check for the opcache cache-hit ratio. If it's below 99.85%, it's a critical alert. + scripts = info["opcache_statistics"]["num_cached_scripts"] + hits = info["opcache_statistics"]["hits"] + # Skip the check if the service has been restarted since a few minutes, and we + # don't have enough traffic to reach the stats. + # Specifically, we need to have a number of hits that, given the number of scripts, + # would allow to reach such thresholds. + threshold = scripts * 10000 # 1 miss out of 10k => 99.99% + if hits > threshold: + hit_rate = info["opcache_statistics"]["opcache_hit_rate"] + if hit_rate < 99.85: + alerts.critical(f"opcache cache-hit ratio is below 99.85% on php.") + elif hit_rate < 99.99: + alerts.warning(f"opcache cache-hit ratio is below 99.99% on php.") + + # Now check if the free space is below the critical level + free_space = info["memory_usage"]["free_memory"] / (1024 * 1024) + if free_space < args.critical: + alerts.critical(f"opcache free space is below {args.critical} MB on php.") + elif free_space < args.warning: + alerts.warning(f"opcache free space is below {args.warning} MB on php.") + + # Print out the results! + print(alerts) + return alerts.retval() + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + # Catch any unexpected errors, like requests or JSON errors + print("UNKNOWN: {e}".format(e=e)) + traceback.print_exc() + sys.exit(3) diff --git a/modules/mediawiki/files/php/php7adm.sh b/modules/mediawiki/files/php/php7adm.sh new file mode 100755 index 0000000000..aa12509b44 --- /dev/null +++ b/modules/mediawiki/files/php/php7adm.sh @@ -0,0 +1,32 @@ +#!/bin/bash +function usage() { + cat < directory, + recurse => true, + owner => 'root', + group => 'www-data', + mode => '0555', + source => 'puppet:///modules/mediawiki/php/admin' + } + + nginx::site { 'php-admin': + ensure => present, + content => template('mediawiki/php-admin.conf.erb'), + } + + ## Admin script + file { '/usr/local/bin/php7adm': + ensure => present, + source => 'puppet:///modules/mediawiki/php/php7adm.sh', + owner => 'root', + group => 'root', + mode => '0555', + } + if ( $facts['networking']['interfaces']['ens19'] and $facts['networking']['interfaces']['ens18'] ) { $address = $facts['networking']['interfaces']['ens19']['ip'] } elsif ( $facts['networking']['interfaces']['ens18'] ) { diff --git a/modules/mediawiki/templates/php-admin.conf.erb b/modules/mediawiki/templates/php-admin.conf.erb new file mode 100644 index 0000000000..0a3ebdd7f3 --- /dev/null +++ b/modules/mediawiki/templates/php-admin.conf.erb @@ -0,0 +1,19 @@ +server { + listen 9181; + listen [::]:9181; + + server_name ~.; + root /var/www/php-monitoring; + + index index.php index.html; + + location ~ \.php { + include fastcgi_params; + fastcgi_index index.php; + fastcgi_split_path_info ^(.+\.php)(.*)$; + fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name; + fastcgi_buffers 32 32k; + fastcgi_buffer_size 64k; + fastcgi_pass unix:/run/php/fpm-www.sock; + } +} \ No newline at end of file