Added self metrics

This commit is contained in:
Simon Gruber
2025-12-14 20:16:34 +01:00
parent ed432545ad
commit aab94a309d
5 changed files with 271 additions and 3 deletions
+44
View File
@@ -36,6 +36,17 @@ A lightweight, modular Docker monitoring tool that collects comprehensive metric
- Volume usage patterns (in-use vs unused) - Volume usage patterns (in-use vs unused)
- Container utilization percentage - Container utilization percentage
**Self-Metrics:**
- Service uptime and iteration count
- Collection duration (average and last)
- Metrics collected per iteration
- Collector success/error counts
- Export success/error counts
- Memory usage (RSS, VMS)
- CPU usage percentage
- Thread count
## Quick Start ## Quick Start
### Using Docker Compose ### Using Docker Compose
@@ -102,6 +113,26 @@ docker-metrics.aggregated.volumes.unused_count
docker-metrics.aggregated.system.container_utilization_percent docker-metrics.aggregated.system.container_utilization_percent
``` ```
### Self-Metrics (Service Health)
```
docker-metrics.service.uptime_seconds
docker-metrics.service.iterations_total
docker-metrics.service.metrics_collected_total
docker-metrics.service.metrics_collected_last
docker-metrics.service.collection_duration_avg_seconds
docker-metrics.service.collection_duration_last_seconds
docker-metrics.service.collector.{collector_name}.success_total
docker-metrics.service.collector.{collector_name}.errors_total
docker-metrics.service.exports_success_total
docker-metrics.service.exports_errors_total
docker-metrics.service.memory_rss_bytes
docker-metrics.service.memory_vms_bytes
docker-metrics.service.memory_rss_mb
docker-metrics.service.cpu_percent
docker-metrics.service.threads_count
```
## 📊 Grafana Queries ## 📊 Grafana Queries
Powerful queries to visualize your Docker metrics: Powerful queries to visualize your Docker metrics:
@@ -146,6 +177,19 @@ Powerful queries to visualize your Docker metrics:
- **Restart count trends**: `aliasByNode(docker-metrics.containers.*.restart_count, 2)` - **Restart count trends**: `aliasByNode(docker-metrics.containers.*.restart_count, 2)`
- **Containers restarted recently**: `aliasByNode(highestCurrent(docker-metrics.containers.*.restart_count, 5), 2)` - **Containers restarted recently**: `aliasByNode(highestCurrent(docker-metrics.containers.*.restart_count, 5), 2)`
### Self-Monitoring (Service Health)
- **Service uptime**: `docker-metrics.service.uptime_seconds`
- **Collection performance**: `aliasByNode(docker-metrics.service.collection_duration_{avg,last}_seconds, 3)`
- **Metrics collected per iteration**: `docker-metrics.service.metrics_collected_last`
- **Total metrics collected**: `docker-metrics.service.metrics_collected_total`
- **Service memory usage (MB)**: `docker-metrics.service.memory_rss_mb`
- **Service CPU usage**: `docker-metrics.service.cpu_percent`
- **Collector success rates**: `aliasByNode(docker-metrics.service.collector.*.success_total, 3)`
- **Collector error counts**: `aliasByNode(docker-metrics.service.collector.*.errors_total, 3)`
- **Export success vs errors**: `aliasByNode(docker-metrics.service.exports_{success,errors}_total, 2)`
- **Service health score**: `divideSeries(docker-metrics.service.exports_success_total, sumSeries(docker-metrics.service.exports_{success,errors}_total))`
### Advanced Queries ### Advanced Queries
- **Memory usage % across containers**: `aliasByNode(docker-metrics.containers.*.memory_percent, 2)` - **Memory usage % across containers**: `aliasByNode(docker-metrics.containers.*.memory_percent, 2)`
+3 -1
View File
@@ -5,10 +5,12 @@ from .base import BaseCollector
from .container_collector import ContainerCollector from .container_collector import ContainerCollector
from .volume_collector import VolumeCollector from .volume_collector import VolumeCollector
from .system_collector import SystemCollector from .system_collector import SystemCollector
from .self_collector import SelfMetricsCollector
__all__ = [ __all__ = [
'BaseCollector', 'BaseCollector',
'ContainerCollector', 'ContainerCollector',
'VolumeCollector', 'VolumeCollector',
'SystemCollector' 'SystemCollector',
'SelfMetricsCollector'
] ]
+206
View File
@@ -0,0 +1,206 @@
"""
Self-metrics collector - tracks metrics about the collector service itself.
"""
import time
import psutil
import os
from typing import List, Dict, Any
from .base import BaseCollector
class SelfMetricsCollector(BaseCollector):
"""Collects metrics about the Docker metrics collector service itself."""
def __init__(self):
"""Initialize the self-metrics collector."""
self.process = psutil.Process(os.getpid())
self.start_time = time.time()
self.iteration_count = 0
self.total_metrics_collected = 0
self.total_collection_time = 0.0
self.collector_errors = {}
self.collector_successes = {}
self.export_errors = 0
self.export_successes = 0
self.last_collection_duration = 0.0
self.last_metric_count = 0
def collect(self) -> List[Dict[str, Any]]:
"""
Collect self-metrics about the service.
Returns:
List of metric dictionaries
"""
timestamp = int(time.time())
metrics = []
# Service uptime
uptime = time.time() - self.start_time
metrics.append({
'name': 'service.uptime_seconds',
'value': int(uptime),
'timestamp': timestamp
})
# Iteration count
metrics.append({
'name': 'service.iterations_total',
'value': self.iteration_count,
'timestamp': timestamp
})
# Total metrics collected
metrics.append({
'name': 'service.metrics_collected_total',
'value': self.total_metrics_collected,
'timestamp': timestamp
})
# Average collection time
if self.iteration_count > 0:
avg_collection_time = self.total_collection_time / self.iteration_count
metrics.append({
'name': 'service.collection_duration_avg_seconds',
'value': round(avg_collection_time, 3),
'timestamp': timestamp
})
# Last collection duration
metrics.append({
'name': 'service.collection_duration_last_seconds',
'value': round(self.last_collection_duration, 3),
'timestamp': timestamp
})
# Last metric count
metrics.append({
'name': 'service.metrics_collected_last',
'value': self.last_metric_count,
'timestamp': timestamp
})
# Collector success counts
for collector_name, count in self.collector_successes.items():
metrics.append({
'name': f'service.collector.{collector_name}.success_total',
'value': count,
'timestamp': timestamp
})
# Collector error counts
for collector_name, count in self.collector_errors.items():
metrics.append({
'name': f'service.collector.{collector_name}.errors_total',
'value': count,
'timestamp': timestamp
})
# Export success/failure counts
metrics.append({
'name': 'service.exports_success_total',
'value': self.export_successes,
'timestamp': timestamp
})
metrics.append({
'name': 'service.exports_errors_total',
'value': self.export_errors,
'timestamp': timestamp
})
# Memory usage
try:
mem_info = self.process.memory_info()
metrics.append({
'name': 'service.memory_rss_bytes',
'value': mem_info.rss,
'timestamp': timestamp
})
metrics.append({
'name': 'service.memory_vms_bytes',
'value': mem_info.vms,
'timestamp': timestamp
})
# Memory usage in MB for easier reading
metrics.append({
'name': 'service.memory_rss_mb',
'value': round(mem_info.rss / (1024 * 1024), 2),
'timestamp': timestamp
})
except Exception as e:
pass # Skip memory metrics if we can't get them
# CPU usage
try:
cpu_percent = self.process.cpu_percent()
metrics.append({
'name': 'service.cpu_percent',
'value': round(cpu_percent, 2),
'timestamp': timestamp
})
except Exception as e:
pass # Skip CPU metrics if we can't get them
# Thread count
try:
num_threads = self.process.num_threads()
metrics.append({
'name': 'service.threads_count',
'value': num_threads,
'timestamp': timestamp
})
except Exception as e:
pass # Skip thread metrics if we can't get them
return metrics
def get_name(self) -> str:
"""Return the name of this collector."""
return "self-metrics"
def record_iteration(self, duration: float, metric_count: int):
"""
Record metrics from a collection iteration.
Args:
duration: Duration of the collection in seconds
metric_count: Number of metrics collected
"""
self.iteration_count += 1
self.total_collection_time += duration
self.total_metrics_collected += metric_count
self.last_collection_duration = duration
self.last_metric_count = metric_count
def record_collector_success(self, collector_name: str):
"""
Record a successful collection from a collector.
Args:
collector_name: Name of the collector
"""
if collector_name not in self.collector_successes:
self.collector_successes[collector_name] = 0
self.collector_successes[collector_name] += 1
def record_collector_error(self, collector_name: str):
"""
Record an error from a collector.
Args:
collector_name: Name of the collector
"""
if collector_name not in self.collector_errors:
self.collector_errors[collector_name] = 0
self.collector_errors[collector_name] += 1
def record_export_success(self):
"""Record a successful export."""
self.export_successes += 1
def record_export_error(self):
"""Record an export error."""
self.export_errors += 1
+17 -2
View File
@@ -5,7 +5,7 @@ import time
import signal import signal
from typing import List from typing import List
from collectors import ContainerCollector, VolumeCollector, SystemCollector from collectors import ContainerCollector, VolumeCollector, SystemCollector, SelfMetricsCollector
from exporters import GraphiteExporter, ConsoleExporter from exporters import GraphiteExporter, ConsoleExporter
from aggregator import MetricsAggregator from aggregator import MetricsAggregator
@@ -17,11 +17,15 @@ class DockerMetricsCollector:
self.running = True self.running = True
self.config = self._load_config() self.config = self._load_config()
# Initialize self-metrics collector first
self.self_metrics = SelfMetricsCollector()
# Initialize collectors # Initialize collectors
self.collectors = [ self.collectors = [
ContainerCollector(), ContainerCollector(),
VolumeCollector(), VolumeCollector(),
SystemCollector() SystemCollector(),
self.self_metrics # Include self-metrics in collection
] ]
# Initialize aggregator # Initialize aggregator
@@ -96,8 +100,13 @@ class DockerMetricsCollector:
metrics = collector.collect() metrics = collector.collect()
all_metrics.extend(metrics) all_metrics.extend(metrics)
print(f" - {collector_name}: {len(metrics)} metrics") print(f" - {collector_name}: {len(metrics)} metrics")
# Track collector success
self.self_metrics.record_collector_success(collector_name)
except Exception as e: except Exception as e:
print(f" - Error in {collector.get_name()} collector: {e}") print(f" - Error in {collector.get_name()} collector: {e}")
# Track collector error
self.self_metrics.record_collector_error(collector.get_name())
# Aggregate metrics # Aggregate metrics
try: try:
@@ -115,10 +124,16 @@ class DockerMetricsCollector:
for exporter in self.exporters: for exporter in self.exporters:
try: try:
exporter.export(all_metrics) exporter.export(all_metrics)
self.self_metrics.record_export_success()
except Exception as e: except Exception as e:
print(f" - Error exporting: {e}") print(f" - Error exporting: {e}")
self.self_metrics.record_export_error()
elapsed = time.time() - start_time elapsed = time.time() - start_time
# Record iteration metrics (excluding self-metrics from count to avoid recursion)
self.self_metrics.record_iteration(elapsed, len(all_metrics))
print(f" Collection completed in {elapsed:.2f}s\n") print(f" Collection completed in {elapsed:.2f}s\n")
# Sleep until next iteration # Sleep until next iteration
+1
View File
@@ -1 +1,2 @@
docker>=7.0.0 docker>=7.0.0
psutil>=5.9.0