Added self metrics

This commit is contained in:
Simon Gruber
2025-12-14 20:16:34 +01:00
parent ed432545ad
commit aab94a309d
5 changed files with 271 additions and 3 deletions

View File

@@ -36,6 +36,17 @@ A lightweight, modular Docker monitoring tool that collects comprehensive metric
- Volume usage patterns (in-use vs unused)
- Container utilization percentage
**Self-Metrics:**
- Service uptime and iteration count
- Collection duration (average and last)
- Metrics collected per iteration
- Collector success/error counts
- Export success/error counts
- Memory usage (RSS, VMS)
- CPU usage percentage
- Thread count
## Quick Start
### Using Docker Compose
@@ -102,6 +113,26 @@ docker-metrics.aggregated.volumes.unused_count
docker-metrics.aggregated.system.container_utilization_percent
```
### Self-Metrics (Service Health)
```
docker-metrics.service.uptime_seconds
docker-metrics.service.iterations_total
docker-metrics.service.metrics_collected_total
docker-metrics.service.metrics_collected_last
docker-metrics.service.collection_duration_avg_seconds
docker-metrics.service.collection_duration_last_seconds
docker-metrics.service.collector.{collector_name}.success_total
docker-metrics.service.collector.{collector_name}.errors_total
docker-metrics.service.exports_success_total
docker-metrics.service.exports_errors_total
docker-metrics.service.memory_rss_bytes
docker-metrics.service.memory_vms_bytes
docker-metrics.service.memory_rss_mb
docker-metrics.service.cpu_percent
docker-metrics.service.threads_count
```
## 📊 Grafana Queries
Powerful queries to visualize your Docker metrics:
@@ -146,6 +177,19 @@ Powerful queries to visualize your Docker metrics:
- **Restart count trends**: `aliasByNode(docker-metrics.containers.*.restart_count, 2)`
- **Containers restarted recently**: `aliasByNode(highestCurrent(docker-metrics.containers.*.restart_count, 5), 2)`
### Self-Monitoring (Service Health)
- **Service uptime**: `docker-metrics.service.uptime_seconds`
- **Collection performance**: `aliasByNode(docker-metrics.service.collection_duration_{avg,last}_seconds, 3)`
- **Metrics collected per iteration**: `docker-metrics.service.metrics_collected_last`
- **Total metrics collected**: `docker-metrics.service.metrics_collected_total`
- **Service memory usage (MB)**: `docker-metrics.service.memory_rss_mb`
- **Service CPU usage**: `docker-metrics.service.cpu_percent`
- **Collector success rates**: `aliasByNode(docker-metrics.service.collector.*.success_total, 3)`
- **Collector error counts**: `aliasByNode(docker-metrics.service.collector.*.errors_total, 3)`
- **Export success vs errors**: `aliasByNode(docker-metrics.service.exports_{success,errors}_total, 2)`
- **Service health score**: `divideSeries(docker-metrics.service.exports_success_total, sumSeries(docker-metrics.service.exports_{success,errors}_total))`
### Advanced Queries
- **Memory usage % across containers**: `aliasByNode(docker-metrics.containers.*.memory_percent, 2)`

View File

@@ -5,10 +5,12 @@ from .base import BaseCollector
from .container_collector import ContainerCollector
from .volume_collector import VolumeCollector
from .system_collector import SystemCollector
from .self_collector import SelfMetricsCollector
__all__ = [
'BaseCollector',
'ContainerCollector',
'VolumeCollector',
'SystemCollector'
'SystemCollector',
'SelfMetricsCollector'
]

View File

@@ -0,0 +1,206 @@
"""
Self-metrics collector - tracks metrics about the collector service itself.
"""
import time
import psutil
import os
from typing import List, Dict, Any
from .base import BaseCollector
class SelfMetricsCollector(BaseCollector):
"""Collects metrics about the Docker metrics collector service itself."""
def __init__(self):
"""Initialize the self-metrics collector."""
self.process = psutil.Process(os.getpid())
self.start_time = time.time()
self.iteration_count = 0
self.total_metrics_collected = 0
self.total_collection_time = 0.0
self.collector_errors = {}
self.collector_successes = {}
self.export_errors = 0
self.export_successes = 0
self.last_collection_duration = 0.0
self.last_metric_count = 0
def collect(self) -> List[Dict[str, Any]]:
"""
Collect self-metrics about the service.
Returns:
List of metric dictionaries
"""
timestamp = int(time.time())
metrics = []
# Service uptime
uptime = time.time() - self.start_time
metrics.append({
'name': 'service.uptime_seconds',
'value': int(uptime),
'timestamp': timestamp
})
# Iteration count
metrics.append({
'name': 'service.iterations_total',
'value': self.iteration_count,
'timestamp': timestamp
})
# Total metrics collected
metrics.append({
'name': 'service.metrics_collected_total',
'value': self.total_metrics_collected,
'timestamp': timestamp
})
# Average collection time
if self.iteration_count > 0:
avg_collection_time = self.total_collection_time / self.iteration_count
metrics.append({
'name': 'service.collection_duration_avg_seconds',
'value': round(avg_collection_time, 3),
'timestamp': timestamp
})
# Last collection duration
metrics.append({
'name': 'service.collection_duration_last_seconds',
'value': round(self.last_collection_duration, 3),
'timestamp': timestamp
})
# Last metric count
metrics.append({
'name': 'service.metrics_collected_last',
'value': self.last_metric_count,
'timestamp': timestamp
})
# Collector success counts
for collector_name, count in self.collector_successes.items():
metrics.append({
'name': f'service.collector.{collector_name}.success_total',
'value': count,
'timestamp': timestamp
})
# Collector error counts
for collector_name, count in self.collector_errors.items():
metrics.append({
'name': f'service.collector.{collector_name}.errors_total',
'value': count,
'timestamp': timestamp
})
# Export success/failure counts
metrics.append({
'name': 'service.exports_success_total',
'value': self.export_successes,
'timestamp': timestamp
})
metrics.append({
'name': 'service.exports_errors_total',
'value': self.export_errors,
'timestamp': timestamp
})
# Memory usage
try:
mem_info = self.process.memory_info()
metrics.append({
'name': 'service.memory_rss_bytes',
'value': mem_info.rss,
'timestamp': timestamp
})
metrics.append({
'name': 'service.memory_vms_bytes',
'value': mem_info.vms,
'timestamp': timestamp
})
# Memory usage in MB for easier reading
metrics.append({
'name': 'service.memory_rss_mb',
'value': round(mem_info.rss / (1024 * 1024), 2),
'timestamp': timestamp
})
except Exception as e:
pass # Skip memory metrics if we can't get them
# CPU usage
try:
cpu_percent = self.process.cpu_percent()
metrics.append({
'name': 'service.cpu_percent',
'value': round(cpu_percent, 2),
'timestamp': timestamp
})
except Exception as e:
pass # Skip CPU metrics if we can't get them
# Thread count
try:
num_threads = self.process.num_threads()
metrics.append({
'name': 'service.threads_count',
'value': num_threads,
'timestamp': timestamp
})
except Exception as e:
pass # Skip thread metrics if we can't get them
return metrics
def get_name(self) -> str:
"""Return the name of this collector."""
return "self-metrics"
def record_iteration(self, duration: float, metric_count: int):
"""
Record metrics from a collection iteration.
Args:
duration: Duration of the collection in seconds
metric_count: Number of metrics collected
"""
self.iteration_count += 1
self.total_collection_time += duration
self.total_metrics_collected += metric_count
self.last_collection_duration = duration
self.last_metric_count = metric_count
def record_collector_success(self, collector_name: str):
"""
Record a successful collection from a collector.
Args:
collector_name: Name of the collector
"""
if collector_name not in self.collector_successes:
self.collector_successes[collector_name] = 0
self.collector_successes[collector_name] += 1
def record_collector_error(self, collector_name: str):
"""
Record an error from a collector.
Args:
collector_name: Name of the collector
"""
if collector_name not in self.collector_errors:
self.collector_errors[collector_name] = 0
self.collector_errors[collector_name] += 1
def record_export_success(self):
"""Record a successful export."""
self.export_successes += 1
def record_export_error(self):
"""Record an export error."""
self.export_errors += 1

View File

@@ -5,7 +5,7 @@ import time
import signal
from typing import List
from collectors import ContainerCollector, VolumeCollector, SystemCollector
from collectors import ContainerCollector, VolumeCollector, SystemCollector, SelfMetricsCollector
from exporters import GraphiteExporter, ConsoleExporter
from aggregator import MetricsAggregator
@@ -17,11 +17,15 @@ class DockerMetricsCollector:
self.running = True
self.config = self._load_config()
# Initialize self-metrics collector first
self.self_metrics = SelfMetricsCollector()
# Initialize collectors
self.collectors = [
ContainerCollector(),
VolumeCollector(),
SystemCollector()
SystemCollector(),
self.self_metrics # Include self-metrics in collection
]
# Initialize aggregator
@@ -96,8 +100,13 @@ class DockerMetricsCollector:
metrics = collector.collect()
all_metrics.extend(metrics)
print(f" - {collector_name}: {len(metrics)} metrics")
# Track collector success
self.self_metrics.record_collector_success(collector_name)
except Exception as e:
print(f" - Error in {collector.get_name()} collector: {e}")
# Track collector error
self.self_metrics.record_collector_error(collector.get_name())
# Aggregate metrics
try:
@@ -115,10 +124,16 @@ class DockerMetricsCollector:
for exporter in self.exporters:
try:
exporter.export(all_metrics)
self.self_metrics.record_export_success()
except Exception as e:
print(f" - Error exporting: {e}")
self.self_metrics.record_export_error()
elapsed = time.time() - start_time
# Record iteration metrics (excluding self-metrics from count to avoid recursion)
self.self_metrics.record_iteration(elapsed, len(all_metrics))
print(f" Collection completed in {elapsed:.2f}s\n")
# Sleep until next iteration

View File

@@ -1 +1,2 @@
docker>=7.0.0
psutil>=5.9.0