Added self metrics
This commit is contained in:
@@ -36,6 +36,17 @@ A lightweight, modular Docker monitoring tool that collects comprehensive metric
|
|||||||
- Volume usage patterns (in-use vs unused)
|
- Volume usage patterns (in-use vs unused)
|
||||||
- Container utilization percentage
|
- Container utilization percentage
|
||||||
|
|
||||||
|
**Self-Metrics:**
|
||||||
|
|
||||||
|
- Service uptime and iteration count
|
||||||
|
- Collection duration (average and last)
|
||||||
|
- Metrics collected per iteration
|
||||||
|
- Collector success/error counts
|
||||||
|
- Export success/error counts
|
||||||
|
- Memory usage (RSS, VMS)
|
||||||
|
- CPU usage percentage
|
||||||
|
- Thread count
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
### Using Docker Compose
|
### Using Docker Compose
|
||||||
@@ -102,6 +113,26 @@ docker-metrics.aggregated.volumes.unused_count
|
|||||||
docker-metrics.aggregated.system.container_utilization_percent
|
docker-metrics.aggregated.system.container_utilization_percent
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Self-Metrics (Service Health)
|
||||||
|
|
||||||
|
```
|
||||||
|
docker-metrics.service.uptime_seconds
|
||||||
|
docker-metrics.service.iterations_total
|
||||||
|
docker-metrics.service.metrics_collected_total
|
||||||
|
docker-metrics.service.metrics_collected_last
|
||||||
|
docker-metrics.service.collection_duration_avg_seconds
|
||||||
|
docker-metrics.service.collection_duration_last_seconds
|
||||||
|
docker-metrics.service.collector.{collector_name}.success_total
|
||||||
|
docker-metrics.service.collector.{collector_name}.errors_total
|
||||||
|
docker-metrics.service.exports_success_total
|
||||||
|
docker-metrics.service.exports_errors_total
|
||||||
|
docker-metrics.service.memory_rss_bytes
|
||||||
|
docker-metrics.service.memory_vms_bytes
|
||||||
|
docker-metrics.service.memory_rss_mb
|
||||||
|
docker-metrics.service.cpu_percent
|
||||||
|
docker-metrics.service.threads_count
|
||||||
|
```
|
||||||
|
|
||||||
## 📊 Grafana Queries
|
## 📊 Grafana Queries
|
||||||
|
|
||||||
Powerful queries to visualize your Docker metrics:
|
Powerful queries to visualize your Docker metrics:
|
||||||
@@ -146,6 +177,19 @@ Powerful queries to visualize your Docker metrics:
|
|||||||
- **Restart count trends**: `aliasByNode(docker-metrics.containers.*.restart_count, 2)`
|
- **Restart count trends**: `aliasByNode(docker-metrics.containers.*.restart_count, 2)`
|
||||||
- **Containers restarted recently**: `aliasByNode(highestCurrent(docker-metrics.containers.*.restart_count, 5), 2)`
|
- **Containers restarted recently**: `aliasByNode(highestCurrent(docker-metrics.containers.*.restart_count, 5), 2)`
|
||||||
|
|
||||||
|
### Self-Monitoring (Service Health)
|
||||||
|
|
||||||
|
- **Service uptime**: `docker-metrics.service.uptime_seconds`
|
||||||
|
- **Collection performance**: `aliasByNode(docker-metrics.service.collection_duration_{avg,last}_seconds, 3)`
|
||||||
|
- **Metrics collected per iteration**: `docker-metrics.service.metrics_collected_last`
|
||||||
|
- **Total metrics collected**: `docker-metrics.service.metrics_collected_total`
|
||||||
|
- **Service memory usage (MB)**: `docker-metrics.service.memory_rss_mb`
|
||||||
|
- **Service CPU usage**: `docker-metrics.service.cpu_percent`
|
||||||
|
- **Collector success rates**: `aliasByNode(docker-metrics.service.collector.*.success_total, 3)`
|
||||||
|
- **Collector error counts**: `aliasByNode(docker-metrics.service.collector.*.errors_total, 3)`
|
||||||
|
- **Export success vs errors**: `aliasByNode(docker-metrics.service.exports_{success,errors}_total, 2)`
|
||||||
|
- **Service health score**: `divideSeries(docker-metrics.service.exports_success_total, sumSeries(docker-metrics.service.exports_{success,errors}_total))`
|
||||||
|
|
||||||
### Advanced Queries
|
### Advanced Queries
|
||||||
|
|
||||||
- **Memory usage % across containers**: `aliasByNode(docker-metrics.containers.*.memory_percent, 2)`
|
- **Memory usage % across containers**: `aliasByNode(docker-metrics.containers.*.memory_percent, 2)`
|
||||||
|
|||||||
@@ -5,10 +5,12 @@ from .base import BaseCollector
|
|||||||
from .container_collector import ContainerCollector
|
from .container_collector import ContainerCollector
|
||||||
from .volume_collector import VolumeCollector
|
from .volume_collector import VolumeCollector
|
||||||
from .system_collector import SystemCollector
|
from .system_collector import SystemCollector
|
||||||
|
from .self_collector import SelfMetricsCollector
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'BaseCollector',
|
'BaseCollector',
|
||||||
'ContainerCollector',
|
'ContainerCollector',
|
||||||
'VolumeCollector',
|
'VolumeCollector',
|
||||||
'SystemCollector'
|
'SystemCollector',
|
||||||
|
'SelfMetricsCollector'
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -0,0 +1,206 @@
|
|||||||
|
"""
|
||||||
|
Self-metrics collector - tracks metrics about the collector service itself.
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
import psutil
|
||||||
|
import os
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from .base import BaseCollector
|
||||||
|
|
||||||
|
|
||||||
|
class SelfMetricsCollector(BaseCollector):
|
||||||
|
"""Collects metrics about the Docker metrics collector service itself."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the self-metrics collector."""
|
||||||
|
self.process = psutil.Process(os.getpid())
|
||||||
|
self.start_time = time.time()
|
||||||
|
self.iteration_count = 0
|
||||||
|
self.total_metrics_collected = 0
|
||||||
|
self.total_collection_time = 0.0
|
||||||
|
self.collector_errors = {}
|
||||||
|
self.collector_successes = {}
|
||||||
|
self.export_errors = 0
|
||||||
|
self.export_successes = 0
|
||||||
|
self.last_collection_duration = 0.0
|
||||||
|
self.last_metric_count = 0
|
||||||
|
|
||||||
|
def collect(self) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Collect self-metrics about the service.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of metric dictionaries
|
||||||
|
"""
|
||||||
|
timestamp = int(time.time())
|
||||||
|
metrics = []
|
||||||
|
|
||||||
|
# Service uptime
|
||||||
|
uptime = time.time() - self.start_time
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.uptime_seconds',
|
||||||
|
'value': int(uptime),
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Iteration count
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.iterations_total',
|
||||||
|
'value': self.iteration_count,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Total metrics collected
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.metrics_collected_total',
|
||||||
|
'value': self.total_metrics_collected,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Average collection time
|
||||||
|
if self.iteration_count > 0:
|
||||||
|
avg_collection_time = self.total_collection_time / self.iteration_count
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.collection_duration_avg_seconds',
|
||||||
|
'value': round(avg_collection_time, 3),
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Last collection duration
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.collection_duration_last_seconds',
|
||||||
|
'value': round(self.last_collection_duration, 3),
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Last metric count
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.metrics_collected_last',
|
||||||
|
'value': self.last_metric_count,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Collector success counts
|
||||||
|
for collector_name, count in self.collector_successes.items():
|
||||||
|
metrics.append({
|
||||||
|
'name': f'service.collector.{collector_name}.success_total',
|
||||||
|
'value': count,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Collector error counts
|
||||||
|
for collector_name, count in self.collector_errors.items():
|
||||||
|
metrics.append({
|
||||||
|
'name': f'service.collector.{collector_name}.errors_total',
|
||||||
|
'value': count,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Export success/failure counts
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.exports_success_total',
|
||||||
|
'value': self.export_successes,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.exports_errors_total',
|
||||||
|
'value': self.export_errors,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Memory usage
|
||||||
|
try:
|
||||||
|
mem_info = self.process.memory_info()
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.memory_rss_bytes',
|
||||||
|
'value': mem_info.rss,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.memory_vms_bytes',
|
||||||
|
'value': mem_info.vms,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Memory usage in MB for easier reading
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.memory_rss_mb',
|
||||||
|
'value': round(mem_info.rss / (1024 * 1024), 2),
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
pass # Skip memory metrics if we can't get them
|
||||||
|
|
||||||
|
# CPU usage
|
||||||
|
try:
|
||||||
|
cpu_percent = self.process.cpu_percent()
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.cpu_percent',
|
||||||
|
'value': round(cpu_percent, 2),
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
pass # Skip CPU metrics if we can't get them
|
||||||
|
|
||||||
|
# Thread count
|
||||||
|
try:
|
||||||
|
num_threads = self.process.num_threads()
|
||||||
|
metrics.append({
|
||||||
|
'name': 'service.threads_count',
|
||||||
|
'value': num_threads,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
pass # Skip thread metrics if we can't get them
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
def get_name(self) -> str:
|
||||||
|
"""Return the name of this collector."""
|
||||||
|
return "self-metrics"
|
||||||
|
|
||||||
|
def record_iteration(self, duration: float, metric_count: int):
|
||||||
|
"""
|
||||||
|
Record metrics from a collection iteration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
duration: Duration of the collection in seconds
|
||||||
|
metric_count: Number of metrics collected
|
||||||
|
"""
|
||||||
|
self.iteration_count += 1
|
||||||
|
self.total_collection_time += duration
|
||||||
|
self.total_metrics_collected += metric_count
|
||||||
|
self.last_collection_duration = duration
|
||||||
|
self.last_metric_count = metric_count
|
||||||
|
|
||||||
|
def record_collector_success(self, collector_name: str):
|
||||||
|
"""
|
||||||
|
Record a successful collection from a collector.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
collector_name: Name of the collector
|
||||||
|
"""
|
||||||
|
if collector_name not in self.collector_successes:
|
||||||
|
self.collector_successes[collector_name] = 0
|
||||||
|
self.collector_successes[collector_name] += 1
|
||||||
|
|
||||||
|
def record_collector_error(self, collector_name: str):
|
||||||
|
"""
|
||||||
|
Record an error from a collector.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
collector_name: Name of the collector
|
||||||
|
"""
|
||||||
|
if collector_name not in self.collector_errors:
|
||||||
|
self.collector_errors[collector_name] = 0
|
||||||
|
self.collector_errors[collector_name] += 1
|
||||||
|
|
||||||
|
def record_export_success(self):
|
||||||
|
"""Record a successful export."""
|
||||||
|
self.export_successes += 1
|
||||||
|
|
||||||
|
def record_export_error(self):
|
||||||
|
"""Record an export error."""
|
||||||
|
self.export_errors += 1
|
||||||
+17
-2
@@ -5,7 +5,7 @@ import time
|
|||||||
import signal
|
import signal
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from collectors import ContainerCollector, VolumeCollector, SystemCollector
|
from collectors import ContainerCollector, VolumeCollector, SystemCollector, SelfMetricsCollector
|
||||||
from exporters import GraphiteExporter, ConsoleExporter
|
from exporters import GraphiteExporter, ConsoleExporter
|
||||||
from aggregator import MetricsAggregator
|
from aggregator import MetricsAggregator
|
||||||
|
|
||||||
@@ -17,11 +17,15 @@ class DockerMetricsCollector:
|
|||||||
self.running = True
|
self.running = True
|
||||||
self.config = self._load_config()
|
self.config = self._load_config()
|
||||||
|
|
||||||
|
# Initialize self-metrics collector first
|
||||||
|
self.self_metrics = SelfMetricsCollector()
|
||||||
|
|
||||||
# Initialize collectors
|
# Initialize collectors
|
||||||
self.collectors = [
|
self.collectors = [
|
||||||
ContainerCollector(),
|
ContainerCollector(),
|
||||||
VolumeCollector(),
|
VolumeCollector(),
|
||||||
SystemCollector()
|
SystemCollector(),
|
||||||
|
self.self_metrics # Include self-metrics in collection
|
||||||
]
|
]
|
||||||
|
|
||||||
# Initialize aggregator
|
# Initialize aggregator
|
||||||
@@ -96,8 +100,13 @@ class DockerMetricsCollector:
|
|||||||
metrics = collector.collect()
|
metrics = collector.collect()
|
||||||
all_metrics.extend(metrics)
|
all_metrics.extend(metrics)
|
||||||
print(f" - {collector_name}: {len(metrics)} metrics")
|
print(f" - {collector_name}: {len(metrics)} metrics")
|
||||||
|
|
||||||
|
# Track collector success
|
||||||
|
self.self_metrics.record_collector_success(collector_name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" - Error in {collector.get_name()} collector: {e}")
|
print(f" - Error in {collector.get_name()} collector: {e}")
|
||||||
|
# Track collector error
|
||||||
|
self.self_metrics.record_collector_error(collector.get_name())
|
||||||
|
|
||||||
# Aggregate metrics
|
# Aggregate metrics
|
||||||
try:
|
try:
|
||||||
@@ -115,10 +124,16 @@ class DockerMetricsCollector:
|
|||||||
for exporter in self.exporters:
|
for exporter in self.exporters:
|
||||||
try:
|
try:
|
||||||
exporter.export(all_metrics)
|
exporter.export(all_metrics)
|
||||||
|
self.self_metrics.record_export_success()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" - Error exporting: {e}")
|
print(f" - Error exporting: {e}")
|
||||||
|
self.self_metrics.record_export_error()
|
||||||
|
|
||||||
elapsed = time.time() - start_time
|
elapsed = time.time() - start_time
|
||||||
|
|
||||||
|
# Record iteration metrics (excluding self-metrics from count to avoid recursion)
|
||||||
|
self.self_metrics.record_iteration(elapsed, len(all_metrics))
|
||||||
|
|
||||||
print(f" Collection completed in {elapsed:.2f}s\n")
|
print(f" Collection completed in {elapsed:.2f}s\n")
|
||||||
|
|
||||||
# Sleep until next iteration
|
# Sleep until next iteration
|
||||||
|
|||||||
@@ -1 +1,2 @@
|
|||||||
docker>=7.0.0
|
docker>=7.0.0
|
||||||
|
psutil>=5.9.0
|
||||||
|
|||||||
Reference in New Issue
Block a user