Added self metrics
This commit is contained in:
44
Readme.md
44
Readme.md
@@ -36,6 +36,17 @@ A lightweight, modular Docker monitoring tool that collects comprehensive metric
|
||||
- Volume usage patterns (in-use vs unused)
|
||||
- Container utilization percentage
|
||||
|
||||
**Self-Metrics:**
|
||||
|
||||
- Service uptime and iteration count
|
||||
- Collection duration (average and last)
|
||||
- Metrics collected per iteration
|
||||
- Collector success/error counts
|
||||
- Export success/error counts
|
||||
- Memory usage (RSS, VMS)
|
||||
- CPU usage percentage
|
||||
- Thread count
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Using Docker Compose
|
||||
@@ -102,6 +113,26 @@ docker-metrics.aggregated.volumes.unused_count
|
||||
docker-metrics.aggregated.system.container_utilization_percent
|
||||
```
|
||||
|
||||
### Self-Metrics (Service Health)
|
||||
|
||||
```
|
||||
docker-metrics.service.uptime_seconds
|
||||
docker-metrics.service.iterations_total
|
||||
docker-metrics.service.metrics_collected_total
|
||||
docker-metrics.service.metrics_collected_last
|
||||
docker-metrics.service.collection_duration_avg_seconds
|
||||
docker-metrics.service.collection_duration_last_seconds
|
||||
docker-metrics.service.collector.{collector_name}.success_total
|
||||
docker-metrics.service.collector.{collector_name}.errors_total
|
||||
docker-metrics.service.exports_success_total
|
||||
docker-metrics.service.exports_errors_total
|
||||
docker-metrics.service.memory_rss_bytes
|
||||
docker-metrics.service.memory_vms_bytes
|
||||
docker-metrics.service.memory_rss_mb
|
||||
docker-metrics.service.cpu_percent
|
||||
docker-metrics.service.threads_count
|
||||
```
|
||||
|
||||
## 📊 Grafana Queries
|
||||
|
||||
Powerful queries to visualize your Docker metrics:
|
||||
@@ -146,6 +177,19 @@ Powerful queries to visualize your Docker metrics:
|
||||
- **Restart count trends**: `aliasByNode(docker-metrics.containers.*.restart_count, 2)`
|
||||
- **Containers restarted recently**: `aliasByNode(highestCurrent(docker-metrics.containers.*.restart_count, 5), 2)`
|
||||
|
||||
### Self-Monitoring (Service Health)
|
||||
|
||||
- **Service uptime**: `docker-metrics.service.uptime_seconds`
|
||||
- **Collection performance**: `aliasByNode(docker-metrics.service.collection_duration_{avg,last}_seconds, 3)`
|
||||
- **Metrics collected per iteration**: `docker-metrics.service.metrics_collected_last`
|
||||
- **Total metrics collected**: `docker-metrics.service.metrics_collected_total`
|
||||
- **Service memory usage (MB)**: `docker-metrics.service.memory_rss_mb`
|
||||
- **Service CPU usage**: `docker-metrics.service.cpu_percent`
|
||||
- **Collector success rates**: `aliasByNode(docker-metrics.service.collector.*.success_total, 3)`
|
||||
- **Collector error counts**: `aliasByNode(docker-metrics.service.collector.*.errors_total, 3)`
|
||||
- **Export success vs errors**: `aliasByNode(docker-metrics.service.exports_{success,errors}_total, 2)`
|
||||
- **Service health score**: `divideSeries(docker-metrics.service.exports_success_total, sumSeries(docker-metrics.service.exports_{success,errors}_total))`
|
||||
|
||||
### Advanced Queries
|
||||
|
||||
- **Memory usage % across containers**: `aliasByNode(docker-metrics.containers.*.memory_percent, 2)`
|
||||
|
||||
@@ -5,10 +5,12 @@ from .base import BaseCollector
|
||||
from .container_collector import ContainerCollector
|
||||
from .volume_collector import VolumeCollector
|
||||
from .system_collector import SystemCollector
|
||||
from .self_collector import SelfMetricsCollector
|
||||
|
||||
__all__ = [
|
||||
'BaseCollector',
|
||||
'ContainerCollector',
|
||||
'VolumeCollector',
|
||||
'SystemCollector'
|
||||
'SystemCollector',
|
||||
'SelfMetricsCollector'
|
||||
]
|
||||
|
||||
206
src/collectors/self_collector.py
Normal file
206
src/collectors/self_collector.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""
|
||||
Self-metrics collector - tracks metrics about the collector service itself.
|
||||
"""
|
||||
import time
|
||||
import psutil
|
||||
import os
|
||||
from typing import List, Dict, Any
|
||||
from .base import BaseCollector
|
||||
|
||||
|
||||
class SelfMetricsCollector(BaseCollector):
|
||||
"""Collects metrics about the Docker metrics collector service itself."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the self-metrics collector."""
|
||||
self.process = psutil.Process(os.getpid())
|
||||
self.start_time = time.time()
|
||||
self.iteration_count = 0
|
||||
self.total_metrics_collected = 0
|
||||
self.total_collection_time = 0.0
|
||||
self.collector_errors = {}
|
||||
self.collector_successes = {}
|
||||
self.export_errors = 0
|
||||
self.export_successes = 0
|
||||
self.last_collection_duration = 0.0
|
||||
self.last_metric_count = 0
|
||||
|
||||
def collect(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Collect self-metrics about the service.
|
||||
|
||||
Returns:
|
||||
List of metric dictionaries
|
||||
"""
|
||||
timestamp = int(time.time())
|
||||
metrics = []
|
||||
|
||||
# Service uptime
|
||||
uptime = time.time() - self.start_time
|
||||
metrics.append({
|
||||
'name': 'service.uptime_seconds',
|
||||
'value': int(uptime),
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Iteration count
|
||||
metrics.append({
|
||||
'name': 'service.iterations_total',
|
||||
'value': self.iteration_count,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Total metrics collected
|
||||
metrics.append({
|
||||
'name': 'service.metrics_collected_total',
|
||||
'value': self.total_metrics_collected,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Average collection time
|
||||
if self.iteration_count > 0:
|
||||
avg_collection_time = self.total_collection_time / self.iteration_count
|
||||
metrics.append({
|
||||
'name': 'service.collection_duration_avg_seconds',
|
||||
'value': round(avg_collection_time, 3),
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Last collection duration
|
||||
metrics.append({
|
||||
'name': 'service.collection_duration_last_seconds',
|
||||
'value': round(self.last_collection_duration, 3),
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Last metric count
|
||||
metrics.append({
|
||||
'name': 'service.metrics_collected_last',
|
||||
'value': self.last_metric_count,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Collector success counts
|
||||
for collector_name, count in self.collector_successes.items():
|
||||
metrics.append({
|
||||
'name': f'service.collector.{collector_name}.success_total',
|
||||
'value': count,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Collector error counts
|
||||
for collector_name, count in self.collector_errors.items():
|
||||
metrics.append({
|
||||
'name': f'service.collector.{collector_name}.errors_total',
|
||||
'value': count,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Export success/failure counts
|
||||
metrics.append({
|
||||
'name': 'service.exports_success_total',
|
||||
'value': self.export_successes,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
metrics.append({
|
||||
'name': 'service.exports_errors_total',
|
||||
'value': self.export_errors,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Memory usage
|
||||
try:
|
||||
mem_info = self.process.memory_info()
|
||||
metrics.append({
|
||||
'name': 'service.memory_rss_bytes',
|
||||
'value': mem_info.rss,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
metrics.append({
|
||||
'name': 'service.memory_vms_bytes',
|
||||
'value': mem_info.vms,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Memory usage in MB for easier reading
|
||||
metrics.append({
|
||||
'name': 'service.memory_rss_mb',
|
||||
'value': round(mem_info.rss / (1024 * 1024), 2),
|
||||
'timestamp': timestamp
|
||||
})
|
||||
except Exception as e:
|
||||
pass # Skip memory metrics if we can't get them
|
||||
|
||||
# CPU usage
|
||||
try:
|
||||
cpu_percent = self.process.cpu_percent()
|
||||
metrics.append({
|
||||
'name': 'service.cpu_percent',
|
||||
'value': round(cpu_percent, 2),
|
||||
'timestamp': timestamp
|
||||
})
|
||||
except Exception as e:
|
||||
pass # Skip CPU metrics if we can't get them
|
||||
|
||||
# Thread count
|
||||
try:
|
||||
num_threads = self.process.num_threads()
|
||||
metrics.append({
|
||||
'name': 'service.threads_count',
|
||||
'value': num_threads,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
except Exception as e:
|
||||
pass # Skip thread metrics if we can't get them
|
||||
|
||||
return metrics
|
||||
|
||||
def get_name(self) -> str:
|
||||
"""Return the name of this collector."""
|
||||
return "self-metrics"
|
||||
|
||||
def record_iteration(self, duration: float, metric_count: int):
|
||||
"""
|
||||
Record metrics from a collection iteration.
|
||||
|
||||
Args:
|
||||
duration: Duration of the collection in seconds
|
||||
metric_count: Number of metrics collected
|
||||
"""
|
||||
self.iteration_count += 1
|
||||
self.total_collection_time += duration
|
||||
self.total_metrics_collected += metric_count
|
||||
self.last_collection_duration = duration
|
||||
self.last_metric_count = metric_count
|
||||
|
||||
def record_collector_success(self, collector_name: str):
|
||||
"""
|
||||
Record a successful collection from a collector.
|
||||
|
||||
Args:
|
||||
collector_name: Name of the collector
|
||||
"""
|
||||
if collector_name not in self.collector_successes:
|
||||
self.collector_successes[collector_name] = 0
|
||||
self.collector_successes[collector_name] += 1
|
||||
|
||||
def record_collector_error(self, collector_name: str):
|
||||
"""
|
||||
Record an error from a collector.
|
||||
|
||||
Args:
|
||||
collector_name: Name of the collector
|
||||
"""
|
||||
if collector_name not in self.collector_errors:
|
||||
self.collector_errors[collector_name] = 0
|
||||
self.collector_errors[collector_name] += 1
|
||||
|
||||
def record_export_success(self):
|
||||
"""Record a successful export."""
|
||||
self.export_successes += 1
|
||||
|
||||
def record_export_error(self):
|
||||
"""Record an export error."""
|
||||
self.export_errors += 1
|
||||
19
src/main.py
19
src/main.py
@@ -5,7 +5,7 @@ import time
|
||||
import signal
|
||||
from typing import List
|
||||
|
||||
from collectors import ContainerCollector, VolumeCollector, SystemCollector
|
||||
from collectors import ContainerCollector, VolumeCollector, SystemCollector, SelfMetricsCollector
|
||||
from exporters import GraphiteExporter, ConsoleExporter
|
||||
from aggregator import MetricsAggregator
|
||||
|
||||
@@ -17,11 +17,15 @@ class DockerMetricsCollector:
|
||||
self.running = True
|
||||
self.config = self._load_config()
|
||||
|
||||
# Initialize self-metrics collector first
|
||||
self.self_metrics = SelfMetricsCollector()
|
||||
|
||||
# Initialize collectors
|
||||
self.collectors = [
|
||||
ContainerCollector(),
|
||||
VolumeCollector(),
|
||||
SystemCollector()
|
||||
SystemCollector(),
|
||||
self.self_metrics # Include self-metrics in collection
|
||||
]
|
||||
|
||||
# Initialize aggregator
|
||||
@@ -96,8 +100,13 @@ class DockerMetricsCollector:
|
||||
metrics = collector.collect()
|
||||
all_metrics.extend(metrics)
|
||||
print(f" - {collector_name}: {len(metrics)} metrics")
|
||||
|
||||
# Track collector success
|
||||
self.self_metrics.record_collector_success(collector_name)
|
||||
except Exception as e:
|
||||
print(f" - Error in {collector.get_name()} collector: {e}")
|
||||
# Track collector error
|
||||
self.self_metrics.record_collector_error(collector.get_name())
|
||||
|
||||
# Aggregate metrics
|
||||
try:
|
||||
@@ -115,10 +124,16 @@ class DockerMetricsCollector:
|
||||
for exporter in self.exporters:
|
||||
try:
|
||||
exporter.export(all_metrics)
|
||||
self.self_metrics.record_export_success()
|
||||
except Exception as e:
|
||||
print(f" - Error exporting: {e}")
|
||||
self.self_metrics.record_export_error()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Record iteration metrics (excluding self-metrics from count to avoid recursion)
|
||||
self.self_metrics.record_iteration(elapsed, len(all_metrics))
|
||||
|
||||
print(f" Collection completed in {elapsed:.2f}s\n")
|
||||
|
||||
# Sleep until next iteration
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
docker>=7.0.0
|
||||
psutil>=5.9.0
|
||||
|
||||
Reference in New Issue
Block a user