Performance Metrics

Comprehensive performance monitoring and metrics collection for Athena.Cache. Track cache effectiveness, system performance, and identify optimization opportunities.

Core Metrics Collection

Enable detailed metrics collection for production monitoring.

Configuration

// Program.cs
builder.Services.AddAthenaCacheComplete(options =>
{
    // Enable metrics collection
    options.Monitoring.EnableMetrics = true;
    options.Monitoring.MetricsCollectionInterval = TimeSpan.FromSeconds(30);
    options.Monitoring.RetainMetricsFor = TimeSpan.FromHours(24);
    
    // Detailed performance tracking
    options.Monitoring.TrackResponseTimes = true;
    options.Monitoring.TrackCacheKeyDistribution = true;
    options.Monitoring.TrackMemoryUsage = true;
    options.Monitoring.TrackErrorRates = true;
    
    // Custom metrics
    options.Monitoring.EnableCustomMetrics = true;
    options.Monitoring.MaxCustomMetrics = 1000;
});

Metrics Collection Service

public class CacheMetricsCollector : ICacheMetricsCollector
{
    private readonly ConcurrentDictionary<string, MetricValue> _metrics = new();
    private readonly Timer _collectionTimer;
    private readonly ILogger<CacheMetricsCollector> _logger;

    public CacheMetricsCollector(ILogger<CacheMetricsCollector> logger)
    {
        _logger = logger;
        _collectionTimer = new Timer(CollectMetrics, null, TimeSpan.Zero, TimeSpan.FromSeconds(30));
    }

    private void CollectMetrics(object state)
    {
        try
        {
            var currentMetrics = new CacheMetrics
            {
                Timestamp = DateTimeOffset.UtcNow,
                
                // Cache effectiveness
                HitRate = CalculateHitRate(),
                MissRate = CalculateMissRate(),
                HitCount = _metrics.GetValueOrDefault("hit_count", 0),
                MissCount = _metrics.GetValueOrDefault("miss_count", 0),
                
                // Performance metrics
                AverageResponseTime = CalculateAverageResponseTime(),
                P50ResponseTime = CalculatePercentileResponseTime(50),
                P95ResponseTime = CalculatePercentileResponseTime(95),
                P99ResponseTime = CalculatePercentileResponseTime(99),
                
                // Throughput metrics
                RequestsPerSecond = CalculateRequestsPerSecond(),
                CacheOperationsPerSecond = CalculateCacheOperationsPerSecond(),
                
                // Resource metrics
                MemoryUsage = GC.GetTotalMemory(false),
                CacheSize = GetCacheSize(),
                KeyCount = GetKeyCount(),
                
                // Error metrics
                ErrorRate = CalculateErrorRate(),
                TimeoutCount = _metrics.GetValueOrDefault("timeout_count", 0),
                ConnectionErrors = _metrics.GetValueOrDefault("connection_errors", 0)
            };

            // Store metrics for historical analysis
            StoreMetrics(currentMetrics);
            
            // Publish metrics to monitoring systems
            PublishMetrics(currentMetrics);
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error collecting cache metrics");
        }
    }

    public void RecordCacheHit(string key, TimeSpan responseTime)
    {
        IncrementMetric("hit_count");
        RecordResponseTime(responseTime);
        RecordKeyAccess(key);
    }

    public void RecordCacheMiss(string key, TimeSpan responseTime)
    {
        IncrementMetric("miss_count");
        RecordResponseTime(responseTime);
        RecordKeyAccess(key);
    }

    public void RecordError(string operation, Exception exception)
    {
        IncrementMetric("error_count");
        IncrementMetric($"error_{operation}");
        
        if (exception is TimeoutException)
        {
            IncrementMetric("timeout_count");
        }
        else if (exception is ConnectionException)
        {
            IncrementMetric("connection_errors");
        }
    }
}

Real-time Performance Dashboard

Create comprehensive dashboards for real-time monitoring.

Metrics API Controller

[ApiController]
[Route("api/cache/metrics")]
public class CacheMetricsController : ControllerBase
{
    private readonly ICacheMetricsCollector _metricsCollector;
    private readonly ICacheStatistics _statistics;
    private readonly ILogger<CacheMetricsController> _logger;

    [HttpGet("current")]
    public async Task<ActionResult<CacheMetrics>> GetCurrentMetrics()
    {
        return Ok(await _metricsCollector.GetCurrentMetricsAsync());
    }

    [HttpGet("history")]
    public async Task<ActionResult<IEnumerable<CacheMetrics>>> GetMetricsHistory(
        [FromQuery] DateTime? startTime = null,
        [FromQuery] DateTime? endTime = null,
        [FromQuery] int intervalMinutes = 5)
    {
        var start = startTime ?? DateTime.UtcNow.AddHours(-1);
        var end = endTime ?? DateTime.UtcNow;
        
        var metrics = await _metricsCollector.GetMetricsHistoryAsync(start, end, TimeSpan.FromMinutes(intervalMinutes));
        return Ok(metrics);
    }

    [HttpGet("performance")]
    public async Task<ActionResult<PerformanceMetrics>> GetPerformanceMetrics()
    {
        var stats = await _statistics.GetCurrentStatsAsync();
        
        return Ok(new PerformanceMetrics
        {
            // Response time metrics
            AverageResponseTime = stats.AverageResponseTime,
            MedianResponseTime = stats.MedianResponseTime,
            P95ResponseTime = stats.P95ResponseTime,
            P99ResponseTime = stats.P99ResponseTime,
            
            // Throughput metrics
            RequestsPerSecond = stats.RequestsPerSecond,
            CacheOperationsPerSecond = stats.CacheOperationsPerSecond,
            PeakRequestsPerSecond = stats.PeakRequestsPerSecond,
            
            // Cache effectiveness
            HitRate = stats.HitRate,
            MissRate = stats.MissRate,
            EvictionRate = stats.EvictionRate,
            
            // Resource utilization
            MemoryUsage = stats.MemoryUsage,
            CpuUsage = stats.CpuUsage,
            NetworkLatency = stats.NetworkLatency,
            
            // Error metrics
            ErrorRate = stats.ErrorRate,
            TimeoutRate = stats.TimeoutRate,
            ConnectionFailureRate = stats.ConnectionFailureRate
        });
    }

    [HttpGet("cache-keys/analysis")]
    public async Task<ActionResult<CacheKeyAnalysis>> GetCacheKeyAnalysis()
    {
        var analysis = await _statistics.AnalyzeCacheKeysAsync();
        
        return Ok(new CacheKeyAnalysis
        {
            TotalKeys = analysis.TotalKeys,
            MostAccessedKeys = analysis.MostAccessedKeys.Take(20),
            LeastAccessedKeys = analysis.LeastAccessedKeys.Take(20),
            LargestKeys = analysis.LargestKeys.Take(20),
            KeysByController = analysis.KeysByController,
            KeysByPattern = analysis.KeysByPattern,
            ExpirationDistribution = analysis.ExpirationDistribution
        });
    }

    [HttpGet("memory/analysis")]
    public async Task<ActionResult<MemoryAnalysis>> GetMemoryAnalysis()
    {
        return Ok(new MemoryAnalysis
        {
            TotalMemoryUsage = GC.GetTotalMemory(false),
            Gen0Collections = GC.CollectionCount(0),
            Gen1Collections = GC.CollectionCount(1),
            Gen2Collections = GC.CollectionCount(2),
            TotalPauseDuration = GC.GetTotalPauseDuration(),
            WorkingSet = Environment.WorkingSet,
            
            // Pool statistics
            StringPoolStats = GetStringPoolStats(),
            CollectionPoolStats = GetCollectionPoolStats(),
            
            // Cache-specific memory
            CacheMemoryUsage = await _statistics.GetCacheMemoryUsageAsync(),
            KeyMemoryDistribution = await _statistics.GetKeyMemoryDistributionAsync()
        });
    }

    [HttpGet("health")]
    public async Task<ActionResult<CacheHealthMetrics>> GetHealthMetrics()
    {
        var health = await _statistics.GetHealthMetricsAsync();
        
        return Ok(new CacheHealthMetrics
        {
            OverallHealth = health.OverallHealth,
            ComponentHealth = health.ComponentHealth,
            Alerts = health.ActiveAlerts,
            
            // Service-level indicators
            Availability = health.Availability,
            Reliability = health.Reliability,
            Latency = health.Latency,
            Throughput = health.Throughput,
            
            // Trend indicators
            HealthTrend = health.HealthTrend,
            PerformanceTrend = health.PerformanceTrend,
            ErrorTrend = health.ErrorTrend
        });
    }
}

Performance Dashboard HTML

<!DOCTYPE html>
<html>
<head>
    <title>Athena Cache Performance Dashboard</title>
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <script src="https://unpkg.com/@microsoft/signalr/dist/browser/signalr.min.js"></script>
    <style>
        .dashboard {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
            gap: 20px;
            padding: 20px;
        }
        .metric-card {
            background: white;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            padding: 20px;
        }
        .metric-value {
            font-size: 2.5rem;
            font-weight: bold;
            color: #2c3e50;
        }
        .metric-label {
            color: #7f8c8d;
            font-size: 0.9rem;
        }
        .chart-container {
            position: relative;
            height: 300px;
            width: 100%;
        }
        .status-good { color: #27ae60; }
        .status-warning { color: #f39c12; }
        .status-error { color: #e74c3c; }
    </style>
</head>
<body>
    <h1>🏛️ Athena Cache Performance Dashboard</h1>
    
    <div class="dashboard">
        <!-- Key Performance Indicators -->
        <div class="metric-card">
            <h3>Cache Hit Rate</h3>
            <div class="metric-value" id="hit-rate">--%</div>
            <div class="metric-label">Current hit rate</div>
        </div>
        
        <div class="metric-card">
            <h3>Response Time</h3>
            <div class="metric-value" id="response-time">-- ms</div>
            <div class="metric-label">P95 response time</div>
        </div>
        
        <div class="metric-card">
            <h3>Throughput</h3>
            <div class="metric-value" id="throughput">-- /s</div>
            <div class="metric-label">Requests per second</div>
        </div>
        
        <div class="metric-card">
            <h3>Memory Usage</h3>
            <div class="metric-value" id="memory-usage">-- MB</div>
            <div class="metric-label">Total memory consumption</div>
        </div>
        
        <!-- Performance Charts -->
        <div class="metric-card" style="grid-column: span 2;">
            <h3>Response Time Trends</h3>
            <div class="chart-container">
                <canvas id="responseTimeChart"></canvas>
            </div>
        </div>
        
        <div class="metric-card" style="grid-column: span 2;">
            <h3>Hit Rate Trends</h3>
            <div class="chart-container">
                <canvas id="hitRateChart"></canvas>
            </div>
        </div>
        
        <div class="metric-card" style="grid-column: span 2;">
            <h3>Throughput Analysis</h3>
            <div class="chart-container">
                <canvas id="throughputChart"></canvas>
            </div>
        </div>
        
        <!-- Cache Key Analysis -->
        <div class="metric-card">
            <h3>Top Cache Keys</h3>
            <div id="top-keys">Loading...</div>
        </div>
        
        <div class="metric-card">
            <h3>Memory Distribution</h3>
            <div class="chart-container">
                <canvas id="memoryDistributionChart"></canvas>
            </div>
        </div>
        
        <!-- Health Status -->
        <div class="metric-card">
            <h3>System Health</h3>
            <div class="metric-value" id="health-status">Unknown</div>
            <div class="metric-label">Overall system status</div>
            <div id="health-details"></div>
        </div>
    </div>

    <script>
        // Initialize charts
        const responseTimeChart = new Chart(document.getElementById('responseTimeChart'), {
            type: 'line',
            data: {
                labels: [],
                datasets: [{
                    label: 'P95 Response Time (ms)',
                    data: [],
                    borderColor: '#3498db',
                    backgroundColor: 'rgba(52, 152, 219, 0.1)',
                    tension: 0.4
                }, {
                    label: 'Average Response Time (ms)',
                    data: [],
                    borderColor: '#2ecc71',
                    backgroundColor: 'rgba(46, 204, 113, 0.1)',
                    tension: 0.4
                }]
            },
            options: {
                responsive: true,
                maintainAspectRatio: false,
                scales: {
                    y: {
                        beginAtZero: true,
                        title: {
                            display: true,
                            text: 'Response Time (ms)'
                        }
                    }
                }
            }
        });

        const hitRateChart = new Chart(document.getElementById('hitRateChart'), {
            type: 'line',
            data: {
                labels: [],
                datasets: [{
                    label: 'Hit Rate (%)',
                    data: [],
                    borderColor: '#e74c3c',
                    backgroundColor: 'rgba(231, 76, 60, 0.1)',
                    tension: 0.4
                }]
            },
            options: {
                responsive: true,
                maintainAspectRatio: false,
                scales: {
                    y: {
                        min: 0,
                        max: 100,
                        title: {
                            display: true,
                            text: 'Hit Rate (%)'
                        }
                    }
                }
            }
        });

        const throughputChart = new Chart(document.getElementById('throughputChart'), {
            type: 'line',
            data: {
                labels: [],
                datasets: [{
                    label: 'Requests/sec',
                    data: [],
                    borderColor: '#f39c12',
                    backgroundColor: 'rgba(243, 156, 18, 0.1)',
                    tension: 0.4
                }]
            },
            options: {
                responsive: true,
                maintainAspectRatio: false,
                scales: {
                    y: {
                        beginAtZero: true,
                        title: {
                            display: true,
                            text: 'Requests per Second'
                        }
                    }
                }
            }
        });

        const memoryDistributionChart = new Chart(document.getElementById('memoryDistributionChart'), {
            type: 'doughnut',
            data: {
                labels: ['Cache Data', 'String Pool', 'Collection Pool', 'Other'],
                datasets: [{
                    data: [0, 0, 0, 0],
                    backgroundColor: ['#3498db', '#2ecc71', '#f39c12', '#95a5a6']
                }]
            },
            options: {
                responsive: true,
                maintainAspectRatio: false
            }
        });

        // Real-time data updates
        function updateDashboard() {
            fetch('/api/cache/metrics/current')
                .then(response => response.json())
                .then(data => {
                    // Update KPI values
                    document.getElementById('hit-rate').textContent = data.hitRate.toFixed(1) + '%';
                    document.getElementById('response-time').textContent = data.p95ResponseTime.toFixed(1) + ' ms';
                    document.getElementById('throughput').textContent = data.requestsPerSecond.toFixed(0) + ' /s';
                    document.getElementById('memory-usage').textContent = (data.memoryUsage / 1024 / 1024).toFixed(1) + ' MB';
                    
                    // Update charts
                    const now = new Date().toLocaleTimeString();
                    
                    // Response time chart
                    responseTimeChart.data.labels.push(now);
                    responseTimeChart.data.datasets[0].data.push(data.p95ResponseTime);
                    responseTimeChart.data.datasets[1].data.push(data.averageResponseTime);
                    
                    // Hit rate chart
                    hitRateChart.data.labels.push(now);
                    hitRateChart.data.datasets[0].data.push(data.hitRate);
                    
                    // Throughput chart
                    throughputChart.data.labels.push(now);
                    throughputChart.data.datasets[0].data.push(data.requestsPerSecond);
                    
                    // Limit data points
                    const maxPoints = 20;
                    [responseTimeChart, hitRateChart, throughputChart].forEach(chart => {
                        if (chart.data.labels.length > maxPoints) {
                            chart.data.labels.shift();
                            chart.data.datasets.forEach(dataset => dataset.data.shift());
                        }
                        chart.update('none');
                    });
                })
                .catch(error => console.error('Error fetching metrics:', error));

            // Update cache key analysis
            fetch('/api/cache/metrics/cache-keys/analysis')
                .then(response => response.json())
                .then(data => {
                    const topKeysHtml = data.mostAccessedKeys
                        .slice(0, 10)
                        .map(key => `<div><code>${key.key}</code> (${key.accessCount} hits)</div>`)
                        .join('');
                    document.getElementById('top-keys').innerHTML = topKeysHtml;
                });

            // Update memory distribution
            fetch('/api/cache/metrics/memory/analysis')
                .then(response => response.json())
                .then(data => {
                    memoryDistributionChart.data.datasets[0].data = [
                        data.cacheMemoryUsage,
                        data.stringPoolStats.memoryUsage,
                        data.collectionPoolStats.memoryUsage,
                        data.totalMemoryUsage - data.cacheMemoryUsage - data.stringPoolStats.memoryUsage - data.collectionPoolStats.memoryUsage
                    ];
                    memoryDistributionChart.update();
                });

            // Update health status
            fetch('/api/cache/metrics/health')
                .then(response => response.json())
                .then(data => {
                    const statusElement = document.getElementById('health-status');
                    statusElement.textContent = data.overallHealth;
                    statusElement.className = 'metric-value ' + getHealthStatusClass(data.overallHealth);
                    
                    const detailsHtml = Object.entries(data.componentHealth)
                        .map(([component, status]) => `<div>${component}: <span class="${getHealthStatusClass(status)}">${status}</span></div>`)
                        .join('');
                    document.getElementById('health-details').innerHTML = detailsHtml;
                });
        }

        function getHealthStatusClass(status) {
            switch (status.toLowerCase()) {
                case 'healthy': return 'status-good';
                case 'degraded': return 'status-warning';
                case 'unhealthy': return 'status-error';
                default: return '';
            }
        }

        // Update dashboard every 10 seconds
        updateDashboard();
        setInterval(updateDashboard, 10000);
    </script>
</body>
</html>

Custom Metrics and Alerts

Implement custom metrics for business-specific monitoring.

Custom Metrics Configuration

public class CustomMetricsService : ICustomMetricsService
{
    private readonly ConcurrentDictionary<string, CustomMetric> _customMetrics = new();
    private readonly ILogger<CustomMetricsService> _logger;

    public void RecordCustomMetric(string name, double value, Dictionary<string, string> tags = null)
    {
        var metric = new CustomMetric
        {
            Name = name,
            Value = value,
            Tags = tags ?? new Dictionary<string, string>(),
            Timestamp = DateTimeOffset.UtcNow
        };

        _customMetrics.AddOrUpdate(name, metric, (key, existing) =>
        {
            existing.Value = value;
            existing.Timestamp = DateTimeOffset.UtcNow;
            return existing;
        });

        // Check for alert conditions
        CheckAlertConditions(metric);
    }

    public void IncrementCounter(string name, Dictionary<string, string> tags = null)
    {
        var key = $"{name}_{string.Join("_", tags?.Values ?? Array.Empty<string>())}";
        var current = _customMetrics.GetOrAdd(key, _ => new CustomMetric
        {
            Name = name,
            Value = 0,
            Tags = tags ?? new Dictionary<string, string>(),
            Timestamp = DateTimeOffset.UtcNow
        });

        Interlocked.Increment(ref current.Value);
        current.Timestamp = DateTimeOffset.UtcNow;
    }

    public void RecordBusinessMetric(string operation, TimeSpan duration, bool success)
    {
        var tags = new Dictionary<string, string>
        {
            ["operation"] = operation,
            ["success"] = success.ToString().ToLower()
        };

        RecordCustomMetric($"business_operation_duration_ms", duration.TotalMilliseconds, tags);
        IncrementCounter("business_operation_count", tags);

        if (!success)
        {
            IncrementCounter("business_operation_errors", new Dictionary<string, string> { ["operation"] = operation });
        }
    }

    private void CheckAlertConditions(CustomMetric metric)
    {
        // Example: Alert if error rate exceeds threshold
        if (metric.Name == "business_operation_errors" && metric.Value > 10)
        {
            TriggerAlert($"High error rate detected: {metric.Value} errors for {metric.Tags.GetValueOrDefault("operation", "unknown")}");
        }

        // Example: Alert if response time is too high
        if (metric.Name == "business_operation_duration_ms" && metric.Value > 5000)
        {
            TriggerAlert($"Slow operation detected: {metric.Value}ms for {metric.Tags.GetValueOrDefault("operation", "unknown")}");
        }
    }

    private void TriggerAlert(string message)
    {
        _logger.LogWarning("ALERT: {Message}", message);
        
        // Send to alerting system (PagerDuty, Slack, etc.)
        // Implementation depends on your alerting infrastructure
    }
}

// Usage in business logic
[HttpGet("{id}")]
[AthenaCache(ExpirationMinutes = 30)]
public async Task<ActionResult<ProductDto>> GetProduct(
    int id, 
    [FromServices] ICustomMetricsService customMetrics)
{
    var stopwatch = Stopwatch.StartNew();
    var success = false;
    
    try
    {
        var product = await _productService.GetProductAsync(id);
        success = true;
        return Ok(product);
    }
    catch (Exception ex)
    {
        _logger.LogError(ex, "Failed to get product {ProductId}", id);
        return StatusCode(500);
    }
    finally
    {
        stopwatch.Stop();
        customMetrics.RecordBusinessMetric("get_product", stopwatch.Elapsed, success);
    }
}

Integration with Monitoring Systems

OpenTelemetry Integration

// Program.cs
builder.Services.AddOpenTelemetryMetrics(builder =>
{
    builder
        .AddAspNetCoreInstrumentation()
        .AddHttpClientInstrumentation()
        .AddAthenaCacheInstrumentation() // Custom instrumentation
        .AddPrometheusExporter()
        .AddConsoleExporter();
});

public class AthenaCacheInstrumentation
{
    private static readonly ActivitySource ActivitySource = new("Athena.Cache");
    private static readonly Meter Meter = new("Athena.Cache");
    
    private readonly Counter<long> _cacheHitCounter;
    private readonly Counter<long> _cacheMissCounter;
    private readonly Histogram<double> _cacheOperationDuration;
    private readonly ObservableGauge<long> _cacheSize;

    public AthenaCacheInstrumentation()
    {
        _cacheHitCounter = Meter.CreateCounter<long>(
            "athena_cache_hits_total",
            description: "Total number of cache hits");

        _cacheMissCounter = Meter.CreateCounter<long>(
            "athena_cache_misses_total", 
            description: "Total number of cache misses");

        _cacheOperationDuration = Meter.CreateHistogram<double>(
            "athena_cache_operation_duration_ms",
            unit: "ms",
            description: "Cache operation duration in milliseconds");

        _cacheSize = Meter.CreateObservableGauge<long>(
            "athena_cache_size_bytes",
            description: "Current cache size in bytes",
            observeValue: () => GetCurrentCacheSize());
    }

    public void RecordCacheHit(string key, string controller, TimeSpan duration)
    {
        using var activity = ActivitySource.StartActivity("cache.hit");
        activity?.SetTag("cache.key", key);
        activity?.SetTag("cache.controller", controller);

        _cacheHitCounter.Add(1, new TagList
        {
            ["controller"] = controller,
            ["operation"] = "hit"
        });

        _cacheOperationDuration.Record(duration.TotalMilliseconds, new TagList
        {
            ["controller"] = controller,
            ["operation"] = "hit"
        });
    }

    public void RecordCacheMiss(string key, string controller, TimeSpan duration)
    {
        using var activity = ActivitySource.StartActivity("cache.miss");
        activity?.SetTag("cache.key", key);
        activity?.SetTag("cache.controller", controller);

        _cacheMissCounter.Add(1, new TagList
        {
            ["controller"] = controller,
            ["operation"] = "miss"
        });

        _cacheOperationDuration.Record(duration.TotalMilliseconds, new TagList
        {
            ["controller"] = controller,
            ["operation"] = "miss"
        });
    }
}

Prometheus Metrics Export

[HttpGet("metrics")]
public async Task<IActionResult> GetPrometheusMetrics([FromServices] ICacheStatistics stats)
{
    var metrics = await stats.GetCurrentStatsAsync();
    
    var prometheusMetrics = new StringBuilder();
    
    // Cache hit rate
    prometheusMetrics.AppendLine("# HELP athena_cache_hit_rate Cache hit rate percentage");
    prometheusMetrics.AppendLine("# TYPE athena_cache_hit_rate gauge");
    prometheusMetrics.AppendLine($"athena_cache_hit_rate {metrics.HitRate}");
    
    // Response time
    prometheusMetrics.AppendLine("# HELP athena_cache_response_time_ms Response time in milliseconds");
    prometheusMetrics.AppendLine("# TYPE athena_cache_response_time_ms histogram");
    prometheusMetrics.AppendLine($"athena_cache_response_time_ms_bucket {metrics.ResponseTimeBuckets.Le50}");
    prometheusMetrics.AppendLine($"athena_cache_response_time_ms_bucket {metrics.ResponseTimeBuckets.Le100}");
    prometheusMetrics.AppendLine($"athena_cache_response_time_ms_bucket {metrics.ResponseTimeBuckets.Le250}");
    prometheusMetrics.AppendLine($"athena_cache_response_time_ms_bucket {metrics.ResponseTimeBuckets.Le500}");
    prometheusMetrics.AppendLine($"athena_cache_response_time_ms_bucket {metrics.TotalRequests}");
    
    // Memory usage
    prometheusMetrics.AppendLine("# HELP athena_cache_memory_usage_bytes Memory usage in bytes");
    prometheusMetrics.AppendLine("# TYPE athena_cache_memory_usage_bytes gauge");
    prometheusMetrics.AppendLine($"athena_cache_memory_usage_bytes {metrics.MemoryUsage}");
    
    // Request rate
    prometheusMetrics.AppendLine("# HELP athena_cache_requests_per_second Requests per second");
    prometheusMetrics.AppendLine("# TYPE athena_cache_requests_per_second gauge");
    prometheusMetrics.AppendLine($"athena_cache_requests_per_second {metrics.RequestsPerSecond}");

    return Content(prometheusMetrics.ToString(), "text/plain; version=0.0.4");
}

Performance Baselines and SLOs

Establish service level objectives and track against baselines.

SLO Configuration

public class CacheServiceLevelObjectives
{
    public SLOConfig Availability { get; set; } = new()
    {
        Target = 99.9, // 99.9% availability
        Window = TimeSpan.FromDays(30)
    };

    public SLOConfig Latency { get; set; } = new()
    {
        Target = 95.0, // 95% of requests under 100ms
        Threshold = 100, // milliseconds
        Window = TimeSpan.FromHours(1)
    };

    public SLOConfig HitRate { get; set; } = new()
    {
        Target = 80.0, // 80% hit rate minimum
        Window = TimeSpan.FromHours(1)
    };

    public SLOConfig ErrorRate { get; set; } = new()
    {
        Target = 1.0, // Less than 1% error rate
        Window = TimeSpan.FromHours(1)
    };
}

public class SLOMonitoringService : BackgroundService
{
    private readonly ICacheStatistics _stats;
    private readonly CacheServiceLevelObjectives _slos;
    private readonly ILogger<SLOMonitoringService> _logger;

    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        while (!stoppingToken.IsCancellationRequested)
        {
            try
            {
                await CheckSLOs();
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Error checking SLOs");
            }

            await Task.Delay(TimeSpan.FromMinutes(5), stoppingToken);
        }
    }

    private async Task CheckSLOs()
    {
        var stats = await _stats.GetStatsForWindowAsync(_slos.Latency.Window);
        
        // Check latency SLO
        var latencyCompliance = CalculateLatencyCompliance(stats);
        if (latencyCompliance < _slos.Latency.Target)
        {
            _logger.LogWarning("Latency SLO violation: {Compliance}% (target: {Target}%)", 
                latencyCompliance, _slos.Latency.Target);
        }

        // Check hit rate SLO
        if (stats.HitRate < _slos.HitRate.Target)
        {
            _logger.LogWarning("Hit rate SLO violation: {HitRate}% (target: {Target}%)", 
                stats.HitRate, _slos.HitRate.Target);
        }

        // Check error rate SLO
        if (stats.ErrorRate > _slos.ErrorRate.Target)
        {
            _logger.LogError("Error rate SLO violation: {ErrorRate}% (target: < {Target}%)", 
                stats.ErrorRate, _slos.ErrorRate.Target);
        }

        // Calculate error budget burn rate
        var errorBudgetBurnRate = CalculateErrorBudgetBurnRate(stats);
        if (errorBudgetBurnRate > 1.0) // Burning error budget faster than sustainable
        {
            _logger.LogWarning("Error budget burn rate is {BurnRate}x the sustainable rate", 
                errorBudgetBurnRate);
        }
    }

    private double CalculateLatencyCompliance(CacheStatistics stats)
    {
        var totalRequests = stats.TotalRequests;
        var requestsUnderThreshold = stats.RequestsUnderThreshold(_slos.Latency.Threshold);
        
        return totalRequests > 0 ? (requestsUnderThreshold / (double)totalRequests) * 100 : 100;
    }

    private double CalculateErrorBudgetBurnRate(CacheStatistics stats)
    {
        var allowedErrorRate = 100 - _slos.Availability.Target; // 0.1% for 99.9% availability
        var actualErrorRate = stats.ErrorRate;
        
        return actualErrorRate / allowedErrorRate;
    }
}

Troubleshooting Performance Issues

Performance Diagnostic Tools

[HttpGet("diagnostics/performance")]
public async Task<IActionResult> DiagnosePerformance(
    [FromServices] ICacheStatistics stats,
    [FromServices] ICacheProfiler profiler)
{
    var diagnostics = new PerformanceDiagnostics
    {
        Timestamp = DateTimeOffset.UtcNow,
        
        // System metrics
        SystemMetrics = new()
        {
            CpuUsage = await GetCpuUsageAsync(),
            MemoryUsage = GC.GetTotalMemory(false),
            ThreadCount = Process.GetCurrentProcess().Threads.Count,
            HandleCount = Process.GetCurrentProcess().HandleCount
        },
        
        // Cache metrics
        CacheMetrics = await stats.GetDetailedStatsAsync(),
        
        // Performance hotspots
        PerformanceHotspots = await profiler.GetHotspotsAsync(),
        
        // Slow operations
        SlowOperations = await profiler.GetSlowOperationsAsync(TimeSpan.FromMilliseconds(100)),
        
        // Memory analysis
        MemoryAnalysis = await AnalyzeMemoryUsageAsync(),
        
        // Recommendations
        Recommendations = GeneratePerformanceRecommendations(await stats.GetCurrentStatsAsync())
    };

    return Ok(diagnostics);
}

private List<string> GeneratePerformanceRecommendations(CacheStatistics stats)
{
    var recommendations = new List<string>();

    if (stats.HitRate < 70)
    {
        recommendations.Add("Consider increasing cache expiration times to improve hit rate");
        recommendations.Add("Review cache key patterns for potential improvements");
    }

    if (stats.AverageResponseTime > 50)
    {
        recommendations.Add("Enable Source Generator for better performance");
        recommendations.Add("Review serialization settings for optimization");
    }

    if (stats.MemoryUsage > 512 * 1024 * 1024) // 512MB
    {
        recommendations.Add("Enable memory pressure management");
        recommendations.Add("Consider reducing cache size or implementing eviction policies");
    }

    if (stats.ErrorRate > 1)
    {
        recommendations.Add("Investigate cache errors and implement proper error handling");
        recommendations.Add("Consider enabling fallback mechanisms");
    }

    return recommendations;
}

For advanced topics:

Real-time Dashboards - Interactive monitoring
Analytics - Advanced analysis and insights
Production Tuning - Performance optimization
Troubleshooting - Diagnosing issues