Path Parameters
endpointId: The unique identifier of the endpoint to get metrics for
Query Parameters
timeframe: Time period for metrics (1h,6h,24h,7d,30d) - defaults to24hgranularity: Data point granularity (1m,5m,15m,1h,1d) - defaults to15mmetrics: Specific metrics to include (performance,usage,costs,resources,errors)format: Response format (json,csv,prometheus) - defaults tojsonaggregation: Aggregation method (avg,sum,min,max,p95,p99) for time series data
Example Usage
Basic Performance Metrics
curl -X GET "https://api.tensorone.ai/v2/endpoints/ep_1234567890abcdef/metrics" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json"
Detailed Metrics with Custom Timeframe
curl -X GET "https://api.tensorone.ai/v2/endpoints/ep_1234567890abcdef/metrics?timeframe=7d&granularity=1h&metrics=performance,usage,costs,resources" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json"
Cost Analysis with Breakdown
curl -X GET "https://api.tensorone.ai/v2/endpoints/ep_1234567890abcdef/metrics?timeframe=30d&metrics=costs&format=json" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json"
Prometheus Format for Monitoring Integration
curl -X GET "https://api.tensorone.ai/v2/endpoints/ep_1234567890abcdef/metrics?format=prometheus" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Accept: text/plain"
Batch Metrics for Multiple Endpoints
curl -X POST "https://api.tensorone.ai/v2/endpoints/metrics/batch" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"endpointIds": [
"ep_1234567890abcdef",
"ep_2345678901bcdefg",
"ep_3456789012cdefgh"
],
"timeframe": "24h",
"metrics": ["performance", "costs"],
"aggregation": "avg"
}'
Response
Comprehensive Metrics Response
{
"endpointId": "ep_1234567890abcdef",
"timeframe": "24h",
"granularity": "15m",
"generatedAt": "2024-01-15T14:35:22Z",
"summary": {
"totalExecutions": 2847,
"successfulExecutions": 2815,
"failedExecutions": 32,
"successRate": 98.88,
"totalCost": 47.32,
"averageLatency": 2.3,
"p95Latency": 4.8,
"p99Latency": 8.2,
"totalComputeTime": 6547.2,
"averageQueueTime": 0.15,
"coldStartCount": 23,
"coldStartRate": 0.81
},
"performance": {
"latency": {
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"avg": 2.1,
"p50": 1.8,
"p95": 4.2,
"p99": 7.8,
"min": 0.9,
"max": 12.5
}
],
"trends": {
"hourly": {
"average": 2.3,
"trend": "stable",
"changePercent": -2.1
},
"daily": {
"average": 2.4,
"trend": "improving",
"changePercent": -8.5
}
}
},
"throughput": {
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"requestsPerSecond": 8.2,
"tokensPerSecond": 456.7,
"itemsProcessed": 29400
}
],
"peak": {
"requestsPerSecond": 24.8,
"timestamp": "2024-01-15T09:30:00Z"
}
},
"errorRates": {
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"errorRate": 1.2,
"timeoutRate": 0.3,
"resourceErrorRate": 0.2,
"userErrorRate": 0.7
}
],
"breakdown": {
"4xx": 18,
"5xx": 14,
"timeouts": 8,
"resourceExhausted": 5
}
}
},
"usage": {
"requests": {
"total": 2847,
"successful": 2815,
"failed": 32,
"byHour": [
{
"hour": "2024-01-15T00:00:00Z",
"count": 124,
"avgLatency": 2.1
}
],
"patterns": {
"peakHour": "09:00-10:00",
"lowestHour": "03:00-04:00",
"weekdayAverage": 2650,
"weekendAverage": 1890
}
},
"compute": {
"totalSeconds": 6547.2,
"gpuSeconds": 6234.8,
"cpuSeconds": 312.4,
"idleTime": 145.2,
"utilizationRate": 91.5,
"efficiency": {
"score": 87.3,
"suggestions": [
"Consider batch processing for similar requests",
"Optimize model size to reduce memory usage"
]
}
},
"data": {
"inputBytes": 45672819200,
"outputBytes": 12847392000,
"cacheHits": 847,
"cacheMisses": 2000,
"cacheHitRate": 29.7,
"bandwidth": {
"ingress": "142.5 MB/hour",
"egress": "40.2 MB/hour"
}
}
},
"costs": {
"total": 47.32,
"breakdown": {
"compute": 42.15,
"storage": 2.84,
"network": 1.47,
"other": 0.86
},
"byCategory": {
"inference": 38.92,
"coldStarts": 3.23,
"idleTime": 2.17,
"dataTransfer": 1.47,
"storage": 1.53
},
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"cost": 1.85,
"computeCost": 1.67,
"storageCost": 0.12,
"networkCost": 0.06
}
],
"trends": {
"daily": {
"average": 47.32,
"trend": "increasing",
"changePercent": 12.4,
"projection": {
"monthly": 1419.60,
"confidence": 0.85
}
}
},
"optimization": {
"potentialSavings": 8.45,
"suggestions": [
{
"category": "cold_starts",
"description": "Enable warm pools to reduce cold start costs",
"savings": 3.23,
"effort": "low"
},
{
"category": "idle_time",
"description": "Implement auto-scaling to reduce idle time",
"savings": 2.17,
"effort": "medium"
},
{
"category": "resource_sizing",
"description": "Right-size GPU allocation based on usage patterns",
"savings": 3.05,
"effort": "high"
}
]
}
},
"resources": {
"gpu": {
"utilization": {
"average": 67.3,
"peak": 94.8,
"timeSeries": [
{
"timestamp": "2024-01-15T00:00:00Z",
"utilization": 65.2,
"memoryUsage": 78.4,
"temperature": 68
}
]
},
"memory": {
"average": 78.4,
"peak": 92.1,
"allocated": "40GB",
"efficiency": 82.7
},
"performance": {
"flops": 125.4,
"memoryBandwidth": 1250.8,
"efficiency": 89.2
}
},
"cpu": {
"utilization": {
"average": 23.8,
"peak": 67.2
},
"memory": {
"usage": 45.2,
"available": "64GB"
}
},
"scaling": {
"events": [
{
"timestamp": "2024-01-15T09:30:00Z",
"action": "scale_up",
"from": 2,
"to": 4,
"reason": "High request volume",
"duration": 45.2
}
],
"currentInstances": 3,
"targetInstances": 3,
"autoScalingEnabled": true
}
},
"coldStarts": {
"count": 23,
"rate": 0.81,
"averageDuration": 34.2,
"breakdown": {
"containerStart": 12.5,
"modelLoad": 18.7,
"dependencyLoad": 3.0
},
"impact": {
"latencyIncrease": 32.1,
"costIncrease": 3.23,
"userExperience": "moderate"
},
"optimization": {
"warmPoolRecommended": true,
"estimatedImprovement": {
"latencyReduction": 28.5,
"costReduction": 2.89
}
}
},
"insights": {
"performance": [
{
"type": "latency_spike",
"severity": "medium",
"message": "Latency increased by 15% during peak hours",
"timestamp": "2024-01-15T09:30:00Z",
"recommendation": "Consider enabling auto-scaling or warm pools"
}
],
"usage": [
{
"type": "usage_pattern",
"severity": "info",
"message": "Consistent daily usage pattern detected",
"recommendation": "Predictable usage allows for capacity planning optimization"
}
],
"costs": [
{
"type": "cost_trend",
"severity": "warning",
"message": "Monthly costs trending upward by 12.4%",
"recommendation": "Review usage patterns and consider optimization strategies"
}
]
}
}
Cost Breakdown Response
{
"endpointId": "ep_cost_analysis",
"timeframe": "30d",
"costs": {
"total": 1247.85,
"breakdown": {
"compute": {
"amount": 1089.24,
"percentage": 87.3,
"details": {
"gpuHours": 2847.5,
"rate": 0.382,
"tier": "premium"
}
},
"storage": {
"amount": 67.42,
"percentage": 5.4,
"details": {
"models": 45.20,
"outputs": 22.22
}
},
"network": {
"amount": 34.78,
"percentage": 2.8,
"details": {
"ingress": 12.45,
"egress": 22.33
}
},
"other": {
"amount": 56.41,
"percentage": 4.5,
"details": {
"coldStarts": 28.90,
"monitoring": 15.67,
"logging": 11.84
}
}
},
"trends": {
"daily": {
"average": 41.60,
"trend": "stable",
"changePercent": 2.1
},
"weekly": {
"average": 291.18,
"trend": "increasing",
"changePercent": 8.7
}
},
"budgetAnalysis": {
"monthlyBudget": 1500.00,
"currentUsage": 1247.85,
"remainingBudget": 252.15,
"projectedTotal": 1389.42,
"onTrack": true,
"burnRate": 41.60
},
"optimization": {
"totalPotentialSavings": 187.92,
"recommendations": [
{
"category": "resource_rightsizing",
"savings": 125.67,
"confidence": 0.92,
"description": "Downsize GPU during low-usage periods",
"implementation": "Configure auto-scaling policies"
},
{
"category": "warm_pools",
"savings": 34.56,
"confidence": 0.87,
"description": "Enable warm pools to reduce cold start costs",
"implementation": "Enable warm pool with 2 instances"
},
{
"category": "data_caching",
"savings": 27.69,
"confidence": 0.78,
"description": "Implement result caching for repeated requests",
"implementation": "Enable Redis cache with 1-hour TTL"
}
]
}
}
}
Prometheus Format Response
# HELP tensorone_endpoint_requests_total Total number of requests
# TYPE tensorone_endpoint_requests_total counter
tensorone_endpoint_requests_total{endpoint_id="ep_1234567890abcdef",status="success"} 2815
tensorone_endpoint_requests_total{endpoint_id="ep_1234567890abcdef",status="error"} 32
# HELP tensorone_endpoint_latency_seconds Request latency in seconds
# TYPE tensorone_endpoint_latency_seconds histogram
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="0.5"} 284
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="1.0"} 1256
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="2.0"} 2134
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="5.0"} 2678
tensorone_endpoint_latency_seconds_bucket{endpoint_id="ep_1234567890abcdef",le="10.0"} 2815
# HELP tensorone_endpoint_gpu_utilization GPU utilization percentage
# TYPE tensorone_endpoint_gpu_utilization gauge
tensorone_endpoint_gpu_utilization{endpoint_id="ep_1234567890abcdef"} 67.3
# HELP tensorone_endpoint_cost_total Total cost in USD
# TYPE tensorone_endpoint_cost_total counter
tensorone_endpoint_cost_total{endpoint_id="ep_1234567890abcdef",category="compute"} 42.15
tensorone_endpoint_cost_total{endpoint_id="ep_1234567890abcdef",category="storage"} 2.84
Metric Categories
Performance Metrics
- Latency: Response times with percentiles (p50, p95, p99)
- Throughput: Requests per second, tokens per second
- Error Rates: Success rates, failure breakdown by type
- Cold Starts: Frequency, duration, impact on performance
Usage Metrics
- Request Volume: Total requests, patterns, trends
- Compute Utilization: GPU/CPU usage, efficiency scores
- Data Transfer: Input/output volumes, bandwidth usage
- Cache Performance: Hit rates, miss rates, efficiency
Cost Metrics
- Total Costs: Comprehensive cost breakdown
- Cost Trends: Historical analysis and projections
- Optimization: Potential savings and recommendations
- Budget Tracking: Budget vs. actual spending analysis
Resource Metrics
- Hardware Utilization: GPU, CPU, memory usage
- Scaling Events: Auto-scaling activities and effectiveness
- Resource Efficiency: Utilization optimization scores
- Capacity Planning: Usage patterns and capacity recommendations
Time Series Data
Granularity Options
1m: Minute-level granularity (last 6 hours max)5m: 5-minute intervals (last 24 hours max)15m: 15-minute intervals (last 7 days max)1h: Hourly intervals (last 30 days max)1d: Daily intervals (unlimited retention)
Aggregation Methods
avg: Average values over time periodsum: Sum of all valuesmin: Minimum value in periodmax: Maximum value in periodp95: 95th percentilep99: 99th percentile
Error Handling
400 Bad Request
{
"error": "INVALID_PARAMETERS",
"message": "Invalid timeframe specified",
"details": {
"parameter": "timeframe",
"value": "2y",
"allowedValues": ["1h", "6h", "24h", "7d", "30d"]
}
}
403 Forbidden
{
"error": "INSUFFICIENT_PERMISSIONS",
"message": "Metrics access requires analytics:read permission",
"details": {
"requiredPermission": "analytics:read",
"currentPermissions": ["endpoints:execute", "endpoints:read"]
}
}
429 Rate Limited
{
"error": "RATE_LIMIT_EXCEEDED",
"message": "Too many metrics requests",
"details": {
"limit": 60,
"window": "1h",
"retryAfter": 300,
"suggestion": "Use batch endpoints or increase polling interval"
}
}
SDK Examples
Python SDK
from tensorone import TensorOneClient
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
client = TensorOneClient(api_key="your_api_key")
# Basic metrics retrieval
def get_endpoint_metrics(endpoint_id, timeframe="24h"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
metrics=["performance", "usage", "costs", "resources"]
)
print(f"Metrics for {endpoint_id} (last {timeframe}):")
print(f"Total Executions: {metrics.summary.total_executions}")
print(f"Success Rate: {metrics.summary.success_rate:.2f}%")
print(f"Average Latency: {metrics.summary.average_latency:.2f}s")
print(f"Total Cost: ${metrics.summary.total_cost:.2f}")
return metrics
# Cost analysis and optimization
def analyze_costs(endpoint_id, timeframe="30d"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
metrics=["costs"]
)
costs = metrics.costs
print(f"Cost Analysis for {endpoint_id}:")
print(f"Total Cost: ${costs.total:.2f}")
print(f"Daily Average: ${costs.trends.daily.average:.2f}")
print(f"Monthly Projection: ${costs.trends.projection.monthly:.2f}")
# Optimization recommendations
if costs.optimization.potential_savings > 0:
print(f"\nOptimization Opportunities:")
print(f"Potential Savings: ${costs.optimization.potential_savings:.2f}")
for suggestion in costs.optimization.suggestions:
print(f"- {suggestion.description}")
print(f" Savings: ${suggestion.savings:.2f}")
print(f" Effort: {suggestion.effort}")
return costs
# Performance trend analysis
def analyze_performance_trends(endpoint_id, timeframe="7d"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
granularity="1h",
metrics=["performance"]
)
# Convert to pandas DataFrame for analysis
latency_data = []
for point in metrics.performance.latency.time_series:
latency_data.append({
'timestamp': point.timestamp,
'avg_latency': point.avg,
'p95_latency': point.p95,
'p99_latency': point.p99
})
df = pd.DataFrame(latency_data)
df['timestamp'] = pd.to_datetime(df['timestamp'])
# Performance analysis
avg_latency = df['avg_latency'].mean()
latency_trend = df['avg_latency'].diff().mean()
print(f"Performance Analysis:")
print(f"Average Latency: {avg_latency:.2f}s")
print(f"Latency Trend: {'Improving' if latency_trend < 0 else 'Degrading'}")
print(f"95th Percentile: {df['p95_latency'].mean():.2f}s")
print(f"99th Percentile: {df['p99_latency'].mean():.2f}s")
return df
# Resource utilization monitoring
def monitor_resource_utilization(endpoint_id, timeframe="24h"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
granularity="15m",
metrics=["resources"]
)
resources = metrics.resources
gpu = resources.gpu
print(f"Resource Utilization Analysis:")
print(f"Average GPU Utilization: {gpu.utilization.average:.1f}%")
print(f"Peak GPU Utilization: {gpu.utilization.peak:.1f}%")
print(f"GPU Memory Usage: {gpu.memory.average:.1f}%")
print(f"Resource Efficiency: {gpu.performance.efficiency:.1f}%")
# Scaling analysis
if resources.scaling.events:
print(f"\nScaling Events:")
for event in resources.scaling.events:
print(f"- {event.timestamp}: {event.action} from {event.from} to {event.to}")
print(f" Reason: {event.reason}")
return resources
# Cold start analysis
def analyze_cold_starts(endpoint_id, timeframe="7d"):
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
metrics=["performance"]
)
cold_starts = metrics.cold_starts
print(f"Cold Start Analysis:")
print(f"Cold Start Count: {cold_starts.count}")
print(f"Cold Start Rate: {cold_starts.rate:.2f}%")
print(f"Average Duration: {cold_starts.average_duration:.1f}s")
print(f"Latency Impact: +{cold_starts.impact.latency_increase:.1f}s")
print(f"Cost Impact: +${cold_starts.impact.cost_increase:.2f}")
if cold_starts.optimization.warm_pool_recommended:
improvement = cold_starts.optimization.estimated_improvement
print(f"\nOptimization Opportunity:")
print(f"Warm pools could reduce latency by {improvement.latency_reduction:.1f}s")
print(f"Estimated cost reduction: ${improvement.cost_reduction:.2f}")
return cold_starts
# Comprehensive dashboard data
def create_dashboard_data(endpoint_ids, timeframe="24h"):
"""Create comprehensive dashboard data for multiple endpoints"""
dashboard_data = []
for endpoint_id in endpoint_ids:
try:
metrics = client.endpoints.get_metrics(
endpoint_id=endpoint_id,
timeframe=timeframe,
metrics=["performance", "usage", "costs", "resources"]
)
dashboard_data.append({
'endpoint_id': endpoint_id,
'executions': metrics.summary.total_executions,
'success_rate': metrics.summary.success_rate,
'avg_latency': metrics.summary.average_latency,
'p95_latency': metrics.summary.p95_latency,
'total_cost': metrics.summary.total_cost,
'gpu_utilization': metrics.resources.gpu.utilization.average,
'error_rate': 100 - metrics.summary.success_rate,
'cold_starts': metrics.cold_starts.count if metrics.cold_starts else 0
})
except Exception as e:
print(f"Error fetching metrics for {endpoint_id}: {e}")
df = pd.DataFrame(dashboard_data)
return df
# Usage examples
if __name__ == "__main__":
endpoint_id = "ep_1234567890abcdef"
# Basic metrics
metrics = get_endpoint_metrics(endpoint_id)
# Cost analysis
costs = analyze_costs(endpoint_id, "30d")
# Performance trends
perf_df = analyze_performance_trends(endpoint_id, "7d")
# Resource monitoring
resources = monitor_resource_utilization(endpoint_id)
# Cold start analysis
cold_starts = analyze_cold_starts(endpoint_id)
# Dashboard for multiple endpoints
endpoints = ["ep_1234567890abcdef", "ep_2345678901bcdefg"]
dashboard_df = create_dashboard_data(endpoints)
print("\nDashboard Summary:")
print(dashboard_df.to_string(index=False))
JavaScript SDK
import { TensorOneClient } from "@tensorone/sdk";
import Chart from 'chart.js/auto';
const client = new TensorOneClient({ apiKey: "your_api_key" });
// Basic metrics retrieval
async function getEndpointMetrics(endpointId, timeframe = "24h") {
const metrics = await client.endpoints.getMetrics(endpointId, {
timeframe,
metrics: ["performance", "usage", "costs", "resources"]
});
console.log(`Metrics for ${endpointId} (last ${timeframe}):`);
console.log(`Total Executions: ${metrics.summary.totalExecutions}`);
console.log(`Success Rate: ${metrics.summary.successRate.toFixed(2)}%`);
console.log(`Average Latency: ${metrics.summary.averageLatency.toFixed(2)}s`);
console.log(`Total Cost: $${metrics.summary.totalCost.toFixed(2)}`);
return metrics;
}
// Real-time metrics dashboard
class MetricsDashboard {
constructor(endpointId, containerId) {
this.endpointId = endpointId;
this.container = document.getElementById(containerId);
this.charts = {};
this.initialize();
}
async initialize() {
await this.createCharts();
this.startRealTimeUpdates();
}
async createCharts() {
// Latency trend chart
const latencyCanvas = this.createCanvas('latency-chart');
this.charts.latency = new Chart(latencyCanvas, {
type: 'line',
data: {
labels: [],
datasets: [{
label: 'Average Latency',
data: [],
borderColor: 'rgb(75, 192, 192)',
tension: 0.1
}, {
label: 'P95 Latency',
data: [],
borderColor: 'rgb(255, 99, 132)',
tension: 0.1
}]
},
options: {
responsive: true,
plugins: {
title: {
display: true,
text: 'Latency Trends'
}
},
scales: {
y: {
beginAtZero: true,
title: {
display: true,
text: 'Latency (seconds)'
}
}
}
}
});
// Cost breakdown pie chart
const costCanvas = this.createCanvas('cost-chart');
this.charts.cost = new Chart(costCanvas, {
type: 'pie',
data: {
labels: ['Compute', 'Storage', 'Network', 'Other'],
datasets: [{
data: [],
backgroundColor: [
'rgb(255, 99, 132)',
'rgb(54, 162, 235)',
'rgb(255, 205, 86)',
'rgb(75, 192, 192)'
]
}]
},
options: {
responsive: true,
plugins: {
title: {
display: true,
text: 'Cost Breakdown'
}
}
}
});
// GPU utilization gauge
const gpuCanvas = this.createCanvas('gpu-chart');
this.charts.gpu = new Chart(gpuCanvas, {
type: 'doughnut',
data: {
labels: ['Used', 'Available'],
datasets: [{
data: [0, 100],
backgroundColor: ['rgb(255, 99, 132)', 'rgb(200, 200, 200)']
}]
},
options: {
responsive: true,
plugins: {
title: {
display: true,
text: 'GPU Utilization'
}
}
}
});
}
createCanvas(id) {
const canvas = document.createElement('canvas');
canvas.id = id;
this.container.appendChild(canvas);
return canvas;
}
async updateCharts() {
try {
const metrics = await client.endpoints.getMetrics(this.endpointId, {
timeframe: "1h",
granularity: "5m",
metrics: ["performance", "costs", "resources"]
});
// Update latency chart
if (metrics.performance?.latency?.timeSeries) {
const latencyData = metrics.performance.latency.timeSeries;
this.charts.latency.data.labels = latencyData.map(d =>
new Date(d.timestamp).toLocaleTimeString()
);
this.charts.latency.data.datasets[0].data = latencyData.map(d => d.avg);
this.charts.latency.data.datasets[1].data = latencyData.map(d => d.p95);
this.charts.latency.update();
}
// Update cost chart
if (metrics.costs?.breakdown) {
const breakdown = metrics.costs.breakdown;
this.charts.cost.data.datasets[0].data = [
breakdown.compute,
breakdown.storage,
breakdown.network,
breakdown.other
];
this.charts.cost.update();
}
// Update GPU chart
if (metrics.resources?.gpu?.utilization) {
const utilization = metrics.resources.gpu.utilization.average;
this.charts.gpu.data.datasets[0].data = [utilization, 100 - utilization];
this.charts.gpu.update();
}
} catch (error) {
console.error('Error updating charts:', error);
}
}
startRealTimeUpdates() {
// Update every 30 seconds
setInterval(() => this.updateCharts(), 30000);
// Initial update
this.updateCharts();
}
}
// Cost optimization analyzer
async function analyzeCostOptimization(endpointId, timeframe = "30d") {
const metrics = await client.endpoints.getMetrics(endpointId, {
timeframe,
metrics: ["costs"]
});
const costs = metrics.costs;
console.log(`Cost Optimization Analysis for ${endpointId}:`);
console.log(`Current Monthly Cost: $${costs.total.toFixed(2)}`);
if (costs.optimization?.potentialSavings > 0) {
console.log(`\nOptimization Opportunities:`);
console.log(`Total Potential Savings: $${costs.optimization.potentialSavings.toFixed(2)}`);
costs.optimization.suggestions.forEach((suggestion, index) => {
console.log(`\n${index + 1}. ${suggestion.description}`);
console.log(` Savings: $${suggestion.savings.toFixed(2)}`);
console.log(` Confidence: ${(suggestion.confidence * 100).toFixed(0)}%`);
console.log(` Effort: ${suggestion.effort}`);
});
// Calculate ROI for each suggestion
const roiAnalysis = costs.optimization.suggestions.map(suggestion => ({
...suggestion,
roi: (suggestion.savings / costs.total) * 100,
monthlyImpact: suggestion.savings * 12
}));
console.log(`\nROI Analysis:`);
roiAnalysis.forEach(suggestion => {
console.log(`${suggestion.category}: ${suggestion.roi.toFixed(1)}% savings`);
});
}
return costs;
}
// Performance alerting system
class PerformanceAlerting {
constructor(endpointIds, thresholds = {}) {
this.endpointIds = endpointIds;
this.thresholds = {
maxLatency: 5.0,
minSuccessRate: 95.0,
maxErrorRate: 5.0,
maxCostIncrease: 20.0,
...thresholds
};
this.alerts = [];
}
async checkAlerts() {
for (const endpointId of this.endpointIds) {
try {
const metrics = await client.endpoints.getMetrics(endpointId, {
timeframe: "1h",
metrics: ["performance", "costs"]
});
await this.evaluateMetrics(endpointId, metrics);
} catch (error) {
this.addAlert(endpointId, 'error', `Failed to fetch metrics: ${error.message}`);
}
}
return this.alerts;
}
async evaluateMetrics(endpointId, metrics) {
const summary = metrics.summary;
// Latency alerts
if (summary.averageLatency > this.thresholds.maxLatency) {
this.addAlert(endpointId, 'warning',
`High latency: ${summary.averageLatency.toFixed(2)}s (threshold: ${this.thresholds.maxLatency}s)`
);
}
// Success rate alerts
if (summary.successRate < this.thresholds.minSuccessRate) {
this.addAlert(endpointId, 'critical',
`Low success rate: ${summary.successRate.toFixed(2)}% (threshold: ${this.thresholds.minSuccessRate}%)`
);
}
// Error rate alerts
const errorRate = 100 - summary.successRate;
if (errorRate > this.thresholds.maxErrorRate) {
this.addAlert(endpointId, 'warning',
`High error rate: ${errorRate.toFixed(2)}% (threshold: ${this.thresholds.maxErrorRate}%)`
);
}
// Cost increase alerts
if (metrics.costs?.trends?.daily?.changePercent > this.thresholds.maxCostIncrease) {
this.addAlert(endpointId, 'warning',
`Cost increase: ${metrics.costs.trends.daily.changePercent.toFixed(1)}% (threshold: ${this.thresholds.maxCostIncrease}%)`
);
}
}
addAlert(endpointId, severity, message) {
this.alerts.push({
endpointId,
severity,
message,
timestamp: new Date().toISOString()
});
}
async sendNotifications() {
if (this.alerts.length === 0) return;
// Group alerts by severity
const alertsBySeverity = this.alerts.reduce((acc, alert) => {
acc[alert.severity] = acc[alert.severity] || [];
acc[alert.severity].push(alert);
return acc;
}, {});
console.log('Performance Alerts:');
Object.entries(alertsBySeverity).forEach(([severity, alerts]) => {
console.log(`\n${severity.toUpperCase()} (${alerts.length}):`);
alerts.forEach(alert => {
console.log(` ${alert.endpointId}: ${alert.message}`);
});
});
// Here you would integrate with your notification system
// (email, Slack, PagerDuty, etc.)
}
}
// Usage examples
async function main() {
const endpointId = "ep_1234567890abcdef";
const endpointIds = ["ep_1234567890abcdef", "ep_2345678901bcdefg"];
try {
// Basic metrics
const metrics = await getEndpointMetrics(endpointId);
// Cost optimization
await analyzeCostOptimization(endpointId);
// Create real-time dashboard (if running in browser)
// const dashboard = new MetricsDashboard(endpointId, 'dashboard-container');
// Set up performance alerting
const alerting = new PerformanceAlerting(endpointIds, {
maxLatency: 3.0,
minSuccessRate: 98.0,
maxErrorRate: 2.0,
maxCostIncrease: 15.0
});
const alerts = await alerting.checkAlerts();
await alerting.sendNotifications();
} catch (error) {
console.error("Metrics analysis error:", error);
}
}
main();
Use Cases
Production Monitoring
- SLA Monitoring: Track performance against service level agreements
- Cost Control: Monitor spending and identify optimization opportunities
- Capacity Planning: Analyze usage patterns for infrastructure planning
- Performance Optimization: Identify bottlenecks and optimization opportunities
Business Intelligence
- Usage Analytics: Understand user behavior and usage patterns
- Cost Attribution: Track costs by team, project, or customer
- ROI Analysis: Measure return on investment for AI initiatives
- Trend Analysis: Identify growth patterns and seasonal variations
DevOps and Automation
- Auto-scaling: Use metrics to trigger scaling decisions
- Alerting: Set up automated alerts for performance and cost thresholds
- CI/CD Integration: Include performance metrics in deployment pipelines
- Resource Optimization: Automatically optimize resource allocation
Research and Development
- Model Performance: Compare different model versions and configurations
- A/B Testing: Analyze performance differences between model variants
- Optimization Research: Identify areas for model and infrastructure improvement
- Benchmarking: Compare performance against industry standards
Best Practices
Monitoring Strategy
- Key Metrics: Focus on metrics that align with business objectives
- Alerting Thresholds: Set appropriate thresholds to avoid alert fatigue
- Dashboards: Create role-specific dashboards for different stakeholders
- Historical Analysis: Maintain historical data for trend analysis
Performance Optimization
- Regular Review: Regularly review metrics to identify optimization opportunities
- Baseline Establishment: Establish performance baselines for comparison
- Proactive Monitoring: Monitor trends to prevent issues before they occur
- Cross-correlation: Analyze relationships between different metrics
Cost Management
- Budget Tracking: Set up budget alerts and tracking
- Cost Attribution: Tag resources for accurate cost attribution
- Optimization Cycles: Regularly review and implement cost optimizations
- Forecasting: Use historical data for cost forecasting and planning
Data Quality
- Metric Validation: Validate metric accuracy and completeness
- Data Retention: Define appropriate data retention policies
- Access Control: Implement proper access controls for sensitive metrics
- Documentation: Document metric definitions and calculation methods
Metrics are calculated in real-time and cached for 5 minutes to ensure optimal performance. For the most
current data, use granularities of 1m or 5m.
Historical metrics are retained for 90 days. Export important data for longer-term analysis and archival
purposes.
Use the batch metrics endpoint to efficiently retrieve metrics for multiple endpoints. Set up automated
reporting and alerting to proactively manage performance and costs.

