curl --request GET \
--url https://api.example.com/monitoring/performance{
"timeRange": {
"start": "2024-01-16T12:00:00Z",
"end": "2024-01-16T18:00:00Z",
"granularity": "15m"
},
"overall": {
"healthScore": 87,
"performanceScore": 82,
"availabilityScore": 99.2,
"averageResponseTime": 245,
"totalThroughput": 1847.3,
"errorRate": 0.8
},
"resources": [
{
"resourceType": "cluster",
"resourceId": "cluster-gpu-a100-001",
"resourceName": "GPU Cluster A100-001",
"region": "us-east-1",
"metrics": {
"cpu": {
"current": 78.5,
"average": 72.3,
"peak": 94.2,
"trend": "increasing"
},
"memory": {
"current": 84.2,
"average": 79.8,
"peak": 91.5,
"trend": "stable"
},
"gpu": {
"current": 92.8,
"average": 88.4,
"peak": 98.1,
"trend": "increasing"
},
"network": {
"inbound": "2.3 Gbps",
"outbound": "4.7 Gbps",
"latency": 12.3
},
"storage": {
"readThroughput": "850 MB/s",
"writeThroughput": "420 MB/s",
"iops": 12500
}
},
"timeSeries": [
{
"timestamp": "2024-01-16T17:45:00Z",
"values": {
"cpu_utilization": 78.5,
"memory_utilization": 84.2,
"gpu_utilization": 92.8,
"response_time": 234
}
}
]
}
],
"aggregatedMetrics": {
"cpuUtilization": {
"average": 68.4,
"p95": 87.2,
"p99": 94.8,
"peakTime": "2024-01-16T15:30:00Z"
},
"memoryUtilization": {
"average": 71.8,
"p95": 89.3,
"p99": 95.1,
"peakTime": "2024-01-16T16:15:00Z"
},
"gpuUtilization": {
"average": 83.2,
"p95": 96.7,
"p99": 98.9,
"peakTime": "2024-01-16T17:00:00Z"
},
"networkThroughput": {
"totalInbound": "45.8 Gbps",
"totalOutbound": "78.2 Gbps",
"averageLatency": 15.7
},
"storageThroughput": {
"totalRead": "12.4 GB/s",
"totalWrite": "6.8 GB/s",
"averageIOPS": 89500
},
"apiMetrics": {
"averageResponseTime": 245,
"throughput": 1847.3,
"successRate": 99.2,
"errorRate": 0.8
}
},
"alerts": [
{
"alertId": "perf-alert-001",
"severity": "medium",
"metric": "gpu_utilization",
"threshold": 90,
"currentValue": 92.8,
"resource": "cluster-gpu-a100-001",
"timestamp": "2024-01-16T17:45:00Z"
}
]
}
Monitor detailed performance metrics and resource utilization across TensorOne infrastructure and services
curl --request GET \
--url https://api.example.com/monitoring/performance{
"timeRange": {
"start": "2024-01-16T12:00:00Z",
"end": "2024-01-16T18:00:00Z",
"granularity": "15m"
},
"overall": {
"healthScore": 87,
"performanceScore": 82,
"availabilityScore": 99.2,
"averageResponseTime": 245,
"totalThroughput": 1847.3,
"errorRate": 0.8
},
"resources": [
{
"resourceType": "cluster",
"resourceId": "cluster-gpu-a100-001",
"resourceName": "GPU Cluster A100-001",
"region": "us-east-1",
"metrics": {
"cpu": {
"current": 78.5,
"average": 72.3,
"peak": 94.2,
"trend": "increasing"
},
"memory": {
"current": 84.2,
"average": 79.8,
"peak": 91.5,
"trend": "stable"
},
"gpu": {
"current": 92.8,
"average": 88.4,
"peak": 98.1,
"trend": "increasing"
},
"network": {
"inbound": "2.3 Gbps",
"outbound": "4.7 Gbps",
"latency": 12.3
},
"storage": {
"readThroughput": "850 MB/s",
"writeThroughput": "420 MB/s",
"iops": 12500
}
},
"timeSeries": [
{
"timestamp": "2024-01-16T17:45:00Z",
"values": {
"cpu_utilization": 78.5,
"memory_utilization": 84.2,
"gpu_utilization": 92.8,
"response_time": 234
}
}
]
}
],
"aggregatedMetrics": {
"cpuUtilization": {
"average": 68.4,
"p95": 87.2,
"p99": 94.8,
"peakTime": "2024-01-16T15:30:00Z"
},
"memoryUtilization": {
"average": 71.8,
"p95": 89.3,
"p99": 95.1,
"peakTime": "2024-01-16T16:15:00Z"
},
"gpuUtilization": {
"average": 83.2,
"p95": 96.7,
"p99": 98.9,
"peakTime": "2024-01-16T17:00:00Z"
},
"networkThroughput": {
"totalInbound": "45.8 Gbps",
"totalOutbound": "78.2 Gbps",
"averageLatency": 15.7
},
"storageThroughput": {
"totalRead": "12.4 GB/s",
"totalWrite": "6.8 GB/s",
"averageIOPS": 89500
},
"apiMetrics": {
"averageResponseTime": 245,
"throughput": 1847.3,
"successRate": 99.2,
"errorRate": 0.8
}
},
"alerts": [
{
"alertId": "perf-alert-001",
"severity": "medium",
"metric": "gpu_utilization",
"threshold": 90,
"currentValue": 92.8,
"resource": "cluster-gpu-a100-001",
"timestamp": "2024-01-16T17:45:00Z"
}
]
}
all - All resources and servicesclusters - GPU clusters onlyendpoints - Serverless endpoints onlytraining - Training jobs and servicesai-services - AI generation servicesstorage - Storage systemsnetwork - Network infrastructure5m - Last 5 minutes15m - Last 15 minutes1h - Last hour6h - Last 6 hours24h - Last 24 hours7d - Last 7 days30d - Last 30 days10s - 10-second intervals1m - 1-minute intervals5m - 5-minute intervals15m - 15-minute intervals1h - 1-hour intervals1d - Daily aggregationcpu_utilization - CPU usage percentagememory_utilization - Memory usage percentagegpu_utilization - GPU usage percentagedisk_io - Disk I/O operations and throughputnetwork_io - Network I/O operations and throughputresponse_time - API response timesthroughput - Request throughputerror_rate - Error rates and failure countsqueue_depth - Job queue depthsus-east-1 - US East (Virginia)us-west-2 - US West (Oregon)eu-west-1 - Europe (Ireland)ap-southeast-1 - Asia Pacific (Singapore)avg - Average valuesmax - Maximum valuesmin - Minimum valuessum - Sum of valuesp95 - 95th percentilep99 - 99th percentileShow Overall Performance
Show Resource Performance
Show Aggregated Metrics
Show Performance Alert
low, medium, high, criticalcurl -X GET "https://api.tensorone.ai/v2/monitoring/performance" \
-H "Authorization: Bearer YOUR_API_KEY" \
-G \
-d "resource=clusters" \
-d "timeRange=6h" \
-d "granularity=15m" \
-d "metrics[]=cpu_utilization&metrics[]=gpu_utilization&metrics[]=memory_utilization" \
-d "aggregation=avg"
import requests
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pandas as pd
def get_performance_metrics(resource="all", time_range="1h", metrics=None):
params = {
'resource': resource,
'timeRange': time_range,
'granularity': '1m',
'aggregation': 'avg'
}
if metrics:
params['metrics'] = metrics
response = requests.get(
"https://api.tensorone.ai/v2/monitoring/performance",
headers={"Authorization": "Bearer YOUR_API_KEY"},
params=params
)
return response.json()
# Get comprehensive performance data
performance_data = get_performance_metrics(
resource="clusters",
time_range="6h",
metrics=["cpu_utilization", "gpu_utilization", "memory_utilization", "response_time"]
)
print("Overall Performance:")
overall = performance_data['overall']
print(f"Health Score: {overall['healthScore']}/100")
print(f"Performance Score: {overall['performanceScore']}/100")
print(f"Average Response Time: {overall['averageResponseTime']}ms")
print(f"Error Rate: {overall['errorRate']}%")
# Analyze resource performance
print("\nTop Resource Performance Issues:")
for resource in performance_data['resources']:
metrics = resource['metrics']
if metrics.get('cpu', {}).get('current', 0) > 80:
print(f"HIGH CPU: {resource['resourceName']} - {metrics['cpu']['current']}%")
if metrics.get('memory', {}).get('current', 0) > 85:
print(f"HIGH MEMORY: {resource['resourceName']} - {metrics['memory']['current']}%")
if metrics.get('gpu', {}).get('current', 0) > 95:
print(f"HIGH GPU: {resource['resourceName']} - {metrics['gpu']['current']}%")
# Check for performance alerts
if performance_data['alerts']:
print(f"\nActive Performance Alerts ({len(performance_data['alerts'])}):")
for alert in performance_data['alerts']:
print(f" {alert['severity'].upper()}: {alert['metric']} on {alert['resource']}")
print(f" Current: {alert['currentValue']}, Threshold: {alert['threshold']}")
# Plot performance trends
def plot_performance_trends(data):
"""Plot CPU and GPU utilization trends"""
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
for resource in data['resources'][:3]: # Plot top 3 resources
time_series = resource['timeSeries']
timestamps = [datetime.fromisoformat(point['timestamp'].replace('Z', '+00:00')) for point in time_series]
cpu_values = [point['values'].get('cpu_utilization', 0) for point in time_series]
gpu_values = [point['values'].get('gpu_utilization', 0) for point in time_series]
ax1.plot(timestamps, cpu_values, label=f"{resource['resourceName']} CPU")
ax2.plot(timestamps, gpu_values, label=f"{resource['resourceName']} GPU")
ax1.set_title('CPU Utilization Over Time')
ax1.set_ylabel('CPU Usage (%)')
ax1.legend()
ax1.grid(True)
ax2.set_title('GPU Utilization Over Time')
ax2.set_ylabel('GPU Usage (%)')
ax2.set_xlabel('Time')
ax2.legend()
ax2.grid(True)
plt.tight_layout()
plt.show()
# Uncomment to plot trends
# plot_performance_trends(performance_data)
const getPerformanceMetrics = async (options = {}) => {
const params = new URLSearchParams({
resource: options.resource || 'all',
timeRange: options.timeRange || '1h',
granularity: options.granularity || '1m',
aggregation: options.aggregation || 'avg'
});
if (options.metrics) {
options.metrics.forEach(metric => params.append('metrics[]', metric));
}
const response = await fetch(`https://api.tensorone.ai/v2/monitoring/performance?${params}`, {
headers: {
'Authorization': 'Bearer YOUR_API_KEY'
}
});
return await response.json();
};
// Monitor cluster performance
getPerformanceMetrics({
resource: 'clusters',
timeRange: '6h',
metrics: ['cpu_utilization', 'gpu_utilization', 'memory_utilization'],
granularity: '15m'
}).then(data => {
console.log('Overall Performance:', data.overall);
// Find performance bottlenecks
data.resources.forEach(resource => {
const metrics = resource.metrics;
const issues = [];
if (metrics.cpu?.current > 80) issues.push(`High CPU: ${metrics.cpu.current}%`);
if (metrics.memory?.current > 85) issues.push(`High Memory: ${metrics.memory.current}%`);
if (metrics.gpu?.current > 95) issues.push(`High GPU: ${metrics.gpu.current}%`);
if (issues.length > 0) {
console.log(`${resource.resourceName} Issues:`, issues);
}
});
// Check alerts
if (data.alerts.length > 0) {
console.log('Performance Alerts:', data.alerts);
}
});
{
"timeRange": {
"start": "2024-01-16T12:00:00Z",
"end": "2024-01-16T18:00:00Z",
"granularity": "15m"
},
"overall": {
"healthScore": 87,
"performanceScore": 82,
"availabilityScore": 99.2,
"averageResponseTime": 245,
"totalThroughput": 1847.3,
"errorRate": 0.8
},
"resources": [
{
"resourceType": "cluster",
"resourceId": "cluster-gpu-a100-001",
"resourceName": "GPU Cluster A100-001",
"region": "us-east-1",
"metrics": {
"cpu": {
"current": 78.5,
"average": 72.3,
"peak": 94.2,
"trend": "increasing"
},
"memory": {
"current": 84.2,
"average": 79.8,
"peak": 91.5,
"trend": "stable"
},
"gpu": {
"current": 92.8,
"average": 88.4,
"peak": 98.1,
"trend": "increasing"
},
"network": {
"inbound": "2.3 Gbps",
"outbound": "4.7 Gbps",
"latency": 12.3
},
"storage": {
"readThroughput": "850 MB/s",
"writeThroughput": "420 MB/s",
"iops": 12500
}
},
"timeSeries": [
{
"timestamp": "2024-01-16T17:45:00Z",
"values": {
"cpu_utilization": 78.5,
"memory_utilization": 84.2,
"gpu_utilization": 92.8,
"response_time": 234
}
}
]
}
],
"aggregatedMetrics": {
"cpuUtilization": {
"average": 68.4,
"p95": 87.2,
"p99": 94.8,
"peakTime": "2024-01-16T15:30:00Z"
},
"memoryUtilization": {
"average": 71.8,
"p95": 89.3,
"p99": 95.1,
"peakTime": "2024-01-16T16:15:00Z"
},
"gpuUtilization": {
"average": 83.2,
"p95": 96.7,
"p99": 98.9,
"peakTime": "2024-01-16T17:00:00Z"
},
"networkThroughput": {
"totalInbound": "45.8 Gbps",
"totalOutbound": "78.2 Gbps",
"averageLatency": 15.7
},
"storageThroughput": {
"totalRead": "12.4 GB/s",
"totalWrite": "6.8 GB/s",
"averageIOPS": 89500
},
"apiMetrics": {
"averageResponseTime": 245,
"throughput": 1847.3,
"successRate": 99.2,
"errorRate": 0.8
}
},
"alerts": [
{
"alertId": "perf-alert-001",
"severity": "medium",
"metric": "gpu_utilization",
"threshold": 90,
"currentValue": 92.8,
"resource": "cluster-gpu-a100-001",
"timestamp": "2024-01-16T17:45:00Z"
}
]
}
import time
import threading
from collections import deque
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
class PerformanceMonitor:
def __init__(self, api_key, update_interval=30):
self.api_key = api_key
self.update_interval = update_interval
self.running = False
self.data_history = {
'timestamps': deque(maxlen=100),
'cpu': deque(maxlen=100),
'memory': deque(maxlen=100),
'gpu': deque(maxlen=100),
'response_time': deque(maxlen=100)
}
def fetch_metrics(self):
"""Fetch current performance metrics"""
response = requests.get(
"https://api.tensorone.ai/v2/monitoring/performance",
headers={"Authorization": f"Bearer {self.api_key}"},
params={
"resource": "all",
"timeRange": "5m",
"granularity": "1m",
"aggregation": "avg"
}
)
return response.json()
def update_data(self):
"""Update data history with latest metrics"""
try:
metrics = self.fetch_metrics()
now = time.time()
self.data_history['timestamps'].append(now)
self.data_history['cpu'].append(metrics['aggregatedMetrics']['cpuUtilization']['average'])
self.data_history['memory'].append(metrics['aggregatedMetrics']['memoryUtilization']['average'])
self.data_history['gpu'].append(metrics['aggregatedMetrics']['gpuUtilization']['average'])
self.data_history['response_time'].append(metrics['overall']['averageResponseTime'])
# Print current status
overall = metrics['overall']
print(f"[{time.strftime('%H:%M:%S')}] Health: {overall['healthScore']}/100, "
f"CPU: {self.data_history['cpu'][-1]:.1f}%, "
f"Memory: {self.data_history['memory'][-1]:.1f}%, "
f"GPU: {self.data_history['gpu'][-1]:.1f}%")
# Check for alerts
if metrics['alerts']:
for alert in metrics['alerts']:
print(f"⚠️ ALERT: {alert['metric']} on {alert['resource']} - {alert['currentValue']}")
except Exception as e:
print(f"Error fetching metrics: {e}")
def start_monitoring(self):
"""Start continuous monitoring"""
self.running = True
print("Starting performance monitoring...")
while self.running:
self.update_data()
time.sleep(self.update_interval)
def stop_monitoring(self):
"""Stop monitoring"""
self.running = False
print("Monitoring stopped")
def create_dashboard(self):
"""Create real-time dashboard"""
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
def animate(frame):
if len(self.data_history['timestamps']) > 1:
times = list(self.data_history['timestamps'])
# Clear and plot
ax1.clear()
ax1.plot(times, list(self.data_history['cpu']), 'b-', label='CPU')
ax1.set_title('CPU Utilization (%)')
ax1.set_ylim(0, 100)
ax2.clear()
ax2.plot(times, list(self.data_history['memory']), 'g-', label='Memory')
ax2.set_title('Memory Utilization (%)')
ax2.set_ylim(0, 100)
ax3.clear()
ax3.plot(times, list(self.data_history['gpu']), 'r-', label='GPU')
ax3.set_title('GPU Utilization (%)')
ax3.set_ylim(0, 100)
ax4.clear()
ax4.plot(times, list(self.data_history['response_time']), 'm-', label='Response Time')
ax4.set_title('Response Time (ms)')
ani = FuncAnimation(fig, animate, interval=1000, cache_frame_data=False)
plt.tight_layout()
plt.show()
# Usage
monitor = PerformanceMonitor("YOUR_API_KEY", update_interval=30)
# Start monitoring in background
monitor_thread = threading.Thread(target=monitor.start_monitoring)
monitor_thread.daemon = True
monitor_thread.start()
# Show dashboard (uncomment to run)
# monitor.create_dashboard()
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
class AnomalyDetector:
def __init__(self, api_key):
self.api_key = api_key
self.baseline_data = {}
def establish_baseline(self, days=7):
"""Establish performance baseline over specified days"""
response = requests.get(
"https://api.tensorone.ai/v2/monitoring/performance",
headers={"Authorization": f"Bearer {self.api_key}"},
params={
"resource": "all",
"timeRange": f"{days}d",
"granularity": "1h",
"aggregation": "avg"
}
)
data = response.json()
# Extract baseline metrics
cpu_values = []
memory_values = []
gpu_values = []
response_times = []
for resource in data['resources']:
for point in resource['timeSeries']:
values = point['values']
cpu_values.append(values.get('cpu_utilization', 0))
memory_values.append(values.get('memory_utilization', 0))
gpu_values.append(values.get('gpu_utilization', 0))
response_times.append(values.get('response_time', 0))
self.baseline_data = {
'cpu': {'mean': np.mean(cpu_values), 'std': np.std(cpu_values)},
'memory': {'mean': np.mean(memory_values), 'std': np.std(memory_values)},
'gpu': {'mean': np.mean(gpu_values), 'std': np.std(gpu_values)},
'response_time': {'mean': np.mean(response_times), 'std': np.std(response_times)}
}
print("Baseline established:")
for metric, stats in self.baseline_data.items():
print(f" {metric}: μ={stats['mean']:.1f}, σ={stats['std']:.1f}")
def detect_anomalies(self, threshold=2.0):
"""Detect performance anomalies using z-score"""
response = requests.get(
"https://api.tensorone.ai/v2/monitoring/performance",
headers={"Authorization": f"Bearer {self.api_key}"},
params={
"resource": "all",
"timeRange": "1h",
"granularity": "5m",
"aggregation": "avg"
}
)
data = response.json()
anomalies = []
for resource in data['resources']:
resource_anomalies = []
for point in resource['timeSeries']:
values = point['values']
timestamp = point['timestamp']
for metric in ['cpu_utilization', 'memory_utilization', 'gpu_utilization', 'response_time']:
if metric.replace('_utilization', '').replace('_', '_') in self.baseline_data:
current_value = values.get(metric, 0)
baseline_key = metric.replace('_utilization', '').replace('_', '_')
baseline = self.baseline_data[baseline_key]
# Calculate z-score
if baseline['std'] > 0:
z_score = abs(current_value - baseline['mean']) / baseline['std']
if z_score > threshold:
resource_anomalies.append({
'timestamp': timestamp,
'metric': metric,
'value': current_value,
'baseline_mean': baseline['mean'],
'z_score': z_score,
'severity': 'high' if z_score > 3 else 'medium'
})
if resource_anomalies:
anomalies.append({
'resource': resource['resourceName'],
'resource_id': resource['resourceId'],
'anomalies': resource_anomalies
})
return anomalies
def generate_report(self, anomalies):
"""Generate anomaly detection report"""
if not anomalies:
print("✅ No performance anomalies detected")
return
print(f"🚨 Detected {len(anomalies)} resources with performance anomalies:")
for resource_anomaly in anomalies:
print(f"\n📊 {resource_anomaly['resource']} ({resource_anomaly['resource_id']}):")
for anomaly in resource_anomaly['anomalies']:
severity_icon = "🔴" if anomaly['severity'] == 'high' else "🟡"
print(f" {severity_icon} {anomaly['metric']}: {anomaly['value']:.1f} "
f"(baseline: {anomaly['baseline_mean']:.1f}, z-score: {anomaly['z_score']:.2f})")
print(f" Time: {anomaly['timestamp']}")
# Usage
detector = AnomalyDetector("YOUR_API_KEY")
# Establish baseline
detector.establish_baseline(days=7)
# Detect current anomalies
anomalies = detector.detect_anomalies(threshold=2.0)
detector.generate_report(anomalies)
def get_optimization_recommendations(resource_id=None):
"""Get performance optimization recommendations"""
params = {
'analyzeBottlenecks': True,
'includeRecommendations': True,
'timeRange': '24h'
}
if resource_id:
params['resourceIds'] = [resource_id]
response = requests.get(
"https://api.tensorone.ai/v2/monitoring/performance/optimization",
headers={"Authorization": "Bearer YOUR_API_KEY"},
params=params
)
return response.json()
def analyze_performance_bottlenecks():
"""Analyze and provide optimization recommendations"""
recommendations = get_optimization_recommendations()
print("🔧 Performance Optimization Recommendations:")
print("=" * 50)
for resource in recommendations['resources']:
print(f"\n📈 {resource['resourceName']} ({resource['resourceType']}):")
# Resource-specific recommendations
if 'recommendations' in resource:
for rec in resource['recommendations']:
priority_icon = {"high": "🔴", "medium": "🟡", "low": "🟢"}.get(rec['priority'], "⚪")
print(f" {priority_icon} {rec['title']}")
print(f" {rec['description']}")
if 'estimatedImprovement' in rec:
print(f" 💡 Expected improvement: {rec['estimatedImprovement']}")
print()
# Bottleneck analysis
if 'bottlenecks' in resource:
print(" 🚧 Identified Bottlenecks:")
for bottleneck in resource['bottlenecks']:
print(f" - {bottleneck['component']}: {bottleneck['description']}")
print(f" Impact: {bottleneck['impact']}")
# Platform-wide recommendations
if 'platformRecommendations' in recommendations:
print("\n🌐 Platform-wide Recommendations:")
for rec in recommendations['platformRecommendations']:
print(f" • {rec['title']}")
print(f" {rec['description']}")
if 'costImpact' in rec:
print(f" 💰 Cost impact: {rec['costImpact']}")
# Run optimization analysis
analyze_performance_bottlenecks()