{
"success": true,
"data": {
"id": "cluster_abc123",
"name": "ml-training-cluster",
"description": "High-performance cluster for LLM training",
"status": "running",
"status_details": {
"message": "Cluster is running normally",
"last_status_change": "2024-01-15T14:35:00Z",
"health_checks": {
"gpu_health": "healthy",
"storage_health": "healthy",
"network_health": "healthy",
"docker_health": "healthy"
}
},
"configuration": {
"gpu_type": "A100",
"gpu_count": 4,
"cpu_cores": 32,
"memory_gb": 256,
"storage_gb": 1000,
"storage_type": "nvme",
"region": "us-west-2",
"availability_zone": "us-west-2a"
},
"project_info": {
"project_id": "proj_456",
"project_name": "ML Research Team",
"owner_id": "user_789",
"owner_email": "researcher@company.com"
},
"template_info": {
"template_id": "tmpl_pytorch_latest",
"template_name": "PyTorch 2.1 with CUDA 11.8",
"template_version": "v2.1.0",
"docker_image": "tensorone/pytorch:2.1-cuda11.8"
},
"network": {
"private_ip": "10.0.1.15",
"public_ip": "203.0.113.42",
"proxy_url": "https://cluster-abc123.tensorone.ai",
"ssh_connection": {
"host": "ssh-abc123.tensorone.ai",
"port": 22,
"username": "root",
"status": "connected",
"connection_string": "ssh root@ssh-abc123.tensorone.ai",
"last_connection": "2024-01-15T15:42:00Z"
},
"port_mappings": [
{
"internal_port": 8080,
"external_port": 32001,
"protocol": "tcp",
"description": "Web Application",
"url": "https://cluster-abc123.tensorone.ai:32001",
"status": "active"
},
{
"internal_port": 6006,
"external_port": 32002,
"protocol": "tcp",
"description": "TensorBoard",
"url": "https://cluster-abc123.tensorone.ai:32002",
"status": "active"
}
],
"security_groups": ["sg_ml_training", "sg_ssh_access"],
"firewall_rules": [
{
"direction": "inbound",
"protocol": "tcp",
"port_range": "22",
"source": "0.0.0.0/0",
"description": "SSH Access"
}
]
},
"metrics": {
"current": {
"timestamp": "2024-01-15T15:45:00Z",
"gpu_utilization": 87.3,
"gpu_memory_utilization": 94.2,
"cpu_utilization": 52.1,
"memory_utilization": 68.4,
"storage_utilization": 45.2,
"network_rx_mbps": 125.3,
"network_tx_mbps": 89.7,
"temperature_celsius": 72.5,
"power_usage_watts": 1250
},
"historical": {
"window": "24h",
"gpu_utilization": {
"avg": 82.1,
"min": 15.3,
"max": 98.7,
"trend": "increasing"
},
"memory_utilization": {
"avg": 65.4,
"min": 12.1,
"max": 94.2,
"trend": "stable"
},
"cost_efficiency": {
"utilization_score": 85.2,
"cost_per_compute_hour": 8.95
}
},
"alerts": [
{
"type": "high_gpu_utilization",
"severity": "info",
"message": "GPU utilization consistently above 85%",
"triggered_at": "2024-01-15T15:30:00Z"
}
]
},
"cost": {
"current_hourly_rate": 8.50,
"currency": "USD",
"session_cost": 68.25,
"total_lifetime_cost": 284.75,
"cost_breakdown": {
"gpu_cost": 6.80,
"cpu_cost": 0.85,
"memory_cost": 0.45,
"storage_cost": 0.25,
"network_cost": 0.15
},
"billing_period": {
"start": "2024-01-15T14:35:00Z",
"current": "2024-01-15T15:45:00Z",
"duration_hours": 1.17
},
"cost_projections": {
"daily_estimate": 204.00,
"weekly_estimate": 1428.00,
"monthly_estimate": 6120.00
}
},
"storage": {
"volumes": [
{
"name": "root",
"size_gb": 100,
"used_gb": 45,
"mount_path": "/",
"type": "nvme",
"encrypted": true
},
{
"name": "data",
"size_gb": 900,
"used_gb": 230,
"mount_path": "/data",
"type": "nvme",
"encrypted": true
}
],
"snapshots": [
{
"id": "snap_123",
"name": "pre_training_snapshot",
"size_gb": 45,
"created_at": "2024-01-15T14:00:00Z"
}
]
},
"environment": {
"variables": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3",
"NCCL_SOCKET_IFNAME": "eth0",
"PYTHONPATH": "/workspace"
},
"secrets": ["WANDB_API_KEY", "HUGGINGFACE_TOKEN"],
"runtime_info": {
"python_version": "3.9.18",
"cuda_version": "11.8",
"driver_version": "520.61.05",
"docker_version": "24.0.7"
}
},
"auto_terminate": {
"enabled": true,
"idle_minutes": 60,
"max_runtime_hours": 24,
"cost_limit_usd": 500.0,
"estimated_termination": "2024-01-16T14:35:00Z",
"current_idle_minutes": 5
},
"uptime_seconds": 4200,
"created_at": "2024-01-15T14:35:00Z",
"updated_at": "2024-01-15T15:45:00Z",
"last_accessed": "2024-01-15T15:42:00Z",
"tags": {
"team": "ml-research",
"project": "llm-training",
"environment": "production"
}
},
"meta": {
"request_id": "req_get_456",
"response_time_ms": 89,
"cache_hit": false,
"data_freshness_seconds": 15
}
}