curl -X GET "https://api.tensorone.ai/v2/monitoring/system-status" \
-H "Authorization: Bearer YOUR_API_KEY" \
-G \
-d "includeMetrics=true" \
-d "includeIncidents=true" \
-d "services[]=api&services[]=clusters&services[]=ai-services"
{
"overallStatus": "operational",
"lastUpdated": "2024-01-16T18:30:00Z",
"services": [
{
"name": "API",
"status": "operational",
"uptime": 99.97,
"responseTime": 85,
"lastIncident": "2024-01-10T14:22:00Z",
"description": "All API endpoints are functioning normally",
"regions": [
{
"region": "us-east-1",
"status": "operational",
"latency": 78
},
{
"region": "us-west-2",
"status": "operational",
"latency": 92
}
]
},
{
"name": "GPU Clusters",
"status": "operational",
"uptime": 99.95,
"responseTime": 1200,
"lastIncident": "2024-01-08T09:15:00Z",
"description": "GPU infrastructure operating normally",
"regions": [
{
"region": "us-east-1",
"status": "operational",
"latency": 1150
},
{
"region": "us-west-2",
"status": "operational",
"latency": 1250
}
]
},
{
"name": "AI Services",
"status": "degraded",
"uptime": 98.5,
"responseTime": 3200,
"lastIncident": "2024-01-16T17:45:00Z",
"description": "Text-to-video generation experiencing higher latency",
"regions": [
{
"region": "us-east-1",
"status": "degraded",
"latency": 3800
},
{
"region": "us-west-2",
"status": "operational",
"latency": 2600
}
]
}
],
"incidents": [
{
"id": "INC-2024-001",
"title": "Increased latency in text-to-video generation",
"status": "monitoring",
"severity": "medium",
"startedAt": "2024-01-16T17:45:00Z",
"updatedAt": "2024-01-16T18:15:00Z",
"affectedServices": ["AI Services"],
"description": "We are monitoring increased response times for text-to-video requests. The issue appears to be isolated to the US East region."
}
],
"maintenanceWindows": [
{
"id": "MAINT-2024-002",
"title": "Database optimization and indexing",
"startTime": "2024-01-18T06:00:00Z",
"endTime": "2024-01-18T08:00:00Z",
"affectedServices": ["API", "Training Services"],
"impact": "minimal",
"status": "scheduled"
}
],
"metrics": {
"apiRequests": {
"total": 2486532,
"successful": 2461204,
"failed": 25328,
"averageResponseTime": 127
},
"resourceUsage": {
"cpuUtilization": 67.2,
"memoryUtilization": 78.5,
"storageUsage": 45.8,
"networkThroughput": {
"inbound": "2.3 Gbps",
"outbound": "4.7 Gbps"
}
},
"gpuClusters": {
"totalClusters": 1247,
"activeClusters": 892,
"averageUtilization": 82.4,
"queueLength": 23
}
}
}
Get real-time status of TensorOne platform services, infrastructure, and operational metrics
curl -X GET "https://api.tensorone.ai/v2/monitoring/system-status" \
-H "Authorization: Bearer YOUR_API_KEY" \
-G \
-d "includeMetrics=true" \
-d "includeIncidents=true" \
-d "services[]=api&services[]=clusters&services[]=ai-services"
{
"overallStatus": "operational",
"lastUpdated": "2024-01-16T18:30:00Z",
"services": [
{
"name": "API",
"status": "operational",
"uptime": 99.97,
"responseTime": 85,
"lastIncident": "2024-01-10T14:22:00Z",
"description": "All API endpoints are functioning normally",
"regions": [
{
"region": "us-east-1",
"status": "operational",
"latency": 78
},
{
"region": "us-west-2",
"status": "operational",
"latency": 92
}
]
},
{
"name": "GPU Clusters",
"status": "operational",
"uptime": 99.95,
"responseTime": 1200,
"lastIncident": "2024-01-08T09:15:00Z",
"description": "GPU infrastructure operating normally",
"regions": [
{
"region": "us-east-1",
"status": "operational",
"latency": 1150
},
{
"region": "us-west-2",
"status": "operational",
"latency": 1250
}
]
},
{
"name": "AI Services",
"status": "degraded",
"uptime": 98.5,
"responseTime": 3200,
"lastIncident": "2024-01-16T17:45:00Z",
"description": "Text-to-video generation experiencing higher latency",
"regions": [
{
"region": "us-east-1",
"status": "degraded",
"latency": 3800
},
{
"region": "us-west-2",
"status": "operational",
"latency": 2600
}
]
}
],
"incidents": [
{
"id": "INC-2024-001",
"title": "Increased latency in text-to-video generation",
"status": "monitoring",
"severity": "medium",
"startedAt": "2024-01-16T17:45:00Z",
"updatedAt": "2024-01-16T18:15:00Z",
"affectedServices": ["AI Services"],
"description": "We are monitoring increased response times for text-to-video requests. The issue appears to be isolated to the US East region."
}
],
"maintenanceWindows": [
{
"id": "MAINT-2024-002",
"title": "Database optimization and indexing",
"startTime": "2024-01-18T06:00:00Z",
"endTime": "2024-01-18T08:00:00Z",
"affectedServices": ["API", "Training Services"],
"impact": "minimal",
"status": "scheduled"
}
],
"metrics": {
"apiRequests": {
"total": 2486532,
"successful": 2461204,
"failed": 25328,
"averageResponseTime": 127
},
"resourceUsage": {
"cpuUtilization": 67.2,
"memoryUtilization": 78.5,
"storageUsage": 45.8,
"networkThroughput": {
"inbound": "2.3 Gbps",
"outbound": "4.7 Gbps"
}
},
"gpuClusters": {
"totalClusters": 1247,
"activeClusters": 892,
"averageUtilization": 82.4,
"queueLength": 23
}
}
}
api - Core API servicesclusters - GPU cluster infrastructureendpoints - Serverless endpointstraining - ML training servicesai-services - AI generation servicesstorage - File and data storagedatabase - Database servicesmonitoring - Monitoring and logging systemsus-east-1 - US East (Virginia)us-west-2 - US West (Oregon)eu-west-1 - Europe (Ireland)ap-southeast-1 - Asia Pacific (Singapore)global - Global servicesoperational, degraded, partial_outage, major_outage, maintenanceShow Service Status
operational, degraded, partial_outage, major_outage, maintenanceShow Incident
investigating, identified, monitoring, resolvedlow, medium, high, criticalShow Maintenance Window
none, minimal, moderate, significantscheduled, in_progress, completed, cancelledShow System Metrics
curl -X GET "https://api.tensorone.ai/v2/monitoring/system-status" \
-H "Authorization: Bearer YOUR_API_KEY" \
-G \
-d "includeMetrics=true" \
-d "includeIncidents=true" \
-d "services[]=api&services[]=clusters&services[]=ai-services"
{
"overallStatus": "operational",
"lastUpdated": "2024-01-16T18:30:00Z",
"services": [
{
"name": "API",
"status": "operational",
"uptime": 99.97,
"responseTime": 85,
"lastIncident": "2024-01-10T14:22:00Z",
"description": "All API endpoints are functioning normally",
"regions": [
{
"region": "us-east-1",
"status": "operational",
"latency": 78
},
{
"region": "us-west-2",
"status": "operational",
"latency": 92
}
]
},
{
"name": "GPU Clusters",
"status": "operational",
"uptime": 99.95,
"responseTime": 1200,
"lastIncident": "2024-01-08T09:15:00Z",
"description": "GPU infrastructure operating normally",
"regions": [
{
"region": "us-east-1",
"status": "operational",
"latency": 1150
},
{
"region": "us-west-2",
"status": "operational",
"latency": 1250
}
]
},
{
"name": "AI Services",
"status": "degraded",
"uptime": 98.5,
"responseTime": 3200,
"lastIncident": "2024-01-16T17:45:00Z",
"description": "Text-to-video generation experiencing higher latency",
"regions": [
{
"region": "us-east-1",
"status": "degraded",
"latency": 3800
},
{
"region": "us-west-2",
"status": "operational",
"latency": 2600
}
]
}
],
"incidents": [
{
"id": "INC-2024-001",
"title": "Increased latency in text-to-video generation",
"status": "monitoring",
"severity": "medium",
"startedAt": "2024-01-16T17:45:00Z",
"updatedAt": "2024-01-16T18:15:00Z",
"affectedServices": ["AI Services"],
"description": "We are monitoring increased response times for text-to-video requests. The issue appears to be isolated to the US East region."
}
],
"maintenanceWindows": [
{
"id": "MAINT-2024-002",
"title": "Database optimization and indexing",
"startTime": "2024-01-18T06:00:00Z",
"endTime": "2024-01-18T08:00:00Z",
"affectedServices": ["API", "Training Services"],
"impact": "minimal",
"status": "scheduled"
}
],
"metrics": {
"apiRequests": {
"total": 2486532,
"successful": 2461204,
"failed": 25328,
"averageResponseTime": 127
},
"resourceUsage": {
"cpuUtilization": 67.2,
"memoryUtilization": 78.5,
"storageUsage": 45.8,
"networkThroughput": {
"inbound": "2.3 Gbps",
"outbound": "4.7 Gbps"
}
},
"gpuClusters": {
"totalClusters": 1247,
"activeClusters": 892,
"averageUtilization": 82.4,
"queueLength": 23
}
}
}
import time
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
class StatusMonitor:
def __init__(self, api_key):
self.api_key = api_key
self.status_history = []
def check_status(self):
"""Get current system status"""
response = requests.get(
"https://api.tensorone.ai/v2/monitoring/system-status",
headers={"Authorization": f"Bearer {self.api_key}"},
params={"includeMetrics": True}
)
status = response.json()
status['timestamp'] = datetime.now()
self.status_history.append(status)
return status
def get_service_health(self, service_name):
"""Get health status for specific service"""
latest_status = self.status_history[-1] if self.status_history else self.check_status()
for service in latest_status['services']:
if service['name'].lower() == service_name.lower():
return service
return None
def monitor_continuously(self, interval=60, duration=3600):
"""Monitor status continuously"""
end_time = datetime.now() + timedelta(seconds=duration)
print(f"Starting continuous monitoring for {duration/60:.0f} minutes...")
while datetime.now() < end_time:
try:
status = self.check_status()
self._print_status_summary(status)
# Alert on status changes
if len(self.status_history) > 1:
previous = self.status_history[-2]
if status['overallStatus'] != previous['overallStatus']:
print(f"🚨 ALERT: Overall status changed from {previous['overallStatus']} to {status['overallStatus']}")
time.sleep(interval)
except KeyboardInterrupt:
print("Monitoring stopped by user")
break
except Exception as e:
print(f"Error during monitoring: {e}")
time.sleep(interval)
def _print_status_summary(self, status):
timestamp = status['timestamp'].strftime("%H:%M:%S")
overall = status['overallStatus'].upper()
print(f"\n[{timestamp}] Overall Status: {overall}")
for service in status['services']:
status_emoji = {
'operational': '✅',
'degraded': '⚠️',
'partial_outage': '🔶',
'major_outage': '🔴',
'maintenance': '🔧'
}.get(service['status'], '❓')
print(f" {status_emoji} {service['name']}: {service['status']} ({service['uptime']:.1f}%)")
if status.get('incidents'):
print(f" 📋 Active Incidents: {len(status['incidents'])}")
# Usage
monitor = StatusMonitor("YOUR_API_KEY")
# Single status check
current_status = monitor.check_status()
print(f"Overall Status: {current_status['overallStatus']}")
# Check specific service
api_health = monitor.get_service_health("API")
if api_health:
print(f"API Health: {api_health['status']} - {api_health['uptime']}% uptime")
# Continuous monitoring (uncomment to run)
# monitor.monitor_continuously(interval=30, duration=1800) # Monitor for 30 minutes
def check_regional_status():
regions = ['us-east-1', 'us-west-2', 'eu-west-1', 'ap-southeast-1']
regional_status = {}
for region in regions:
response = requests.get(
"https://api.tensorone.ai/v2/monitoring/system-status",
headers={"Authorization": "Bearer YOUR_API_KEY"},
params={
"regions": [region],
"includeMetrics": True
}
)
regional_status[region] = response.json()
return regional_status
def analyze_regional_performance(regional_status):
print("Regional Performance Analysis:")
print("=" * 50)
for region, status in regional_status.items():
print(f"\n{region.upper()}:")
print(f" Overall Status: {status['overallStatus']}")
# Calculate average response time across services
response_times = [s['responseTime'] for s in status['services'] if s.get('responseTime')]
avg_response_time = sum(response_times) / len(response_times) if response_times else 0
print(f" Average Response Time: {avg_response_time:.0f}ms")
# Check for region-specific incidents
regional_incidents = [i for i in status.get('incidents', []) if region in i.get('affectedRegions', [])]
if regional_incidents:
print(f" Regional Incidents: {len(regional_incidents)}")
# Resource utilization
if 'metrics' in status and 'resourceUsage' in status['metrics']:
usage = status['metrics']['resourceUsage']
print(f" CPU Utilization: {usage['cpuUtilization']}%")
print(f" Memory Utilization: {usage['memoryUtilization']}%")
# Analyze regional performance
regional_data = check_regional_status()
analyze_regional_performance(regional_data)
def get_service_dependencies():
response = requests.get(
"https://api.tensorone.ai/v2/monitoring/service-dependencies",
headers={"Authorization": "Bearer YOUR_API_KEY"}
)
return response.json()
def analyze_impact(failing_service, dependencies):
"""Analyze potential impact of a failing service"""
impacted_services = []
def find_dependent_services(service, deps, visited=None):
if visited is None:
visited = set()
if service in visited:
return []
visited.add(service)
dependents = []
for dep_service, dep_info in deps.items():
if service in dep_info.get('dependencies', []):
dependents.append(dep_service)
dependents.extend(find_dependent_services(dep_service, deps, visited))
return dependents
return find_dependent_services(failing_service, dependencies)
# Get service dependencies
dependencies = get_service_dependencies()
# Analyze impact of API service failure
if 'API' in dependencies:
impacted = analyze_impact('API', dependencies)
print(f"If API service fails, these services may be impacted:")
for service in impacted:
print(f" - {service}")
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
class StatusNotifier:
def __init__(self, api_key, email_config=None, webhook_url=None):
self.api_key = api_key
self.email_config = email_config
self.webhook_url = webhook_url
self.last_status = None
def check_and_notify(self):
"""Check status and send notifications if changed"""
current_status = requests.get(
"https://api.tensorone.ai/v2/monitoring/system-status",
headers={"Authorization": f"Bearer {self.api_key}"}
).json()
if self.last_status and self._status_changed(current_status):
self._send_notifications(current_status)
self.last_status = current_status
return current_status
def _status_changed(self, current_status):
"""Check if status has changed significantly"""
if current_status['overallStatus'] != self.last_status['overallStatus']:
return True
# Check for new incidents
current_incidents = {i['id'] for i in current_status.get('incidents', [])}
last_incidents = {i['id'] for i in self.last_status.get('incidents', [])}
return current_incidents != last_incidents
def _send_notifications(self, status):
"""Send notifications via configured channels"""
message = self._format_status_message(status)
if self.email_config:
self._send_email(message)
if self.webhook_url:
self._send_webhook(message, status)
def _format_status_message(self, status):
"""Format status for notifications"""
message = f"TensorOne Status Update\n"
message += f"Overall Status: {status['overallStatus']}\n"
message += f"Time: {status['lastUpdated']}\n\n"
if status.get('incidents'):
message += "Active Incidents:\n"
for incident in status['incidents']:
message += f"- {incident['title']} ({incident['severity']})\n"
return message
def _send_email(self, message):
"""Send email notification"""
try:
msg = MIMEMultipart()
msg['From'] = self.email_config['sender']
msg['To'] = self.email_config['recipient']
msg['Subject'] = "TensorOne Status Alert"
msg.attach(MIMEText(message, 'plain'))
server = smtplib.SMTP(self.email_config['smtp_server'], self.email_config['port'])
server.starttls()
server.login(self.email_config['sender'], self.email_config['password'])
text = msg.as_string()
server.sendmail(self.email_config['sender'], self.email_config['recipient'], text)
server.quit()
print("Email notification sent")
except Exception as e:
print(f"Failed to send email: {e}")
def _send_webhook(self, message, status):
"""Send webhook notification"""
try:
webhook_data = {
"text": message,
"status": status['overallStatus'],
"timestamp": status['lastUpdated'],
"incidents": len(status.get('incidents', []))
}
response = requests.post(self.webhook_url, json=webhook_data)
if response.status_code == 200:
print("Webhook notification sent")
else:
print(f"Webhook failed: {response.status_code}")
except Exception as e:
print(f"Failed to send webhook: {e}")
# Setup notifications
email_config = {
'sender': 'alerts@yourcompany.com',
'recipient': 'admin@yourcompany.com',
'smtp_server': 'smtp.gmail.com',
'port': 587,
'password': 'your_app_password'
}
notifier = StatusNotifier(
api_key="YOUR_API_KEY",
email_config=email_config,
webhook_url="https://hooks.slack.com/your/webhook/url"
)
# Check status and notify if changed
status = notifier.check_and_notify()