Manage and interact with model checkpoints created during training. Checkpoints allow you to save model state at specific points and restore or deploy models from those states.
# Regularly clean up old automatic checkpointsdef cleanup_old_checkpoints(job_id, keep_count=5): response = requests.get( f"https://api.tensorone.ai/v2/training/jobs/{job_id}/checkpoints", params={"sortBy": "createdAt", "sortOrder": "desc"} ) checkpoints = response.json()['checkpoints'] # Keep best and final checkpoints, plus N most recent to_delete = [] automatic_count = 0 for ckpt in checkpoints: if ckpt['type'] in ['best', 'final']: continue elif ckpt['type'] == 'automatic': automatic_count += 1 if automatic_count > keep_count: to_delete.append(ckpt['checkpointId']) # Delete old checkpoints for ckpt_id in to_delete: requests.delete( f"https://api.tensorone.ai/v2/training/checkpoints/{ckpt_id}", headers={"Authorization": "Bearer YOUR_API_KEY"} )
# Download and backup critical checkpointsdef backup_checkpoint(checkpoint_id, local_path): # Get checkpoint details response = requests.get( f"https://api.tensorone.ai/v2/training/checkpoints/{checkpoint_id}", headers={"Authorization": "Bearer YOUR_API_KEY"} ) checkpoint = response.json() download_url = checkpoint['downloadUrl'] # Download checkpoint file checkpoint_response = requests.get(download_url, stream=True) with open(local_path, 'wb') as f: for chunk in checkpoint_response.iter_content(chunk_size=8192): f.write(chunk) print(f"Backed up checkpoint to {local_path}")# Backup best checkpointbackup_checkpoint("ckpt_best_abc123", "./backups/best_model.ckpt")
Checkpoints are automatically compressed and deduplicated to minimize storage costs. Similar model states share common data blocks to reduce overall storage usage.
Checkpoint download URLs expire after 1 hour for security. Generate new URLs as needed or download immediately after getting the URL.