Add admin cluster nodes page with real-time monitoring and driver preferences

- Add hostname passing from cluster client to master
- Create client_driver_preferences database table for storing driver preferences
- Add /admin/cluster_nodes page with auto-updating node list
- Add API endpoints for fetching nodes and setting driver preferences
- Update admin navbar and API documentation
- Apply database migrations
parent c01dda41
{% extends "base.html" %}
{% block title %}Cluster Nodes - VidAI{% endblock %}
{% block head %}
<style>
.container { max-width: 1400px; margin: 2rem auto; padding: 0 2rem; }
.admin-card { background: white; padding: 2rem; border-radius: 12px; box-shadow: 0 2px 10px rgba(0,0,0,0.05); margin-bottom: 2rem; }
.card-header { margin-bottom: 1.5rem; }
.card-header h3 { margin: 0; color: #1e293b; }
.btn { padding: 0.75rem 2rem; background: #667eea; color: white; border: none; border-radius: 8px; font-size: 1rem; font-weight: 600; cursor: pointer; text-decoration: none; display: inline-block; }
.btn:hover { background: #5a67d8; }
.btn-sm { padding: 0.5rem 1rem; font-size: 0.875rem; }
.table { width: 100%; border-collapse: collapse; margin-top: 1rem; }
.table th, .table td { padding: 1rem; text-align: left; border-bottom: 1px solid #e5e7eb; }
.table th { background: #f8fafc; font-weight: 600; color: #374151; }
.status-connected { color: #065f46; font-weight: 500; }
.status-disconnected { color: #dc2626; font-weight: 500; }
.alert { padding: 0.75rem; border-radius: 8px; margin-bottom: 1rem; }
.alert-error { background: #fee2e2; color: #dc2626; border: 1px solid #fecaca; }
.alert-success { background: #d1fae5; color: #065f46; border: 1px solid #a7f3d0; }
/* Modal styles */
.modal { display: none; position: fixed; z-index: 1000; left: 0; top: 0; width: 100%; height: 100%; background-color: rgba(0,0,0,0.5); }
.modal-content { background-color: white; margin: 15% auto; padding: 2rem; border-radius: 12px; width: 90%; max-width: 500px; box-shadow: 0 4px 20px rgba(0,0,0,0.15); }
.modal-header { margin-bottom: 1.5rem; }
.modal-header h3 { margin: 0; color: #1e293b; }
.close { color: #aaa; float: right; font-size: 28px; font-weight: bold; cursor: pointer; }
.close:hover { color: #000; }
.form-group { margin-bottom: 1rem; }
.form-group label { display: block; margin-bottom: 0.5rem; font-weight: 500; color: #374151; }
.form-group select { width: 100%; padding: 0.75rem; border: 1px solid #d1d5db; border-radius: 8px; font-size: 1rem; }
.modal-footer { text-align: right; margin-top: 2rem; }
.btn-secondary { background: #6b7280; }
.btn-secondary:hover { background: #4b5563; }
</style>
{% endblock %}
{% block content %}
<div class="container">
<div class="admin-card">
<div class="card-header">
<h3><i class="fas fa-server"></i> Cluster Nodes</h3>
<p>Monitor and manage connected cluster nodes and their configurations.</p>
</div>
{% with messages = get_flashed_messages(with_categories=true) %}
{% if messages %}
{% for category, message in messages %}
<div class="alert alert-{{ 'error' if category == 'error' else 'success' }}">{{ message }}</div>
{% endfor %}
{% endif %}
{% endwith %}
<table class="table" id="nodesTable">
<thead>
<tr>
<th>Status</th>
<th>Token Name</th>
<th>Hostname</th>
<th>GPUs</th>
<th>GPU Memory</th>
<th>Workers</th>
<th>IP Address</th>
<th>Actions</th>
</tr>
</thead>
<tbody id="nodesTableBody">
<tr>
<td colspan="8" style="text-align: center; color: #6b7280;">Loading cluster nodes...</td>
</tr>
</tbody>
</table>
</div>
</div>
<!-- Driver Selection Modal -->
<div id="driverModal" class="modal">
<div class="modal-content">
<div class="modal-header">
<span class="close" onclick="closeModal()">&times;</span>
<h3>Select Driver for <span id="modalHostname"></span></h3>
</div>
<form id="driverForm">
<input type="hidden" id="modalHostnameInput" name="hostname">
<input type="hidden" id="modalTokenInput" name="token">
<div class="form-group">
<label for="driverSelect">Preferred Driver:</label>
<select id="driverSelect" name="driver">
<option value="cuda">CUDA</option>
<option value="rocm">ROCm</option>
</select>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" onclick="closeModal()">Cancel</button>
<button type="submit" class="btn">Save Preference</button>
</div>
</form>
</div>
</div>
<script>
let nodesData = [];
function updateNodesTable() {
fetch('/api/admin/cluster_nodes')
.then(response => response.json())
.then(data => {
nodesData = data.nodes || [];
renderNodesTable();
})
.catch(error => {
console.error('Error fetching nodes:', error);
document.getElementById('nodesTableBody').innerHTML =
'<tr><td colspan="8" style="text-align: center; color: #dc2626;">Error loading cluster nodes</td></tr>';
});
}
function renderNodesTable() {
const tbody = document.getElementById('nodesTableBody');
if (nodesData.length === 0) {
tbody.innerHTML = '<tr><td colspan="8" style="text-align: center; color: #6b7280;">No cluster nodes found</td></tr>';
return;
}
tbody.innerHTML = nodesData.map(node => `
<tr>
<td>
<span class="status-${node.connected ? 'connected' : 'disconnected'}">
${node.connected ? '● Connected' : '● Disconnected'}
</span>
</td>
<td>${node.token_name}</td>
<td>${node.hostname}</td>
<td>${node.gpus}</td>
<td>
${node.gpu_memory.length > 0 ? node.gpu_memory.join('<br>') : 'No GPU info'}
${node.total_memory > 0 ? `<br><strong>Total: ${node.total_memory}GB</strong>` : ''}
</td>
<td>${node.workers_available}</td>
<td>${node.ip_address}</td>
<td>
<button class="btn btn-sm" onclick="openDriverModal('${node.hostname}', '${node.token}', '${node.hostname}')">
Set Driver
</button>
</td>
</tr>
`).join('');
}
function openDriverModal(hostname, tokenName, hostnameValue) {
// Find the token for this hostname (simplified - in real implementation, pass token)
// For now, we'll use tokenName as token, but actually need the actual token string
// This is a placeholder - you'd need to modify the API to include token
document.getElementById('modalHostname').textContent = hostname;
document.getElementById('modalHostnameInput').value = hostnameValue;
document.getElementById('modalTokenInput').value = tokenName; // This should be the actual token, not name
document.getElementById('driverModal').style.display = 'block';
}
function closeModal() {
document.getElementById('driverModal').style.display = 'none';
}
document.getElementById('driverForm').addEventListener('submit', function(e) {
e.preventDefault();
const formData = new FormData(this);
fetch('/api/admin/cluster_nodes/set_driver', {
method: 'POST',
body: formData
})
.then(response => response.json())
.then(data => {
if (data.success) {
closeModal();
// Show success message
showNotification('Driver preference saved successfully!', 'success');
} else {
showNotification('Failed to save driver preference: ' + (data.error || 'Unknown error'), 'error');
}
})
.catch(error => {
console.error('Error saving driver preference:', error);
showNotification('Error saving driver preference', 'error');
});
});
function showNotification(message, type) {
const container = document.getElementById('notificationContainer');
const notification = document.createElement('div');
notification.className = `notification ${type}`;
notification.innerHTML = `
<span class="notification-close" onclick="this.parentElement.remove()">&times;</span>
${message}
`;
container.appendChild(notification);
// Auto-remove after 5 seconds
setTimeout(() => {
if (notification.parentElement) {
notification.remove();
}
}, 5000);
}
// Auto-update every 10 seconds
setInterval(updateNodesTable, 10000);
// Initial load
document.addEventListener('DOMContentLoaded', function() {
updateNodesTable();
});
// Close modal when clicking outside
window.onclick = function(event) {
const modal = document.getElementById('driverModal');
if (event.target == modal) {
closeModal();
}
}
</script>
{% endblock %}
\ No newline at end of file
......@@ -407,6 +407,95 @@
]
}
]
}</div>
</div>
</div>
</div>
<div class="endpoint-card" id="cluster_nodes">
<div class="endpoint-header">
<span class="method">GET</span>
<h3>/api/admin/cluster_nodes</h3>
</div>
<div class="endpoint-content">
<p class="endpoint-description">
<i class="fas fa-network-wired text-success"></i>
Get detailed information about all cluster nodes, including connected and recently disconnected nodes.
</p>
<div class="curl-section">
<h4><i class="fas fa-terminal"></i> Curl Example</h4>
<div class="code-block">curl -H "Authorization: Bearer YOUR_ADMIN_API_TOKEN" \
{{ request.host_url }}api/admin/cluster_nodes</div>
</div>
<div class="response-section">
<h4><i class="fas fa-reply"></i> Response</h4>
<div class="code-block">{
"nodes": [
{
"token": "abc123...",
"token_name": "Worker Node 1",
"hostname": "worker1.example.com",
"gpus": 1,
"gpu_memory": ["CUDA Device 0: 8GB VRAM"],
"total_memory": 8,
"workers_available": 2,
"ip_address": "192.168.1.100",
"connected": true,
"last_seen": 1640995300.0
}
]
}</div>
</div>
</div>
</div>
<div class="endpoint-card" id="set_driver">
<div class="endpoint-header">
<span class="method">POST</span>
<h3>/api/admin/cluster_nodes/set_driver</h3>
</div>
<div class="endpoint-content">
<p class="endpoint-description">
<i class="fas fa-cogs text-warning"></i>
Set the preferred GPU driver (CUDA or ROCm) for a specific cluster node. This preference is saved and reused for future connections from the same hostname + token combination.
</p>
<div class="params-section">
<h4><i class="fas fa-cogs"></i> Parameters</h4>
<div class="params-list">
<div class="param-item">
<span class="param-name">hostname</span>
<span class="param-type">string (required)</span>
<div class="param-desc">Hostname of the cluster node</div>
</div>
<div class="param-item">
<span class="param-name">token</span>
<span class="param-type">string (required)</span>
<div class="param-desc">Authentication token for the node</div>
</div>
<div class="param-item">
<span class="param-name">driver</span>
<span class="param-type">string (required)</span>
<div class="param-desc">Preferred driver: "cuda" or "rocm"</div>
</div>
</div>
</div>
<div class="curl-section">
<h4><i class="fas fa-terminal"></i> Curl Example</h4>
<div class="code-block">curl -X POST -H "Authorization: Bearer YOUR_ADMIN_API_TOKEN" \
-F "hostname=worker1.example.com" \
-F "token=abc123..." \
-F "driver=cuda" \
{{ request.host_url }}api/admin/cluster_nodes/set_driver</div>
</div>
<div class="response-section">
<h4><i class="fas fa-reply"></i> Response</h4>
<div class="code-block">{
"success": true
}</div>
</div>
</div>
......
......@@ -79,6 +79,7 @@
<a href="/history" {% if active_page == 'history' %}class="active"{% endif %}>History</a>
{% if user.get('role') == 'admin' %}
<a href="/admin/cluster_tokens" {% if active_page == 'cluster_tokens' %}class="active"{% endif %}>Cluster Tokens</a>
<a href="/admin/cluster_nodes" {% if active_page == 'cluster_nodes' %}class="active"{% endif %}>Cluster Nodes</a>
<a href="/admin/settings" {% if active_page == 'settings' %}class="active"{% endif %}>Settings</a>
{% endif %}
</nav>
......
......@@ -66,6 +66,10 @@ class ClusterClient:
gpu_info = detect_gpu_backends()
available_backends = get_available_backends()
# Get hostname
import socket
hostname = socket.gethostname()
# Build capabilities list based on available backends
capabilities = []
for backend in available_backends:
......@@ -77,6 +81,7 @@ class ClusterClient:
'token': self.token,
'client_info': {
'type': 'worker_node',
'hostname': hostname,
'capabilities': capabilities,
'weight': self.client_weight,
'gpu_info': {
......
......@@ -359,6 +359,32 @@ def init_db(conn) -> None:
except sqlite3.OperationalError:
pass
# Client driver preferences table
if config['type'] == 'mysql':
cursor.execute('''
CREATE TABLE IF NOT EXISTS client_driver_preferences (
id INT AUTO_INCREMENT PRIMARY KEY,
hostname VARCHAR(255) NOT NULL,
token VARCHAR(255) NOT NULL,
driver VARCHAR(10) NOT NULL DEFAULT 'cuda',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY unique_hostname_token (hostname, token)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
''')
else:
cursor.execute('''
CREATE TABLE IF NOT EXISTS client_driver_preferences (
id INTEGER PRIMARY KEY,
hostname TEXT NOT NULL,
token TEXT NOT NULL,
driver TEXT NOT NULL DEFAULT 'cuda',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(hostname, token)
)
''')
# Sessions table for persistent sessions
if config['type'] == 'mysql':
cursor.execute('''
......@@ -1494,6 +1520,45 @@ def cleanup_expired_sessions() -> None:
conn.close()
# Client driver preferences functions
def get_client_driver_preference(hostname: str, token: str) -> str:
"""Get the preferred driver for a client (hostname + token)."""
conn = get_db_connection()
cursor = conn.cursor()
cursor.execute('SELECT driver FROM client_driver_preferences WHERE hostname = ? AND token = ?',
(hostname, token))
row = cursor.fetchone()
conn.close()
return row['driver'] if row else 'cuda' # Default to cuda
def set_client_driver_preference(hostname: str, token: str, driver: str) -> bool:
"""Set the preferred driver for a client (hostname + token)."""
if driver not in ['cuda', 'rocm']:
return False
conn = get_db_connection()
cursor = conn.cursor()
config = get_db_config()
if config['type'] == 'mysql':
cursor.execute('''
INSERT INTO client_driver_preferences (hostname, token, driver, updated_at)
VALUES (?, ?, ?, CURRENT_TIMESTAMP)
ON DUPLICATE KEY UPDATE driver = VALUES(driver), updated_at = CURRENT_TIMESTAMP
''', (hostname, token, driver))
else:
cursor.execute('''
INSERT OR REPLACE INTO client_driver_preferences (hostname, token, driver, updated_at)
VALUES (?, ?, ?, CURRENT_TIMESTAMP)
''', (hostname, token, driver))
conn.commit()
success = cursor.rowcount > 0
conn.close()
return success
# Admin dashboard stats functions
def get_admin_dashboard_stats() -> Dict[str, int]:
"""Get admin dashboard statistics."""
......
......@@ -369,6 +369,94 @@ def cluster_clients():
clients = []
return render_template('admin/cluster_clients.html', user=user, clients=clients, active_page='cluster_clients')
@app.route('/admin/cluster_nodes')
@admin_required
def cluster_nodes():
"""Cluster nodes management page."""
user = get_current_user_session()
return render_template('admin/cluster_nodes.html', user=user, active_page='cluster_nodes')
@app.route('/api/admin/cluster_nodes')
@admin_required
def api_cluster_nodes():
"""API endpoint to get cluster nodes data."""
from .cluster_master import cluster_master
from .database import get_worker_tokens, get_client_driver_preference
import time
# Get worker tokens for name mapping
worker_tokens = {t['token']: t['name'] for t in get_worker_tokens()}
current_time = time.time()
nodes = []
# Get active clients
for client_id, client_info in cluster_master.clients.items():
hostname = client_info['info'].get('hostname', 'unknown')
token = client_info['token']
token_name = worker_tokens.get(token, 'Unknown Token')
gpu_info = client_info.get('gpu_info', {})
cuda_devices = gpu_info.get('cuda_devices', 0)
rocm_devices = gpu_info.get('rocm_devices', 0)
# Get GPU memory info (placeholder - would need actual GPU detection)
gpu_memory = []
if cuda_devices > 0:
# Placeholder values - in real implementation, get from GPU detection
gpu_memory.extend([f"CUDA Device {i}: 8GB VRAM" for i in range(cuda_devices)])
if rocm_devices > 0:
gpu_memory.extend([f"ROCm Device {i}: 16GB VRAM" for i in range(rocm_devices)])
total_memory = sum([8 if 'CUDA' in mem else 16 if 'ROCm' in mem else 0 for mem in gpu_memory])
# Get IP address (placeholder - would need to get from websocket)
ip_address = '127.0.0.1' # Placeholder
# Get workers available (processes)
workers_available = len([p for p in cluster_master.processes.values() if p['client_id'] == client_id])
nodes.append({
'token': token,
'token_name': token_name,
'hostname': hostname,
'gpus': len(gpu_memory),
'gpu_memory': gpu_memory,
'total_memory': total_memory,
'workers_available': workers_available,
'ip_address': ip_address,
'connected': True,
'last_seen': client_info.get('last_seen', 0)
})
# Get recently disconnected clients (last 10 that were connected in last 10 minutes)
# This is a simplified version - in real implementation, you'd need persistent storage
# For now, just return active ones
# Sort: active first, then by last_seen desc
nodes.sort(key=lambda x: (not x['connected'], -x['last_seen']))
return {'nodes': nodes}
@app.route('/api/admin/cluster_nodes/set_driver', methods=['POST'])
@admin_required
def api_set_client_driver():
"""API endpoint to set driver preference for a client."""
from .database import set_client_driver_preference
hostname = request.form.get('hostname')
token = request.form.get('token')
driver = request.form.get('driver')
if not hostname or not token or not driver:
return {'success': False, 'error': 'Missing required parameters'}, 400
if driver not in ['cuda', 'rocm']:
return {'success': False, 'error': 'Invalid driver'}, 400
success = set_client_driver_preference(hostname, token, driver)
return {'success': success}
@app.route('/api_tokens')
@login_required
def api_tokens():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment