Add client weight display to cluster nodes page

- Add weight column to cluster nodes table showing load balancing weight
- Set default weights: master=0, clients=100
- Update API response to include client weight
- Update frontend to display weight information
- Update API documentation with weight field
parent b48679df
...@@ -91,6 +91,7 @@ ...@@ -91,6 +91,7 @@
<th>Status</th> <th>Status</th>
<th>Token Name</th> <th>Token Name</th>
<th>Hostname</th> <th>Hostname</th>
<th>Weight</th>
<th>GPUs</th> <th>GPUs</th>
<th>GPU Memory</th> <th>GPU Memory</th>
<th>Workers</th> <th>Workers</th>
...@@ -173,7 +174,7 @@ function renderNodesTable() { ...@@ -173,7 +174,7 @@ function renderNodesTable() {
const tbody = document.getElementById('nodesTableBody'); const tbody = document.getElementById('nodesTableBody');
if (nodesData.length === 0) { if (nodesData.length === 0) {
tbody.innerHTML = '<tr><td colspan="11" style="text-align: center; color: #6b7280;">No cluster nodes found</td></tr>'; tbody.innerHTML = '<tr><td colspan="12" style="text-align: center; color: #6b7280;">No cluster nodes found</td></tr>';
return; return;
} }
...@@ -186,6 +187,7 @@ function renderNodesTable() { ...@@ -186,6 +187,7 @@ function renderNodesTable() {
</td> </td>
<td>${node.token_name}</td> <td>${node.token_name}</td>
<td>${node.hostname}</td> <td>${node.hostname}</td>
<td>${node.weight || 100}</td>
<td>${node.gpus}</td> <td>${node.gpus}</td>
<td> <td>
${node.gpu_memory.length > 0 ? node.gpu_memory.join('<br>') : 'No GPU info'} ${node.gpu_memory.length > 0 ? node.gpu_memory.join('<br>') : 'No GPU info'}
......
...@@ -445,6 +445,7 @@ ...@@ -445,6 +445,7 @@
"token": "abc123...", "token": "abc123...",
"token_name": "Worker Node 1", "token_name": "Worker Node 1",
"hostname": "worker1.example.com", "hostname": "worker1.example.com",
"weight": 100,
"gpus": 1, "gpus": 1,
"gpu_memory": ["CUDA Device 0: 8GB VRAM"], "gpu_memory": ["CUDA Device 0: 8GB VRAM"],
"total_memory": 8, "total_memory": 8,
......
...@@ -36,9 +36,10 @@ from collections import defaultdict ...@@ -36,9 +36,10 @@ from collections import defaultdict
class ClusterMaster: class ClusterMaster:
"""Master server for cluster coordination.""" """Master server for cluster coordination."""
def __init__(self, port: int = 5003, shared_dir: str = None): def __init__(self, port: int = 5003, shared_dir: str = None, weight: int = 0):
self.port = port self.port = port
self.shared_dir = shared_dir self.shared_dir = shared_dir
self.weight = weight # Master weight (default 0)
self.server_sock: Optional[socket.socket] = None self.server_sock: Optional[socket.socket] = None
self.clients = {} # type: Dict[str, Dict[str, Any]] self.clients = {} # type: Dict[str, Dict[str, Any]]
self.client_websockets = {} # type: Dict[str, websockets.WebSocketServerProtocol] self.client_websockets = {} # type: Dict[str, websockets.WebSocketServerProtocol]
...@@ -608,10 +609,11 @@ class ClusterMaster: ...@@ -608,10 +609,11 @@ class ClusterMaster:
cluster_master = ClusterMaster() cluster_master = ClusterMaster()
def start_cluster_master(port: int = 5003, shared_dir: str = None) -> None: def start_cluster_master(port: int = 5003, shared_dir: str = None, weight: int = 0) -> None:
"""Start the cluster master server.""" """Start the cluster master server."""
cluster_master.port = port cluster_master.port = port
cluster_master.shared_dir = shared_dir cluster_master.shared_dir = shared_dir
cluster_master.weight = weight
asyncio.run(cluster_master.start()) asyncio.run(cluster_master.start())
...@@ -621,10 +623,11 @@ if __name__ == "__main__": ...@@ -621,10 +623,11 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description='VidAI Cluster Master') parser = argparse.ArgumentParser(description='VidAI Cluster Master')
parser.add_argument('--port', type=int, default=5003, help='Port to listen on (default: 5003)') parser.add_argument('--port', type=int, default=5003, help='Port to listen on (default: 5003)')
parser.add_argument('--shared-dir', help='Shared directory for file transfers') parser.add_argument('--shared-dir', help='Shared directory for file transfers')
parser.add_argument('--weight', type=int, default=0, help='Master weight for load balancing (default: 0)')
args = parser.parse_args() args = parser.parse_args()
print(f"Starting VidAI Cluster Master on port {args.port}") print(f"Starting VidAI Cluster Master on port {args.port} with weight {args.weight}")
if args.shared_dir: if args.shared_dir:
print(f"Using shared directory: {args.shared_dir}") print(f"Using shared directory: {args.shared_dir}")
# Validate shared directory exists # Validate shared directory exists
...@@ -632,4 +635,4 @@ if __name__ == "__main__": ...@@ -632,4 +635,4 @@ if __name__ == "__main__":
print(f"Warning: Shared directory {args.shared_dir} does not exist. Creating it.") print(f"Warning: Shared directory {args.shared_dir} does not exist. Creating it.")
os.makedirs(args.shared_dir, exist_ok=True) os.makedirs(args.shared_dir, exist_ok=True)
start_cluster_master(args.port, args.shared_dir) start_cluster_master(args.port, args.shared_dir, args.weight)
\ No newline at end of file \ No newline at end of file
...@@ -443,7 +443,8 @@ def api_cluster_nodes(): ...@@ -443,7 +443,8 @@ def api_cluster_nodes():
'last_seen': client_info.get('last_seen', 0), 'last_seen': client_info.get('last_seen', 0),
'uptime_seconds': uptime_seconds, 'uptime_seconds': uptime_seconds,
'active_jobs': active_jobs, 'active_jobs': active_jobs,
'completed_jobs': completed_jobs 'completed_jobs': completed_jobs,
'weight': client_info.get('weight', 100)
}) })
# Get recently disconnected clients (last 10 that were connected in last 10 minutes) # Get recently disconnected clients (last 10 that were connected in last 10 minutes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment