forked from danielcamposramos/Knowledge3D
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_parallel_training.py
More file actions
100 lines (82 loc) · 3.08 KB
/
test_parallel_training.py
File metadata and controls
100 lines (82 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
"""Test parallel LoRA training to verify GPU utilization."""
import numpy as np
import time
from knowledge3d.cranium.sovereign.lora_gpu_trainer import LoRAGPUEngine
def test_parallel_training():
"""Test batched training with 15 samples."""
print("="*70)
print("Testing Parallel LoRA Training")
print("="*70)
# Setup
dims = 128
rank = 16
n_samples = 150 # Multiple batches
batch_size = 15 # Process 15 at a time
# Create synthetic data
print(f"\nCreating {n_samples} samples (dims={dims}, rank={rank})")
base_matrix = np.random.randn(dims, dims).astype(np.float32) * 0.01
A = np.random.randn(dims, rank).astype(np.float32) * 0.01
B = np.random.randn(rank, dims).astype(np.float32) * 0.01
inputs = np.random.randn(n_samples, dims).astype(np.float32)
targets = np.random.randn(n_samples, dims).astype(np.float32)
# Initialize engine
print("\nInitializing LoRA GPU engine...")
engine = LoRAGPUEngine()
# Allocate buffers
print(f"Allocating GPU buffers (batch_size={batch_size})...")
buffers = engine.allocate_buffers(
base_matrix, A, B, inputs, targets,
max_batch=batch_size
)
# Training parameters
alpha = 0.5
learning_rate = 0.001
epochs = 10
print(f"\nTraining for {epochs} epochs...")
print(f"- Samples: {n_samples}")
print(f"- Batch size: {batch_size}")
print(f"- Batches per epoch: {(n_samples + batch_size - 1) // batch_size}")
print()
# Train
start_time = time.time()
for epoch in range(1, epochs + 1):
epoch_loss = 0.0
n_batches = 0
for batch_start in range(0, n_samples, batch_size):
batch_end = min(batch_start + batch_size, n_samples)
batch_indices = np.arange(batch_start, batch_end, dtype=np.int32)
loss = engine.train_batch(
buffers=buffers,
batch_indices=batch_indices,
dims=dims,
rank=rank,
alpha=alpha,
learning_rate=learning_rate,
)
epoch_loss += loss
n_batches += 1
avg_loss = epoch_loss / n_batches
elapsed = time.time() - start_time
samples_per_sec = (epoch * n_samples) / elapsed
print(f"Epoch {epoch:2d}/{epochs}: loss={avg_loss:.6f} "
f"({samples_per_sec:.1f} samples/sec)")
# Final stats
total_time = time.time() - start_time
total_samples = epochs * n_samples
throughput = total_samples / total_time
print()
print("="*70)
print("RESULTS")
print("="*70)
print(f"Total time: {total_time:.2f} seconds")
print(f"Total samples: {total_samples}")
print(f"Throughput: {throughput:.1f} samples/sec")
print(f"Time per sample: {1000 * total_time / total_samples:.2f} ms")
print()
print("✅ Training completed successfully!")
print()
print("NOTE: Check GPU utilization with: nvidia-smi dmon -c 10 -s ucm")
print(" Expected: 80-95% GPU utilization during training")
if __name__ == "__main__":
test_parallel_training()