mirror of
https://github.com/wassname/ml_debug.git
synced 2026-06-27 01:00:14 +08:00
refactor(ml_debug): extract grep patterns and diagnostics to refs/
Moved 6.1 (static analysis grep patterns) and 6.2 (diagnostic code snippets) to refs/static_analysis.md and refs/diagnostics.md. Triage tree (6.3) stays in main with references to the ref files. ml_debug/SKILL.md reduced from 7229w to 5093w (~30% from original).
This commit is contained in:
@@ -0,0 +1,169 @@
|
||||
# 6.2 Diagnostic code snippets
|
||||
|
||||
Copy-paste these. Each tests one thing.
|
||||
|
||||
**Data pipeline sanity check**
|
||||
```python
|
||||
batch = next(iter(train_loader))
|
||||
for k, v in (batch.items() if isinstance(batch, dict) else enumerate(batch)):
|
||||
if isinstance(v, torch.Tensor):
|
||||
print(f"{k}: shape={v.shape}, dtype={v.dtype}, "
|
||||
f"range=[{v.min():.3f}, {v.max():.3f}], "
|
||||
f"mean={v.float().mean():.3f}, std={v.float().std():.3f}, "
|
||||
f"nan={v.isnan().sum()}, inf={v.isinf().sum()}")
|
||||
else:
|
||||
print(f"{k}: type={type(v)}, len={len(v) if hasattr(v, '__len__') else 'scalar'}")
|
||||
# Check: inputs ~mean 0, std 1? Labels in expected range? No NaN/Inf? Shapes match model?
|
||||
```
|
||||
|
||||
**Init loss check**
|
||||
```python
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
batch = next(iter(train_loader))
|
||||
out = model(batch['input']) # adapt to your interface
|
||||
loss = loss_fn(out, batch['target'])
|
||||
print(f"Init loss: {loss.item():.4f}")
|
||||
|
||||
# Expected init loss (random predictions):
|
||||
# - CrossEntropy, C classes: -ln(1/C) = ln(C)
|
||||
# C=2: 0.693, C=10: 2.303, C=100: 4.605, C=1000: 6.908
|
||||
# - Binary CrossEntropy: -ln(0.5) = 0.693
|
||||
# - MSE (targets ~N(0,1)): ~1.0 (if init outputs ~0) or ~var(targets)
|
||||
# - L1 (targets ~N(0,1)): ~0.8
|
||||
#
|
||||
# If init loss << expected: model is cheating (data leakage, shortcut)
|
||||
# If init loss >> expected: wrong loss fn, bad init, or data pipeline broken
|
||||
```
|
||||
|
||||
**Overfit-one-batch test**
|
||||
```python
|
||||
model.train()
|
||||
batch = next(iter(train_loader))
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
|
||||
|
||||
for step in range(200):
|
||||
optimizer.zero_grad()
|
||||
out = model(batch['input'])
|
||||
loss = loss_fn(out, batch['target'])
|
||||
loss.backward()
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 100.0)
|
||||
optimizer.step()
|
||||
if step % 20 == 0:
|
||||
print(f"step {step:3d} loss={loss.item():.4f} grad_norm={grad_norm:.4f}")
|
||||
|
||||
# Expected: loss drops to ~0 within 200 steps.
|
||||
# If not: model can't even memorize 1 batch -- architecture or gradient problem.
|
||||
```
|
||||
|
||||
**Gradient flow check (per-layer)**
|
||||
```python
|
||||
loss.backward()
|
||||
for name, p in model.named_parameters():
|
||||
if p.grad is not None:
|
||||
g = p.grad
|
||||
print(f"{name:40s} grad: mean={g.mean():+.2e}, std={g.std():.2e}, "
|
||||
f"max={g.abs().max():.2e}, zero%={100*(g==0).float().mean():.0f}")
|
||||
else:
|
||||
print(f"{name:40s} grad: None") # <-- not in computation graph!
|
||||
# Check: no None grads (disconnected), no all-zero grads (dead layer),
|
||||
# no huge grads (explosion), reasonable magnitude across layers.
|
||||
```
|
||||
|
||||
**NaN/Inf detector hooks**
|
||||
```python
|
||||
def nan_hook(module, input, output):
|
||||
def _check(t, label):
|
||||
if isinstance(t, torch.Tensor) and (torch.isnan(t).any() or torch.isinf(t).any()):
|
||||
raise RuntimeError(
|
||||
f"NaN/Inf in {module.__class__.__name__} {label}, "
|
||||
f"shape={t.shape}, nan={t.isnan().sum()}, inf={t.isinf().sum()}")
|
||||
if isinstance(output, torch.Tensor):
|
||||
_check(output, "output")
|
||||
elif isinstance(output, dict):
|
||||
for k, v in output.items():
|
||||
_check(v, f"output[{k!r}]")
|
||||
elif isinstance(output, (tuple, list)):
|
||||
for i, o in enumerate(output):
|
||||
_check(o, f"output[{i}]")
|
||||
|
||||
for name, module in model.named_modules():
|
||||
module.register_forward_hook(nan_hook)
|
||||
# Run one forward pass. First module to raise = source of the NaN.
|
||||
```
|
||||
|
||||
**Random input test** [Slavv]
|
||||
```python
|
||||
# Pass random noise instead of real data. If loss/error behaves the same,
|
||||
# the data pipeline is destroying information before the model sees it.
|
||||
model.eval()
|
||||
real_batch = next(iter(train_loader))
|
||||
fake_input = torch.randn_like(real_batch['input'])
|
||||
with torch.no_grad():
|
||||
real_out = model(real_batch['input'])
|
||||
fake_out = model(fake_input)
|
||||
real_loss = loss_fn(real_out, real_batch['target']).item()
|
||||
fake_loss = loss_fn(fake_out, real_batch['target']).item()
|
||||
print(f"Real input loss: {real_loss:.4f}")
|
||||
print(f"Random input loss: {fake_loss:.4f}")
|
||||
# If similar: model isn't using the input. Check preprocessing, data loading, feature selection.
|
||||
# If very different: model sees real signal. Problem is elsewhere.
|
||||
```
|
||||
|
||||
**Prime dimension trick** [Slavv]
|
||||
```python
|
||||
# Use prime/weird numbers for each dimension to catch silent broadcasting.
|
||||
# If batch=7, seq=13, hidden=17, any mismatched reshape/view that "works"
|
||||
# by accident with powers-of-2 will fail with primes.
|
||||
x = torch.randn(7, 13, 17) # (batch=7, seq=13, hidden=17)
|
||||
out = model(x)
|
||||
print(f"in={x.shape} -> out={out.shape}")
|
||||
# If this crashes but normal shapes don't: you have a broadcasting bug.
|
||||
```
|
||||
|
||||
**Class imbalance check**
|
||||
```python
|
||||
from collections import Counter
|
||||
all_labels = []
|
||||
for batch in train_loader:
|
||||
labels = batch['target'] if isinstance(batch, dict) else batch[1]
|
||||
all_labels.extend(labels.flatten().tolist())
|
||||
counts = Counter(all_labels)
|
||||
total = sum(counts.values())
|
||||
for cls, n in sorted(counts.items(), key=lambda x: -x[1]):
|
||||
print(f" class {cls}: {n:6d} ({100*n/total:.1f}%)")
|
||||
# Ratio > 10:1 = likely need weighted loss or resampling.
|
||||
# Ratio > 100:1 = model will predict majority class and look "accurate".
|
||||
```
|
||||
|
||||
**Confidence-sorted error inspection** [common practice, cf. FSDL error analysis]
|
||||
```python
|
||||
# Find the model's most confident wrong predictions. These reveal
|
||||
# systematic bugs (e.g., cropping cutting off relevant features).
|
||||
model.eval()
|
||||
errors = []
|
||||
with torch.no_grad():
|
||||
for batch in val_loader:
|
||||
logits = model(batch['input'])
|
||||
probs = torch.softmax(logits, dim=-1)
|
||||
confidence, predicted = probs.max(dim=-1)
|
||||
wrong = predicted != batch['target']
|
||||
for i in wrong.nonzero(as_tuple=True)[0]:
|
||||
errors.append((confidence[i].item(), predicted[i].item(),
|
||||
batch['target'][i].item(), i.item()))
|
||||
errors.sort(reverse=True) # most confident mistakes first
|
||||
for conf, pred, true, idx in errors[:10]:
|
||||
print(f" conf={conf:.3f} predicted={pred} true={true} idx={idx}")
|
||||
# Inspect the actual inputs for these indices. Pattern = systematic bug.
|
||||
```
|
||||
|
||||
**Weight/bias distribution check** [Slavv, CS231n]
|
||||
```python
|
||||
for name, p in model.named_parameters():
|
||||
print(f"{name:40s} mean={p.data.mean():+.4f} std={p.data.std():.4f} "
|
||||
f"min={p.data.min():+.4f} max={p.data.max():+.4f} "
|
||||
f"shape={list(p.shape)}")
|
||||
# Healthy: roughly Gaussian, std ~0.01-1.0 depending on init scheme.
|
||||
# Bad signs: all zeros, huge values (>100), std ~0 (collapsed), NaN.
|
||||
# After training: weights diverging to +/-inf = exploding. All same value = dead.
|
||||
```
|
||||
@@ -0,0 +1,114 @@
|
||||
# 6.1 Static analysis: grep for silent bugs
|
||||
|
||||
Run these searches on the codebase before anything else. Each catches a common bug that produces no error but wrong results.
|
||||
|
||||
**Shape mismatches (silent broadcasting)**
|
||||
```
|
||||
# Grep patterns:
|
||||
\.view\(|\.reshape\( # check dims match intent
|
||||
unsqueeze\(|squeeze\( # dimension insertion/removal
|
||||
\.expand\(|\.repeat\( # broadcasting
|
||||
# Action: for every hit, trace the tensor shape backward. Add assert statements.
|
||||
```
|
||||
|
||||
**Autograd breakers**
|
||||
```
|
||||
# Grep patterns:
|
||||
\.detach\(\) # breaks gradient flow
|
||||
\.data\b # bypasses autograd entirely
|
||||
with torch\.no_grad # check this isn't wrapping training code
|
||||
\.item\(\) # in a loss computation = broken
|
||||
\.numpy\(\) # in forward pass = broken
|
||||
# Action: every .detach() should have a comment explaining WHY grad is intentionally stopped.
|
||||
```
|
||||
|
||||
**Missing train/eval mode**
|
||||
```
|
||||
# Grep patterns:
|
||||
\.train\(\) # count occurrences
|
||||
\.eval\(\) # should pair with .train()
|
||||
# Action: verify .eval() before every val loop, .train() before every train loop.
|
||||
# Dropout and batchnorm behave differently -- this silently degrades results.
|
||||
```
|
||||
|
||||
**In-place ops on tensors requiring grad**
|
||||
```
|
||||
# Grep patterns:
|
||||
\+=|\-=|\*=|/= # in-place assignment on tensors
|
||||
\.add_\(|\.mul_\(|\.zero_\( # in-place methods
|
||||
\[.*\]\s*=[^=] # index assignment (excludes ==)
|
||||
# Action: in-place ops on leaf tensors with requires_grad=True corrupt autograd.
|
||||
# Replace x += y with x = x + y.
|
||||
```
|
||||
|
||||
**Double softmax (softmax input to CrossEntropyLoss)**
|
||||
```
|
||||
# Grep patterns:
|
||||
CrossEntropyLoss|cross_entropy # expects raw logits
|
||||
softmax|log_softmax|\.softmax # if applied BEFORE CrossEntropyLoss = double softmax
|
||||
# Action: CrossEntropyLoss = log_softmax + NLLLoss internally.
|
||||
# If you softmax first, CE computes log_softmax(softmax(x)) -- the softmax
|
||||
# compresses logits into (0,1), so log_softmax sees near-uniform inputs.
|
||||
# Gradients vanish. Loss plateaus near ln(n_classes).
|
||||
```
|
||||
|
||||
**Wrong optimizer step ordering**
|
||||
```
|
||||
# Grep patterns -- verify this exact order exists:
|
||||
# 1. optimizer.zero_grad()
|
||||
# 2. loss.backward()
|
||||
# 3. [optional: clip_grad_norm_]
|
||||
# 4. optimizer.step()
|
||||
# 5. [optional: scheduler.step()]
|
||||
# Common bugs: zero_grad after backward (kills grads), step before backward (stale grads),
|
||||
# scheduler.step() in wrong loop: per-epoch schedulers (StepLR, CosineAnnealingLR)
|
||||
# called per-batch = decays too fast. Per-step schedulers (OneCycleLR) called per-epoch = too slow.
|
||||
```
|
||||
|
||||
**Broadcasting traps**
|
||||
```python
|
||||
# Diagnostic: print shapes at every binary operation between tensors of different ndim
|
||||
# Shapes (3,) and (3,1) silently broadcast to (3,3) -- probably not intended.
|
||||
# Shapes (B,1) and (B,N) broadcast fine but verify it's intentional.
|
||||
a = torch.randn(3)
|
||||
b = torch.randn(3, 1)
|
||||
print((a + b).shape) # (3, 3) -- wanted (3,)?
|
||||
```
|
||||
|
||||
**Wrong loss sign**
|
||||
```
|
||||
# Grep patterns:
|
||||
maximize|ascent # gradient ascent when descent intended?
|
||||
\-\s*loss # negating loss -- intentional (e.g., reward maximization)?
|
||||
1\.0\s*-\s*|1\s*-\s* # 1 - metric as loss -- is the metric bounded [0,1]?
|
||||
# Action: verify that minimizing the loss = improving the metric you care about.
|
||||
```
|
||||
|
||||
**Frozen parameters not intended**
|
||||
```
|
||||
# Grep patterns:
|
||||
requires_grad\s*=\s*False # intentional freeze?
|
||||
\.freeze\(|\.requires_grad_ # parameter freezing
|
||||
for.*param.*\.parameters # check nothing is skipped
|
||||
# Diagnostic:
|
||||
for name, p in model.named_parameters():
|
||||
if not p.requires_grad:
|
||||
print(f"FROZEN: {name}")
|
||||
```
|
||||
|
||||
**Data leakage**
|
||||
```
|
||||
# Grep patterns:
|
||||
\.fit_transform\( # on test data = leakage
|
||||
train_test_split.*shuffle=True # for time series = leakage
|
||||
# Action: fit on train only, transform on both. Use temporal split for time series.
|
||||
```
|
||||
|
||||
**Class imbalance**
|
||||
```
|
||||
# Grep patterns:
|
||||
CrossEntropyLoss\(\) # no weight= argument? check if classes balanced
|
||||
weight=.*class # existing balancing -- verify weights are correct
|
||||
# Diagnostic: count labels per class (see diagnostics.md "Class imbalance check").
|
||||
# 100:1 ratio with unweighted loss = model predicts majority class.
|
||||
```
|
||||
Reference in New Issue
Block a user