wip

2026-06-27 16:14:27 +08:00 · 2026-04-04 23:57:11 +08:00
parent fc46d878cf
commit f55c18ac6e
7 changed files with 125 additions and 275 deletions
@@ -25,6 +25,13 @@ We optimize: **{metric name}** ({lower/higher} is better)

 Baseline (untrained or current best): **{value}**

+Baseline std across 3 seeds: **{value}** (must be < effect size of a meaningful improvement)
+
+Train + eval wall time: **{X} minutes** (target: 5-40 min)
+
+If variance is too high relative to effect size, use more eval data, a smaller proxy task,
+or a lower-variance metric -- before running any experiments.
+
 ## Data

 {FILL_IN: dataset, size, splits}
@@ -1,30 +0,0 @@
-<!-- APPEND-ONLY: new entries at top. Never edit old entries. -->
-<!-- Written by agents (not humans -- see human_journal.md). -->
-<!-- Format: ## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME -->
-
-# Agent Journal
-
-Short-term notes from agents: what was attempted in this session,
-blockers hit, context for the next agent picking this up.
-
-For lasting research learnings, write to RESEARCH_JOURNAL.md instead.
-
---
-
-## Template entry
-
-```
-## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME
-
-### Session goal
-[what this agent was asked to do]
-
-### What was done
-[brief -- the commit history has the details]
-
-### Blockers / open questions
-[anything the next agent needs to know]
-
-### Pueue queue state
-[any jobs queued: `pueue status` output or summary]
-```
@@ -21,7 +21,7 @@ import tyro
 from loguru import logger

 # {FILL_IN}: import your model
-# from model import Config, build_model
+# from train import Config, build_model


 EVAL_SEED = 42          # FROZEN: never change
@@ -85,6 +85,7 @@ def main(cfg: EvalConfig):
    data = load_eval_data()

    # {FILL_IN}: build and load your model
+    # from train import Config, build_model
    # model_cfg = Config()
    # model = build_model(model_cfg)
    # if cfg.checkpoint:
@@ -13,7 +13,7 @@ default:
 # Fast smoke test: shape checks on CPU, short run
 smoke:
    #!/usr/bin/env bash
-    BEARTYPE=1 JAX_PLATFORMS=cpu uv run python train.py --max_steps=1
+    BEARTYPE=1 uv run python train.py --max_steps=2
    echo smoke passed

 # --- Evaluation (FROZEN logic -- see eval.py) ---
@@ -1,74 +0,0 @@
-"""
-Model definition and config.
-
-This file is NOT frozen -- agents modify it freely in worktrees.
-Keep eval.py's evaluate() interface stable: it calls build_model(cfg) and model.forward(x).
-"""
-
-from dataclasses import dataclass
-
-import torch
-import torch.nn as nn
-import tyro
-from jaxtyping import Float
-from loguru import logger
-from torch import Tensor
-
-# beartype checking enabled only when BEARTYPE=1 (smoke tests)
-import os
-if os.environ.get("BEARTYPE"):
-    from beartype import beartype as typechecker
-    from jaxtyping import jaxtyped
-else:
-    def typechecker(f): return f
-    def jaxtyped(**_): return lambda f: f
-
-
-@dataclass
-class Config:
-    """Model and training hyperparameters. Edit freely."""
-
-    # model
-    d_model: int = 256
-    n_layers: int = 4
-    # {FILL_IN}: add your architecture params
-
-    # training
-    lr: float = 3e-4
-    batch_size: int = 32
-    max_steps: int = 1000
-    seed: int = 42
-
-    # data
-    # {FILL_IN}: add data params
-
-
-class Model(nn.Module):
-    """
-    {FILL_IN}: replace with your actual model.
-    """
-
-    def __init__(self, cfg: Config):
-        super().__init__()
-        self.cfg = cfg
-        # {FILL_IN}: define layers
-        # e.g. self.embed = nn.Embedding(vocab_size, cfg.d_model)
-
-    @jaxtyped(typechecker=typechecker)
-    def forward(self, x: Float[Tensor, "b s"]) -> Float[Tensor, "b s d"]:
-        # {FILL_IN}: implement forward pass
-        raise NotImplementedError("{FILL_IN}: implement forward()")
-
-
-def build_model(cfg: Config) -> Model:
-    torch.manual_seed(cfg.seed)
-    model = Model(cfg)
-    n_params = sum(p.numel() for p in model.parameters())
-    logger.info(f"Model: {n_params:,} parameters")
-    return model
-
-
-if __name__ == "__main__":
-    cfg = tyro.cli(Config)
-    model = build_model(cfg)
-    logger.info(f"Config: {cfg}")
@@ -1,21 +1,15 @@
 <!-- FROZEN: only edit in meta-mode (META_MODE=1) -->
-<!-- Symlinked to AGENTS.md and CLAUDE.md -- always loaded by Claude/Cursor/etc. -->
+<!-- Symlinked to AGENTS.md and CLAUDE.md -- always loaded. -->

 # Research Program

-**Project**: {FILL_IN: one sentence describing the research problem}
+**Project**: {FILL_IN: one sentence}

-**Metric**: {FILL_IN: what we optimize, e.g. val_bpb, accuracy, F1} (lower/higher is better)
+**Metric**: {FILL_IN: metric name} ({lower/higher} is better). Target: 5-40 min per run.

-**Metric design requirements** (enforce before first real experiment):
- Train + eval runs in 5-40 minutes on your GPU
- Variance across seeds < effect size of a meaningful improvement (run baseline x3, check std)
- Deterministic given same seed (fixed data order, fixed eval split)
- If variance is too high: use more eval data, smaller model, or a proxy metric with less noise
+**Hypothesis space**: {FILL_IN: what class of approaches}

-**Hypothesis space**: {FILL_IN: what class of approaches are in scope}
-
-Read `0_docs/problem.md` for full context.
+See `0_docs/problem.md` for full problem context and baseline numbers.

 ---

@@ -24,182 +18,69 @@ Read `0_docs/problem.md` for full context.
 | Type | Files | Rule |
 |------|-------|------|
 | FROZEN | `program.md`, `eval.py`, `meta_journal.md` | Never edit without `META_MODE=1` |
-| GLOBAL | `RESEARCH_JOURNAL.md`, `results.tsv` | Only commit from main; worktrees append to root copy |
-| APPEND-ONLY | `*_journal.md` | New entries at top, never edit old ones |
+| GLOBAL | `RESEARCH_JOURNAL.md`, `results.tsv` | Commit from main only; worktrees append to root copy |
+| APPEND-ONLY | `RESEARCH_JOURNAL.md`, `human_journal.md`, `meta_journal.md` | New entries at top, never edit old entries |
 | REGULAR | everything else | Modify freely in your worktree |

 ---

 ## Agent Algorithm

-```
-YOU ARE AN AGENT. Follow this loop:
+```python
+# MUST read before starting:
+read RESEARCH_JOURNAL.md        # what has been tried, what worked/failed
+read "Lessons Learned" below    # gotchas -- don't repeat past failures

-read RESEARCH_JOURNAL.md          # what has been tried
-read 0_docs/problem.md            # what we're solving
-
-n_ideas = count files in 1_ideas/ (not _TEMPLATE.md)
+n_ideas = count(1_ideas/*.md) - 1  # minus _TEMPLATE.md

 if n_ideas < 30:
-    ## IDEATE
-    - Read at least one file from 0_docs/papers/ (or fetch a new paper)
-    - Do at least one web search for recent approaches
-    - Fetch papers: use /semantic-search or /exa-search skills
-      -> save FULL paper text to 0_docs/papers/{slug}.md (not summaries -- full text)
-      -> optionally add a vargdown-style argument map to 0_docs/papers/{slug}_analysis.argdown
-      -> add key insight (1-3 observations with sources) to RESEARCH_JOURNAL.md
-    - Brainstorm ideas. Quality bar:
-        * Novel (not in RESEARCH_JOURNAL.md already)
-        * Mechanistically grounded (not just hyperparameter tuning)
-        * Not sklearn slop -- must be a real ML research contribution
-        * Bold enough that it could be a paper contribution
-    - For each idea:
-        write 1_ideas/{YYYY-MM-DD}_{slug}.md  (use _TEMPLATE.md format)
-        spawn subagent to critique the idea (prompt: "Is this idea sound?
-          What are the failure modes? Is the hypothesis testable?")
-        append subagent feedback to the idea file
-    - Append summary of new ideas + paper insights to RESEARCH_JOURNAL.md
+    # IDEATE
+    # MUST read: 0_docs/ideation_guide.md  (epistemics, paper fetching, brainstorm discipline)
+    read 0_docs/problem.md
+    search 1+ papers (semantic-search, exa-search, bibtex MCP)
+    save full paper text to 0_docs/papers/{slug}.md   # full text, not summaries
+    for each idea:
+        write 1_ideas/{YYYY-MM-DD}_{slug}.md   # use _TEMPLATE.md
+        subagent critique: "Is this sound? Failure modes? Testable?"
+        append subagent feedback to idea file
+    append paper insights + ideas to RESEARCH_JOURNAL.md

 else:
-    ## IMPLEMENT
-    pick the best idea from 1_ideas/ based on:
-        - subagent rating (see feedback section in idea file)
-        - novelty relative to RESEARCH_JOURNAL.md
-        - expected impact on metric
-        - implementation feasibility
+    # IMPLEMENT
+    # MUST read: 0_docs/conventions.md  (coding style)
+    pick best idea (subagent rating + novelty + expected impact)
+    just worktree {slug}           # creates 5_worktrees/{slug} on branch exp/{slug}
+    implement in worktree (edit train.py; do NOT touch eval.py, program.md)

-    slug = idea filename slug
-    run: git worktree add 5_worktrees/{slug} -b exp/{slug}
-    cd 5_worktrees/{slug}
+    # TEST
+    subagent code review vs idea doc
+    just smoke
+    just eval                      # appends row to results.tsv

-    implement the idea (modify train.py, model.py, etc.)
-    do NOT modify: eval.py, program.md, meta_journal.md
+    # REPORT
+    write 9_reports/{YYYY-MM-DD}_{slug}.md   # use _TEMPLATE.md
+    append to RESEARCH_JOURNAL.md: what tried, delta metric, observation vs inference

-    ## TEST
-    spawn subagent: "Code review this against the idea doc 1_ideas/{slug}.md.
-      Does the implementation match the hypothesis? Any bugs?"
-    run: just smoke                    # fast sanity check
-    run: just eval                     # appends to results.tsv
-
-    ## REPORT
-    write 9_reports/{YYYY-MM-DD}_{slug}.md  (use _TEMPLATE.md format)
-    append short summary to RESEARCH_JOURNAL.md:
-        - what was tried, what metric changed, what you learned
-        - key observation vs inference distinction
-
-    ## SUBMIT
-    git commit -m "exp({slug}): {one-line description}"
+    # SUBMIT
+    git commit -m "exp({slug}): {one line}"
    git push origin exp/{slug}
-    if result beats best in results.tsv:
-        create PR for human to merge
+    if beats best in results.tsv: open PR for human

-## QUEUING EXPERIMENTS (pueue)
-
-Use pueue to queue experiments for the single GPU -- one at a time, no collision:
-
-    # Queue with a label showing the question and expected resolution
-    pueue add --label "Q: does X help? H: expect +0.05 metric" -- just eval --config=path
-
-    # Check queue / status / logs
-    pueue status
-    pueue log {task_id}       # full stdout
-    pueue follow {task_id}    # live tail
-
-Labels encode the hypothesis being tested. After the run, append observed vs expected
-to RESEARCH_JOURNAL.md. The label shows up in `pueue status` so you can track what
-question each running/queued job is answering.
-
-    # Example: multiple experiments queued with different hypotheses
-    pueue add --label "Q: rotary vs sinusoidal? H: rotary saves 0.1 bpb" -- just eval rotary
-    pueue add --label "Q: flash-attn memory? H: 2x batch size same speed" -- just eval flash
-    pueue add --label "Q: does layer norm placement matter? H: pre-norm better" -- just eval prenorm
+# GPU QUEUE (pueue -- one GPU, no collision)
+just queue "Q: does X help? H: expect +delta" eval {args}
+pueue status          # shows hypothesis label for each queued/running job
 ```

 ---

-## Coding Conventions
+## Lessons Learned and Gotchas

-Fail fast. No defensive programming. No silent fallbacks.
+Format: `YYYY-MM-DD | title | lesson (one line)`

-```python
-# shape ops: einops for clarity
-from einops import rearrange, reduce
-x = rearrange(x, 'b s h d -> b h s d')
-
-# einsum for explicit contraction
-out = torch.einsum('b h s d, b h d v -> b h s v', q, k)
-
-# jaxtyping on function boundaries (docs + smoke-test checking)
-from jaxtyping import Float
-from torch import Tensor
-def encode(x: Float[Tensor, 'b s d']) -> Float[Tensor, 'b s h']:
-    ...
-
-# logging: loguru not print
-from loguru import logger
-logger.info(f"loss={loss:.4f}")
-
-# dataframes: polars v1
-import polars as pl
-df.group_by("exp").agg(pl.col("metric").mean())
-
-# config: tyro dataclass
-import tyro
-from dataclasses import dataclass
-
-@dataclass
-class Config:
-    lr: float = 3e-4
-    # {FILL_IN}
-
-cfg = tyro.cli(Config)
-```
-
---
-
-## Research Epistemics
-
-Separate observations from inferences:
- **Observation**: "val_bpb dropped from 3.2 to 2.9 on run X" (measured fact)
- **Inference**: "this suggests the attention head is learning positional structure" (interpretation)
- **Claim from paper**: "authors claim X" -- not "X is true" unless you verified it
-
-For complex arguments, use `/vargdown` skill: verified argument maps with credences.
-
-Trust signals: community adoption > papers citing it > open source code > author reputation.
-
---
-
-## Available Skills
-
-Assume installed at `~/.claude/skills/` (from https://github.com/wassname/skills):
-
-| Skill | Use for |
-|-------|---------|
-| `/semantic-search` | Search arXiv, Semantic Scholar, DBLP, OpenAlex |
-| `/arxiv-fetch` | Download full paper text given arXiv ID/URL |
-| `/exa-search` | Neural web search for recent approaches |
-| `/vargdown` | Verified argument maps with credences for complex reasoning |
-| `/gsd` | Get Shit Done: spec -> implement -> test -> review -> wrap |
-| `/jaxtyping` | Runtime tensor shape/dtype checking |
-| `/justfile` | Project recipes (`just smoke`, `just eval`, `just queue`) |
-| `/ml_debug` | ML convergence, gradient analysis, sweep methodology |
-| `/brainstorm` | Wide + deep ideation without tunnel vision |
-| `/external-review` | Code/plan review via a different model |
-| `pueue` | Queue GPU jobs sequentially; label each with Q/hypothesis |
-
-Also available: bibtex MCP (search_reference, fetch), wandb MCP (query runs).
+<!-- {FILL_IN: add entries as experiments run} -->

 ---

 ## Meta-Mode

-Human sets `META_MODE=1` to enable editing of FROZEN files and committing to main.
-
-Use meta-mode to:
- Revise this program.md (agent instructions)
- Update eval.py (e.g., add new metric columns)
- Reflect on the overall research process in meta_journal.md
- Exit-interview style: what worked, what didn't, what would you change?
-
-To enter: human writes `META_MODE=1` in human_journal.md entry before asking agent.
+Human writes `META_MODE=1` in `human_journal.md` to unlock editing FROZEN files and committing to main. Use for: revising this program.md, updating eval.py, exit-interview style process reflection in meta_journal.md.
@@ -1,25 +1,90 @@
 """
-Training loop. NOT frozen -- agents modify freely in worktrees.
+Model + training. NOT frozen -- agents modify freely in worktrees.

-Convention: eval.py runs the frozen evaluation. This file handles training.
-Keep them separate so eval is never accidentally changed during experimentation.
+eval.py is frozen (anti-p-hacking). This file is not.
+Everything agents change lives here: architecture, optimizer, loss, data pipeline.
 """

+import os
+from dataclasses import dataclass
+
 import torch
+import torch.nn as nn
 import tyro
 import wandb
+from einops import rearrange  # noqa: F401 -- available for use
+from jaxtyping import Float
 from loguru import logger
+from torch import Tensor

-from model import Config, build_model
+if os.environ.get("BEARTYPE"):
+    from beartype import beartype as typechecker
+    from jaxtyping import jaxtyped
+else:
+    def typechecker(f): return f
+    def jaxtyped(**_): return lambda f: f


+# --- Config -------------------------------------------------------------------
+
+@dataclass
+class Config:
+    """All hyperparameters. Edit freely. eval.py imports this."""
+
+    # model
+    d_model: int = 256
+    n_layers: int = 4
+    # {FILL_IN}: add architecture params
+
+    # training
+    lr: float = 3e-4
+    batch_size: int = 32
+    max_steps: int = 1000
+    seed: int = 42
+
+    # data
+    # {FILL_IN}: add data params
+
+    # logging
+    wandb_project: str = "{FILL_IN}"
+
+
+# --- Model --------------------------------------------------------------------
+
+class Model(nn.Module):
+    """
+    {FILL_IN}: replace with your architecture.
+    Agents: this is the main thing you modify between experiments.
+    """
+
+    def __init__(self, cfg: Config):
+        super().__init__()
+        self.cfg = cfg
+        # {FILL_IN}: define layers
+
+    @jaxtyped(typechecker=typechecker)
+    def forward(self, x: Float[Tensor, "b s"]) -> Float[Tensor, "b s d"]:
+        # {FILL_IN}: implement forward pass
+        raise NotImplementedError("{FILL_IN}: implement forward()")
+
+
+def build_model(cfg: Config) -> Model:
+    torch.manual_seed(cfg.seed)
+    model = Model(cfg)
+    n_params = sum(p.numel() for p in model.parameters())
+    logger.info(f"Model: {n_params:,} parameters")
+    return model
+
+
+# --- Training -----------------------------------------------------------------
+
 def train(cfg: Config):
    torch.manual_seed(cfg.seed)

    wandb.init(
-        project="{FILL_IN}",  # replace with your W&B project name
+        project=cfg.wandb_project,
        config=vars(cfg),
-        # group is set by justfile sweep recipes via WANDB_RUN_GROUP env var
+        # WANDB_RUN_GROUP env var set by justfile sweep recipes
    )

    model = build_model(cfg)