From f55c18ac6e077a615c96d02f71681ec05fbacc19 Mon Sep 17 00:00:00 2001 From: wassname <1103714+wassname@users.noreply.github.com> Date: Sat, 4 Apr 2026 23:57:11 +0800 Subject: [PATCH] wip --- 0_docs/problem.md | 7 ++ agent_journal.md | 30 ------- eval.py | 3 +- justfile | 2 +- model.py | 74 ----------------- program.md | 207 ++++++++++------------------------------------ train.py | 77 +++++++++++++++-- 7 files changed, 125 insertions(+), 275 deletions(-) delete mode 100644 agent_journal.md delete mode 100644 model.py diff --git a/0_docs/problem.md b/0_docs/problem.md index 4f7a186..b659163 100644 --- a/0_docs/problem.md +++ b/0_docs/problem.md @@ -25,6 +25,13 @@ We optimize: **{metric name}** ({lower/higher} is better) Baseline (untrained or current best): **{value}** +Baseline std across 3 seeds: **{value}** (must be < effect size of a meaningful improvement) + +Train + eval wall time: **{X} minutes** (target: 5-40 min) + +If variance is too high relative to effect size, use more eval data, a smaller proxy task, +or a lower-variance metric -- before running any experiments. + ## Data {FILL_IN: dataset, size, splits} diff --git a/agent_journal.md b/agent_journal.md deleted file mode 100644 index 248f54c..0000000 --- a/agent_journal.md +++ /dev/null @@ -1,30 +0,0 @@ - - - - -# Agent Journal - -Short-term notes from agents: what was attempted in this session, -blockers hit, context for the next agent picking this up. - -For lasting research learnings, write to RESEARCH_JOURNAL.md instead. - ---- - -## Template entry - -``` -## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME - -### Session goal -[what this agent was asked to do] - -### What was done -[brief -- the commit history has the details] - -### Blockers / open questions -[anything the next agent needs to know] - -### Pueue queue state -[any jobs queued: `pueue status` output or summary] -``` diff --git a/eval.py b/eval.py index ca46564..094f58e 100644 --- a/eval.py +++ b/eval.py @@ -21,7 +21,7 @@ import tyro from loguru import logger # {FILL_IN}: import your model -# from model import Config, build_model +# from train import Config, build_model EVAL_SEED = 42 # FROZEN: never change @@ -85,6 +85,7 @@ def main(cfg: EvalConfig): data = load_eval_data() # {FILL_IN}: build and load your model + # from train import Config, build_model # model_cfg = Config() # model = build_model(model_cfg) # if cfg.checkpoint: diff --git a/justfile b/justfile index 3b979ed..db55ae2 100644 --- a/justfile +++ b/justfile @@ -13,7 +13,7 @@ default: # Fast smoke test: shape checks on CPU, short run smoke: #!/usr/bin/env bash - BEARTYPE=1 JAX_PLATFORMS=cpu uv run python train.py --max_steps=1 + BEARTYPE=1 uv run python train.py --max_steps=2 echo smoke passed # --- Evaluation (FROZEN logic -- see eval.py) --- diff --git a/model.py b/model.py deleted file mode 100644 index 144f925..0000000 --- a/model.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Model definition and config. - -This file is NOT frozen -- agents modify it freely in worktrees. -Keep eval.py's evaluate() interface stable: it calls build_model(cfg) and model.forward(x). -""" - -from dataclasses import dataclass - -import torch -import torch.nn as nn -import tyro -from jaxtyping import Float -from loguru import logger -from torch import Tensor - -# beartype checking enabled only when BEARTYPE=1 (smoke tests) -import os -if os.environ.get("BEARTYPE"): - from beartype import beartype as typechecker - from jaxtyping import jaxtyped -else: - def typechecker(f): return f - def jaxtyped(**_): return lambda f: f - - -@dataclass -class Config: - """Model and training hyperparameters. Edit freely.""" - - # model - d_model: int = 256 - n_layers: int = 4 - # {FILL_IN}: add your architecture params - - # training - lr: float = 3e-4 - batch_size: int = 32 - max_steps: int = 1000 - seed: int = 42 - - # data - # {FILL_IN}: add data params - - -class Model(nn.Module): - """ - {FILL_IN}: replace with your actual model. - """ - - def __init__(self, cfg: Config): - super().__init__() - self.cfg = cfg - # {FILL_IN}: define layers - # e.g. self.embed = nn.Embedding(vocab_size, cfg.d_model) - - @jaxtyped(typechecker=typechecker) - def forward(self, x: Float[Tensor, "b s"]) -> Float[Tensor, "b s d"]: - # {FILL_IN}: implement forward pass - raise NotImplementedError("{FILL_IN}: implement forward()") - - -def build_model(cfg: Config) -> Model: - torch.manual_seed(cfg.seed) - model = Model(cfg) - n_params = sum(p.numel() for p in model.parameters()) - logger.info(f"Model: {n_params:,} parameters") - return model - - -if __name__ == "__main__": - cfg = tyro.cli(Config) - model = build_model(cfg) - logger.info(f"Config: {cfg}") diff --git a/program.md b/program.md index 85b6b38..2e39d3a 100644 --- a/program.md +++ b/program.md @@ -1,21 +1,15 @@ - + # Research Program -**Project**: {FILL_IN: one sentence describing the research problem} +**Project**: {FILL_IN: one sentence} -**Metric**: {FILL_IN: what we optimize, e.g. val_bpb, accuracy, F1} (lower/higher is better) +**Metric**: {FILL_IN: metric name} ({lower/higher} is better). Target: 5-40 min per run. -**Metric design requirements** (enforce before first real experiment): -- Train + eval runs in 5-40 minutes on your GPU -- Variance across seeds < effect size of a meaningful improvement (run baseline x3, check std) -- Deterministic given same seed (fixed data order, fixed eval split) -- If variance is too high: use more eval data, smaller model, or a proxy metric with less noise +**Hypothesis space**: {FILL_IN: what class of approaches} -**Hypothesis space**: {FILL_IN: what class of approaches are in scope} - -Read `0_docs/problem.md` for full context. +See `0_docs/problem.md` for full problem context and baseline numbers. --- @@ -24,182 +18,69 @@ Read `0_docs/problem.md` for full context. | Type | Files | Rule | |------|-------|------| | FROZEN | `program.md`, `eval.py`, `meta_journal.md` | Never edit without `META_MODE=1` | -| GLOBAL | `RESEARCH_JOURNAL.md`, `results.tsv` | Only commit from main; worktrees append to root copy | -| APPEND-ONLY | `*_journal.md` | New entries at top, never edit old ones | +| GLOBAL | `RESEARCH_JOURNAL.md`, `results.tsv` | Commit from main only; worktrees append to root copy | +| APPEND-ONLY | `RESEARCH_JOURNAL.md`, `human_journal.md`, `meta_journal.md` | New entries at top, never edit old entries | | REGULAR | everything else | Modify freely in your worktree | --- ## Agent Algorithm -``` -YOU ARE AN AGENT. Follow this loop: +```python +# MUST read before starting: +read RESEARCH_JOURNAL.md # what has been tried, what worked/failed +read "Lessons Learned" below # gotchas -- don't repeat past failures -read RESEARCH_JOURNAL.md # what has been tried -read 0_docs/problem.md # what we're solving - -n_ideas = count files in 1_ideas/ (not _TEMPLATE.md) +n_ideas = count(1_ideas/*.md) - 1 # minus _TEMPLATE.md if n_ideas < 30: - ## IDEATE - - Read at least one file from 0_docs/papers/ (or fetch a new paper) - - Do at least one web search for recent approaches - - Fetch papers: use /semantic-search or /exa-search skills - -> save FULL paper text to 0_docs/papers/{slug}.md (not summaries -- full text) - -> optionally add a vargdown-style argument map to 0_docs/papers/{slug}_analysis.argdown - -> add key insight (1-3 observations with sources) to RESEARCH_JOURNAL.md - - Brainstorm ideas. Quality bar: - * Novel (not in RESEARCH_JOURNAL.md already) - * Mechanistically grounded (not just hyperparameter tuning) - * Not sklearn slop -- must be a real ML research contribution - * Bold enough that it could be a paper contribution - - For each idea: - write 1_ideas/{YYYY-MM-DD}_{slug}.md (use _TEMPLATE.md format) - spawn subagent to critique the idea (prompt: "Is this idea sound? - What are the failure modes? Is the hypothesis testable?") - append subagent feedback to the idea file - - Append summary of new ideas + paper insights to RESEARCH_JOURNAL.md + # IDEATE + # MUST read: 0_docs/ideation_guide.md (epistemics, paper fetching, brainstorm discipline) + read 0_docs/problem.md + search 1+ papers (semantic-search, exa-search, bibtex MCP) + save full paper text to 0_docs/papers/{slug}.md # full text, not summaries + for each idea: + write 1_ideas/{YYYY-MM-DD}_{slug}.md # use _TEMPLATE.md + subagent critique: "Is this sound? Failure modes? Testable?" + append subagent feedback to idea file + append paper insights + ideas to RESEARCH_JOURNAL.md else: - ## IMPLEMENT - pick the best idea from 1_ideas/ based on: - - subagent rating (see feedback section in idea file) - - novelty relative to RESEARCH_JOURNAL.md - - expected impact on metric - - implementation feasibility + # IMPLEMENT + # MUST read: 0_docs/conventions.md (coding style) + pick best idea (subagent rating + novelty + expected impact) + just worktree {slug} # creates 5_worktrees/{slug} on branch exp/{slug} + implement in worktree (edit train.py; do NOT touch eval.py, program.md) - slug = idea filename slug - run: git worktree add 5_worktrees/{slug} -b exp/{slug} - cd 5_worktrees/{slug} + # TEST + subagent code review vs idea doc + just smoke + just eval # appends row to results.tsv - implement the idea (modify train.py, model.py, etc.) - do NOT modify: eval.py, program.md, meta_journal.md + # REPORT + write 9_reports/{YYYY-MM-DD}_{slug}.md # use _TEMPLATE.md + append to RESEARCH_JOURNAL.md: what tried, delta metric, observation vs inference - ## TEST - spawn subagent: "Code review this against the idea doc 1_ideas/{slug}.md. - Does the implementation match the hypothesis? Any bugs?" - run: just smoke # fast sanity check - run: just eval # appends to results.tsv - - ## REPORT - write 9_reports/{YYYY-MM-DD}_{slug}.md (use _TEMPLATE.md format) - append short summary to RESEARCH_JOURNAL.md: - - what was tried, what metric changed, what you learned - - key observation vs inference distinction - - ## SUBMIT - git commit -m "exp({slug}): {one-line description}" + # SUBMIT + git commit -m "exp({slug}): {one line}" git push origin exp/{slug} - if result beats best in results.tsv: - create PR for human to merge + if beats best in results.tsv: open PR for human -## QUEUING EXPERIMENTS (pueue) - -Use pueue to queue experiments for the single GPU -- one at a time, no collision: - - # Queue with a label showing the question and expected resolution - pueue add --label "Q: does X help? H: expect +0.05 metric" -- just eval --config=path - - # Check queue / status / logs - pueue status - pueue log {task_id} # full stdout - pueue follow {task_id} # live tail - -Labels encode the hypothesis being tested. After the run, append observed vs expected -to RESEARCH_JOURNAL.md. The label shows up in `pueue status` so you can track what -question each running/queued job is answering. - - # Example: multiple experiments queued with different hypotheses - pueue add --label "Q: rotary vs sinusoidal? H: rotary saves 0.1 bpb" -- just eval rotary - pueue add --label "Q: flash-attn memory? H: 2x batch size same speed" -- just eval flash - pueue add --label "Q: does layer norm placement matter? H: pre-norm better" -- just eval prenorm +# GPU QUEUE (pueue -- one GPU, no collision) +just queue "Q: does X help? H: expect +delta" eval {args} +pueue status # shows hypothesis label for each queued/running job ``` --- -## Coding Conventions +## Lessons Learned and Gotchas -Fail fast. No defensive programming. No silent fallbacks. +Format: `YYYY-MM-DD | title | lesson (one line)` -```python -# shape ops: einops for clarity -from einops import rearrange, reduce -x = rearrange(x, 'b s h d -> b h s d') - -# einsum for explicit contraction -out = torch.einsum('b h s d, b h d v -> b h s v', q, k) - -# jaxtyping on function boundaries (docs + smoke-test checking) -from jaxtyping import Float -from torch import Tensor -def encode(x: Float[Tensor, 'b s d']) -> Float[Tensor, 'b s h']: - ... - -# logging: loguru not print -from loguru import logger -logger.info(f"loss={loss:.4f}") - -# dataframes: polars v1 -import polars as pl -df.group_by("exp").agg(pl.col("metric").mean()) - -# config: tyro dataclass -import tyro -from dataclasses import dataclass - -@dataclass -class Config: - lr: float = 3e-4 - # {FILL_IN} - -cfg = tyro.cli(Config) -``` - ---- - -## Research Epistemics - -Separate observations from inferences: -- **Observation**: "val_bpb dropped from 3.2 to 2.9 on run X" (measured fact) -- **Inference**: "this suggests the attention head is learning positional structure" (interpretation) -- **Claim from paper**: "authors claim X" -- not "X is true" unless you verified it - -For complex arguments, use `/vargdown` skill: verified argument maps with credences. - -Trust signals: community adoption > papers citing it > open source code > author reputation. - ---- - -## Available Skills - -Assume installed at `~/.claude/skills/` (from https://github.com/wassname/skills): - -| Skill | Use for | -|-------|---------| -| `/semantic-search` | Search arXiv, Semantic Scholar, DBLP, OpenAlex | -| `/arxiv-fetch` | Download full paper text given arXiv ID/URL | -| `/exa-search` | Neural web search for recent approaches | -| `/vargdown` | Verified argument maps with credences for complex reasoning | -| `/gsd` | Get Shit Done: spec -> implement -> test -> review -> wrap | -| `/jaxtyping` | Runtime tensor shape/dtype checking | -| `/justfile` | Project recipes (`just smoke`, `just eval`, `just queue`) | -| `/ml_debug` | ML convergence, gradient analysis, sweep methodology | -| `/brainstorm` | Wide + deep ideation without tunnel vision | -| `/external-review` | Code/plan review via a different model | -| `pueue` | Queue GPU jobs sequentially; label each with Q/hypothesis | - -Also available: bibtex MCP (search_reference, fetch), wandb MCP (query runs). + --- ## Meta-Mode -Human sets `META_MODE=1` to enable editing of FROZEN files and committing to main. - -Use meta-mode to: -- Revise this program.md (agent instructions) -- Update eval.py (e.g., add new metric columns) -- Reflect on the overall research process in meta_journal.md -- Exit-interview style: what worked, what didn't, what would you change? - -To enter: human writes `META_MODE=1` in human_journal.md entry before asking agent. +Human writes `META_MODE=1` in `human_journal.md` to unlock editing FROZEN files and committing to main. Use for: revising this program.md, updating eval.py, exit-interview style process reflection in meta_journal.md. diff --git a/train.py b/train.py index 974d381..c94bb52 100644 --- a/train.py +++ b/train.py @@ -1,25 +1,90 @@ """ -Training loop. NOT frozen -- agents modify freely in worktrees. +Model + training. NOT frozen -- agents modify freely in worktrees. -Convention: eval.py runs the frozen evaluation. This file handles training. -Keep them separate so eval is never accidentally changed during experimentation. +eval.py is frozen (anti-p-hacking). This file is not. +Everything agents change lives here: architecture, optimizer, loss, data pipeline. """ +import os +from dataclasses import dataclass + import torch +import torch.nn as nn import tyro import wandb +from einops import rearrange # noqa: F401 -- available for use +from jaxtyping import Float from loguru import logger +from torch import Tensor -from model import Config, build_model +if os.environ.get("BEARTYPE"): + from beartype import beartype as typechecker + from jaxtyping import jaxtyped +else: + def typechecker(f): return f + def jaxtyped(**_): return lambda f: f +# --- Config ------------------------------------------------------------------- + +@dataclass +class Config: + """All hyperparameters. Edit freely. eval.py imports this.""" + + # model + d_model: int = 256 + n_layers: int = 4 + # {FILL_IN}: add architecture params + + # training + lr: float = 3e-4 + batch_size: int = 32 + max_steps: int = 1000 + seed: int = 42 + + # data + # {FILL_IN}: add data params + + # logging + wandb_project: str = "{FILL_IN}" + + +# --- Model -------------------------------------------------------------------- + +class Model(nn.Module): + """ + {FILL_IN}: replace with your architecture. + Agents: this is the main thing you modify between experiments. + """ + + def __init__(self, cfg: Config): + super().__init__() + self.cfg = cfg + # {FILL_IN}: define layers + + @jaxtyped(typechecker=typechecker) + def forward(self, x: Float[Tensor, "b s"]) -> Float[Tensor, "b s d"]: + # {FILL_IN}: implement forward pass + raise NotImplementedError("{FILL_IN}: implement forward()") + + +def build_model(cfg: Config) -> Model: + torch.manual_seed(cfg.seed) + model = Model(cfg) + n_params = sum(p.numel() for p in model.parameters()) + logger.info(f"Model: {n_params:,} parameters") + return model + + +# --- Training ----------------------------------------------------------------- + def train(cfg: Config): torch.manual_seed(cfg.seed) wandb.init( - project="{FILL_IN}", # replace with your W&B project name + project=cfg.wandb_project, config=vars(cfg), - # group is set by justfile sweep recipes via WANDB_RUN_GROUP env var + # WANDB_RUN_GROUP env var set by justfile sweep recipes ) model = build_model(cfg)