From fc46d878cf1981a3fbdd7c16ae99c13575c13d12 Mon Sep 17 00:00:00 2001
From: wassname <1103714+wassname@users.noreply.github.com>
Date: Sat, 4 Apr 2026 23:40:34 +0800
Subject: [PATCH] init

---
 .claude/settings.local.json |   8 ++
 .githooks/pre-commit        |  62 +++++++++++
 .gitignore                  |  26 +++++
 0_docs/papers/.gitkeep      |   0
 0_docs/problem.md           |  53 ++++++++++
 1_ideas/_TEMPLATE.md        |  64 +++++++++++
 9_reports/_TEMPLATE.md      |  71 +++++++++++++
 AGENTS.md                   |   1 +
 CLAUDE.md                   |   1 +
 RESEARCH_JOURNAL.md         |  40 +++++++
 agent_journal.md            |  30 ++++++
 eval.py                     | 110 +++++++++++++++++++
 human_journal.md            |  52 +++++++++
 justfile                    |  69 ++++++++++++
 meta_journal.md             |  47 +++++++++
 model.py                    |  74 +++++++++++++
 program.md                  | 205 ++++++++++++++++++++++++++++++++++++
 pyproject.toml              |  31 ++++++
 train.py                    |  51 +++++++++
 19 files changed, 995 insertions(+)
 create mode 100644 .claude/settings.local.json
 create mode 100755 .githooks/pre-commit
 create mode 100644 .gitignore
 create mode 100644 0_docs/papers/.gitkeep
 create mode 100644 0_docs/problem.md
 create mode 100644 1_ideas/_TEMPLATE.md
 create mode 100644 9_reports/_TEMPLATE.md
 create mode 120000 AGENTS.md
 create mode 120000 CLAUDE.md
 create mode 100644 RESEARCH_JOURNAL.md
 create mode 100644 agent_journal.md
 create mode 100644 eval.py
 create mode 100644 human_journal.md
 create mode 100644 justfile
 create mode 100644 meta_journal.md
 create mode 100644 model.py
 create mode 100644 program.md
 create mode 100644 pyproject.toml
 create mode 100644 train.py

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 0000000..94b3710
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,8 @@
+{
+  "permissions": {
+    "allow": [
+      "Read(//home/wassname/.claude/skills/setup-repo/**)",
+      "Read(//home/wassname/dev/my-skills/setup-repo/**)"
+    ]
+  }
+}
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
new file mode 100755
index 0000000..c108685
--- /dev/null
+++ b/.githooks/pre-commit
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+# Pre-commit hook: enforce autoresearch repo invariants.
+# Install: git config core.hooksPath .githooks
+
+set -euo pipefail
+
+FROZEN_FILES=("program.md" "eval.py" "meta_journal.md")
+GLOBAL_FILES=("RESEARCH_JOURNAL.md" "results.tsv")
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'  # no color
+
+CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
+IS_WORKTREE=$(git rev-parse --git-dir | grep -q "worktrees" && echo "yes" || echo "no")
+META_MODE="${META_MODE:-0}"
+
+# ── 1. Warn on direct commits to main ────────────────────────────────────────
+if [[ "$CURRENT_BRANCH" == "main" && "$META_MODE" != "1" ]]; then
+    echo -e "${YELLOW}[hook] WARNING: committing directly to main.${NC}"
+    echo "  Agents should work in worktrees: just worktree {slug}"
+    echo "  To suppress: META_MODE=1 git commit ..."
+    echo ""
+    echo -n "  Are you sure? (y/N) "
+    read -r reply < /dev/tty
+    if [[ "$reply" != "y" && "$reply" != "Y" ]]; then
+        echo "Aborted."
+        exit 1
+    fi
+fi
+
+# ── 2. Protect FROZEN files ───────────────────────────────────────────────────
+if [[ "$META_MODE" != "1" ]]; then
+    for f in "${FROZEN_FILES[@]}"; do
+        if git diff --cached --name-only | grep -qx "$f"; then
+            echo -e "${RED}[hook] ERROR: '$f' is FROZEN.${NC}"
+            echo "  Only edit in meta-mode: META_MODE=1 git commit ..."
+            echo "  See meta_journal.md for instructions."
+            exit 1
+        fi
+    done
+fi
+
+# ── 3. Warn about GLOBAL files committed from worktrees ───────────────────────
+if [[ "$IS_WORKTREE" == "yes" || "$CURRENT_BRANCH" != "main" ]]; then
+    for f in "${GLOBAL_FILES[@]}"; do
+        if git diff --cached --name-only | grep -qx "$f"; then
+            echo -e "${YELLOW}[hook] WARNING: '$f' is a GLOBAL file.${NC}"
+            echo "  It should only be committed from main (not from worktree branches)."
+            echo "  Agents: append to the root project copy instead:"
+            echo "    echo '...' >> \$(git rev-parse --show-toplevel)/$f"
+            echo ""
+            echo -n "  Commit anyway? (y/N) "
+            read -r reply < /dev/tty
+            if [[ "$reply" != "y" && "$reply" != "Y" ]]; then
+                echo "Aborted. Unstage with: git reset HEAD $f"
+                exit 1
+            fi
+        fi
+    done
+fi
+
+exit 0
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3f33575
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,26 @@
+# GLOBAL files: tracked in main only, never in worktrees
+# (git hook enforces this -- see .githooks/pre-commit)
+results.tsv
+
+# worktrees live outside the repo
+5_worktrees/
+
+# python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.venv/
+dist/
+
+# ML artifacts
+outputs/
+wandb/
+data/
+*.ckpt
+*.pt
+*.safetensors
+
+# editor
+.vscode/
+.idea/
+*.swp
diff --git a/0_docs/papers/.gitkeep b/0_docs/papers/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/0_docs/problem.md b/0_docs/problem.md
new file mode 100644
index 0000000..4f7a186
--- /dev/null
+++ b/0_docs/problem.md
@@ -0,0 +1,53 @@
+# Problem Statement
+
+<!-- FILL IN: replace everything below with your actual problem description -->
+<!-- This file is read by agents at the start of every IDEATE session -->
+
+## What we're trying to solve
+
+{FILL_IN: 2-3 sentences describing the research problem}
+
+## Why it matters
+
+{FILL_IN: why does solving this matter? What's the impact?}
+
+## Current best approaches
+
+{FILL_IN: brief description of SOTA and why it's insufficient}
+
+## Our approach / hypothesis space
+
+{FILL_IN: what class of solutions are we exploring?}
+
+## Metric
+
+We optimize: **{metric name}** ({lower/higher} is better)
+
+Baseline (untrained or current best): **{value}**
+
+## Data
+
+{FILL_IN: dataset, size, splits}
+- Train: {description}
+- Val (eval.py uses this, FROZEN split): {description}
+
+## Constraints
+
+- {FILL_IN: compute budget, e.g. "5 min training on single A100"}
+- {FILL_IN: any other hard constraints}
+
+## Key papers
+
+See `0_docs/papers/` for full summaries. Quick list:
+
+| Paper | Key claim | Trust signal |
+|-------|-----------|--------------|
+| {FILL_IN} | {claim} | {citations / code / self-report} |
+
+## Lessons so far
+
+{start empty, accumulate as experiments run}
+
+## What has NOT worked
+
+{start empty, accumulate}
diff --git a/1_ideas/_TEMPLATE.md b/1_ideas/_TEMPLATE.md
new file mode 100644
index 0000000..21b9ad3
--- /dev/null
+++ b/1_ideas/_TEMPLATE.md
@@ -0,0 +1,64 @@
+# {slug}: {one-line title}
+
+<!-- Copy to: 1_ideas/YYYY-MM-DD_{slug}.md -->
+
+## Metadata
+- **Date**: YYYY-MM-DD
+- **Author**: {agent or human name}
+- **Status**: draft | validated | in-progress | done | rejected
+
+## Hypothesis
+
+**Question**: What happens if we {specific change}?
+
+**Prediction**: We expect {metric} to change by {amount} because {mechanism}.
+
+**Falsification**: If {metric} does NOT change by at least {threshold}, the hypothesis is wrong.
+
+## Approach
+
+Brief description of the implementation. What files change? What's the core modification?
+
+```python
+# pseudocode or key snippet showing the idea
+```
+
+## Why this is novel
+
+How does this differ from what's already in RESEARCH_JOURNAL.md? What paper/idea is it based on?
+
+## Expected implementation effort
+
+Small (< 1 hour) / Medium (1-4 hours) / Large (> 4 hours)
+
+## Pueue label (for queuing)
+
+```
+Q: does {change} improve {metric}? H: expect {delta}
+```
+
+---
+
+## Subagent Critique
+
+<!-- Append subagent feedback here before implementing -->
+<!-- Prompt used: "Is this idea sound? What are the failure modes? Is the hypothesis testable?" -->
+
+**Rating**: 1-5 (1=bad, 5=excellent)
+
+**Strengths**:
+-
+
+**Failure modes**:
+-
+
+**Verdict**: proceed / revise / reject
+
+---
+
+## Result (fill in after experiment)
+
+**Commit**: SHORT_SHA
+**Report**: [9_reports/YYYY-MM-DD_{slug}.md](../9_reports/YYYY-MM-DD_{slug}.md)
+**Observed**: {metric} = {value} (expected {expected})
+**Verdict**: confirmed / partially confirmed / refuted / inconclusive
diff --git a/9_reports/_TEMPLATE.md b/9_reports/_TEMPLATE.md
new file mode 100644
index 0000000..940c20c
--- /dev/null
+++ b/9_reports/_TEMPLATE.md
@@ -0,0 +1,71 @@
+# Lab Report: {slug}
+
+<!-- Copy to: 9_reports/YYYY-MM-DD_{slug}.md -->
+<!-- Self-contained: a reader should understand the experiment without reading anything else -->
+
+## Metadata
+
+| Field | Value |
+|-------|-------|
+| Date | YYYY-MM-DD |
+| Commit | SHORT_SHA |
+| Branch | exp/{slug} |
+| Worktree | 5_worktrees/{slug} |
+| Agent | {name} |
+| Idea doc | [1_ideas/YYYY-MM-DD_{slug}.md](../1_ideas/YYYY-MM-DD_{slug}.md) |
+
+## Context
+
+{1-2 sentences: what problem are we solving and why does this experiment matter}
+
+## Hypothesis
+
+**Question**: What happens if we {change}?
+
+**Prediction**: {metric} improves by {amount} because {mechanism}.
+
+## Experiment
+
+What was changed vs the baseline:
+
+```diff
+# key diff -- just the essential change, not the whole file
+```
+
+Baseline: commit {SHORT_SHA} (or "untrained")
+
+## Observations
+
+Measured facts only -- no interpretation here.
+
+| Run | Metric | Value | Delta vs baseline |
+|-----|--------|-------|-------------------|
+| baseline | {metric} | {value} | -- |
+| this exp | {metric} | {value} | {+/-delta} |
+
+```
+# pueue log output or relevant stdout snippet
+```
+
+## Diagnosis
+
+> Caution: 95% of ML failures are bugs, engineering issues, or misconceptions -- not deep theory.
+> State credences explicitly. Don't overclaim.
+
+**Most likely explanation** (credence: X%): {explanation}
+
+**Alternative explanations**:
+- {alternative} (credence: Y%)
+
+**What would distinguish these**: {test or observation that would separate them}
+
+## Limitations
+
+- {what this experiment doesn't test}
+- {confounds}
+
+## Future work
+
+- {most promising next step given this result}
+- {if confirmed: what's the natural follow-up?}
+- {if refuted: what's the diagnosis and what would we try instead?}
diff --git a/AGENTS.md b/AGENTS.md
new file mode 120000
index 0000000..855cf16
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1 @@
+program.md
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 120000
index 0000000..855cf16
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1 @@
+program.md
\ No newline at end of file
diff --git a/RESEARCH_JOURNAL.md b/RESEARCH_JOURNAL.md
new file mode 100644
index 0000000..b513d49
--- /dev/null
+++ b/RESEARCH_JOURNAL.md
@@ -0,0 +1,40 @@
+<!-- APPEND-ONLY: new entries at top. Never edit old entries. -->
+<!-- Tracked in main branch only. Git hook prevents committing from worktrees. -->
+<!-- All worktrees append here: write to the root project copy. -->
+<!--
+Format:
+## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME
+
+### What I did
+[1-3 sentences: what experiment, what change]
+
+### Observations (facts)
+- "val_bpb = 2.91 (baseline 3.02, delta -0.11)" -- measured, not interpreted
+
+### Inferences (interpretations)
+- "suggests X because Y" -- label as inference, give credence
+
+### What surprised me
+[anything unexpected -- good signal for future ideation]
+
+### Next questions
+[what to try next based on this result]
+-->
+
+# Research Journal
+
+---
+
+## 2026-04-04 | commit: c687a68 | branch: main | agent: human
+
+### What I did
+Initialized autoresearch template. No experiments yet.
+
+### Observations
+- Repo structure created: program.md (FROZEN), eval.py (FROZEN), justfile, pyproject.toml
+- Git hooks configured to protect FROZEN files and warn on worktree commits of GLOBAL files
+
+### Next questions
+- Fill in `{FILL_IN}` markers in program.md, eval.py, 0_docs/problem.md
+- Run `just smoke` to verify setup
+- Start IDEATE loop: read 0_docs/problem.md, search for papers, generate first ideas
diff --git a/agent_journal.md b/agent_journal.md
new file mode 100644
index 0000000..248f54c
--- /dev/null
+++ b/agent_journal.md
@@ -0,0 +1,30 @@
+<!-- APPEND-ONLY: new entries at top. Never edit old entries. -->
+<!-- Written by agents (not humans -- see human_journal.md). -->
+<!-- Format: ## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME -->
+
+# Agent Journal
+
+Short-term notes from agents: what was attempted in this session,
+blockers hit, context for the next agent picking this up.
+
+For lasting research learnings, write to RESEARCH_JOURNAL.md instead.
+
+---
+
+## Template entry
+
+```
+## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME
+
+### Session goal
+[what this agent was asked to do]
+
+### What was done
+[brief -- the commit history has the details]
+
+### Blockers / open questions
+[anything the next agent needs to know]
+
+### Pueue queue state
+[any jobs queued: `pueue status` output or summary]
+```
diff --git a/eval.py b/eval.py
new file mode 100644
index 0000000..ca46564
--- /dev/null
+++ b/eval.py
@@ -0,0 +1,110 @@
+"""
+FROZEN: only edit in meta-mode (META_MODE=1).
+
+Frozen to prevent p-hacking, seed hacking, and eval-set leakage.
+All experiments must be compared using this exact eval logic.
+
+Appends one row to results.tsv per run.
+"""
+
+# FROZEN: only edit in meta-mode (META_MODE=1)
+
+import csv
+import os
+import subprocess
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+
+import torch
+import tyro
+from loguru import logger
+
+# {FILL_IN}: import your model
+# from model import Config, build_model
+
+
+EVAL_SEED = 42          # FROZEN: never change
+EVAL_SPLIT = "val"      # FROZEN: never change
+RESULTS_FILE = Path(__file__).parent / "results.tsv"
+
+
+@dataclass
+class EvalConfig:
+    checkpoint: str = ""          # path to checkpoint or empty for untrained baseline
+    # {FILL_IN}: add any eval-time config here (e.g. batch_size, max_seq_len)
+    batch_size: int = 32
+
+
+def get_git_info() -> dict:
+    def run(cmd):
+        return subprocess.check_output(cmd, text=True).strip()
+
+    return {
+        "commit": run(["git", "rev-parse", "--short", "HEAD"]),
+        "branch": run(["git", "rev-parse", "--abbrev-ref", "HEAD"]),
+        "worktree": Path.cwd().name,
+    }
+
+
+def load_eval_data():
+    """Load fixed evaluation data. FROZEN: do not change split or preprocessing."""
+    torch.manual_seed(EVAL_SEED)
+    # {FILL_IN}: load your dataset
+    # e.g. dataset = load_dataset("openwebtext", split=EVAL_SPLIT)
+    raise NotImplementedError("{FILL_IN}: implement load_eval_data()")
+
+
+def evaluate(model, data) -> dict:
+    """Run evaluation. FROZEN: do not change metric computation."""
+    model.eval()
+    torch.manual_seed(EVAL_SEED)
+    # {FILL_IN}: compute your metric(s)
+    # e.g. return {"val_bpb": compute_bpb(model, data), "val_loss": compute_loss(model, data)}
+    raise NotImplementedError("{FILL_IN}: implement evaluate()")
+
+
+def append_results(row: dict):
+    """Append one row to results.tsv. Creates file with header if missing."""
+    fieldnames = list(row.keys())
+    file_exists = RESULTS_FILE.exists()
+
+    with open(RESULTS_FILE, "a", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter="\t")
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow(row)
+
+    logger.info(f"Results appended to {RESULTS_FILE}")
+
+
+def main(cfg: EvalConfig):
+    logger.info(f"Eval config: {cfg}")
+    git = get_git_info()
+
+    data = load_eval_data()
+
+    # {FILL_IN}: build and load your model
+    # model_cfg = Config()
+    # model = build_model(model_cfg)
+    # if cfg.checkpoint:
+    #     model.load_state_dict(torch.load(cfg.checkpoint))
+    raise NotImplementedError("{FILL_IN}: build and load model")
+
+    metrics = evaluate(model, data)
+
+    row = {
+        "date": datetime.now().isoformat(timespec="seconds"),
+        "commit": git["commit"],
+        "branch": git["branch"],
+        "worktree": git["worktree"],
+        **metrics,
+        # {FILL_IN}: add config fields you want tracked (e.g. model_cfg.n_layers)
+    }
+
+    logger.info(f"Metrics: {metrics}")
+    append_results(row)
+
+
+if __name__ == "__main__":
+    tyro.cli(main)
diff --git a/human_journal.md b/human_journal.md
new file mode 100644
index 0000000..c67a6cc
--- /dev/null
+++ b/human_journal.md
@@ -0,0 +1,52 @@
+<!-- Only editable by humans -->
+
+# 2026-04-04 11:31:20
+
+I'd like to turn this into a template repo for autoresearch
+
+first read the skills for 
+
+justfile pueue uv jaxtyping skill and gsd skill, vargdown 
+
+assume these are available and can be described in one or two lines then references. We assume that https://github.com/wassname/skills is installed to the machine.
+
+First principles
+- some files a SINGLETONS, they are only in the main branch, and protected by git hook reminders, and or gitignore
+- some are APPEND ONLY like journals
+- some are write only by automatic (eval.py), meta-agents (meta_journal.md, etc), agents (research journal), humans (human journal)
+
+then read my best practice guide here... oh I can't find it... highlight from memory
+- it's like https://github.com/karpathy/autoresearch/blob/master/program.md or https://github.com/NousResearch/autonovel/blob/master/program.md read these for context and outline (but be aware the 1st is simple, and the 2nd is for writing not ml expeiments like here)
+- but we add a lesson learned and gotcha section
+- we have a FROZEN / META files, with a comment at the top saying "only edit in meta mode" to the top of these files: 
+  - eval.py - this is frozen to stop 1) cheating 2) make sure results are compariable. Anything needed to stop p-hacking, seed hacking, changing the eval set, etc. While anything the agent can and should improve and compare should be in non frozen files
+  - meta_journal.md -
+  - program.md 
+    - this is symlinked to AGENTS.md and CLAUDE.md, to make sure it's loded in
+    - but it's frozen to stop the agent from changing its own instructions. Unless in metamode! 
+    - Normal non-meta agents are encourages to add meta-feedback to the research log.
+    - This should give agents a clear pseudo algo code of how to work
+      - If the ideas queue if <30, think for hours about wide and deep next steps, based on read at least one main files, and at least one paper, and one web search. Ideas should not by hyperparameter tuning, or sklearn slop. They should be novel, and bold, and you should be able to show an independant subagent, and it says it makes sense. should we in 2_ideas/{ts}_{slug}.md have a log at end for subagent feedback and log. This will get turned into lab reports, and we want to seperate observations from inferences and guesses.
+      - If the ideas queue is >30, pick the best one, and do it in a new worktree
+      - When ready to test 1) have a subagent do a code review against the idea 2) do a smoke test 3) run eval.py
+      - When finished improving write a short self contained lab report into 9_lab_reports/ with 
+        - metadata: date, commit, worktree, agent name, 
+        - intro: context, hypothesis, 
+        - experiment, observations, results
+        - diagnosis: be very careful here to not overclaim, 95% of the time in ML it failed due to bug, poor engineering, or a misconception.
+        - limitations, future work
+      - Then append a short summary to the research journal with learnings.
+- we have GLOBAL files that are 0) only in the root projected directory 1) untracked, gitignored 2) have git hooks to stop them being commited. This so that they don't get commited to worktrees, then overwrite the main one during merge.
+  - These are RESEARCH_JOURNAL.md - append only file that agents in all worktrees should write to. All entried should have date, commit, worktree, agent name. First line in a comment saying it's append only, and showing format
+  - results.tsv - automatically appended by eval.py, early colums have short values, later columns have long values, and the file is git-ignored.
+- our main is protected by a git hook remining models to work in a worktree, and not commit directly to main (unless human has put them in meta-mode)
+- meta-mode, this is where the agent can take a step back, think about the overall workflow, perhaps do an exist interview. Here we improve the overall process by editing the FROZEN / META files, commiting to main, and appending to the meta-journal. 
+- Agents should work in worktrees. They shouldbe commited and pushed. If it's a better result than any in results.tsv make it a PR for the human to merge.
+
+
+Generally we want to show not tell, and sometimes show and tell, but never tell but not show. That means that these files should show throught specific examples
+
+
+Thinking: We need expanded files for some of the stages, in particular ideation, brainstroming/diagnoses are hard for current LLM's, and can fill up context of just fall to tunnel vision. 
+
+Research: To help with ideation we have a 0_docs folder with a problem intro, and papers. Whenever an agent fetches a paper it should write the full markdown summary to 0_docs/papers/{slug}.md, and then add the main insights to the research journal. This way we have a growing set of knowledge that agents can draw on, and we can also track which papers were read when, and what was learned from them. We should marshall available skills, tools, and mcps for searching and fetching and priortise them so we can 1) search 2) download full text to files, and 3) have subagents with good epistemics, the vargdown skills, and knowledge of the problems and what we are looking for write summaries.
diff --git a/justfile b/justfile
new file mode 100644
index 0000000..3b979ed
--- /dev/null
+++ b/justfile
@@ -0,0 +1,69 @@
+set shell := ["bash", "-cu"]
+
+WORKTREES := "5_worktrees"
+IDEAS := "1_ideas"
+REPORTS := "9_reports"
+
+# List all recipes
+default:
+    @just --list
+
+# --- Sanity checks ---
+
+# Fast smoke test: shape checks on CPU, short run
+smoke:
+    #!/usr/bin/env bash
+    BEARTYPE=1 JAX_PLATFORMS=cpu uv run python train.py --max_steps=1
+    echo smoke passed
+
+# --- Evaluation (FROZEN logic -- see eval.py) ---
+
+# Run eval and append to results.tsv
+eval *ARGS:
+    uv run python eval.py {{ARGS}}
+
+# Show results table
+results:
+    @column -t -s $'\t' results.tsv | head -30
+
+# --- Training ---
+
+# Train with default config
+train *ARGS:
+    uv run python train.py {{ARGS}}
+
+# Queue an experiment to pueue with a hypothesis label
+# Usage: just queue LABEL CMD...
+queue LABEL *CMD:
+    pueue add --label '{{LABEL}}' -- just {{CMD}}
+
+# Show pueue queue status
+status:
+    pueue status
+
+# --- Sweeps (pueue-based, one GPU, sequential) ---
+
+# Sweeps: write a recipe that loops over params and calls `pueue add --label Q -- just eval ...`
+# See justfile-sweeps skill for the full pattern.
+
+# --- Worktrees ---
+
+# Create a new experiment worktree
+worktree SLUG:
+    git worktree add {{WORKTREES}}/{{SLUG}} -b exp/{{SLUG}}
+
+# List worktrees
+worktrees:
+    git worktree list
+
+# --- Reports ---
+
+# List lab reports (see 9_reports/ directory)
+reports:
+    ls -t {{REPORTS}} | head -20 || true
+
+# --- Ideas ---
+
+# Count ideas in queue (see 1_ideas/ directory)
+ideas:
+    ls -t {{IDEAS}} | grep -v _TEMPLATE | head -10 || true
diff --git a/meta_journal.md b/meta_journal.md
new file mode 100644
index 0000000..6864c2f
--- /dev/null
+++ b/meta_journal.md
@@ -0,0 +1,47 @@
+<!-- FROZEN: only edit in meta-mode (META_MODE=1) -->
+<!-- APPEND-ONLY: new entries at top -->
+<!-- Format: ## YYYY-MM-DD HH:MM | agent: NAME -->
+<!-- Purpose: meta-level observations about the research process itself -->
+<!-- (not about experiment results -- those go in RESEARCH_JOURNAL.md) -->
+
+# Meta Journal
+
+Observations about the research *process*: what's working, what's broken,
+what should change in program.md, eval.py, or the overall workflow.
+
+---
+
+## Template entry format
+
+```
+## YYYY-MM-DD HH:MM | agent: {name or "human"}
+
+### Observation
+[What I noticed about the process -- be specific]
+
+### Root cause
+[Why is this happening?]
+
+### Action taken
+[What was changed in FROZEN files, and why]
+
+### Expected effect
+[How should agent behavior change after this update?]
+```
+
+---
+
+## 2026-04-04 | agent: human
+
+### Observation
+Template repo initialized. program.md, eval.py written as FROZEN stubs.
+
+### Root cause
+Starting from scratch -- no prior experiments.
+
+### Action taken
+Created initial file structure. Set up git hooks to protect FROZEN files.
+
+### Expected effect
+Agents can now run the IDEATE/IMPLEMENT loop. Fill in {FILL_IN} markers
+before first real experiment.
diff --git a/model.py b/model.py
new file mode 100644
index 0000000..144f925
--- /dev/null
+++ b/model.py
@@ -0,0 +1,74 @@
+"""
+Model definition and config.
+
+This file is NOT frozen -- agents modify it freely in worktrees.
+Keep eval.py's evaluate() interface stable: it calls build_model(cfg) and model.forward(x).
+"""
+
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+import tyro
+from jaxtyping import Float
+from loguru import logger
+from torch import Tensor
+
+# beartype checking enabled only when BEARTYPE=1 (smoke tests)
+import os
+if os.environ.get("BEARTYPE"):
+    from beartype import beartype as typechecker
+    from jaxtyping import jaxtyped
+else:
+    def typechecker(f): return f
+    def jaxtyped(**_): return lambda f: f
+
+
+@dataclass
+class Config:
+    """Model and training hyperparameters. Edit freely."""
+
+    # model
+    d_model: int = 256
+    n_layers: int = 4
+    # {FILL_IN}: add your architecture params
+
+    # training
+    lr: float = 3e-4
+    batch_size: int = 32
+    max_steps: int = 1000
+    seed: int = 42
+
+    # data
+    # {FILL_IN}: add data params
+
+
+class Model(nn.Module):
+    """
+    {FILL_IN}: replace with your actual model.
+    """
+
+    def __init__(self, cfg: Config):
+        super().__init__()
+        self.cfg = cfg
+        # {FILL_IN}: define layers
+        # e.g. self.embed = nn.Embedding(vocab_size, cfg.d_model)
+
+    @jaxtyped(typechecker=typechecker)
+    def forward(self, x: Float[Tensor, "b s"]) -> Float[Tensor, "b s d"]:
+        # {FILL_IN}: implement forward pass
+        raise NotImplementedError("{FILL_IN}: implement forward()")
+
+
+def build_model(cfg: Config) -> Model:
+    torch.manual_seed(cfg.seed)
+    model = Model(cfg)
+    n_params = sum(p.numel() for p in model.parameters())
+    logger.info(f"Model: {n_params:,} parameters")
+    return model
+
+
+if __name__ == "__main__":
+    cfg = tyro.cli(Config)
+    model = build_model(cfg)
+    logger.info(f"Config: {cfg}")
diff --git a/program.md b/program.md
new file mode 100644
index 0000000..85b6b38
--- /dev/null
+++ b/program.md
@@ -0,0 +1,205 @@
+<!-- FROZEN: only edit in meta-mode (META_MODE=1) -->
+<!-- Symlinked to AGENTS.md and CLAUDE.md -- always loaded by Claude/Cursor/etc. -->
+
+# Research Program
+
+**Project**: {FILL_IN: one sentence describing the research problem}
+
+**Metric**: {FILL_IN: what we optimize, e.g. val_bpb, accuracy, F1} (lower/higher is better)
+
+**Metric design requirements** (enforce before first real experiment):
+- Train + eval runs in 5-40 minutes on your GPU
+- Variance across seeds < effect size of a meaningful improvement (run baseline x3, check std)
+- Deterministic given same seed (fixed data order, fixed eval split)
+- If variance is too high: use more eval data, smaller model, or a proxy metric with less noise
+
+**Hypothesis space**: {FILL_IN: what class of approaches are in scope}
+
+Read `0_docs/problem.md` for full context.
+
+---
+
+## File Taxonomy
+
+| Type | Files | Rule |
+|------|-------|------|
+| FROZEN | `program.md`, `eval.py`, `meta_journal.md` | Never edit without `META_MODE=1` |
+| GLOBAL | `RESEARCH_JOURNAL.md`, `results.tsv` | Only commit from main; worktrees append to root copy |
+| APPEND-ONLY | `*_journal.md` | New entries at top, never edit old ones |
+| REGULAR | everything else | Modify freely in your worktree |
+
+---
+
+## Agent Algorithm
+
+```
+YOU ARE AN AGENT. Follow this loop:
+
+read RESEARCH_JOURNAL.md          # what has been tried
+read 0_docs/problem.md            # what we're solving
+
+n_ideas = count files in 1_ideas/ (not _TEMPLATE.md)
+
+if n_ideas < 30:
+    ## IDEATE
+    - Read at least one file from 0_docs/papers/ (or fetch a new paper)
+    - Do at least one web search for recent approaches
+    - Fetch papers: use /semantic-search or /exa-search skills
+      -> save FULL paper text to 0_docs/papers/{slug}.md (not summaries -- full text)
+      -> optionally add a vargdown-style argument map to 0_docs/papers/{slug}_analysis.argdown
+      -> add key insight (1-3 observations with sources) to RESEARCH_JOURNAL.md
+    - Brainstorm ideas. Quality bar:
+        * Novel (not in RESEARCH_JOURNAL.md already)
+        * Mechanistically grounded (not just hyperparameter tuning)
+        * Not sklearn slop -- must be a real ML research contribution
+        * Bold enough that it could be a paper contribution
+    - For each idea:
+        write 1_ideas/{YYYY-MM-DD}_{slug}.md  (use _TEMPLATE.md format)
+        spawn subagent to critique the idea (prompt: "Is this idea sound?
+          What are the failure modes? Is the hypothesis testable?")
+        append subagent feedback to the idea file
+    - Append summary of new ideas + paper insights to RESEARCH_JOURNAL.md
+
+else:
+    ## IMPLEMENT
+    pick the best idea from 1_ideas/ based on:
+        - subagent rating (see feedback section in idea file)
+        - novelty relative to RESEARCH_JOURNAL.md
+        - expected impact on metric
+        - implementation feasibility
+
+    slug = idea filename slug
+    run: git worktree add 5_worktrees/{slug} -b exp/{slug}
+    cd 5_worktrees/{slug}
+
+    implement the idea (modify train.py, model.py, etc.)
+    do NOT modify: eval.py, program.md, meta_journal.md
+
+    ## TEST
+    spawn subagent: "Code review this against the idea doc 1_ideas/{slug}.md.
+      Does the implementation match the hypothesis? Any bugs?"
+    run: just smoke                    # fast sanity check
+    run: just eval                     # appends to results.tsv
+
+    ## REPORT
+    write 9_reports/{YYYY-MM-DD}_{slug}.md  (use _TEMPLATE.md format)
+    append short summary to RESEARCH_JOURNAL.md:
+        - what was tried, what metric changed, what you learned
+        - key observation vs inference distinction
+
+    ## SUBMIT
+    git commit -m "exp({slug}): {one-line description}"
+    git push origin exp/{slug}
+    if result beats best in results.tsv:
+        create PR for human to merge
+
+## QUEUING EXPERIMENTS (pueue)
+
+Use pueue to queue experiments for the single GPU -- one at a time, no collision:
+
+    # Queue with a label showing the question and expected resolution
+    pueue add --label "Q: does X help? H: expect +0.05 metric" -- just eval --config=path
+
+    # Check queue / status / logs
+    pueue status
+    pueue log {task_id}       # full stdout
+    pueue follow {task_id}    # live tail
+
+Labels encode the hypothesis being tested. After the run, append observed vs expected
+to RESEARCH_JOURNAL.md. The label shows up in `pueue status` so you can track what
+question each running/queued job is answering.
+
+    # Example: multiple experiments queued with different hypotheses
+    pueue add --label "Q: rotary vs sinusoidal? H: rotary saves 0.1 bpb" -- just eval rotary
+    pueue add --label "Q: flash-attn memory? H: 2x batch size same speed" -- just eval flash
+    pueue add --label "Q: does layer norm placement matter? H: pre-norm better" -- just eval prenorm
+```
+
+---
+
+## Coding Conventions
+
+Fail fast. No defensive programming. No silent fallbacks.
+
+```python
+# shape ops: einops for clarity
+from einops import rearrange, reduce
+x = rearrange(x, 'b s h d -> b h s d')
+
+# einsum for explicit contraction
+out = torch.einsum('b h s d, b h d v -> b h s v', q, k)
+
+# jaxtyping on function boundaries (docs + smoke-test checking)
+from jaxtyping import Float
+from torch import Tensor
+def encode(x: Float[Tensor, 'b s d']) -> Float[Tensor, 'b s h']:
+    ...
+
+# logging: loguru not print
+from loguru import logger
+logger.info(f"loss={loss:.4f}")
+
+# dataframes: polars v1
+import polars as pl
+df.group_by("exp").agg(pl.col("metric").mean())
+
+# config: tyro dataclass
+import tyro
+from dataclasses import dataclass
+
+@dataclass
+class Config:
+    lr: float = 3e-4
+    # {FILL_IN}
+
+cfg = tyro.cli(Config)
+```
+
+---
+
+## Research Epistemics
+
+Separate observations from inferences:
+- **Observation**: "val_bpb dropped from 3.2 to 2.9 on run X" (measured fact)
+- **Inference**: "this suggests the attention head is learning positional structure" (interpretation)
+- **Claim from paper**: "authors claim X" -- not "X is true" unless you verified it
+
+For complex arguments, use `/vargdown` skill: verified argument maps with credences.
+
+Trust signals: community adoption > papers citing it > open source code > author reputation.
+
+---
+
+## Available Skills
+
+Assume installed at `~/.claude/skills/` (from https://github.com/wassname/skills):
+
+| Skill | Use for |
+|-------|---------|
+| `/semantic-search` | Search arXiv, Semantic Scholar, DBLP, OpenAlex |
+| `/arxiv-fetch` | Download full paper text given arXiv ID/URL |
+| `/exa-search` | Neural web search for recent approaches |
+| `/vargdown` | Verified argument maps with credences for complex reasoning |
+| `/gsd` | Get Shit Done: spec -> implement -> test -> review -> wrap |
+| `/jaxtyping` | Runtime tensor shape/dtype checking |
+| `/justfile` | Project recipes (`just smoke`, `just eval`, `just queue`) |
+| `/ml_debug` | ML convergence, gradient analysis, sweep methodology |
+| `/brainstorm` | Wide + deep ideation without tunnel vision |
+| `/external-review` | Code/plan review via a different model |
+| `pueue` | Queue GPU jobs sequentially; label each with Q/hypothesis |
+
+Also available: bibtex MCP (search_reference, fetch), wandb MCP (query runs).
+
+---
+
+## Meta-Mode
+
+Human sets `META_MODE=1` to enable editing of FROZEN files and committing to main.
+
+Use meta-mode to:
+- Revise this program.md (agent instructions)
+- Update eval.py (e.g., add new metric columns)
+- Reflect on the overall research process in meta_journal.md
+- Exit-interview style: what worked, what didn't, what would you change?
+
+To enter: human writes `META_MODE=1` in human_journal.md entry before asking agent.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4fe2196
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,31 @@
+[project]
+name = "{FILL_IN}"  # replace with your project name
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "torch>=2.3",
+    "einops>=0.8",
+    "jaxtyping>=0.2",
+    "beartype>=0.18",
+    "loguru>=0.7",
+    "polars>=1.0",
+    "wandb>=0.17",
+    "tyro>=0.8",
+    "tabulate>=0.9",
+    # {FILL_IN}: add project-specific deps
+]
+
+[tool.uv]
+dev-dependencies = [
+    "pytest>=8",
+    "ipykernel",
+]
+
+[tool.ruff]
+line-length = 100
+
+[tool.ruff.lint]
+ignore = [
+    "F722",  # jaxtyping shape strings look like invalid syntax to ruff
+    "F821",  # jaxtyping forward refs
+]
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..974d381
--- /dev/null
+++ b/train.py
@@ -0,0 +1,51 @@
+"""
+Training loop. NOT frozen -- agents modify freely in worktrees.
+
+Convention: eval.py runs the frozen evaluation. This file handles training.
+Keep them separate so eval is never accidentally changed during experimentation.
+"""
+
+import torch
+import tyro
+import wandb
+from loguru import logger
+
+from model import Config, build_model
+
+
+def train(cfg: Config):
+    torch.manual_seed(cfg.seed)
+
+    wandb.init(
+        project="{FILL_IN}",  # replace with your W&B project name
+        config=vars(cfg),
+        # group is set by justfile sweep recipes via WANDB_RUN_GROUP env var
+    )
+
+    model = build_model(cfg)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr)
+
+    # {FILL_IN}: load training data
+    # train_loader = ...
+
+    model.train()
+    for step in range(cfg.max_steps):
+        # {FILL_IN}: training step
+        # batch = next(iter(train_loader))
+        # loss = model(batch)
+        # optimizer.zero_grad(); loss.backward(); optimizer.step()
+
+        if step % 100 == 0:
+            logger.info(f"step={step}")
+            # wandb.log({"loss": loss.item(), "step": step})
+
+    # {FILL_IN}: save checkpoint
+    # torch.save(model.state_dict(), f"outputs/{wandb.run.id}.pt")
+
+    wandb.finish()
+    logger.info("Training complete")
+
+
+if __name__ == "__main__":
+    cfg = tyro.cli(Config)
+    train(cfg)