init

2026-06-27 17:47:23 +08:00 · 2026-04-04 23:40:34 +08:00
parent c687a68f93
commit fc46d878cf
19 changed files with 995 additions and 0 deletions
@@ -0,0 +1,8 @@
 {
  "permissions": {
    "allow": [
      "Read(//home/wassname/.claude/skills/setup-repo/**)",
      "Read(//home/wassname/dev/my-skills/setup-repo/**)"
    ]
  }
 }
@@ -0,0 +1,62 @@
 #!/usr/bin/env bash
 # Pre-commit hook: enforce autoresearch repo invariants.
 # Install: git config core.hooksPath .githooks
 set -euo pipefail
 FROZEN_FILES=("program.md" "eval.py" "meta_journal.md")
 GLOBAL_FILES=("RESEARCH_JOURNAL.md" "results.tsv")
 YELLOW='\033[1;33m'
 RED='\033[0;31m'
 NC='\033[0m'  # no color
 CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
 IS_WORKTREE=$(git rev-parse --git-dir | grep -q "worktrees" && echo "yes" || echo "no")
 META_MODE="${META_MODE:-0}"
 # ── 1. Warn on direct commits to main ────────────────────────────────────────
 if [[ "$CURRENT_BRANCH" == "main" && "$META_MODE" != "1" ]]; then
    echo -e "${YELLOW}[hook] WARNING: committing directly to main.${NC}"
    echo "  Agents should work in worktrees: just worktree {slug}"
    echo "  To suppress: META_MODE=1 git commit ..."
    echo ""
    echo -n "  Are you sure? (y/N) "
    read -r reply < /dev/tty
    if [[ "$reply" != "y" && "$reply" != "Y" ]]; then
        echo "Aborted."
        exit 1
    fi
 fi
 # ── 2. Protect FROZEN files ───────────────────────────────────────────────────
 if [[ "$META_MODE" != "1" ]]; then
    for f in "${FROZEN_FILES[@]}"; do
        if git diff --cached --name-only | grep -qx "$f"; then
            echo -e "${RED}[hook] ERROR: '$f' is FROZEN.${NC}"
            echo "  Only edit in meta-mode: META_MODE=1 git commit ..."
            echo "  See meta_journal.md for instructions."
            exit 1
        fi
    done
 fi
 # ── 3. Warn about GLOBAL files committed from worktrees ───────────────────────
 if [[ "$IS_WORKTREE" == "yes" || "$CURRENT_BRANCH" != "main" ]]; then
    for f in "${GLOBAL_FILES[@]}"; do
        if git diff --cached --name-only | grep -qx "$f"; then
            echo -e "${YELLOW}[hook] WARNING: '$f' is a GLOBAL file.${NC}"
            echo "  It should only be committed from main (not from worktree branches)."
            echo "  Agents: append to the root project copy instead:"
            echo "    echo '...' >> \$(git rev-parse --show-toplevel)/$f"
            echo ""
            echo -n "  Commit anyway? (y/N) "
            read -r reply < /dev/tty
            if [[ "$reply" != "y" && "$reply" != "Y" ]]; then
                echo "Aborted. Unstage with: git reset HEAD $f"
                exit 1
            fi
        fi
    done
 fi
 exit 0
@@ -0,0 +1,26 @@
 # GLOBAL files: tracked in main only, never in worktrees
 # (git hook enforces this -- see .githooks/pre-commit)
 results.tsv
 # worktrees live outside the repo
 5_worktrees/
 # python
 __pycache__/
 *.py[cod]
 *.egg-info/
 .venv/
 dist/
 # ML artifacts
 outputs/
 wandb/
 data/
 *.ckpt
 *.pt
 *.safetensors
 # editor
 .vscode/
 .idea/
 *.swp
@@ -0,0 +1,53 @@
 # Problem Statement
 <!-- FILL IN: replace everything below with your actual problem description -->
 <!-- This file is read by agents at the start of every IDEATE session -->
 ## What we're trying to solve
 {FILL_IN: 2-3 sentences describing the research problem}
 ## Why it matters
 {FILL_IN: why does solving this matter? What's the impact?}
 ## Current best approaches
 {FILL_IN: brief description of SOTA and why it's insufficient}
 ## Our approach / hypothesis space
 {FILL_IN: what class of solutions are we exploring?}
 ## Metric
 We optimize: **{metric name}** ({lower/higher} is better)
 Baseline (untrained or current best): **{value}**
 ## Data
 {FILL_IN: dataset, size, splits}
 - Train: {description}
 - Val (eval.py uses this, FROZEN split): {description}
 ## Constraints
 - {FILL_IN: compute budget, e.g. "5 min training on single A100"}
 - {FILL_IN: any other hard constraints}
 ## Key papers
 See `0_docs/papers/` for full summaries. Quick list:
 | Paper | Key claim | Trust signal |
 |-------|-----------|--------------|
 | {FILL_IN} | {claim} | {citations / code / self-report} |
 ## Lessons so far
 {start empty, accumulate as experiments run}
 ## What has NOT worked
 {start empty, accumulate}
@@ -0,0 +1,64 @@
 # {slug}: {one-line title}
 <!-- Copy to: 1_ideas/YYYY-MM-DD_{slug}.md -->
 ## Metadata
 - **Date**: YYYY-MM-DD
 - **Author**: {agent or human name}
 - **Status**: draft | validated | in-progress | done | rejected
 ## Hypothesis
 **Question**: What happens if we {specific change}?
 **Prediction**: We expect {metric} to change by {amount} because {mechanism}.
 **Falsification**: If {metric} does NOT change by at least {threshold}, the hypothesis is wrong.
 ## Approach
 Brief description of the implementation. What files change? What's the core modification?
 ```python
 # pseudocode or key snippet showing the idea
 ```
 ## Why this is novel
 How does this differ from what's already in RESEARCH_JOURNAL.md? What paper/idea is it based on?
 ## Expected implementation effort
 Small (< 1 hour) / Medium (1-4 hours) / Large (> 4 hours)
 ## Pueue label (for queuing)
 ```
 Q: does {change} improve {metric}? H: expect {delta}
 ```
 ---
 ## Subagent Critique
 <!-- Append subagent feedback here before implementing -->
 <!-- Prompt used: "Is this idea sound? What are the failure modes? Is the hypothesis testable?" -->
 **Rating**: 1-5 (1=bad, 5=excellent)
 **Strengths**:
 -
 **Failure modes**:
 -
 **Verdict**: proceed / revise / reject
 ---
 ## Result (fill in after experiment)
 **Commit**: SHORT_SHA
 **Report**: [9_reports/YYYY-MM-DD_{slug}.md](../9_reports/YYYY-MM-DD_{slug}.md)
 **Observed**: {metric} = {value} (expected {expected})
 **Verdict**: confirmed / partially confirmed / refuted / inconclusive
@@ -0,0 +1,71 @@
 # Lab Report: {slug}
 <!-- Copy to: 9_reports/YYYY-MM-DD_{slug}.md -->
 <!-- Self-contained: a reader should understand the experiment without reading anything else -->
 ## Metadata
 | Field | Value |
 |-------|-------|
 | Date | YYYY-MM-DD |
 | Commit | SHORT_SHA |
 | Branch | exp/{slug} |
 | Worktree | 5_worktrees/{slug} |
 | Agent | {name} |
 | Idea doc | [1_ideas/YYYY-MM-DD_{slug}.md](../1_ideas/YYYY-MM-DD_{slug}.md) |
 ## Context
 {1-2 sentences: what problem are we solving and why does this experiment matter}
 ## Hypothesis
 **Question**: What happens if we {change}?
 **Prediction**: {metric} improves by {amount} because {mechanism}.
 ## Experiment
 What was changed vs the baseline:
 ```diff
 # key diff -- just the essential change, not the whole file
 ```
 Baseline: commit {SHORT_SHA} (or "untrained")
 ## Observations
 Measured facts only -- no interpretation here.
 | Run | Metric | Value | Delta vs baseline |
 |-----|--------|-------|-------------------|
 | baseline | {metric} | {value} | -- |
 | this exp | {metric} | {value} | {+/-delta} |
 ```
 # pueue log output or relevant stdout snippet
 ```
 ## Diagnosis
 > Caution: 95% of ML failures are bugs, engineering issues, or misconceptions -- not deep theory.
 > State credences explicitly. Don't overclaim.
 **Most likely explanation** (credence: X%): {explanation}
 **Alternative explanations**:
 - {alternative} (credence: Y%)
 **What would distinguish these**: {test or observation that would separate them}
 ## Limitations
 - {what this experiment doesn't test}
 - {confounds}
 ## Future work
 - {most promising next step given this result}
 - {if confirmed: what's the natural follow-up?}
 - {if refuted: what's the diagnosis and what would we try instead?}
@@ -0,0 +1 @@
 program.md
@@ -0,0 +1 @@
 program.md
@@ -0,0 +1,40 @@
 <!-- APPEND-ONLY: new entries at top. Never edit old entries. -->
 <!-- Tracked in main branch only. Git hook prevents committing from worktrees. -->
 <!-- All worktrees append here: write to the root project copy. -->
 <!--
 Format:
 ## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME
 ### What I did
 [1-3 sentences: what experiment, what change]
 ### Observations (facts)
 - "val_bpb = 2.91 (baseline 3.02, delta -0.11)" -- measured, not interpreted
 ### Inferences (interpretations)
 - "suggests X because Y" -- label as inference, give credence
 ### What surprised me
 [anything unexpected -- good signal for future ideation]
 ### Next questions
 [what to try next based on this result]
 -->
 # Research Journal
 ---
 ## 2026-04-04 | commit: c687a68 | branch: main | agent: human
 ### What I did
 Initialized autoresearch template. No experiments yet.
 ### Observations
 - Repo structure created: program.md (FROZEN), eval.py (FROZEN), justfile, pyproject.toml
 - Git hooks configured to protect FROZEN files and warn on worktree commits of GLOBAL files
 ### Next questions
 - Fill in `{FILL_IN}` markers in program.md, eval.py, 0_docs/problem.md
 - Run `just smoke` to verify setup
 - Start IDEATE loop: read 0_docs/problem.md, search for papers, generate first ideas
@@ -0,0 +1,30 @@
 <!-- APPEND-ONLY: new entries at top. Never edit old entries. -->
 <!-- Written by agents (not humans -- see human_journal.md). -->
 <!-- Format: ## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME -->
 # Agent Journal
 Short-term notes from agents: what was attempted in this session,
 blockers hit, context for the next agent picking this up.
 For lasting research learnings, write to RESEARCH_JOURNAL.md instead.
 ---
 ## Template entry
 ```
 ## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME
 ### Session goal
 [what this agent was asked to do]
 ### What was done
 [brief -- the commit history has the details]
 ### Blockers / open questions
 [anything the next agent needs to know]
 ### Pueue queue state
 [any jobs queued: `pueue status` output or summary]
 ```
@@ -0,0 +1,110 @@
 """
 FROZEN: only edit in meta-mode (META_MODE=1).
 Frozen to prevent p-hacking, seed hacking, and eval-set leakage.
 All experiments must be compared using this exact eval logic.
 Appends one row to results.tsv per run.
 """
 # FROZEN: only edit in meta-mode (META_MODE=1)
 import csv
 import os
 import subprocess
 from dataclasses import asdict, dataclass
 from datetime import datetime
 from pathlib import Path
 import torch
 import tyro
 from loguru import logger
 # {FILL_IN}: import your model
 # from model import Config, build_model
 EVAL_SEED = 42          # FROZEN: never change
 EVAL_SPLIT = "val"      # FROZEN: never change
 RESULTS_FILE = Path(__file__).parent / "results.tsv"
@dataclass
 class EvalConfig:
    checkpoint: str = ""          # path to checkpoint or empty for untrained baseline
    # {FILL_IN}: add any eval-time config here (e.g. batch_size, max_seq_len)
    batch_size: int = 32
 def get_git_info() -> dict:
    def run(cmd):
        return subprocess.check_output(cmd, text=True).strip()
    return {
        "commit": run(["git", "rev-parse", "--short", "HEAD"]),
        "branch": run(["git", "rev-parse", "--abbrev-ref", "HEAD"]),
        "worktree": Path.cwd().name,
    }
 def load_eval_data():
    """Load fixed evaluation data. FROZEN: do not change split or preprocessing."""
    torch.manual_seed(EVAL_SEED)
    # {FILL_IN}: load your dataset
    # e.g. dataset = load_dataset("openwebtext", split=EVAL_SPLIT)
    raise NotImplementedError("{FILL_IN}: implement load_eval_data()")
 def evaluate(model, data) -> dict:
    """Run evaluation. FROZEN: do not change metric computation."""
    model.eval()
    torch.manual_seed(EVAL_SEED)
    # {FILL_IN}: compute your metric(s)
    # e.g. return {"val_bpb": compute_bpb(model, data), "val_loss": compute_loss(model, data)}
    raise NotImplementedError("{FILL_IN}: implement evaluate()")
 def append_results(row: dict):
    """Append one row to results.tsv. Creates file with header if missing."""
    fieldnames = list(row.keys())
    file_exists = RESULTS_FILE.exists()
    with open(RESULTS_FILE, "a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter="\t")
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)
    logger.info(f"Results appended to {RESULTS_FILE}")
 def main(cfg: EvalConfig):
    logger.info(f"Eval config: {cfg}")
    git = get_git_info()
    data = load_eval_data()
    # {FILL_IN}: build and load your model
    # model_cfg = Config()
    # model = build_model(model_cfg)
    # if cfg.checkpoint:
    #     model.load_state_dict(torch.load(cfg.checkpoint))
    raise NotImplementedError("{FILL_IN}: build and load model")
    metrics = evaluate(model, data)
    row = {
        "date": datetime.now().isoformat(timespec="seconds"),
        "commit": git["commit"],
        "branch": git["branch"],
        "worktree": git["worktree"],
        **metrics,
        # {FILL_IN}: add config fields you want tracked (e.g. model_cfg.n_layers)
    }
    logger.info(f"Metrics: {metrics}")
    append_results(row)
 if __name__ == "__main__":
    tyro.cli(main)
@@ -0,0 +1,52 @@
 <!-- Only editable by humans -->
 # 2026-04-04 11:31:20
 I'd like to turn this into a template repo for autoresearch
 first read the skills for 
 justfile pueue uv jaxtyping skill and gsd skill, vargdown 
 assume these are available and can be described in one or two lines then references. We assume that https://github.com/wassname/skills is installed to the machine.
 First principles
 - some files a SINGLETONS, they are only in the main branch, and protected by git hook reminders, and or gitignore
 - some are APPEND ONLY like journals
 - some are write only by automatic (eval.py), meta-agents (meta_journal.md, etc), agents (research journal), humans (human journal)
 then read my best practice guide here... oh I can't find it... highlight from memory
 - it's like https://github.com/karpathy/autoresearch/blob/master/program.md or https://github.com/NousResearch/autonovel/blob/master/program.md read these for context and outline (but be aware the 1st is simple, and the 2nd is for writing not ml expeiments like here)
 - but we add a lesson learned and gotcha section
 - we have a FROZEN / META files, with a comment at the top saying "only edit in meta mode" to the top of these files: 
  - eval.py - this is frozen to stop 1) cheating 2) make sure results are compariable. Anything needed to stop p-hacking, seed hacking, changing the eval set, etc. While anything the agent can and should improve and compare should be in non frozen files
  - meta_journal.md -
  - program.md 
    - this is symlinked to AGENTS.md and CLAUDE.md, to make sure it's loded in
    - but it's frozen to stop the agent from changing its own instructions. Unless in metamode! 
    - Normal non-meta agents are encourages to add meta-feedback to the research log.
    - This should give agents a clear pseudo algo code of how to work
      - If the ideas queue if <30, think for hours about wide and deep next steps, based on read at least one main files, and at least one paper, and one web search. Ideas should not by hyperparameter tuning, or sklearn slop. They should be novel, and bold, and you should be able to show an independant subagent, and it says it makes sense. should we in 2_ideas/{ts}_{slug}.md have a log at end for subagent feedback and log. This will get turned into lab reports, and we want to seperate observations from inferences and guesses.
      - If the ideas queue is >30, pick the best one, and do it in a new worktree
      - When ready to test 1) have a subagent do a code review against the idea 2) do a smoke test 3) run eval.py
      - When finished improving write a short self contained lab report into 9_lab_reports/ with 
        - metadata: date, commit, worktree, agent name, 
        - intro: context, hypothesis, 
        - experiment, observations, results
        - diagnosis: be very careful here to not overclaim, 95% of the time in ML it failed due to bug, poor engineering, or a misconception.
        - limitations, future work
      - Then append a short summary to the research journal with learnings.
 - we have GLOBAL files that are 0) only in the root projected directory 1) untracked, gitignored 2) have git hooks to stop them being commited. This so that they don't get commited to worktrees, then overwrite the main one during merge.
  - These are RESEARCH_JOURNAL.md - append only file that agents in all worktrees should write to. All entried should have date, commit, worktree, agent name. First line in a comment saying it's append only, and showing format
  - results.tsv - automatically appended by eval.py, early colums have short values, later columns have long values, and the file is git-ignored.
 - our main is protected by a git hook remining models to work in a worktree, and not commit directly to main (unless human has put them in meta-mode)
 - meta-mode, this is where the agent can take a step back, think about the overall workflow, perhaps do an exist interview. Here we improve the overall process by editing the FROZEN / META files, commiting to main, and appending to the meta-journal. 
 - Agents should work in worktrees. They shouldbe commited and pushed. If it's a better result than any in results.tsv make it a PR for the human to merge.
 Generally we want to show not tell, and sometimes show and tell, but never tell but not show. That means that these files should show throught specific examples
 Thinking: We need expanded files for some of the stages, in particular ideation, brainstroming/diagnoses are hard for current LLM's, and can fill up context of just fall to tunnel vision. 
 Research: To help with ideation we have a 0_docs folder with a problem intro, and papers. Whenever an agent fetches a paper it should write the full markdown summary to 0_docs/papers/{slug}.md, and then add the main insights to the research journal. This way we have a growing set of knowledge that agents can draw on, and we can also track which papers were read when, and what was learned from them. We should marshall available skills, tools, and mcps for searching and fetching and priortise them so we can 1) search 2) download full text to files, and 3) have subagents with good epistemics, the vargdown skills, and knowledge of the problems and what we are looking for write summaries.
@@ -0,0 +1,69 @@
 set shell := ["bash", "-cu"]
 WORKTREES := "5_worktrees"
 IDEAS := "1_ideas"
 REPORTS := "9_reports"
 # List all recipes
 default:
    @just --list
 # --- Sanity checks ---
 # Fast smoke test: shape checks on CPU, short run
 smoke:
    #!/usr/bin/env bash
    BEARTYPE=1 JAX_PLATFORMS=cpu uv run python train.py --max_steps=1
    echo smoke passed
 # --- Evaluation (FROZEN logic -- see eval.py) ---
 # Run eval and append to results.tsv
 eval *ARGS:
    uv run python eval.py {{ARGS}}
 # Show results table
 results:
    @column -t -s $'\t' results.tsv | head -30
 # --- Training ---
 # Train with default config
 train *ARGS:
    uv run python train.py {{ARGS}}
 # Queue an experiment to pueue with a hypothesis label
 # Usage: just queue LABEL CMD...
 queue LABEL *CMD:
    pueue add --label '{{LABEL}}' -- just {{CMD}}
 # Show pueue queue status
 status:
    pueue status
 # --- Sweeps (pueue-based, one GPU, sequential) ---
 # Sweeps: write a recipe that loops over params and calls `pueue add --label Q -- just eval ...`
 # See justfile-sweeps skill for the full pattern.
 # --- Worktrees ---
 # Create a new experiment worktree
 worktree SLUG:
    git worktree add {{WORKTREES}}/{{SLUG}} -b exp/{{SLUG}}
 # List worktrees
 worktrees:
    git worktree list
 # --- Reports ---
 # List lab reports (see 9_reports/ directory)
 reports:
    ls -t {{REPORTS}} | head -20 || true
 # --- Ideas ---
 # Count ideas in queue (see 1_ideas/ directory)
 ideas:
    ls -t {{IDEAS}} | grep -v _TEMPLATE | head -10 || true
@@ -0,0 +1,47 @@
 <!-- FROZEN: only edit in meta-mode (META_MODE=1) -->
 <!-- APPEND-ONLY: new entries at top -->
 <!-- Format: ## YYYY-MM-DD HH:MM | agent: NAME -->
 <!-- Purpose: meta-level observations about the research process itself -->
 <!-- (not about experiment results -- those go in RESEARCH_JOURNAL.md) -->
 # Meta Journal
 Observations about the research *process*: what's working, what's broken,
 what should change in program.md, eval.py, or the overall workflow.
 ---
 ## Template entry format
 ```
 ## YYYY-MM-DD HH:MM | agent: {name or "human"}
 ### Observation
 [What I noticed about the process -- be specific]
 ### Root cause
 [Why is this happening?]
 ### Action taken
 [What was changed in FROZEN files, and why]
 ### Expected effect
 [How should agent behavior change after this update?]
 ```
 ---
 ## 2026-04-04 | agent: human
 ### Observation
 Template repo initialized. program.md, eval.py written as FROZEN stubs.
 ### Root cause
 Starting from scratch -- no prior experiments.
 ### Action taken
 Created initial file structure. Set up git hooks to protect FROZEN files.
 ### Expected effect
 Agents can now run the IDEATE/IMPLEMENT loop. Fill in {FILL_IN} markers
 before first real experiment.
@@ -0,0 +1,74 @@
 """
 Model definition and config.
 This file is NOT frozen -- agents modify it freely in worktrees.
 Keep eval.py's evaluate() interface stable: it calls build_model(cfg) and model.forward(x).
 """
 from dataclasses import dataclass
 import torch
 import torch.nn as nn
 import tyro
 from jaxtyping import Float
 from loguru import logger
 from torch import Tensor
 # beartype checking enabled only when BEARTYPE=1 (smoke tests)
 import os
 if os.environ.get("BEARTYPE"):
    from beartype import beartype as typechecker
    from jaxtyping import jaxtyped
 else:
    def typechecker(f): return f
    def jaxtyped(**_): return lambda f: f
@dataclass
 class Config:
    """Model and training hyperparameters. Edit freely."""
    # model
    d_model: int = 256
    n_layers: int = 4
    # {FILL_IN}: add your architecture params
    # training
    lr: float = 3e-4
    batch_size: int = 32
    max_steps: int = 1000
    seed: int = 42
    # data
    # {FILL_IN}: add data params
 class Model(nn.Module):
    """
    {FILL_IN}: replace with your actual model.
    """
    def __init__(self, cfg: Config):
        super().__init__()
        self.cfg = cfg
        # {FILL_IN}: define layers
        # e.g. self.embed = nn.Embedding(vocab_size, cfg.d_model)
    @jaxtyped(typechecker=typechecker)
    def forward(self, x: Float[Tensor, "b s"]) -> Float[Tensor, "b s d"]:
        # {FILL_IN}: implement forward pass
        raise NotImplementedError("{FILL_IN}: implement forward()")
 def build_model(cfg: Config) -> Model:
    torch.manual_seed(cfg.seed)
    model = Model(cfg)
    n_params = sum(p.numel() for p in model.parameters())
    logger.info(f"Model: {n_params:,} parameters")
    return model
 if __name__ == "__main__":
    cfg = tyro.cli(Config)
    model = build_model(cfg)
    logger.info(f"Config: {cfg}")
@@ -0,0 +1,205 @@
 <!-- FROZEN: only edit in meta-mode (META_MODE=1) -->
 <!-- Symlinked to AGENTS.md and CLAUDE.md -- always loaded by Claude/Cursor/etc. -->
 # Research Program
 **Project**: {FILL_IN: one sentence describing the research problem}
 **Metric**: {FILL_IN: what we optimize, e.g. val_bpb, accuracy, F1} (lower/higher is better)
 **Metric design requirements** (enforce before first real experiment):
 - Train + eval runs in 5-40 minutes on your GPU
 - Variance across seeds < effect size of a meaningful improvement (run baseline x3, check std)
 - Deterministic given same seed (fixed data order, fixed eval split)
 - If variance is too high: use more eval data, smaller model, or a proxy metric with less noise
 **Hypothesis space**: {FILL_IN: what class of approaches are in scope}
 Read `0_docs/problem.md` for full context.
 ---
 ## File Taxonomy
 | Type | Files | Rule |
 |------|-------|------|
 | FROZEN | `program.md`, `eval.py`, `meta_journal.md` | Never edit without `META_MODE=1` |
 | GLOBAL | `RESEARCH_JOURNAL.md`, `results.tsv` | Only commit from main; worktrees append to root copy |
 | APPEND-ONLY | `*_journal.md` | New entries at top, never edit old ones |
 | REGULAR | everything else | Modify freely in your worktree |
 ---
 ## Agent Algorithm
 ```
 YOU ARE AN AGENT. Follow this loop:
 read RESEARCH_JOURNAL.md          # what has been tried
 read 0_docs/problem.md            # what we're solving
 n_ideas = count files in 1_ideas/ (not _TEMPLATE.md)
 if n_ideas < 30:
    ## IDEATE
    - Read at least one file from 0_docs/papers/ (or fetch a new paper)
    - Do at least one web search for recent approaches
    - Fetch papers: use /semantic-search or /exa-search skills
      -> save FULL paper text to 0_docs/papers/{slug}.md (not summaries -- full text)
      -> optionally add a vargdown-style argument map to 0_docs/papers/{slug}_analysis.argdown
      -> add key insight (1-3 observations with sources) to RESEARCH_JOURNAL.md
    - Brainstorm ideas. Quality bar:
        * Novel (not in RESEARCH_JOURNAL.md already)
        * Mechanistically grounded (not just hyperparameter tuning)
        * Not sklearn slop -- must be a real ML research contribution
        * Bold enough that it could be a paper contribution
    - For each idea:
        write 1_ideas/{YYYY-MM-DD}_{slug}.md  (use _TEMPLATE.md format)
        spawn subagent to critique the idea (prompt: "Is this idea sound?
          What are the failure modes? Is the hypothesis testable?")
        append subagent feedback to the idea file
    - Append summary of new ideas + paper insights to RESEARCH_JOURNAL.md
 else:
    ## IMPLEMENT
    pick the best idea from 1_ideas/ based on:
        - subagent rating (see feedback section in idea file)
        - novelty relative to RESEARCH_JOURNAL.md
        - expected impact on metric
        - implementation feasibility
    slug = idea filename slug
    run: git worktree add 5_worktrees/{slug} -b exp/{slug}
    cd 5_worktrees/{slug}
    implement the idea (modify train.py, model.py, etc.)
    do NOT modify: eval.py, program.md, meta_journal.md
    ## TEST
    spawn subagent: "Code review this against the idea doc 1_ideas/{slug}.md.
      Does the implementation match the hypothesis? Any bugs?"
    run: just smoke                    # fast sanity check
    run: just eval                     # appends to results.tsv
    ## REPORT
    write 9_reports/{YYYY-MM-DD}_{slug}.md  (use _TEMPLATE.md format)
    append short summary to RESEARCH_JOURNAL.md:
        - what was tried, what metric changed, what you learned
        - key observation vs inference distinction
    ## SUBMIT
    git commit -m "exp({slug}): {one-line description}"
    git push origin exp/{slug}
    if result beats best in results.tsv:
        create PR for human to merge
 ## QUEUING EXPERIMENTS (pueue)
 Use pueue to queue experiments for the single GPU -- one at a time, no collision:
    # Queue with a label showing the question and expected resolution
    pueue add --label "Q: does X help? H: expect +0.05 metric" -- just eval --config=path
    # Check queue / status / logs
    pueue status
    pueue log {task_id}       # full stdout
    pueue follow {task_id}    # live tail
 Labels encode the hypothesis being tested. After the run, append observed vs expected
 to RESEARCH_JOURNAL.md. The label shows up in `pueue status` so you can track what
 question each running/queued job is answering.
    # Example: multiple experiments queued with different hypotheses
    pueue add --label "Q: rotary vs sinusoidal? H: rotary saves 0.1 bpb" -- just eval rotary
    pueue add --label "Q: flash-attn memory? H: 2x batch size same speed" -- just eval flash
    pueue add --label "Q: does layer norm placement matter? H: pre-norm better" -- just eval prenorm
 ```
 ---
 ## Coding Conventions
 Fail fast. No defensive programming. No silent fallbacks.
 ```python
 # shape ops: einops for clarity
 from einops import rearrange, reduce
 x = rearrange(x, 'b s h d -> b h s d')
 # einsum for explicit contraction
 out = torch.einsum('b h s d, b h d v -> b h s v', q, k)
 # jaxtyping on function boundaries (docs + smoke-test checking)
 from jaxtyping import Float
 from torch import Tensor
 def encode(x: Float[Tensor, 'b s d']) -> Float[Tensor, 'b s h']:
    ...
 # logging: loguru not print
 from loguru import logger
 logger.info(f"loss={loss:.4f}")
 # dataframes: polars v1
 import polars as pl
 df.group_by("exp").agg(pl.col("metric").mean())
 # config: tyro dataclass
 import tyro
 from dataclasses import dataclass
@dataclass
 class Config:
    lr: float = 3e-4
    # {FILL_IN}
 cfg = tyro.cli(Config)
 ```
 ---
 ## Research Epistemics
 Separate observations from inferences:
 - **Observation**: "val_bpb dropped from 3.2 to 2.9 on run X" (measured fact)
 - **Inference**: "this suggests the attention head is learning positional structure" (interpretation)
 - **Claim from paper**: "authors claim X" -- not "X is true" unless you verified it
 For complex arguments, use `/vargdown` skill: verified argument maps with credences.
 Trust signals: community adoption > papers citing it > open source code > author reputation.
 ---
 ## Available Skills
 Assume installed at `~/.claude/skills/` (from https://github.com/wassname/skills):
 | Skill | Use for |
 |-------|---------|
 | `/semantic-search` | Search arXiv, Semantic Scholar, DBLP, OpenAlex |
 | `/arxiv-fetch` | Download full paper text given arXiv ID/URL |
 | `/exa-search` | Neural web search for recent approaches |
 | `/vargdown` | Verified argument maps with credences for complex reasoning |
 | `/gsd` | Get Shit Done: spec -> implement -> test -> review -> wrap |
 | `/jaxtyping` | Runtime tensor shape/dtype checking |
 | `/justfile` | Project recipes (`just smoke`, `just eval`, `just queue`) |
 | `/ml_debug` | ML convergence, gradient analysis, sweep methodology |
 | `/brainstorm` | Wide + deep ideation without tunnel vision |
 | `/external-review` | Code/plan review via a different model |
 | `pueue` | Queue GPU jobs sequentially; label each with Q/hypothesis |
 Also available: bibtex MCP (search_reference, fetch), wandb MCP (query runs).
 ---
 ## Meta-Mode
 Human sets `META_MODE=1` to enable editing of FROZEN files and committing to main.
 Use meta-mode to:
 - Revise this program.md (agent instructions)
 - Update eval.py (e.g., add new metric columns)
 - Reflect on the overall research process in meta_journal.md
 - Exit-interview style: what worked, what didn't, what would you change?
 To enter: human writes `META_MODE=1` in human_journal.md entry before asking agent.
@@ -0,0 +1,31 @@
 [project]
 name = "{FILL_IN}"  # replace with your project name
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
    "torch>=2.3",
    "einops>=0.8",
    "jaxtyping>=0.2",
    "beartype>=0.18",
    "loguru>=0.7",
    "polars>=1.0",
    "wandb>=0.17",
    "tyro>=0.8",
    "tabulate>=0.9",
    # {FILL_IN}: add project-specific deps
 ]
 [tool.uv]
 dev-dependencies = [
    "pytest>=8",
    "ipykernel",
 ]
 [tool.ruff]
 line-length = 100
 [tool.ruff.lint]
 ignore = [
    "F722",  # jaxtyping shape strings look like invalid syntax to ruff
    "F821",  # jaxtyping forward refs
 ]
@@ -0,0 +1,51 @@
 """
 Training loop. NOT frozen -- agents modify freely in worktrees.
 Convention: eval.py runs the frozen evaluation. This file handles training.
 Keep them separate so eval is never accidentally changed during experimentation.
 """
 import torch
 import tyro
 import wandb
 from loguru import logger
 from model import Config, build_model
 def train(cfg: Config):
    torch.manual_seed(cfg.seed)
    wandb.init(
        project="{FILL_IN}",  # replace with your W&B project name
        config=vars(cfg),
        # group is set by justfile sweep recipes via WANDB_RUN_GROUP env var
    )
    model = build_model(cfg)
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr)
    # {FILL_IN}: load training data
    # train_loader = ...
    model.train()
    for step in range(cfg.max_steps):
        # {FILL_IN}: training step
        # batch = next(iter(train_loader))
        # loss = model(batch)
        # optimizer.zero_grad(); loss.backward(); optimizer.step()
        if step % 100 == 0:
            logger.info(f"step={step}")
            # wandb.log({"loss": loss.item(), "step": step})
    # {FILL_IN}: save checkpoint
    # torch.save(model.state_dict(), f"outputs/{wandb.run.id}.pt")
    wandb.finish()
    logger.info("Training complete")
 if __name__ == "__main__":
    cfg = tyro.cli(Config)
    train(cfg)