mirror of
https://github.com/wassname/autoresearch_template.git
synced 2026-06-27 16:14:27 +08:00
wip
This commit is contained in:
@@ -25,6 +25,13 @@ We optimize: **{metric name}** ({lower/higher} is better)
|
||||
|
||||
Baseline (untrained or current best): **{value}**
|
||||
|
||||
Baseline std across 3 seeds: **{value}** (must be < effect size of a meaningful improvement)
|
||||
|
||||
Train + eval wall time: **{X} minutes** (target: 5-40 min)
|
||||
|
||||
If variance is too high relative to effect size, use more eval data, a smaller proxy task,
|
||||
or a lower-variance metric -- before running any experiments.
|
||||
|
||||
## Data
|
||||
|
||||
{FILL_IN: dataset, size, splits}
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
<!-- APPEND-ONLY: new entries at top. Never edit old entries. -->
|
||||
<!-- Written by agents (not humans -- see human_journal.md). -->
|
||||
<!-- Format: ## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME -->
|
||||
|
||||
# Agent Journal
|
||||
|
||||
Short-term notes from agents: what was attempted in this session,
|
||||
blockers hit, context for the next agent picking this up.
|
||||
|
||||
For lasting research learnings, write to RESEARCH_JOURNAL.md instead.
|
||||
|
||||
---
|
||||
|
||||
## Template entry
|
||||
|
||||
```
|
||||
## YYYY-MM-DD HH:MM | commit: SHORT_SHA | branch: NAME | agent: NAME
|
||||
|
||||
### Session goal
|
||||
[what this agent was asked to do]
|
||||
|
||||
### What was done
|
||||
[brief -- the commit history has the details]
|
||||
|
||||
### Blockers / open questions
|
||||
[anything the next agent needs to know]
|
||||
|
||||
### Pueue queue state
|
||||
[any jobs queued: `pueue status` output or summary]
|
||||
```
|
||||
@@ -21,7 +21,7 @@ import tyro
|
||||
from loguru import logger
|
||||
|
||||
# {FILL_IN}: import your model
|
||||
# from model import Config, build_model
|
||||
# from train import Config, build_model
|
||||
|
||||
|
||||
EVAL_SEED = 42 # FROZEN: never change
|
||||
@@ -85,6 +85,7 @@ def main(cfg: EvalConfig):
|
||||
data = load_eval_data()
|
||||
|
||||
# {FILL_IN}: build and load your model
|
||||
# from train import Config, build_model
|
||||
# model_cfg = Config()
|
||||
# model = build_model(model_cfg)
|
||||
# if cfg.checkpoint:
|
||||
|
||||
@@ -13,7 +13,7 @@ default:
|
||||
# Fast smoke test: shape checks on CPU, short run
|
||||
smoke:
|
||||
#!/usr/bin/env bash
|
||||
BEARTYPE=1 JAX_PLATFORMS=cpu uv run python train.py --max_steps=1
|
||||
BEARTYPE=1 uv run python train.py --max_steps=2
|
||||
echo smoke passed
|
||||
|
||||
# --- Evaluation (FROZEN logic -- see eval.py) ---
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
"""
|
||||
Model definition and config.
|
||||
|
||||
This file is NOT frozen -- agents modify it freely in worktrees.
|
||||
Keep eval.py's evaluate() interface stable: it calls build_model(cfg) and model.forward(x).
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import tyro
|
||||
from jaxtyping import Float
|
||||
from loguru import logger
|
||||
from torch import Tensor
|
||||
|
||||
# beartype checking enabled only when BEARTYPE=1 (smoke tests)
|
||||
import os
|
||||
if os.environ.get("BEARTYPE"):
|
||||
from beartype import beartype as typechecker
|
||||
from jaxtyping import jaxtyped
|
||||
else:
|
||||
def typechecker(f): return f
|
||||
def jaxtyped(**_): return lambda f: f
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
"""Model and training hyperparameters. Edit freely."""
|
||||
|
||||
# model
|
||||
d_model: int = 256
|
||||
n_layers: int = 4
|
||||
# {FILL_IN}: add your architecture params
|
||||
|
||||
# training
|
||||
lr: float = 3e-4
|
||||
batch_size: int = 32
|
||||
max_steps: int = 1000
|
||||
seed: int = 42
|
||||
|
||||
# data
|
||||
# {FILL_IN}: add data params
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
"""
|
||||
{FILL_IN}: replace with your actual model.
|
||||
"""
|
||||
|
||||
def __init__(self, cfg: Config):
|
||||
super().__init__()
|
||||
self.cfg = cfg
|
||||
# {FILL_IN}: define layers
|
||||
# e.g. self.embed = nn.Embedding(vocab_size, cfg.d_model)
|
||||
|
||||
@jaxtyped(typechecker=typechecker)
|
||||
def forward(self, x: Float[Tensor, "b s"]) -> Float[Tensor, "b s d"]:
|
||||
# {FILL_IN}: implement forward pass
|
||||
raise NotImplementedError("{FILL_IN}: implement forward()")
|
||||
|
||||
|
||||
def build_model(cfg: Config) -> Model:
|
||||
torch.manual_seed(cfg.seed)
|
||||
model = Model(cfg)
|
||||
n_params = sum(p.numel() for p in model.parameters())
|
||||
logger.info(f"Model: {n_params:,} parameters")
|
||||
return model
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = tyro.cli(Config)
|
||||
model = build_model(cfg)
|
||||
logger.info(f"Config: {cfg}")
|
||||
+44
-163
@@ -1,21 +1,15 @@
|
||||
<!-- FROZEN: only edit in meta-mode (META_MODE=1) -->
|
||||
<!-- Symlinked to AGENTS.md and CLAUDE.md -- always loaded by Claude/Cursor/etc. -->
|
||||
<!-- Symlinked to AGENTS.md and CLAUDE.md -- always loaded. -->
|
||||
|
||||
# Research Program
|
||||
|
||||
**Project**: {FILL_IN: one sentence describing the research problem}
|
||||
**Project**: {FILL_IN: one sentence}
|
||||
|
||||
**Metric**: {FILL_IN: what we optimize, e.g. val_bpb, accuracy, F1} (lower/higher is better)
|
||||
**Metric**: {FILL_IN: metric name} ({lower/higher} is better). Target: 5-40 min per run.
|
||||
|
||||
**Metric design requirements** (enforce before first real experiment):
|
||||
- Train + eval runs in 5-40 minutes on your GPU
|
||||
- Variance across seeds < effect size of a meaningful improvement (run baseline x3, check std)
|
||||
- Deterministic given same seed (fixed data order, fixed eval split)
|
||||
- If variance is too high: use more eval data, smaller model, or a proxy metric with less noise
|
||||
**Hypothesis space**: {FILL_IN: what class of approaches}
|
||||
|
||||
**Hypothesis space**: {FILL_IN: what class of approaches are in scope}
|
||||
|
||||
Read `0_docs/problem.md` for full context.
|
||||
See `0_docs/problem.md` for full problem context and baseline numbers.
|
||||
|
||||
---
|
||||
|
||||
@@ -24,182 +18,69 @@ Read `0_docs/problem.md` for full context.
|
||||
| Type | Files | Rule |
|
||||
|------|-------|------|
|
||||
| FROZEN | `program.md`, `eval.py`, `meta_journal.md` | Never edit without `META_MODE=1` |
|
||||
| GLOBAL | `RESEARCH_JOURNAL.md`, `results.tsv` | Only commit from main; worktrees append to root copy |
|
||||
| APPEND-ONLY | `*_journal.md` | New entries at top, never edit old ones |
|
||||
| GLOBAL | `RESEARCH_JOURNAL.md`, `results.tsv` | Commit from main only; worktrees append to root copy |
|
||||
| APPEND-ONLY | `RESEARCH_JOURNAL.md`, `human_journal.md`, `meta_journal.md` | New entries at top, never edit old entries |
|
||||
| REGULAR | everything else | Modify freely in your worktree |
|
||||
|
||||
---
|
||||
|
||||
## Agent Algorithm
|
||||
|
||||
```
|
||||
YOU ARE AN AGENT. Follow this loop:
|
||||
```python
|
||||
# MUST read before starting:
|
||||
read RESEARCH_JOURNAL.md # what has been tried, what worked/failed
|
||||
read "Lessons Learned" below # gotchas -- don't repeat past failures
|
||||
|
||||
read RESEARCH_JOURNAL.md # what has been tried
|
||||
read 0_docs/problem.md # what we're solving
|
||||
|
||||
n_ideas = count files in 1_ideas/ (not _TEMPLATE.md)
|
||||
n_ideas = count(1_ideas/*.md) - 1 # minus _TEMPLATE.md
|
||||
|
||||
if n_ideas < 30:
|
||||
## IDEATE
|
||||
- Read at least one file from 0_docs/papers/ (or fetch a new paper)
|
||||
- Do at least one web search for recent approaches
|
||||
- Fetch papers: use /semantic-search or /exa-search skills
|
||||
-> save FULL paper text to 0_docs/papers/{slug}.md (not summaries -- full text)
|
||||
-> optionally add a vargdown-style argument map to 0_docs/papers/{slug}_analysis.argdown
|
||||
-> add key insight (1-3 observations with sources) to RESEARCH_JOURNAL.md
|
||||
- Brainstorm ideas. Quality bar:
|
||||
* Novel (not in RESEARCH_JOURNAL.md already)
|
||||
* Mechanistically grounded (not just hyperparameter tuning)
|
||||
* Not sklearn slop -- must be a real ML research contribution
|
||||
* Bold enough that it could be a paper contribution
|
||||
- For each idea:
|
||||
write 1_ideas/{YYYY-MM-DD}_{slug}.md (use _TEMPLATE.md format)
|
||||
spawn subagent to critique the idea (prompt: "Is this idea sound?
|
||||
What are the failure modes? Is the hypothesis testable?")
|
||||
append subagent feedback to the idea file
|
||||
- Append summary of new ideas + paper insights to RESEARCH_JOURNAL.md
|
||||
# IDEATE
|
||||
# MUST read: 0_docs/ideation_guide.md (epistemics, paper fetching, brainstorm discipline)
|
||||
read 0_docs/problem.md
|
||||
search 1+ papers (semantic-search, exa-search, bibtex MCP)
|
||||
save full paper text to 0_docs/papers/{slug}.md # full text, not summaries
|
||||
for each idea:
|
||||
write 1_ideas/{YYYY-MM-DD}_{slug}.md # use _TEMPLATE.md
|
||||
subagent critique: "Is this sound? Failure modes? Testable?"
|
||||
append subagent feedback to idea file
|
||||
append paper insights + ideas to RESEARCH_JOURNAL.md
|
||||
|
||||
else:
|
||||
## IMPLEMENT
|
||||
pick the best idea from 1_ideas/ based on:
|
||||
- subagent rating (see feedback section in idea file)
|
||||
- novelty relative to RESEARCH_JOURNAL.md
|
||||
- expected impact on metric
|
||||
- implementation feasibility
|
||||
# IMPLEMENT
|
||||
# MUST read: 0_docs/conventions.md (coding style)
|
||||
pick best idea (subagent rating + novelty + expected impact)
|
||||
just worktree {slug} # creates 5_worktrees/{slug} on branch exp/{slug}
|
||||
implement in worktree (edit train.py; do NOT touch eval.py, program.md)
|
||||
|
||||
slug = idea filename slug
|
||||
run: git worktree add 5_worktrees/{slug} -b exp/{slug}
|
||||
cd 5_worktrees/{slug}
|
||||
# TEST
|
||||
subagent code review vs idea doc
|
||||
just smoke
|
||||
just eval # appends row to results.tsv
|
||||
|
||||
implement the idea (modify train.py, model.py, etc.)
|
||||
do NOT modify: eval.py, program.md, meta_journal.md
|
||||
# REPORT
|
||||
write 9_reports/{YYYY-MM-DD}_{slug}.md # use _TEMPLATE.md
|
||||
append to RESEARCH_JOURNAL.md: what tried, delta metric, observation vs inference
|
||||
|
||||
## TEST
|
||||
spawn subagent: "Code review this against the idea doc 1_ideas/{slug}.md.
|
||||
Does the implementation match the hypothesis? Any bugs?"
|
||||
run: just smoke # fast sanity check
|
||||
run: just eval # appends to results.tsv
|
||||
|
||||
## REPORT
|
||||
write 9_reports/{YYYY-MM-DD}_{slug}.md (use _TEMPLATE.md format)
|
||||
append short summary to RESEARCH_JOURNAL.md:
|
||||
- what was tried, what metric changed, what you learned
|
||||
- key observation vs inference distinction
|
||||
|
||||
## SUBMIT
|
||||
git commit -m "exp({slug}): {one-line description}"
|
||||
# SUBMIT
|
||||
git commit -m "exp({slug}): {one line}"
|
||||
git push origin exp/{slug}
|
||||
if result beats best in results.tsv:
|
||||
create PR for human to merge
|
||||
if beats best in results.tsv: open PR for human
|
||||
|
||||
## QUEUING EXPERIMENTS (pueue)
|
||||
|
||||
Use pueue to queue experiments for the single GPU -- one at a time, no collision:
|
||||
|
||||
# Queue with a label showing the question and expected resolution
|
||||
pueue add --label "Q: does X help? H: expect +0.05 metric" -- just eval --config=path
|
||||
|
||||
# Check queue / status / logs
|
||||
pueue status
|
||||
pueue log {task_id} # full stdout
|
||||
pueue follow {task_id} # live tail
|
||||
|
||||
Labels encode the hypothesis being tested. After the run, append observed vs expected
|
||||
to RESEARCH_JOURNAL.md. The label shows up in `pueue status` so you can track what
|
||||
question each running/queued job is answering.
|
||||
|
||||
# Example: multiple experiments queued with different hypotheses
|
||||
pueue add --label "Q: rotary vs sinusoidal? H: rotary saves 0.1 bpb" -- just eval rotary
|
||||
pueue add --label "Q: flash-attn memory? H: 2x batch size same speed" -- just eval flash
|
||||
pueue add --label "Q: does layer norm placement matter? H: pre-norm better" -- just eval prenorm
|
||||
# GPU QUEUE (pueue -- one GPU, no collision)
|
||||
just queue "Q: does X help? H: expect +delta" eval {args}
|
||||
pueue status # shows hypothesis label for each queued/running job
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Coding Conventions
|
||||
## Lessons Learned and Gotchas
|
||||
|
||||
Fail fast. No defensive programming. No silent fallbacks.
|
||||
Format: `YYYY-MM-DD | title | lesson (one line)`
|
||||
|
||||
```python
|
||||
# shape ops: einops for clarity
|
||||
from einops import rearrange, reduce
|
||||
x = rearrange(x, 'b s h d -> b h s d')
|
||||
|
||||
# einsum for explicit contraction
|
||||
out = torch.einsum('b h s d, b h d v -> b h s v', q, k)
|
||||
|
||||
# jaxtyping on function boundaries (docs + smoke-test checking)
|
||||
from jaxtyping import Float
|
||||
from torch import Tensor
|
||||
def encode(x: Float[Tensor, 'b s d']) -> Float[Tensor, 'b s h']:
|
||||
...
|
||||
|
||||
# logging: loguru not print
|
||||
from loguru import logger
|
||||
logger.info(f"loss={loss:.4f}")
|
||||
|
||||
# dataframes: polars v1
|
||||
import polars as pl
|
||||
df.group_by("exp").agg(pl.col("metric").mean())
|
||||
|
||||
# config: tyro dataclass
|
||||
import tyro
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
lr: float = 3e-4
|
||||
# {FILL_IN}
|
||||
|
||||
cfg = tyro.cli(Config)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Research Epistemics
|
||||
|
||||
Separate observations from inferences:
|
||||
- **Observation**: "val_bpb dropped from 3.2 to 2.9 on run X" (measured fact)
|
||||
- **Inference**: "this suggests the attention head is learning positional structure" (interpretation)
|
||||
- **Claim from paper**: "authors claim X" -- not "X is true" unless you verified it
|
||||
|
||||
For complex arguments, use `/vargdown` skill: verified argument maps with credences.
|
||||
|
||||
Trust signals: community adoption > papers citing it > open source code > author reputation.
|
||||
|
||||
---
|
||||
|
||||
## Available Skills
|
||||
|
||||
Assume installed at `~/.claude/skills/` (from https://github.com/wassname/skills):
|
||||
|
||||
| Skill | Use for |
|
||||
|-------|---------|
|
||||
| `/semantic-search` | Search arXiv, Semantic Scholar, DBLP, OpenAlex |
|
||||
| `/arxiv-fetch` | Download full paper text given arXiv ID/URL |
|
||||
| `/exa-search` | Neural web search for recent approaches |
|
||||
| `/vargdown` | Verified argument maps with credences for complex reasoning |
|
||||
| `/gsd` | Get Shit Done: spec -> implement -> test -> review -> wrap |
|
||||
| `/jaxtyping` | Runtime tensor shape/dtype checking |
|
||||
| `/justfile` | Project recipes (`just smoke`, `just eval`, `just queue`) |
|
||||
| `/ml_debug` | ML convergence, gradient analysis, sweep methodology |
|
||||
| `/brainstorm` | Wide + deep ideation without tunnel vision |
|
||||
| `/external-review` | Code/plan review via a different model |
|
||||
| `pueue` | Queue GPU jobs sequentially; label each with Q/hypothesis |
|
||||
|
||||
Also available: bibtex MCP (search_reference, fetch), wandb MCP (query runs).
|
||||
<!-- {FILL_IN: add entries as experiments run} -->
|
||||
|
||||
---
|
||||
|
||||
## Meta-Mode
|
||||
|
||||
Human sets `META_MODE=1` to enable editing of FROZEN files and committing to main.
|
||||
|
||||
Use meta-mode to:
|
||||
- Revise this program.md (agent instructions)
|
||||
- Update eval.py (e.g., add new metric columns)
|
||||
- Reflect on the overall research process in meta_journal.md
|
||||
- Exit-interview style: what worked, what didn't, what would you change?
|
||||
|
||||
To enter: human writes `META_MODE=1` in human_journal.md entry before asking agent.
|
||||
Human writes `META_MODE=1` in `human_journal.md` to unlock editing FROZEN files and committing to main. Use for: revising this program.md, updating eval.py, exit-interview style process reflection in meta_journal.md.
|
||||
|
||||
@@ -1,25 +1,90 @@
|
||||
"""
|
||||
Training loop. NOT frozen -- agents modify freely in worktrees.
|
||||
Model + training. NOT frozen -- agents modify freely in worktrees.
|
||||
|
||||
Convention: eval.py runs the frozen evaluation. This file handles training.
|
||||
Keep them separate so eval is never accidentally changed during experimentation.
|
||||
eval.py is frozen (anti-p-hacking). This file is not.
|
||||
Everything agents change lives here: architecture, optimizer, loss, data pipeline.
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import tyro
|
||||
import wandb
|
||||
from einops import rearrange # noqa: F401 -- available for use
|
||||
from jaxtyping import Float
|
||||
from loguru import logger
|
||||
from torch import Tensor
|
||||
|
||||
from model import Config, build_model
|
||||
if os.environ.get("BEARTYPE"):
|
||||
from beartype import beartype as typechecker
|
||||
from jaxtyping import jaxtyped
|
||||
else:
|
||||
def typechecker(f): return f
|
||||
def jaxtyped(**_): return lambda f: f
|
||||
|
||||
|
||||
# --- Config -------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
"""All hyperparameters. Edit freely. eval.py imports this."""
|
||||
|
||||
# model
|
||||
d_model: int = 256
|
||||
n_layers: int = 4
|
||||
# {FILL_IN}: add architecture params
|
||||
|
||||
# training
|
||||
lr: float = 3e-4
|
||||
batch_size: int = 32
|
||||
max_steps: int = 1000
|
||||
seed: int = 42
|
||||
|
||||
# data
|
||||
# {FILL_IN}: add data params
|
||||
|
||||
# logging
|
||||
wandb_project: str = "{FILL_IN}"
|
||||
|
||||
|
||||
# --- Model --------------------------------------------------------------------
|
||||
|
||||
class Model(nn.Module):
|
||||
"""
|
||||
{FILL_IN}: replace with your architecture.
|
||||
Agents: this is the main thing you modify between experiments.
|
||||
"""
|
||||
|
||||
def __init__(self, cfg: Config):
|
||||
super().__init__()
|
||||
self.cfg = cfg
|
||||
# {FILL_IN}: define layers
|
||||
|
||||
@jaxtyped(typechecker=typechecker)
|
||||
def forward(self, x: Float[Tensor, "b s"]) -> Float[Tensor, "b s d"]:
|
||||
# {FILL_IN}: implement forward pass
|
||||
raise NotImplementedError("{FILL_IN}: implement forward()")
|
||||
|
||||
|
||||
def build_model(cfg: Config) -> Model:
|
||||
torch.manual_seed(cfg.seed)
|
||||
model = Model(cfg)
|
||||
n_params = sum(p.numel() for p in model.parameters())
|
||||
logger.info(f"Model: {n_params:,} parameters")
|
||||
return model
|
||||
|
||||
|
||||
# --- Training -----------------------------------------------------------------
|
||||
|
||||
def train(cfg: Config):
|
||||
torch.manual_seed(cfg.seed)
|
||||
|
||||
wandb.init(
|
||||
project="{FILL_IN}", # replace with your W&B project name
|
||||
project=cfg.wandb_project,
|
||||
config=vars(cfg),
|
||||
# group is set by justfile sweep recipes via WANDB_RUN_GROUP env var
|
||||
# WANDB_RUN_GROUP env var set by justfile sweep recipes
|
||||
)
|
||||
|
||||
model = build_model(cfg)
|
||||
|
||||
Reference in New Issue
Block a user