From c42bea207a2d8f871b6eab72ab96ea403cdb57ff Mon Sep 17 00:00:00 2001 From: wassname Date: Fri, 15 May 2026 06:25:44 +0000 Subject: [PATCH] minicache 0.2.0: simplify to @cached(exclude=...) as primary API Removes the required positional `kind` and `cachedir` from the decorator, drops the `state_fn` concept entirely. New defaults: kind from fn.__name__, cachedir from DEFAULT_CACHEDIR = Path("cache"). Primary usage is now just: @cached(exclude=["model", "tok"]) def run_eval(model, tok, *, model_id, name, batch_size): ... Explicit-key form unchanged: cache_call("eval", "qwen-27b|nf4|classic|bs=16", lambda: ...) Version bumped to 0.2.0. Co-Authored-By: Claude Sonnet 4.6 --- src/minicache/__init__.py | 71 ++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/src/minicache/__init__.py b/src/minicache/__init__.py index 181df8b..3e0374c 100644 --- a/src/minicache/__init__.py +++ b/src/minicache/__init__.py @@ -1,38 +1,38 @@ """minicache — tiny disk cache for ML / research code. Wraps function calls and stores returns on disk (gzip + cloudpickle). Solves -the four pain points that stdlib `functools.lru_cache + pickle` and existing -function-cache libraries (anycache, cachier) hit on ML code: +four pain points that `functools.lru_cache + pickle` and existing function-cache +libs (anycache, cachier) hit on ML code: - **Loaded models can't be hashed** → arg blacklist (`exclude=["model", "tok"]`). Excluded args pass through to the function but never enter the cache key. - **Tensors / pandas / closures break stdlib pickle** → cloudpickle backend. - **Pickle files grow large** → gzip on disk (~3× smaller, free). -- **"Function source changed → invalidate" causes false invalidations on - reformat** → caller bumps an explicit `state` string when behavior actually - changes. No AST hashing magic. +- **No source-AST hashing** → false invalidation on reformat is the worst kind + of bug. Caller passes a `state` kwarg (or anything else) when behavior + changes. No magic. -## Quick use +## Usage from minicache import cached, cache_call - # 1. Decorator: hashes (state, included args). Excludes drop out of key. - @cached("eval", cachedir="out/cache", - state_fn=lambda *, model_id, **_: f"{model_id}|nf4|r00+r02", - exclude=["model", "tok"]) + # Decorator. Default cachedir = ./cache, default kind = fn.__name__. + @cached(exclude=["model", "tok"]) def run_eval(model, tok, *, model_id, name, batch_size): - return tinymfv_evaluate(model, tok, name=name, batch_size=batch_size) + return expensive_eval(model, tok, name=name, batch_size=batch_size) report = run_eval(model, tok, model_id="qwen-27b", name="classic", batch_size=16) + # second call with same args (any model/tok instance) → cache HIT - # 2. Explicit key: no introspection, you compose the key - key = "qwen-27b|nf4|r00+r02|eval|classic|bs=16" - report = cache_call("eval", key, lambda: tinymfv_evaluate(model, tok, ...), - cachedir="out/cache") + # Explicit-key form. Caller composes the key (no introspection). + # Useful when args alone don't determine the cache identity (e.g. you + # also want to pin to disk state walked at call time). + report = cache_call("eval", "qwen-27b|nf4|r00+r02|classic|bs=16", + lambda: expensive_eval(model, tok, name="classic")) -See also +See also: - anycache https://github.com/c0fec0de/anycache -- cachier https://github.com/python-cachier/cachier#working-with-unhashable-arguments +- cachier https://github.com/python-cachier/cachier """ from __future__ import annotations @@ -46,9 +46,10 @@ from typing import Any, Callable, Iterable import cloudpickle -__version__ = "0.1.0" -__all__ = ["cache_call", "cached"] +__version__ = "0.2.0" +__all__ = ["cache_call", "cached", "DEFAULT_CACHEDIR"] +DEFAULT_CACHEDIR = Path("cache") _EXT = ".pkl.gz" @@ -57,7 +58,7 @@ def _hash(payload: str) -> str: def cache_call(kind: str, key: str, fn: Callable[[], Any], - cachedir: Path | str) -> Any: + cachedir: Path | str = DEFAULT_CACHEDIR) -> Any: """Run-or-load. Cache file = `//.pkl.gz`. Hit: gunzip + cloudpickle.load → return. @@ -76,39 +77,34 @@ def cache_call(kind: str, key: str, fn: Callable[[], Any], def cached( - kind: str, *, - cachedir: Path | str, - state_fn: Callable[..., str] | None = None, exclude: Iterable[str] = (), + cachedir: Path | str = DEFAULT_CACHEDIR, + kind: str | None = None, ): - """Decorator. Cache key = sha256(kind | state_fn(**args) | included_args) - where included = signature(fn) \\ exclude. + """Decorator. Cache key = sha256(json(included args)). - `state_fn` lets you inject context that isn't a function arg (e.g. a model - fingerprint walked from disk). It receives ALL bound args by name; pull - out what you need with **kwargs unpacking. + `exclude` drops args from the key — use for unhashable / large / + instance-specific values (loaded models, GPU tensors, open files). They + still pass through to the function unchanged. - Args in `exclude` pass through to fn unchanged but never enter the key — - use this for unhashable / large / instance-specific things (loaded models, - open files, GPU tensors). + `kind` is the cache subdir (default = fn.__name__). """ excluded = set(exclude) def decorator(fn: Callable) -> Callable: sig = inspect.signature(fn) keep = [n for n in sig.parameters if n not in excluded] + sub = kind or fn.__name__ def wrapper(*args, **kwargs): bound = sig.bind(*args, **kwargs) bound.apply_defaults() included = {n: bound.arguments[n] for n in keep if n in bound.arguments} - state = state_fn(**bound.arguments) if state_fn else "" - payload = json.dumps({"k": kind, "s": state, "a": included}, - sort_keys=True, default=str) + payload = json.dumps(included, sort_keys=True, default=str) key = _hash(payload) - return cache_call(kind, key, lambda: fn(*args, **kwargs), cachedir) + return cache_call(sub, key, lambda: fn(*args, **kwargs), cachedir) wrapper.__wrapped__ = fn return wrapper @@ -116,17 +112,16 @@ def cached( if __name__ == "__main__": - # Smoke: hits cache on second call. import tempfile import time with tempfile.TemporaryDirectory() as td: - @cached("demo", cachedir=td, exclude=["expensive_obj"]) + @cached(exclude=["expensive_obj"], cachedir=td) def f(x, expensive_obj=None, y=10): time.sleep(0.5) return x + y t0 = time.time(); assert f(1, expensive_obj=object()) == 11 - t1 = time.time(); assert f(1, expensive_obj=object()) == 11 # cache HIT + t1 = time.time(); assert f(1, expensive_obj=object()) == 11 t2 = time.time() print(f"miss: {t1-t0:.3f}s, hit: {t2-t1:.4f}s (different obj instance, same key)")