mirror of
https://github.com/wassname/minicache.git
synced 2026-06-27 15:15:59 +08:00
minicache 0.2.0: simplify to @cached(exclude=...) as primary API
Removes the required positional `kind` and `cachedir` from the decorator,
drops the `state_fn` concept entirely. New defaults: kind from fn.__name__,
cachedir from DEFAULT_CACHEDIR = Path("cache").
Primary usage is now just:
@cached(exclude=["model", "tok"])
def run_eval(model, tok, *, model_id, name, batch_size):
...
Explicit-key form unchanged:
cache_call("eval", "qwen-27b|nf4|classic|bs=16", lambda: ...)
Version bumped to 0.2.0.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+33
-38
@@ -1,38 +1,38 @@
|
|||||||
"""minicache — tiny disk cache for ML / research code.
|
"""minicache — tiny disk cache for ML / research code.
|
||||||
|
|
||||||
Wraps function calls and stores returns on disk (gzip + cloudpickle). Solves
|
Wraps function calls and stores returns on disk (gzip + cloudpickle). Solves
|
||||||
the four pain points that stdlib `functools.lru_cache + pickle` and existing
|
four pain points that `functools.lru_cache + pickle` and existing function-cache
|
||||||
function-cache libraries (anycache, cachier) hit on ML code:
|
libs (anycache, cachier) hit on ML code:
|
||||||
|
|
||||||
- **Loaded models can't be hashed** → arg blacklist (`exclude=["model", "tok"]`).
|
- **Loaded models can't be hashed** → arg blacklist (`exclude=["model", "tok"]`).
|
||||||
Excluded args pass through to the function but never enter the cache key.
|
Excluded args pass through to the function but never enter the cache key.
|
||||||
- **Tensors / pandas / closures break stdlib pickle** → cloudpickle backend.
|
- **Tensors / pandas / closures break stdlib pickle** → cloudpickle backend.
|
||||||
- **Pickle files grow large** → gzip on disk (~3× smaller, free).
|
- **Pickle files grow large** → gzip on disk (~3× smaller, free).
|
||||||
- **"Function source changed → invalidate" causes false invalidations on
|
- **No source-AST hashing** → false invalidation on reformat is the worst kind
|
||||||
reformat** → caller bumps an explicit `state` string when behavior actually
|
of bug. Caller passes a `state` kwarg (or anything else) when behavior
|
||||||
changes. No AST hashing magic.
|
changes. No magic.
|
||||||
|
|
||||||
## Quick use
|
## Usage
|
||||||
|
|
||||||
from minicache import cached, cache_call
|
from minicache import cached, cache_call
|
||||||
|
|
||||||
# 1. Decorator: hashes (state, included args). Excludes drop out of key.
|
# Decorator. Default cachedir = ./cache, default kind = fn.__name__.
|
||||||
@cached("eval", cachedir="out/cache",
|
@cached(exclude=["model", "tok"])
|
||||||
state_fn=lambda *, model_id, **_: f"{model_id}|nf4|r00+r02",
|
|
||||||
exclude=["model", "tok"])
|
|
||||||
def run_eval(model, tok, *, model_id, name, batch_size):
|
def run_eval(model, tok, *, model_id, name, batch_size):
|
||||||
return tinymfv_evaluate(model, tok, name=name, batch_size=batch_size)
|
return expensive_eval(model, tok, name=name, batch_size=batch_size)
|
||||||
|
|
||||||
report = run_eval(model, tok, model_id="qwen-27b", name="classic", batch_size=16)
|
report = run_eval(model, tok, model_id="qwen-27b", name="classic", batch_size=16)
|
||||||
|
# second call with same args (any model/tok instance) → cache HIT
|
||||||
|
|
||||||
# 2. Explicit key: no introspection, you compose the key
|
# Explicit-key form. Caller composes the key (no introspection).
|
||||||
key = "qwen-27b|nf4|r00+r02|eval|classic|bs=16"
|
# Useful when args alone don't determine the cache identity (e.g. you
|
||||||
report = cache_call("eval", key, lambda: tinymfv_evaluate(model, tok, ...),
|
# also want to pin to disk state walked at call time).
|
||||||
cachedir="out/cache")
|
report = cache_call("eval", "qwen-27b|nf4|r00+r02|classic|bs=16",
|
||||||
|
lambda: expensive_eval(model, tok, name="classic"))
|
||||||
|
|
||||||
See also
|
See also:
|
||||||
- anycache https://github.com/c0fec0de/anycache
|
- anycache https://github.com/c0fec0de/anycache
|
||||||
- cachier https://github.com/python-cachier/cachier#working-with-unhashable-arguments
|
- cachier https://github.com/python-cachier/cachier
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@@ -46,9 +46,10 @@ from typing import Any, Callable, Iterable
|
|||||||
import cloudpickle
|
import cloudpickle
|
||||||
|
|
||||||
|
|
||||||
__version__ = "0.1.0"
|
__version__ = "0.2.0"
|
||||||
__all__ = ["cache_call", "cached"]
|
__all__ = ["cache_call", "cached", "DEFAULT_CACHEDIR"]
|
||||||
|
|
||||||
|
DEFAULT_CACHEDIR = Path("cache")
|
||||||
_EXT = ".pkl.gz"
|
_EXT = ".pkl.gz"
|
||||||
|
|
||||||
|
|
||||||
@@ -57,7 +58,7 @@ def _hash(payload: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def cache_call(kind: str, key: str, fn: Callable[[], Any],
|
def cache_call(kind: str, key: str, fn: Callable[[], Any],
|
||||||
cachedir: Path | str) -> Any:
|
cachedir: Path | str = DEFAULT_CACHEDIR) -> Any:
|
||||||
"""Run-or-load. Cache file = `<cachedir>/<kind>/<key>.pkl.gz`.
|
"""Run-or-load. Cache file = `<cachedir>/<kind>/<key>.pkl.gz`.
|
||||||
|
|
||||||
Hit: gunzip + cloudpickle.load → return.
|
Hit: gunzip + cloudpickle.load → return.
|
||||||
@@ -76,39 +77,34 @@ def cache_call(kind: str, key: str, fn: Callable[[], Any],
|
|||||||
|
|
||||||
|
|
||||||
def cached(
|
def cached(
|
||||||
kind: str,
|
|
||||||
*,
|
*,
|
||||||
cachedir: Path | str,
|
|
||||||
state_fn: Callable[..., str] | None = None,
|
|
||||||
exclude: Iterable[str] = (),
|
exclude: Iterable[str] = (),
|
||||||
|
cachedir: Path | str = DEFAULT_CACHEDIR,
|
||||||
|
kind: str | None = None,
|
||||||
):
|
):
|
||||||
"""Decorator. Cache key = sha256(kind | state_fn(**args) | included_args)
|
"""Decorator. Cache key = sha256(json(included args)).
|
||||||
where included = signature(fn) \\ exclude.
|
|
||||||
|
|
||||||
`state_fn` lets you inject context that isn't a function arg (e.g. a model
|
`exclude` drops args from the key — use for unhashable / large /
|
||||||
fingerprint walked from disk). It receives ALL bound args by name; pull
|
instance-specific values (loaded models, GPU tensors, open files). They
|
||||||
out what you need with **kwargs unpacking.
|
still pass through to the function unchanged.
|
||||||
|
|
||||||
Args in `exclude` pass through to fn unchanged but never enter the key —
|
`kind` is the cache subdir (default = fn.__name__).
|
||||||
use this for unhashable / large / instance-specific things (loaded models,
|
|
||||||
open files, GPU tensors).
|
|
||||||
"""
|
"""
|
||||||
excluded = set(exclude)
|
excluded = set(exclude)
|
||||||
|
|
||||||
def decorator(fn: Callable) -> Callable:
|
def decorator(fn: Callable) -> Callable:
|
||||||
sig = inspect.signature(fn)
|
sig = inspect.signature(fn)
|
||||||
keep = [n for n in sig.parameters if n not in excluded]
|
keep = [n for n in sig.parameters if n not in excluded]
|
||||||
|
sub = kind or fn.__name__
|
||||||
|
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args, **kwargs):
|
||||||
bound = sig.bind(*args, **kwargs)
|
bound = sig.bind(*args, **kwargs)
|
||||||
bound.apply_defaults()
|
bound.apply_defaults()
|
||||||
included = {n: bound.arguments[n] for n in keep
|
included = {n: bound.arguments[n] for n in keep
|
||||||
if n in bound.arguments}
|
if n in bound.arguments}
|
||||||
state = state_fn(**bound.arguments) if state_fn else ""
|
payload = json.dumps(included, sort_keys=True, default=str)
|
||||||
payload = json.dumps({"k": kind, "s": state, "a": included},
|
|
||||||
sort_keys=True, default=str)
|
|
||||||
key = _hash(payload)
|
key = _hash(payload)
|
||||||
return cache_call(kind, key, lambda: fn(*args, **kwargs), cachedir)
|
return cache_call(sub, key, lambda: fn(*args, **kwargs), cachedir)
|
||||||
|
|
||||||
wrapper.__wrapped__ = fn
|
wrapper.__wrapped__ = fn
|
||||||
return wrapper
|
return wrapper
|
||||||
@@ -116,17 +112,16 @@ def cached(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Smoke: hits cache on second call.
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
with tempfile.TemporaryDirectory() as td:
|
with tempfile.TemporaryDirectory() as td:
|
||||||
@cached("demo", cachedir=td, exclude=["expensive_obj"])
|
@cached(exclude=["expensive_obj"], cachedir=td)
|
||||||
def f(x, expensive_obj=None, y=10):
|
def f(x, expensive_obj=None, y=10):
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
return x + y
|
return x + y
|
||||||
|
|
||||||
t0 = time.time(); assert f(1, expensive_obj=object()) == 11
|
t0 = time.time(); assert f(1, expensive_obj=object()) == 11
|
||||||
t1 = time.time(); assert f(1, expensive_obj=object()) == 11 # cache HIT
|
t1 = time.time(); assert f(1, expensive_obj=object()) == 11
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
print(f"miss: {t1-t0:.3f}s, hit: {t2-t1:.4f}s (different obj instance, same key)")
|
print(f"miss: {t1-t0:.3f}s, hit: {t2-t1:.4f}s (different obj instance, same key)")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user