From 3e86c6a1a88f12859e35fa7e7f07cf4efb5a5335 Mon Sep 17 00:00:00 2001 From: thiswillbeyourgithub <26625900+thiswillbeyourgithub@users.noreply.github.com> Date: Thu, 11 Sep 2025 08:33:30 +0200 Subject: [PATCH 1/4] fix: hilarious typo Signed-off-by: thiswillbeyourgithub <26625900+thiswillbeyourgithub@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 32c90b2..bb61a35 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ f # num_rows: 20 # }) -# it has this sgaoe +# it has this shape ds_a = Dataset.from_parquet(str(f)).with_format("torch") ds_a[0:2]['hidden_states'].shape # [batch, layers, tokens, hidden_states] # torch.Size([2, 25, 1, 896]) From e7aa9e00b7078c54c9e515feb42ecf702a2f6db0 Mon Sep 17 00:00:00 2001 From: thiswillbeyourgithub <26625900+thiswillbeyourgithub@users.noreply.github.com> Date: Thu, 11 Sep 2025 08:33:45 +0200 Subject: [PATCH 2/4] doc: mention the nbs better Signed-off-by: thiswillbeyourgithub <26625900+thiswillbeyourgithub@users.noreply.github.com> --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bb61a35..2dbd06f 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,12 @@ Install using pip install git+https://github.com/wassname/activation_store.git ``` -Example + +## Examples + +Full examples can be found in the [nbs folder](./nbs). + + ```py layer_groups = {'mlp.down_proj': [ 'model.layers.21.mlp.down_proj', @@ -47,9 +52,6 @@ git clone https//github.com/wassname/activation_store.git uv sync ``` -see examples in `nbs` folder. - - ## TODO: - [x] test compression: it's not worth the [complexity](https://github.com/EleutherAI/elk/blob/84e99a36a5050881d85f1510a2486ce46ac1f942/elk/extraction/extraction.py#L382) From e44e4fbd1abb5dd199b3c1d660e063d0669d558a Mon Sep 17 00:00:00 2001 From: thiswillbeyourgithub <26625900+thiswillbeyourgithub@users.noreply.github.com> Date: Thu, 11 Sep 2025 08:42:54 +0200 Subject: [PATCH 3/4] doc: use keyword arguments in the example Signed-off-by: thiswillbeyourgithub <26625900+thiswillbeyourgithub@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2dbd06f..c0e10b0 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ layer_groups = {'mlp.down_proj': [ 'model.layers.23.mlp.up_proj']} # collect activations into a huggingface dataset -f = activation_store(ds, model, layers=layer_groups) +f = activation_store(loader=ds, model=model, layers=layer_groups) f # > Generating train split: 0 examples [00:00, ? examples/s] # Dataset({ From a1dd6d3d8527817916a6e181292dfa0d5cb99f6e Mon Sep 17 00:00:00 2001 From: thiswillbeyourgithub <26625900+thiswillbeyourgithub@users.noreply.github.com> Date: Thu, 11 Sep 2025 08:43:42 +0200 Subject: [PATCH 4/4] wording Signed-off-by: thiswillbeyourgithub <26625900+thiswillbeyourgithub@users.noreply.github.com> --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c0e10b0..19dac9c 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # activation_store -Utility library to persistently store transformer activations on disk. +Utility library to persistently store transformer activations on disk as huggingface dataset. -These activations can be quite large (layers x batch x sequence x hidden_size), so generating them to disk helps avoid out of memory errors. +As these activations can be quite large (layers x batch x sequence x hidden_size), generating them to disk helps avoid out of memory errors. -Install using +Install using ``` pip install git+https://github.com/wassname/activation_store.git ```