Files
activation_store/nbs/example.ipynb
T
wassname bcd47da026 tidy
2025-03-12 13:55:47 +08:00

318 lines
8.7 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"\n",
"from activation_store.collect import activation_store\n",
"\n",
"import torch"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = \"Qwen/Qwen2.5-0.5B-Instruct\"\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" model_name,\n",
" torch_dtype=torch.bfloat16,\n",
" device_map=\"auto\",\n",
" attn_implementation=\"eager\", # flex_attention flash_attention_2 sdpa eager\n",
")\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"if tokenizer.pad_token_id is None:\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
"tokenizer.paddding_side = \"left\"\n",
"tokenizer.truncation_side = \"left\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data and tokenize"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['attention_mask', 'input_ids'],\n",
" num_rows: 20\n",
"})"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"N = 20\n",
"max_length = 256\n",
"\n",
"imdb = load_dataset('wassname/imdb_dpo', split=f'test[:{N}]', keep_in_memory=False)\n",
"\n",
"\n",
"def proc(row):\n",
" messages = [\n",
" {\"role\":\"user\", \"content\": row['prompt'] },\n",
" {\"role\":\"assistant\", \"content\": row['chosen'] }\n",
" ]\n",
" return tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=False, return_dict=True, max_length=max_length)\n",
"\n",
"ds2 = imdb.map(proc).with_format(\"torch\")\n",
"new_cols = set(ds2.column_names) - set(imdb.column_names)\n",
"ds2 = ds2.select_columns(new_cols)\n",
"ds2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data loader"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<torch.utils.data.dataloader.DataLoader object at 0x7089f82ccb30>\n"
]
}
],
"source": [
"from torch.utils.data import DataLoader\n",
"from transformers.data import DataCollatorForLanguageModeling\n",
"collate_fn = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
"ds = DataLoader(ds2, batch_size=4, num_workers=0, collate_fn=collate_fn)\n",
"print(ds)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Collect activations"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['model.layers.0.mlp.down_proj',\n",
" 'model.layers.1.mlp.down_proj',\n",
" 'model.layers.2.mlp.down_proj',\n",
" 'model.layers.3.mlp.down_proj',\n",
" 'model.layers.4.mlp.down_proj',\n",
" 'model.layers.5.mlp.down_proj',\n",
" 'model.layers.6.mlp.down_proj',\n",
" 'model.layers.7.mlp.down_proj',\n",
" 'model.layers.8.mlp.down_proj',\n",
" 'model.layers.9.mlp.down_proj',\n",
" 'model.layers.10.mlp.down_proj',\n",
" 'model.layers.11.mlp.down_proj',\n",
" 'model.layers.12.mlp.down_proj',\n",
" 'model.layers.13.mlp.down_proj',\n",
" 'model.layers.14.mlp.down_proj',\n",
" 'model.layers.15.mlp.down_proj',\n",
" 'model.layers.16.mlp.down_proj',\n",
" 'model.layers.17.mlp.down_proj',\n",
" 'model.layers.18.mlp.down_proj',\n",
" 'model.layers.19.mlp.down_proj',\n",
" 'model.layers.20.mlp.down_proj',\n",
" 'model.layers.21.mlp.down_proj',\n",
" 'model.layers.22.mlp.down_proj',\n",
" 'model.layers.23.mlp.down_proj']"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# choose layers to cache\n",
"layers = [k for k,v in model.named_modules() if k.endswith('mlp.down_proj')]\n",
"layers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-02-16 09:36:37.315\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mactivation_store.collect\u001b[0m:\u001b[36mactivation_store\u001b[0m:\u001b[36m77\u001b[0m - \u001b[1mcreating dataset /media/wassname/SGIronWolf/projects5/elk/cache_transformer_acts/outputs/.ds/ds__fac086acb713a85e.parquet\u001b[0m\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8341bbff75634f0fb235e107abc2083d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"collecting activations: 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
]
},
{
"data": {
"text/plain": [
"PosixPath('/media/wassname/SGIronWolf/projects5/elk/cache_transformer_acts/outputs/.ds/ds__fac086acb713a85e.parquet')"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f = activation_store(ds, model, layers=layers)\n",
"f"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['act-model.layers.0.mlp.down_proj', 'act-model.layers.1.mlp.down_proj', 'act-model.layers.2.mlp.down_proj', 'act-model.layers.3.mlp.down_proj', 'act-model.layers.4.mlp.down_proj', 'act-model.layers.5.mlp.down_proj', 'act-model.layers.6.mlp.down_proj', 'act-model.layers.7.mlp.down_proj', 'act-model.layers.8.mlp.down_proj', 'act-model.layers.9.mlp.down_proj', 'act-model.layers.10.mlp.down_proj', 'act-model.layers.11.mlp.down_proj', 'act-model.layers.12.mlp.down_proj', 'act-model.layers.13.mlp.down_proj', 'act-model.layers.14.mlp.down_proj', 'act-model.layers.15.mlp.down_proj', 'act-model.layers.16.mlp.down_proj', 'act-model.layers.17.mlp.down_proj', 'act-model.layers.18.mlp.down_proj', 'act-model.layers.19.mlp.down_proj', 'act-model.layers.20.mlp.down_proj', 'act-model.layers.21.mlp.down_proj', 'act-model.layers.22.mlp.down_proj', 'act-model.layers.23.mlp.down_proj', 'logits', 'hidden_states'],\n",
" num_rows: 20\n",
"})"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from datasets import Dataset\n",
"ds_a = Dataset.from_parquet(str(f)).with_format(\"torch\")\n",
"ds_a"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([2, 25, 453, 896])"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds_a[0:2]['hidden_states'].shape # [batch, layers, tokens, hidden_states]"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([2, 453, 896])"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds_a[0:2]['act-model.layers.0.mlp.down_proj'].shape"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}