reformat run.ipynb

jbloomAus · Apr 18, 2024 · 822882c · 822882c
1 parent 11a71e1
commit 822882c
Showing 1 changed file with 3 additions and 11 deletions.
diff --git a/scripts/run.ipynb b/scripts/run.ipynb
@@ -256,13 +256,11 @@
     "    d_in=1024,  # the width of the mlp output.\n",
     "    dataset_path=\"apollo-research/roneneldan-TinyStories-tokenizer-gpt2\",  # this is a tokenized language dataset on Huggingface for the Tiny Stories corpus.\n",
     "    is_dataset_tokenized=True,\n",
-    "    \n",
     "    # SAE Parameters\n",
     "    mse_loss_normalization=None,  # We won't normalize the mse loss,\n",
     "    expansion_factor=16,  # the width of the SAE. Larger will result in better stats but slower training.\n",
     "    b_dec_init_method=\"geometric_median\",  # The geometric median can be used to initialize the decoder weights.\n",
     "    apply_b_dec_to_input=False,  # We won't apply the decoder to the input.\n",
-    "    \n",
     "    # Training Parameters\n",
     "    lr=0.0008,  # lower the better, we'll go fairly high to speed up the tutorial.\n",
     "    lr_scheduler_name=\"constant\",  # constant learning rate with warmup. Could be better schedules out there.\n",
@@ -271,15 +269,13 @@
     "    lp_norm=1.0,  # the L1 penalty (and not a Lp for p < 1)\n",
     "    train_batch_size=4096,\n",
     "    context_size=128,  # will control the lenght of the prompts we feed to the model. Larger is better but slower.\n",
-    "    \n",
     "    # Activation Store Parameters\n",
     "    n_batches_in_buffer=64,  # controls how many activations we store / shuffle.\n",
-    "    training_tokens=1_000_000 * 25,  # 100 million tokens is quite a few, but we want to see good stats. Get a coffee, come back.\n",
+    "    training_tokens=1_000_000\n",
+    "    * 25,  # 100 million tokens is quite a few, but we want to see good stats. Get a coffee, come back.\n",
     "    finetuning_method=\"decoder\",\n",
     "    finetuning_tokens=1_000_000 * 25,\n",
     "    store_batch_size=32,\n",
-    "    \n",
-    "    \n",
     "    # Resampling protocol\n",
     "    use_ghost_grads=False,\n",
     "    feature_sampling_window=1000,  # this controls our reporting of feature sparsity stats\n",
@@ -620,8 +616,6 @@
     }
    ],
    "source": [
-    "\n",
-    "\n",
     "cfg = LanguageModelSAERunnerConfig(\n",
     "    # Data Generating Function (Model + Training Distibuion)\n",
     "    model_name=\"gpt2-small\",\n",
@@ -630,7 +624,7 @@
     "    d_in=768,\n",
     "    dataset_path=\"apollo-research/Skylion007-openwebtext-tokenizer-gpt2\",\n",
     "    is_dataset_tokenized=True,\n",
-    "    prepend_bos=True, # should experiment with turning this off.\n",
+    "    prepend_bos=True,  # should experiment with turning this off.\n",
     "    # SAE Parameters\n",
     "    expansion_factor=32,  # determines the dimension of the SAE.\n",
     "    b_dec_init_method=\"geometric_median\",  # geometric median is better but slower to get started\n",
@@ -650,13 +644,11 @@
     "    finetuning_method=\"decoder\",\n",
     "    finetuning_tokens=1_000_000 * 100,\n",
     "    store_batch_size=32,\n",
-    "    \n",
     "    # Resampling protocol\n",
     "    use_ghost_grads=False,\n",
     "    feature_sampling_window=2500,\n",
     "    dead_feature_window=5000,\n",
     "    dead_feature_threshold=1e-8,\n",
-    "    \n",
     "    # WANDB\n",
     "    log_to_wandb=True,\n",
     "    wandb_project=\"gpt2_small_experiments_april\",\n",