diff --git a/scripts/run.ipynb b/scripts/run.ipynb index 824b431c..f13aac59 100644 --- a/scripts/run.ipynb +++ b/scripts/run.ipynb @@ -256,13 +256,11 @@ " d_in=1024, # the width of the mlp output.\n", " dataset_path=\"apollo-research/roneneldan-TinyStories-tokenizer-gpt2\", # this is a tokenized language dataset on Huggingface for the Tiny Stories corpus.\n", " is_dataset_tokenized=True,\n", - " \n", " # SAE Parameters\n", " mse_loss_normalization=None, # We won't normalize the mse loss,\n", " expansion_factor=16, # the width of the SAE. Larger will result in better stats but slower training.\n", " b_dec_init_method=\"geometric_median\", # The geometric median can be used to initialize the decoder weights.\n", " apply_b_dec_to_input=False, # We won't apply the decoder to the input.\n", - " \n", " # Training Parameters\n", " lr=0.0008, # lower the better, we'll go fairly high to speed up the tutorial.\n", " lr_scheduler_name=\"constant\", # constant learning rate with warmup. Could be better schedules out there.\n", @@ -271,15 +269,13 @@ " lp_norm=1.0, # the L1 penalty (and not a Lp for p < 1)\n", " train_batch_size=4096,\n", " context_size=128, # will control the lenght of the prompts we feed to the model. Larger is better but slower.\n", - " \n", " # Activation Store Parameters\n", " n_batches_in_buffer=64, # controls how many activations we store / shuffle.\n", - " training_tokens=1_000_000 * 25, # 100 million tokens is quite a few, but we want to see good stats. Get a coffee, come back.\n", + " training_tokens=1_000_000\n", + " * 25, # 100 million tokens is quite a few, but we want to see good stats. Get a coffee, come back.\n", " finetuning_method=\"decoder\",\n", " finetuning_tokens=1_000_000 * 25,\n", " store_batch_size=32,\n", - " \n", - " \n", " # Resampling protocol\n", " use_ghost_grads=False,\n", " feature_sampling_window=1000, # this controls our reporting of feature sparsity stats\n", @@ -620,8 +616,6 @@ } ], "source": [ - "\n", - "\n", "cfg = LanguageModelSAERunnerConfig(\n", " # Data Generating Function (Model + Training Distibuion)\n", " model_name=\"gpt2-small\",\n", @@ -630,7 +624,7 @@ " d_in=768,\n", " dataset_path=\"apollo-research/Skylion007-openwebtext-tokenizer-gpt2\",\n", " is_dataset_tokenized=True,\n", - " prepend_bos=True, # should experiment with turning this off.\n", + " prepend_bos=True, # should experiment with turning this off.\n", " # SAE Parameters\n", " expansion_factor=32, # determines the dimension of the SAE.\n", " b_dec_init_method=\"geometric_median\", # geometric median is better but slower to get started\n", @@ -650,13 +644,11 @@ " finetuning_method=\"decoder\",\n", " finetuning_tokens=1_000_000 * 100,\n", " store_batch_size=32,\n", - " \n", " # Resampling protocol\n", " use_ghost_grads=False,\n", " feature_sampling_window=2500,\n", " dead_feature_window=5000,\n", " dead_feature_threshold=1e-8,\n", - " \n", " # WANDB\n", " log_to_wandb=True,\n", " wandb_project=\"gpt2_small_experiments_april\",\n",