Fix vocab: Ċ should be line break. Also set left and right buffers

jbloomAus · Mar 29, 2024 · 205b1c1 · 205b1c1
1 parent b159010
commit 205b1c1
Showing 1 changed file with 7 additions and 4 deletions.
diff --git a/sae_analysis/neuronpedia_runner.py b/sae_analysis/neuronpedia_runner.py
@@ -45,15 +45,17 @@ def __init__(
         n_prompts_to_select: int = 4096 * 6,
         # sampling pars
         n_features_at_a_time: int = 1024,
-        buffer_tokens: int = 8,
+        buffer_tokens_left: int = 8,
+        buffer_tokens_right: int = 8,
     ):
         self.sae_path = sae_path
         if init_session:
             self.init_sae_session()
 
         self.feature_sparsity_path = feature_sparsity_path
         self.n_features_at_a_time = n_features_at_a_time
-        self.buffer_tokens = buffer_tokens
+        self.buffer_tokens_left = buffer_tokens_left
+        self.buffer_tokens_right = buffer_tokens_right
         self.n_batches_to_sample_from = n_batches_to_sample_from
         self.n_prompts_to_select = n_prompts_to_select
 
@@ -156,7 +158,8 @@ def run(self):
 
         vocab_dict = cast(Any, self.model.tokenizer).vocab
         vocab_dict = {
-            v: k.replace("Ġ", " ").replace("\n", "\\n") for k, v in vocab_dict.items()
+            v: k.replace("Ġ", " ").replace("\n", "\\n").replace("Ċ", "\n")
+            for k, v in vocab_dict.items()
         }
         # pad with blank tokens to the actual vocab size
         for i in range(len(vocab_dict), self.model.cfg.d_vocab):
@@ -174,7 +177,7 @@ def run(self):
                     minibatch_size_tokens=64,
                     first_group_size=20,
                     other_groups_size=5,
-                    buffer=(self.buffer_tokens, self.buffer_tokens),
+                    buffer=(self.buffer_tokens_left, self.buffer_tokens_right),
                     features=features_to_process,
                     verbose=False,
                     include_left_tables=True,