Skip to content

Commit

Permalink
Fix vocab: Ċ should be line break. Also set left and right buffers
Browse files Browse the repository at this point in the history
  • Loading branch information
hijohnnylin committed Mar 29, 2024
1 parent b159010 commit 205b1c1
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions sae_analysis/neuronpedia_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,17 @@ def __init__(
n_prompts_to_select: int = 4096 * 6,
# sampling pars
n_features_at_a_time: int = 1024,
buffer_tokens: int = 8,
buffer_tokens_left: int = 8,
buffer_tokens_right: int = 8,
):
self.sae_path = sae_path
if init_session:
self.init_sae_session()

self.feature_sparsity_path = feature_sparsity_path
self.n_features_at_a_time = n_features_at_a_time
self.buffer_tokens = buffer_tokens
self.buffer_tokens_left = buffer_tokens_left
self.buffer_tokens_right = buffer_tokens_right
self.n_batches_to_sample_from = n_batches_to_sample_from
self.n_prompts_to_select = n_prompts_to_select

Expand Down Expand Up @@ -156,7 +158,8 @@ def run(self):

vocab_dict = cast(Any, self.model.tokenizer).vocab
vocab_dict = {
v: k.replace("Ġ", " ").replace("\n", "\\n") for k, v in vocab_dict.items()
v: k.replace("Ġ", " ").replace("\n", "\\n").replace("Ċ", "\n")
for k, v in vocab_dict.items()
}
# pad with blank tokens to the actual vocab size
for i in range(len(vocab_dict), self.model.cfg.d_vocab):
Expand All @@ -174,7 +177,7 @@ def run(self):
minibatch_size_tokens=64,
first_group_size=20,
other_groups_size=5,
buffer=(self.buffer_tokens, self.buffer_tokens),
buffer=(self.buffer_tokens_left, self.buffer_tokens_right),
features=features_to_process,
verbose=False,
include_left_tables=True,
Expand Down

0 comments on commit 205b1c1

Please sign in to comment.