From 5e9a3394eaed54aabab4f991f9847c30e69e2857 Mon Sep 17 00:00:00 2001 From: Fabian Degen <106864199+degenfabian@users.noreply.github.com> Date: Sat, 28 Dec 2024 01:43:07 +0100 Subject: [PATCH 1/3] Set prepend_bos to false by default for Qwen models (#815) * Set prepend_bos to false by default for Qwen * Fix typo in warning for center_unembed when logit softcap is activated --------- Co-authored-by: Fabian Degen --- transformer_lens/HookedTransformer.py | 2 +- transformer_lens/loading_from_pretrained.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py index 500098c32..9fa4571ae 100644 --- a/transformer_lens/HookedTransformer.py +++ b/transformer_lens/HookedTransformer.py @@ -1311,7 +1311,7 @@ def from_pretrained( center_writing_weights = False if center_unembed and cfg.output_logits_soft_cap > 0.0: logging.warning( - "You tried to specify center_unembed=True for a model using logit softcap, but this can't be done! Softcapping is not invariant upon adding a constant" + "You tried to specify center_unembed=True for a model using logit softcap, but this can't be done! Softcapping is not invariant upon adding a constant " "Setting center_unembed=False instead." ) center_unembed = False diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py index b4ecc8d64..4d36c744f 100644 --- a/transformer_lens/loading_from_pretrained.py +++ b/transformer_lens/loading_from_pretrained.py @@ -1241,6 +1241,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "trust_remote_code": True, "final_rms": True, "gated_mlp": True, + "default_prepend_bos": False, } elif architecture == "Qwen2ForCausalLM": # Note that Qwen1.5 models have architecture type Qwen2ForCausalLM. @@ -1265,6 +1266,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "tokenizer_prepends_bos": True, "final_rms": True, "gated_mlp": True, + "default_prepend_bos": False, } elif architecture == "PhiForCausalLM": # Architecture for microsoft/phi models From d0d0750f36e216e4dc675b45311744d42fe5d295 Mon Sep 17 00:00:00 2001 From: Fabian Degen <106864199+degenfabian@users.noreply.github.com> Date: Sat, 28 Dec 2024 03:35:33 +0100 Subject: [PATCH 2/3] Throw error when using attn_in with grouped query attention (#810) * raise AssertionError when use_attn_in is used with GQA * add test case for raising AssertionErromake format * rotary_base as int for gemma model to keep test from failing due to beartype error * Test on Qwen model instead of Gemma * Fixed beaertype error by converting rotary_base to int in Qwen config --------- Co-authored-by: Fabian Degen --- tests/integration/test_hooks.py | 7 +++++++ transformer_lens/HookedTransformer.py | 3 +++ transformer_lens/loading_from_pretrained.py | 4 ++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_hooks.py b/tests/integration/test_hooks.py index 6a9880a67..29d5ff9ed 100644 --- a/tests/integration/test_hooks.py +++ b/tests/integration/test_hooks.py @@ -234,3 +234,10 @@ def set_to_randn(z, hook): # exactly when the zero hook is attached last XOR it is prepended assert torch.allclose(logits, model.unembed.b_U[None, :]) == logits_are_unembed_bias + + +def test_use_attn_in_with_gqa_raises_error(): + # Create model that uses GroupedQueryAttention + model = HookedTransformer.from_pretrained("Qwen/Qwen2-0.5B") + with pytest.raises(AssertionError): + model.set_use_attn_in(True) diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py index 9fa4571ae..a34a5c4a0 100644 --- a/transformer_lens/HookedTransformer.py +++ b/transformer_lens/HookedTransformer.py @@ -1969,6 +1969,9 @@ def set_use_attn_in(self, use_attn_in: bool): """ Toggles whether to allow editing of inputs to each attention head. """ + assert ( + self.cfg.n_key_value_heads is None + ), "Can't use attn_in with GroupedQueryAttention, please use split_qkv_input instead" self.cfg.use_attn_in = use_attn_in def set_ungroup_grouped_query_attention(self, ungroup_grouped_query_attention: bool): diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py index 4d36c744f..623d1a43d 100644 --- a/transformer_lens/loading_from_pretrained.py +++ b/transformer_lens/loading_from_pretrained.py @@ -1260,7 +1260,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "initializer_range": hf_config.initializer_range, "normalization_type": "RMS", "positional_embedding_type": "rotary", - "rotary_base": hf_config.rope_theta, + "rotary_base": int(hf_config.rope_theta), "rotary_adjacent_pairs": False, "rotary_dim": hf_config.hidden_size // hf_config.num_attention_heads, "tokenizer_prepends_bos": True, @@ -1327,7 +1327,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "act_fn": "gelu_new", "initializer_range": 0.02, "normalization_type": "RMS", - "rotary_base": 10000.0, + "rotary_base": 10000, "rotary_dim": 256, "positional_embedding_type": "rotary", "use_attn_scale": True, From cc927d7fb4ec0496a7a271f72969c3ea3c0af6b9 Mon Sep 17 00:00:00 2001 From: Bryce Meyer Date: Tue, 31 Dec 2024 02:43:40 +0100 Subject: [PATCH 3/3] Feature llama 33 (#826) * added llama 3.3 config * fixed key * added debug point * updated model compatibility notebook * ran format * removed log point --- demos/Colab_Compatibility.ipynb | 5 ++-- transformer_lens/loading_from_pretrained.py | 33 ++++++++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/demos/Colab_Compatibility.ipynb b/demos/Colab_Compatibility.ipynb index 2fa9838eb..2f28a37a1 100644 --- a/demos/Colab_Compatibility.ipynb +++ b/demos/Colab_Compatibility.ipynb @@ -58,14 +58,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "TransformerLens currently supports 205 models out of the box.\n" + "TransformerLens currently supports 206 models out of the box.\n" ] } ], @@ -429,6 +429,7 @@ " \"meta-llama/Llama-2-70b-chat-hf\",\n", " \"meta-llama/Llama-3.1-70B\",\n", " \"meta-llama/Llama-3.1-70B-Instruct\",\n", + " \"meta-llama/Llama-3.3-70B-Instruct\",\n", " \"meta-llama/Meta-Llama-3-70B\",\n", " \"meta-llama/Meta-Llama-3-70B-Instruct\",\n", " \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n", diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py index 623d1a43d..17d32e8c7 100644 --- a/transformer_lens/loading_from_pretrained.py +++ b/transformer_lens/loading_from_pretrained.py @@ -151,14 +151,15 @@ "meta-llama/Meta-Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-70B", "meta-llama/Meta-Llama-3-70B-Instruct", - "meta-llama/Llama-3.2-1B", - "meta-llama/Llama-3.2-3B", - "meta-llama/Llama-3.2-1B-Instruct", - "meta-llama/Llama-3.2-3B-Instruct", "meta-llama/Llama-3.1-70B", "meta-llama/Llama-3.1-8B", "meta-llama/Llama-3.1-8B-Instruct", "meta-llama/Llama-3.1-70B-Instruct", + "meta-llama/Llama-3.2-1B", + "meta-llama/Llama-3.2-3B", + "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-3B-Instruct", + "meta-llama/Llama-3.3-70B-Instruct", "Baidicoot/Othello-GPT-Transformer-Lens", "bert-base-cased", "roneneldan/TinyStories-1M", @@ -960,6 +961,30 @@ def convert_hf_model_config(model_name: str, **kwargs): "NTK_by_parts_high_freq_factor": 4.0, "NTK_by_parts_factor": 32.0, } + elif "Llama-3.3-70B" in official_model_name: + cfg_dict = { + "d_model": 8192, + "d_head": 128, + "n_heads": 64, + "d_mlp": 28672, + "n_layers": 80, + "n_ctx": 2048, # capped due to memory issues + "eps": 1e-5, + "d_vocab": 128256, + "act_fn": "silu", + "n_key_value_heads": 8, + "normalization_type": "RMS", + "positional_embedding_type": "rotary", + "rotary_adjacent_pairs": False, + "rotary_dim": 32, + "final_rms": True, + "gated_mlp": True, + "rotary_base": 500000.0, + "use_NTK_by_parts_rope": True, + "NTK_by_parts_low_freq_factor": 1.0, + "NTK_by_parts_high_freq_factor": 4.0, + "NTK_by_parts_factor": 8.0, + } elif "Llama-3.1-8B" in official_model_name: cfg_dict = { "d_model": 4096,