Fix all Sphinx warnings (#422)

Fixes all warnings and will now error if new warnings are created (so we catch them in the CD).
TransformerLensOrg · Oct 19, 2023 · c49739f · c49739f
1 parent 11cd1c3
commit c49739f
Show file tree

Hide file tree

Showing 16 changed files with 177 additions and 162 deletions.
diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml
@@ -20,12 +20,10 @@ jobs:
       - uses: actions/checkout@v2
       - name: Install Poetry
         uses: snok/install-poetry@v1
-        with:
-          version: 1.4.0
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
-          python-version: "3.9"
+          python-version: "3.11"
       - name: Install dependencies
         run: poetry install --with docs
       - name: Build Docs

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -30,6 +30,7 @@
         "gelu",
         "githubpages",
         "gptj",
+        "howpublished",
         "huggingface",
         "interpretability",
         "isort",
@@ -41,6 +42,7 @@
         "Nanda",
         "neel",
         "neox",
+        "Nitpicky",
         "Olah",
         "pagename",
         "probs",
@@ -51,6 +53,7 @@
         "templatedir",
         "templatename",
         "toctree",
+        "transformerlens",
         "Unembed",
         "unembedding"
     ],

diff --git a/docs/make_docs.py b/docs/make_docs.py
@@ -1,11 +1,4 @@
-""" 
-Generate a markdown table summarizing properties of pretrained models.
-
-This script extracts various properties of pretrained models from the 
-`easy_transformer` library, such as the number of parameters, layers, and heads, 
-among others, and generates a markdown table. This table is saved to the 
-docs directory.
-"""
+"""Build the API Documentation."""
 import subprocess
 from functools import lru_cache
 from pathlib import Path
@@ -76,7 +69,12 @@ def get_property(name, model_name):
 
 
 def generate_model_table():
-    """Generate a markdown table summarizing properties of pretrained models."""
+    """Generate a markdown table summarizing properties of pretrained models.
+
+    This script extracts various properties of pretrained models from the `easy_transformer`
+    library, such as the number of parameters, layers, and heads, among others, and generates a
+    markdown table.
+    """
 
     # Create the table
     column_names = [
@@ -115,7 +113,17 @@ def generate_model_table():
 def build_docs():
     """Build the docs."""
     generate_model_table()
-    subprocess.run(["sphinx-build", SOURCE_PATH, BUILD_PATH], check=True)
+
+    subprocess.run(
+        [
+            "sphinx-build",
+            SOURCE_PATH,
+            BUILD_PATH,
+            # "-n",  # Nitpicky mode (warn about all missing references)
+            "-W",  # Turn warnings into errors
+        ],
+        check=True,
+    )
 
 
 def docs_hot_reload():

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -23,7 +23,6 @@
     "sphinx.ext.napoleon",
     "myst_parser",
     "sphinx.ext.githubpages",
-    "sphinx.ext.apidoc",
 ]
 
 source_suffix = {

diff --git a/docs/source/content/citation.md b/docs/source/content/citation.md
@@ -1,14 +1,15 @@
 
-## Citation
+# Citation
 
 Please cite this library as:
-```
-@misc{nandatransformerlens2022,
-    title  = {TransformerLens},
-    author = {Nanda, Neel},
-    url    = {/~https://github.com/neelnanda-io/TransformerLens},
-    year   = {2022}
+
+```BibTeX
+@misc{nanda2022transformerlens,
+    title = {TransformerLens},
+    author = {Neel Nanda},
+    year = {2022},
+    howpublished = {\url{/~https://github.com/neelnanda-io/TransformerLens}},
 }
 ```
-(This is my best guess for how citing software works, feel free to send a correction!)
+
 Also, if you're actually using this for your research, I'd love to chat! Reach out at neelnanda27@gmail.com
diff --git a/docs/source/content/development.md b/docs/source/content/development.md
@@ -1,10 +1,10 @@
-## Local Development
+# Local Development
 
-### DevContainer
+## DevContainer
 
 For a one-click setup of your development environment, this project includes a [DevContainer](https://containers.dev/). It can be used locally with [VS Code](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or with [GitHub Codespaces](/~https://github.com/features/codespaces).
 
-### Manual Setup
+## Manual Setup
 
 This project uses [Poetry](https://python-poetry.org/docs/#installation) for package management. Install as follows (this will also setup your virtual environment):
 
@@ -17,12 +17,12 @@ Optionally, if you want Jupyter Lab you can run `poetry run pip install jupyterl
 
 Then the library can be imported as `import transformer_lens`.
 
-### Testing
+## Testing
 
 If adding a feature, please add unit tests for it to the tests folder, and check that it hasn't broken anything major using the existing tests (install pytest and run it in the root TransformerLens/ directory).
 
 To run tests, you can use the following command:
 
-```
+```shell
 poetry run pytest -v transformer_lens/tests
 ```
diff --git a/docs/source/content/gallery.md b/docs/source/content/gallery.md
@@ -1,5 +1,6 @@
-## Gallery
+# Gallery
 
 User contributed examples of the library being used in action:
+
 * [Induction Heads Phase Change Replication](https://colab.research.google.com/github/ckkissane/induction-heads-transformer-lens/blob/main/Induction_Heads_Phase_Change.ipynb): A partial replication of [In-Context Learning and Induction Heads](https://transformer-circuits.pub/2022/in-context-learning-and-induction-heads/index.html) from Connor Kissane
 * [Decision Transformer Interpretability](/~https://github.com/jbloomAus/DecisionTransformerInterpretability): A set of scripts for training decision transformers which uses transformer lens to view intermediate activations, perform attribution and ablations. A write up of the initial work can be found [here](https://www.lesswrong.com/posts/bBuBDJBYHt39Q5zZy/decision-transformer-interpretability).
diff --git a/docs/source/content/getting_started.md b/docs/source/content/getting_started.md
@@ -1,22 +1,21 @@
-## Getting Started
+# Getting Started
 
 **Start with the [main demo](https://neelnanda.io/transformer-lens-demo) to learn how the library works, and the basic features**.
 
-To see what using it for exploratory analysis in practice looks like, check out [my notebook analysing Indirect Objection Identification](https://neelnanda.io/exploratory-analysis-demo) or [my recording of myself doing research](https://www.youtube.com/watch?v=yo4QvDn-vsU)! 
+To see what using it for exploratory analysis in practice looks like, check out [my notebook analysing Indirect Objection Identification](https://neelnanda.io/exploratory-analysis-demo) or [my recording of myself doing research](https://www.youtube.com/watch?v=yo4QvDn-vsU)!
 
 Mechanistic interpretability is a very young and small field, and there are a *lot* of open problems - if you would like to help, please try working on one! **Check out my [list of concrete open problems](https://docs.google.com/document/d/1WONBzNqfKIxERejrrPlQMyKqg7jSFW92x5UMXNrMdPo/edit) to figure out where to start.**. It begins with advice on skilling up, and key resources to check out. 
 
 If you're new to transformers, check out my [what is a transformer tutorial](https://neelnanda.io/transformer-tutorial) and [tutorial on coding GPT-2 from scratch](https://neelnanda.io/transformer-tutorial-2) (with [an accompanying template](https://neelnanda.io/transformer-template) to write one yourself!
 
-### Advice for Reading the Code
+## Advice for Reading the Code
 
 One significant design decision made was to have a single transformer implementation that could support a range of subtly different GPT-style models. This has the upside of interpretability code just working for arbitrary models when you change the model name in `HookedTransformer.from_pretrained`! But it has the significant downside that the code implementing the model (in `HookedTransformer.py` and `components.py`) can be difficult to read. I recommend starting with my [Clean Transformer Demo](https://neelnanda.io/transformer-solution), which is a clean, minimal implementation of GPT-2 with the same internal architecture and activation names as HookedTransformer, but is significantly clearer and better documented.
 
-### Installation
+## Installation
 
 `pip install git+/~https://github.com/neelnanda-io/TransformerLens`
 
 Import the library with `import transformer_lens`
 
 (Note: This library used to be known as EasyTransformer, and some breaking changes have been made since the rename. If you need to use the old version with some legacy code, run `pip install git+/~https://github.com/neelnanda-io/TransformerLens@v1`.)
-
diff --git a/docs/source/content/tutorials.md b/docs/source/content/tutorials.md
@@ -1,14 +1,14 @@
-## Tutorials
+# Tutorials
 
 - **Start with the [main demo](https://neelnanda.io/transformer-lens-demo) to learn how the library works, and the basic features**.
 
-### Where To Start
+## Where To Start
 
 - To see what using it for exploratory analysis in practice looks like, check out [my notebook analysing Indirect Objection Identification](https://neelnanda.io/exploratory-analysis-demo) or [my recording of myself doing research](https://www.youtube.com/watch?v=yo4QvDn-vsU)!
 
 - [What is a Transformer tutorial](https://neelnanda.io/transformer-tutorial)
 
-### Demos
+## Demos
 
 - [**Activation Patching in TransformerLens**](https://colab.research.google.com/github/neelnanda-io/TransformerLens/blob/main/demos/Activation_Patching_in_TL_Demo.ipynb) - Accompanies the [Exploratory Analysis Demo](https://colab.research.google.com/github/neelnanda-io/TransformerLens/blob/main/demos/Exploratory Analysis Demo.ipynb). This demo explains how to use [Activation Patching](https://dynalist.io/d/n2ZWtnoYHrU1s4vnFSAQ519J#z=qeWBvs-R-taFfcCq-S_hgMqx) in TransformerLens, a mechanistic interpretability technique that uses causal intervention to identify which activations in a model matter for producing an output.
 

diff --git a/transformer_lens/ActivationCache.py b/transformer_lens/ActivationCache.py
@@ -268,24 +268,21 @@ def logit_attrs(
         difference attributions for the residual stack if incorrect_tokens is provided.
 
         Args:
-            residual_stack (Float[torch.Tensor, "num_components *batch_and_pos_dims d_model"]):
-                stack of components of residual stream to get logit attributions for.
-
-            tokens (Union[str, int, Int[torch.Tensor, ""], Int[torch.Tensor, "batch"],
-                Int[torch.Tensor, "batch position"]]): tokens to compute logit attributions on.
-            incorrect_tokens (Union[str, int, Int[torch.Tensor, ""], Int[torch.Tensor, "batch"],
-                Int[torch.Tensor, "batch position"]], optional): if provided, compute attributions
+            residual_stack: Stack of components of residual stream to get logit attributions for.
+            tokens: tokens to compute logit attributions on.
+            incorrect_tokens: if provided, compute attributions
                 on logit difference between tokens and incorrect_tokens. Must have the same shape as
                 tokens.
-            pos_slice (Slice, optional): The slice to apply layer norm scaling on. Defaults to None,
+            pos_slice: The slice to apply layer norm scaling on. Defaults to None,
                 do nothing.
-            batch_slice (Slice, optional): The slice to take on the batch dimension during layer
+            batch_slice: The slice to take on the batch dimension during layer
                 norm scaling. Defaults to None, do nothing.
-            has_batch_dim (bool, optional): Whether residual_stack has a batch dimension. Defaults
+            has_batch_dim: Whether residual_stack has a batch dimension. Defaults
                 to True.
+
         Returns:
-            Components: A [num_components, *batch_and_pos_dims] tensor of the logit attributions or
-                logit difference attributions if incorrect_tokens was provided.
+            Components: A tensor of the logit attributions or logit difference attributions if
+            incorrect_tokens was provided.
         """
         if not isinstance(pos_slice, Slice):
             pos_slice = Slice(pos_slice)
@@ -352,25 +349,25 @@ def decompose_resid(
         useful for attributing model behaviour to different components of the residual stream
 
         Args:
-            layer (int): The layer to take components up to - by default includes
+            layer: The layer to take components up to - by default includes
                 resid_pre for that layer and excludes resid_mid and resid_post for that layer.
                 layer==n_layers means to return all layer outputs incl in the final layer, layer==0
                 means just embed and pos_embed. The indices are taken such that this gives the
                 accumulated streams up to the input to layer l
-            incl_mid (bool, optional): Whether to return resid_mid for all previous
+            incl_mid: Whether to return resid_mid for all previous
                 layers. Defaults to False.
-            mlp_input (bool, optional): Whether to include attn_out for the current
+            mlp_input: Whether to include attn_out for the current
                 layer - essentially decomposing the residual stream that's input to the MLP input
                 rather than the Attn input. Defaults to False.
-            mode (str): Values are "all", "mlp" or "attn". "all" returns all
+            mode: Values are "all", "mlp" or "attn". "all" returns all
                 components, "mlp" returns only the MLP components, and "attn" returns only the
                 attention components. Defaults to "all".
-            apply_ln (bool, optional): Whether to apply LayerNorm to the stack. Defaults to False.
-            pos_slice (Slice): A slice object to apply to the pos dimension.
+            apply_ln: Whether to apply LayerNorm to the stack. Defaults to False.
+            pos_slice: A slice object to apply to the pos dimension.
                 Defaults to None, do nothing.
-            incl_embeds (bool): Whether to include embed & pos_embed return_labels (bool, optional):
-            Whether to return a list of labels for
-                the residual stream components. Useful for labelling graphs. Defaults to True.
+            incl_embeds: Whether to include embed & pos_embed
+            return_labels: Whether to return a list of labels for the residual stream components.
+                Useful for labelling graphs. Defaults to True.
 
         Returns:
             Components: A [num_components, batch_size, pos, d_model] tensor of the accumulated

diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py
@@ -1338,7 +1338,6 @@ def from_pretrained_no_processing(
     def init_weights(self):
         """Initialize weights.
 
-
         Initialize weights matrices with a normal of std=initializer_range (default=0.02). This
         roughly follows the GPT-2 paper's scheme (but with truncation, and not halving the std for
         W_pos).
@@ -1348,7 +1347,7 @@ def init_weights(self):
 
         Weight matrices are set to empty by default (to save space + compute, since they're the bulk
         of the parameters), so it is important to call this if you are not loading in pretrained
-        weights! Note that this function assumes that weight names being with W_
+        weights! Note that this function assumes that weight names being with `W_`.
 
         Set seed here to ensure determinism.
 

diff --git a/transformer_lens/SVDInterpreter.py b/transformer_lens/SVDInterpreter.py
@@ -32,30 +32,46 @@ def get_singular_vectors(
     ) -> torch.Tensor:
         """Gets the singular vectors for a given vector type, layer, and optionally head.
 
-        Options:
-        - OV: Get the singular vectors of the OV matrix for a particular layer and head.
-        - w_in: Get the singular vectors of the w_in matrix for a particular layer.
-        - w_out: Get the singular vectors of the w_out matrix for a particular layer.
-
-        Returns a (d_vocab, 1, num_vectors) tensor.
-
-        This tensor can then be plotted using Neel's PySvelte, as demonstrated in the demo for this feature. The demo also points out some "gotchas" in this feature - numerical instability means inconsistency across devices, and the default HookedTransformer parameters don't replicate the original SVD post very well. So I'd recommend checking out the demo if you want to use this!
+        This tensor can then be plotted using Neel's PySvelte, as demonstrated in the demo for this
+        feature. The demo also points out some "gotchas" in this feature - numerical instability
+        means inconsistency across devices, and the default HookedTransformer parameters don't
+        replicate the original SVD post very well. So I'd recommend checking out the demo if you
+        want to use this!
 
         Example:
+
         .. code-block:: python
-            build-docsfrom transformer_lens import HookedTransformer, SVDInterpreter
-            build-docsmodel = HookedTransformer.from_pretrained('gpt2-medium')
-            build-docssvd_interpreter = SVDInterpreter(model)
 
-            build-docsov = svd_interpreter.get_singular_vectors('OV', layer_index=22, head_index=10)
+            from transformer_lens import HookedTransformer, SVDInterpreter
+
+            model = HookedTransformer.from_pretrained('gpt2-medium')
+            svd_interpreter = SVDInterpreter(model)
+
+            ov = svd_interpreter.get_singular_vectors('OV', layer_index=22, head_index=10)
+
+            all_tokens = [model.to_str_tokens(np.array([i])) for i in range(model.cfg.d_vocab)]
+            all_tokens = [all_tokens[i][0] for i in range(model.cfg.d_vocab)]
 
-            build-docsall_tokens = [model.to_str_tokens(np.array([i])) for i in range(model.cfg.d_vocab)]
-            build-docsall_tokens = [all_tokens[i][0] for i in range(model.cfg.d_vocab)]
+            def plot_matrix(matrix, tokens, k=10, filter="topk"):
+                pysvelte.TopKTable(
+                    tokens=all_tokens,
+                    activations=matrix,
+                    obj_type="SVD direction",
+                    k=k,
+                    filter=filter
+                ).show()
 
-            build-docsdef plot_matrix(matrix, tokens, k=10, filter="topk"):
-            build-docs    pysvelte.TopKTable(tokens=all_tokens, activations=matrix, obj_type="SVD direction", k=k, filter=filter).show()
+            plot_matrix(ov, all_tokens)
 
-            build-docsplot_matrix(ov, all_tokens)"""
+        Args:
+            vector_type: Type of the vector:
+                - "OV": Singular vectors of the OV matrix for a particular layer and head.
+                - "w_in": Singular vectors of the w_in matrix for a particular layer.
+                - "w_out": Singular vectors of the w_out matrix for a particular layer.
+            layer_index: The index of the layer.
+            num_vectors: Number of vectors.
+            head_index: Index of the head.
+        """
 
         if head_index is None:
             assert vector_type in [

diff --git a/transformer_lens/evals.py b/transformer_lens/evals.py
@@ -174,7 +174,7 @@ class IOIDataset(Dataset):
     Paper: https://arxiv.org/pdf/2211.00593.pdf
 
     Example:
-    --------
+
     .. code-block:: python
 
         >>> from transformer_lens.evals import ioi_eval, IOIDataset
@@ -281,22 +281,22 @@ def get_default_nouns():
         }
 
 
-# %%
 @torch.inference_mode()
 def ioi_eval(
     model, dataset=None, batch_size=8, num_samples=1000, tokenizer=None, symmetric=False
 ):
-    """
-    Evaluates the model on the Indirect Object Identification task.
-
-    dataset must be a torch Dataset that returns a dict:
-        {
-            'prompt': torch.LongTensor,
-            'IO': torch.LongTensor,
-            'S': torch.LongTensor
-        }
-
-    Returns average logit difference and accuracy.
+    """Evaluate the Model on the Indirect Object Identification Task.
+
+    Args:
+        model: HookedTransformer model.
+        dataset: PyTorch Dataset that returns a dict with keys "prompt", "IO", and "S".
+        batch_size: Batch size to use.
+        num_samples: Number of samples to use.
+        tokenizer: Tokenizer to use.
+        symmetric: Whether to use the symmetric version of the task.
+
+    Returns:
+        Average logit difference and accuracy.
     """
     if tokenizer is None:
         tokenizer = model.tokenizer